diff --git a/.gitattributes b/.gitattributes index d99b3c08b92e24ac3904bc17c63fc1cde065ed76..1b2c23ceb453456cfa116dbe05f1a1282defb846 100644 --- a/.gitattributes +++ b/.gitattributes @@ -62,3 +62,4 @@ sft/smoe_refinev3_665k_llava/logs/0418_1721_llava..._pope_llava_model_args_9259d sft/smoe_refinev3_665k_llava/logs/0418_1721_llava..._pope_llava_model_args_9259d6/pope.json filter=lfs diff=lfs merge=lfs -text sft/smoe_refinev3_665k_llava/logs/0418_1721_llava..._pope_llava_model_args_9259d6/textvqa_val.json filter=lfs diff=lfs merge=lfs -text sft/smoe_refinev3_665k_llava/logs/0418_1748_llava_v1.5_gqa_llava_model_args_9259d6/gqa.json filter=lfs diff=lfs merge=lfs -text +sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/textvqa_val.json filter=lfs diff=lfs merge=lfs -text diff --git a/sft_pretrain/Full_competesmoev30/added_tokens.json b/sft_pretrain/Full_competesmoev30/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/added_tokens.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/config.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28a5bb1c149304f33214eee3c6e2764711ffb065 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.005, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.005, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": true, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 9, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev30", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/generation_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c07d3d2bbd1cfebed61c81ad3bfc51cf80c8f56e --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea4479e66aa85e8466bf3a3d1a2f9804e324234dc0fbe46e75c67af513bdd322 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c452f65676864ca30d4eeae5a5d9ce18e81cf64 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2088221269ad2d364b787b2ab4c9f417764e40f70d80aa906317f07b22b095 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5dbe9bce43c2b6d054b352b23330c1356ec88ec --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e32c502a7d4d022bffd51c4140ee5e5b62b4ab8c731ea99acfaad3e0ccbbf8f3 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..971d20dc136a6ea4413e8e418a9a72edae20d6de --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c645173acf11358d252cacac5a5b11ddf191cd554c34f49bf5604b8f3f36c66 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5b38d1e0355f7ef070265322f4e0a46d960c657 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61c93622ab52b061ca095aa119b8887ae8f84b4e040a6e2770b6e88b7a0c25db +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c46520b39414fc8fb28d9f9d5c4109c709df3d6 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f41b80f22c1bc26bc4c806d64bc3e3fcfd6a9860faf1434383fb3431b5e565e +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00586df1cfebcb0743f8a86a9476e192bd9f9fbe --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71b40864800a684d109e812a97b931c301d2dfbb30b9a9f6ad9d068a0dfee0f1 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6df779d94a32829e77a5255e51504f34d77a303 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e117c267bf55acdabe24d2a092e32796f990bbddd19770be46430c72d21af4d +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/latest b/sft_pretrain/Full_competesmoev30/checkpoint-1040/latest new file mode 100644 index 0000000000000000000000000000000000000000..f37da78e3c7eee26ebe5f06b54d6621716edb6b9 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/latest @@ -0,0 +1 @@ +global_step1040 \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/rank0_metric_eval_done.txt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/rank0_metric_eval_done.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9c064df42468d805177a80623c54c976c8d760e --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/rank0_metric_eval_done.txt @@ -0,0 +1 @@ +rank 0 eval done \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/rank1_metric_eval_done.txt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/rank1_metric_eval_done.txt new file mode 100644 index 0000000000000000000000000000000000000000..36792c9cedb6c006db3a866d72eac15f0ce6a64a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/rank1_metric_eval_done.txt @@ -0,0 +1 @@ +rank 1 eval done \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/rank2_metric_eval_done.txt b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/rank2_metric_eval_done.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3e5c7ecd1fd051ff210a79f69ad980d587fd5b3 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/rank2_metric_eval_done.txt @@ -0,0 +1 @@ +rank 2 eval done \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/results.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/results.json new file mode 100644 index 0000000000000000000000000000000000000000..a0d359997541cd6a41b90d38635c3c2baf4856c3 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/results.json @@ -0,0 +1,74 @@ +{ + "results": { + "textvqa_val": { + "exact_match,none": 0.32172000027894976, + "exact_match_stderr,none": 0.0064028936933341555, + "submission,none": null, + "submission_stderr,none": "N/A", + "alias": "textvqa_val" + } + }, + "configs": { + "textvqa_val": { + "task": "textvqa_val", + "dataset_path": "lmms-lab/textvqa", + "test_split": "validation", + "doc_to_visual": "", + "doc_to_text": "", + "doc_to_target": "answer", + "process_results": "", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": true + }, + { + "metric": "submission", + "aggregation": "", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "ASSISTANT:" + ] + }, + "repeats": 1, + "should_decontaminate": false, + "model_specific_prompt_kwargs": { + "default": { + "pre_prompt": "", + "post_prompt": "\nAnswer the question using a single word or phrase.", + "ocr": false + }, + "qwen_vl": { + "pre_prompt": "", + "post_prompt": " Answer:" + } + } + } + }, + "versions": { + "textvqa_val": "Yaml" + }, + "n-shot": { + "textvqa_val": 0 + }, + "model_configs": { + "model": "llava", + "model_args": "pretrained=/cm/archive/namnv78/checkpoints/Xphi35-siglip224/pretrain_moe/Full_competesmoev30/checkpoint-1040,conv_template=phi35", + "batch_size": "1", + "device": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": "" + }, + "git_hash": "c669d52" +} \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/submissions/textvqa_submission_2025-05-12-01-45-45.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/submissions/textvqa_submission_2025-05-12-01-45-45.json new file mode 100644 index 0000000000000000000000000000000000000000..782b7613e4af96bf96ec36e3517838b93dabb117 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/submissions/textvqa_submission_2025-05-12-01-45-45.json @@ -0,0 +1 @@ +[{"question_id": 34602, "answer": "dakota"}, {"question_id": 34605, "answer": "ormr"}, {"question_id": 34608, "answer": "10:00"}, {"question_id": 34611, "answer": "phillip harper"}, {"question_id": 34614, "answer": "pepsi"}, {"question_id": 34617, "answer": "red"}, {"question_id": 34620, "answer": "3"}, {"question_id": 34623, "answer": "2012"}, {"question_id": 34626, "answer": "no"}, {"question_id": 34629, "answer": "race"}, {"question_id": 34632, "answer": "gabriela belli"}, {"question_id": 34635, "answer": "10:07"}, {"question_id": 34638, "answer": "10"}, {"question_id": 34641, "answer": "10"}, {"question_id": 34644, "answer": "50"}, {"question_id": 34647, "answer": "rj"}, {"question_id": 34650, "answer": "french"}, {"question_id": 34653, "answer": "1000"}, {"question_id": 34656, "answer": "nets"}, {"question_id": 34659, "answer": "english"}, {"question_id": 34662, "answer": "beer"}, {"question_id": 34665, "answer": "19"}, {"question_id": 34668, "answer": "10 and 11"}, {"question_id": 34671, "answer": "ray kurzweil"}, {"question_id": 34674, "answer": "ernst"}, {"question_id": 34677, "answer": "2012"}, {"question_id": 34680, "answer": "0.0%"}, {"question_id": 34683, "answer": "safeway"}, {"question_id": 34686, "answer": "pizza"}, {"question_id": 34689, "answer": "coca cola"}, {"question_id": 34692, "answer": "12:00"}, {"question_id": 34695, "answer": "winnipeg dental"}, {"question_id": 34698, "answer": "lady is vampire"}, {"question_id": 34701, "answer": "clock"}, {"question_id": 34704, "answer": "performers"}, {"question_id": 34707, "answer": "sweet"}, {"question_id": 34710, "answer": "just mobile gum plus"}, {"question_id": 34713, "answer": "clason"}, {"question_id": 34716, "answer": "ren\u00e9 camus"}, {"question_id": 34719, "answer": "i love you"}, {"question_id": 34722, "answer": "mug"}, {"question_id": 34725, "answer": "0"}, {"question_id": 34728, "answer": "all day"}, {"question_id": 34731, "answer": "hertz"}, {"question_id": 34734, "answer": "london underground"}, {"question_id": 34737, "answer": "yammer"}, {"question_id": 34740, "answer": "pale ale"}, {"question_id": 34743, "answer": "halloween"}, {"question_id": 34746, "answer": "tpc"}, {"question_id": 34749, "answer": "do you want to study in usa"}, {"question_id": 34752, "answer": "ryder"}, {"question_id": 34755, "answer": "strand for stand"}, {"question_id": 34758, "answer": "kara elet"}, {"question_id": 34761, "answer": "post office"}, {"question_id": 34764, "answer": "umd"}, {"question_id": 34767, "answer": "6"}, {"question_id": 34770, "answer": "samsung"}, {"question_id": 34773, "answer": "california"}, {"question_id": 34776, "answer": "lancome"}, {"question_id": 34779, "answer": "pepsi"}, {"question_id": 34782, "answer": "3900"}, {"question_id": 34785, "answer": "t"}, {"question_id": 34788, "answer": "yes"}, {"question_id": 34791, "answer": "witcher"}, {"question_id": 34794, "answer": "purdue"}, {"question_id": 34797, "answer": "w"}, {"question_id": 34800, "answer": "beer"}, {"question_id": 34803, "answer": "jr highway bus"}, {"question_id": 34806, "answer": "10:10"}, {"question_id": 34809, "answer": "lakodalma"}, {"question_id": 34812, "answer": "samsung"}, {"question_id": 34815, "answer": "germany"}, {"question_id": 34818, "answer": "airplane"}, {"question_id": 34821, "answer": "lg"}, {"question_id": 34824, "answer": "liz claire"}, {"question_id": 34827, "answer": "nog 3 km"}, {"question_id": 34830, "answer": "burnaby"}, {"question_id": 34833, "answer": "seals"}, {"question_id": 34836, "answer": "yes"}, {"question_id": 34839, "answer": "vodka"}, {"question_id": 34842, "answer": "2"}, {"question_id": 34845, "answer": "northern"}, {"question_id": 34848, "answer": "ironman"}, {"question_id": 34851, "answer": "new york"}, {"question_id": 34854, "answer": "birds"}, {"question_id": 34857, "answer": "tedcom"}, {"question_id": 34860, "answer": "coca cola"}, {"question_id": 34863, "answer": "yes"}, {"question_id": 34866, "answer": "samsung"}, {"question_id": 34869, "answer": "funk"}, {"question_id": 34872, "answer": "kimberly kay hough"}, {"question_id": 34875, "answer": "10 inches"}, {"question_id": 34878, "answer": "blonde"}, {"question_id": 34881, "answer": "office"}, {"question_id": 34884, "answer": "coins"}, {"question_id": 34887, "answer": "boston"}, {"question_id": 34890, "answer": "c"}, {"question_id": 34893, "answer": "stop"}, {"question_id": 34896, "answer": "yes"}, {"question_id": 34899, "answer": "natan"}, {"question_id": 34902, "answer": "george"}, {"question_id": 34905, "answer": "no right turn"}, {"question_id": 34908, "answer": "o"}, {"question_id": 34911, "answer": "2012"}, {"question_id": 34914, "answer": "dell"}, {"question_id": 34917, "answer": "london bus"}, {"question_id": 34920, "answer": "tourist"}, {"question_id": 34923, "answer": "ws reymont komedia"}, {"question_id": 34926, "answer": "70"}, {"question_id": 34929, "answer": "900"}, {"question_id": 34932, "answer": "2"}, {"question_id": 34935, "answer": "page 1"}, {"question_id": 34938, "answer": "cappeewas"}, {"question_id": 34941, "answer": "2013"}, {"question_id": 34944, "answer": "mets"}, {"question_id": 34947, "answer": "nothing"}, {"question_id": 34950, "answer": "samsung"}, {"question_id": 34953, "answer": "post"}, {"question_id": 34956, "answer": "5"}, {"question_id": 34959, "answer": "main"}, {"question_id": 34962, "answer": "microsoft"}, {"question_id": 34965, "answer": "no"}, {"question_id": 34968, "answer": "adidas"}, {"question_id": 34971, "answer": "yes"}, {"question_id": 34974, "answer": "htc"}, {"question_id": 34977, "answer": "black"}, {"question_id": 34980, "answer": "milt thomas"}, {"question_id": 34983, "answer": "june"}, {"question_id": 34986, "answer": "nalli 's"}, {"question_id": 34989, "answer": "yes"}, {"question_id": 34992, "answer": "1"}, {"question_id": 34995, "answer": "right"}, {"question_id": 34998, "answer": "no"}, {"question_id": 35001, "answer": "macys"}, {"question_id": 35004, "answer": "red bull"}, {"question_id": 35007, "answer": "1"}, {"question_id": 35010, "answer": "muffins"}, {"question_id": 35013, "answer": "everyone"}, {"question_id": 35016, "answer": "toshiba"}, {"question_id": 35019, "answer": "new"}, {"question_id": 35022, "answer": "boston"}, {"question_id": 35025, "answer": "wood"}, {"question_id": 35028, "answer": "70"}, {"question_id": 35031, "answer": "mission"}, {"question_id": 35034, "answer": "taco bell"}, {"question_id": 35037, "answer": "bus"}, {"question_id": 35040, "answer": "danger deep water"}, {"question_id": 35043, "answer": "wine website"}, {"question_id": 35046, "answer": "1794"}, {"question_id": 35049, "answer": "yes"}, {"question_id": 35052, "answer": "samsung"}, {"question_id": 35055, "answer": "ok"}, {"question_id": 35058, "answer": "casual"}, {"question_id": 35061, "answer": "deep space diner"}, {"question_id": 35064, "answer": "north shore watch and other poems"}, {"question_id": 35067, "answer": "pizza"}, {"question_id": 35070, "answer": "media cafe"}, {"question_id": 35073, "answer": "cent"}, {"question_id": 35076, "answer": "big 's"}, {"question_id": 35079, "answer": "5"}, {"question_id": 35082, "answer": "start"}, {"question_id": 35085, "answer": "1 inch"}, {"question_id": 35088, "answer": "r"}, {"question_id": 35091, "answer": "htc"}, {"question_id": 35094, "answer": "toshiba"}, {"question_id": 35097, "answer": "coco"}, {"question_id": 35100, "answer": "qantas"}, {"question_id": 35103, "answer": "gil 's club"}, {"question_id": 35106, "answer": "open book: crowdsourced publication of global movement for open knowledge"}, {"question_id": 35109, "answer": "hp 2414"}, {"question_id": 35112, "answer": "2010"}, {"question_id": 35115, "answer": "rambo"}, {"question_id": 35118, "answer": "yes"}, {"question_id": 35121, "answer": "2012"}, {"question_id": 35124, "answer": "lg"}, {"question_id": 35127, "answer": "long star"}, {"question_id": 35130, "answer": "patties"}, {"question_id": 35133, "answer": "no"}, {"question_id": 35136, "answer": "tsingtao"}, {"question_id": 35139, "answer": "yes"}, {"question_id": 35142, "answer": "dvd"}, {"question_id": 35145, "answer": "bud"}, {"question_id": 35148, "answer": "2 3 4"}, {"question_id": 35151, "answer": "petra"}, {"question_id": 35154, "answer": "yes"}, {"question_id": 35157, "answer": "ikea"}, {"question_id": 35160, "answer": "shoes"}, {"question_id": 35163, "answer": "brenton 's"}, {"question_id": 35166, "answer": "tab key"}, {"question_id": 35169, "answer": "stephen 's marking ink"}, {"question_id": 35172, "answer": "new york"}, {"question_id": 35175, "answer": "givry"}, {"question_id": 35178, "answer": "hahn products distribution"}, {"question_id": 35181, "answer": "green"}, {"question_id": 35184, "answer": "dell"}, {"question_id": 35187, "answer": "stop"}, {"question_id": 35190, "answer": "2"}, {"question_id": 35193, "answer": "graffiti"}, {"question_id": 35196, "answer": "post office"}, {"question_id": 35199, "answer": "france"}, {"question_id": 35202, "answer": "12"}, {"question_id": 35205, "answer": "karazy"}, {"question_id": 35208, "answer": "whiskey"}, {"question_id": 35211, "answer": "megan gardiner"}, {"question_id": 35214, "answer": "samsung"}, {"question_id": 35217, "answer": "texas"}, {"question_id": 35220, "answer": "chino"}, {"question_id": 35223, "answer": "50"}, {"question_id": 35226, "answer": "10 10 2014"}, {"question_id": 35229, "answer": "canon"}, {"question_id": 35232, "answer": "yes"}, {"question_id": 35235, "answer": "happy"}, {"question_id": 35238, "answer": "kirin"}, {"question_id": 35241, "answer": "south"}, {"question_id": 35244, "answer": ""}, {"question_id": 35247, "answer": "2011 11 11"}, {"question_id": 35250, "answer": "10"}, {"question_id": 35253, "answer": "arizona"}, {"question_id": 35256, "answer": "5"}, {"question_id": 35259, "answer": "met"}, {"question_id": 35262, "answer": "wii"}, {"question_id": 35265, "answer": "wits"}, {"question_id": 35268, "answer": "new york"}, {"question_id": 35271, "answer": "ships"}, {"question_id": 35274, "answer": "100"}, {"question_id": 35277, "answer": "10"}, {"question_id": 35280, "answer": "$1.99"}, {"question_id": 35283, "answer": "headlights"}, {"question_id": 35286, "answer": "yes"}, {"question_id": 35289, "answer": "crosswalk"}, {"question_id": 35292, "answer": "18"}, {"question_id": 35295, "answer": "zod"}, {"question_id": 35298, "answer": "00"}, {"question_id": 35301, "answer": "black"}, {"question_id": 35304, "answer": "twisted pine"}, {"question_id": 35307, "answer": "paper mate"}, {"question_id": 35310, "answer": "nike"}, {"question_id": 35313, "answer": "cola"}, {"question_id": 35316, "answer": "1999"}, {"question_id": 35319, "answer": "clouds"}, {"question_id": 35322, "answer": "skol"}, {"question_id": 35325, "answer": "guinness"}, {"question_id": 35328, "answer": "edible arrangements"}, {"question_id": 35331, "answer": "ay"}, {"question_id": 35334, "answer": "love"}, {"question_id": 35337, "answer": "united states"}, {"question_id": 35340, "answer": "website test"}, {"question_id": 35343, "answer": "3"}, {"question_id": 35346, "answer": "1999"}, {"question_id": 35349, "answer": "10"}, {"question_id": 35352, "answer": "priests"}, {"question_id": 35355, "answer": "rigger"}, {"question_id": 35358, "answer": "yes"}, {"question_id": 35361, "answer": "1000"}, {"question_id": 35364, "answer": "yes"}, {"question_id": 35367, "answer": "2"}, {"question_id": 35370, "answer": "350"}, {"question_id": 35373, "answer": "yes"}, {"question_id": 35376, "answer": "beacon"}, {"question_id": 35379, "answer": "1"}, {"question_id": 35382, "answer": "30"}, {"question_id": 35385, "answer": "32"}, {"question_id": 35388, "answer": "k c dennison"}, {"question_id": 35391, "answer": "bauer"}, {"question_id": 35394, "answer": "dreyer"}, {"question_id": 35397, "answer": "cemetery"}, {"question_id": 35400, "answer": "subway stories"}, {"question_id": 35403, "answer": "jimmy carter"}, {"question_id": 35406, "answer": "floris violet"}, {"question_id": 35409, "answer": "daily mirror"}, {"question_id": 35412, "answer": "1969"}, {"question_id": 35415, "answer": "sportsnet"}, {"question_id": 35418, "answer": "44"}, {"question_id": 35421, "answer": "wine"}, {"question_id": 35424, "answer": "gemini"}, {"question_id": 35427, "answer": "storbror"}, {"question_id": 35430, "answer": "city"}, {"question_id": 35433, "answer": "7"}, {"question_id": 35436, "answer": "bonds"}, {"question_id": 35439, "answer": "light"}, {"question_id": 35442, "answer": "dragonflight dragonflight"}, {"question_id": 35445, "answer": "wednesday"}, {"question_id": 35448, "answer": "future"}, {"question_id": 35451, "answer": "toronto"}, {"question_id": 35454, "answer": "rc"}, {"question_id": 35457, "answer": "$10"}, {"question_id": 35460, "answer": "michigan"}, {"question_id": 35463, "answer": "books"}, {"question_id": 35466, "answer": "75"}, {"question_id": 35469, "answer": "art magazines"}, {"question_id": 35472, "answer": "identity matrix"}, {"question_id": 35475, "answer": "dover books"}, {"question_id": 35478, "answer": "google"}, {"question_id": 35481, "answer": "sony"}, {"question_id": 35484, "answer": "pokemon"}, {"question_id": 35487, "answer": "100"}, {"question_id": 35490, "answer": "toshiba"}, {"question_id": 35493, "answer": "pythagorean"}, {"question_id": 35496, "answer": "12:33"}, {"question_id": 35499, "answer": "5"}, {"question_id": 35502, "answer": "jct 617"}, {"question_id": 35505, "answer": "3"}, {"question_id": 35508, "answer": "40"}, {"question_id": 35511, "answer": "123456789"}, {"question_id": 35514, "answer": "glenfiddich"}, {"question_id": 35517, "answer": "young adult"}, {"question_id": 35520, "answer": "yes"}, {"question_id": 35523, "answer": "lm"}, {"question_id": 35526, "answer": "872131"}, {"question_id": 35529, "answer": "gonzales"}, {"question_id": 35532, "answer": "2000"}, {"question_id": 35535, "answer": "rolex"}, {"question_id": 35538, "answer": "rolex"}, {"question_id": 35541, "answer": "microsoft"}, {"question_id": 35544, "answer": "ramirez"}, {"question_id": 35547, "answer": "beer"}, {"question_id": 35550, "answer": "ipad"}, {"question_id": 35553, "answer": "elizabeth moon"}, {"question_id": 35556, "answer": "what is middle question"}, {"question_id": 35559, "answer": "88"}, {"question_id": 35562, "answer": "ice"}, {"question_id": 35565, "answer": "global pala alcohol"}, {"question_id": 35568, "answer": "macdonald"}, {"question_id": 35571, "answer": "player"}, {"question_id": 35574, "answer": "doctor"}, {"question_id": 35577, "answer": "yes"}, {"question_id": 35580, "answer": "ella"}, {"question_id": 35583, "answer": "nate bowman"}, {"question_id": 35586, "answer": "kolsch"}, {"question_id": 35589, "answer": "yes"}, {"question_id": 35592, "answer": "calories"}, {"question_id": 35595, "answer": "yes"}, {"question_id": 35598, "answer": "black and decker"}, {"question_id": 35601, "answer": "arena"}, {"question_id": 35604, "answer": "welcome to washington dc"}, {"question_id": 35607, "answer": "i love ny"}, {"question_id": 35610, "answer": "spanish"}, {"question_id": 35613, "answer": "plymouth"}, {"question_id": 35616, "answer": "motivation"}, {"question_id": 35619, "answer": "hudson"}, {"question_id": 35622, "answer": "k"}, {"question_id": 35625, "answer": "petros"}, {"question_id": 35628, "answer": "1"}, {"question_id": 35631, "answer": "clothes"}, {"question_id": 35634, "answer": "riccardo"}, {"question_id": 35637, "answer": "flying girl"}, {"question_id": 35640, "answer": "t"}, {"question_id": 35643, "answer": "tui"}, {"question_id": 35646, "answer": "networker 's non negotiable"}, {"question_id": 35649, "answer": "lg"}, {"question_id": 35652, "answer": "ruler"}, {"question_id": 35655, "answer": "hi"}, {"question_id": 35658, "answer": "sarah"}, {"question_id": 35661, "answer": "tf 100"}, {"question_id": 35664, "answer": "sprint"}, {"question_id": 35667, "answer": "10"}, {"question_id": 35670, "answer": "dominion"}, {"question_id": 35673, "answer": "playstation"}, {"question_id": 35676, "answer": "home event"}, {"question_id": 35679, "answer": "10"}, {"question_id": 35682, "answer": "3"}, {"question_id": 35685, "answer": "2"}, {"question_id": 35688, "answer": "18"}, {"question_id": 35691, "answer": "yes"}, {"question_id": 35694, "answer": "beatles"}, {"question_id": 35697, "answer": "university of saskatchewan"}, {"question_id": 35700, "answer": ""}, {"question_id": 35703, "answer": "rick parrish"}, {"question_id": 35706, "answer": "car"}, {"question_id": 35709, "answer": "donut world"}, {"question_id": 35712, "answer": "mr lidio"}, {"question_id": 35715, "answer": "right"}, {"question_id": 35718, "answer": "2010"}, {"question_id": 35721, "answer": "illinois"}, {"question_id": 35724, "answer": "vnes"}, {"question_id": 35727, "answer": "18"}, {"question_id": 35730, "answer": "2"}, {"question_id": 35733, "answer": "fitbit"}, {"question_id": 35736, "answer": "40"}, {"question_id": 35739, "answer": "intel"}, {"question_id": 35742, "answer": "80"}, {"question_id": 35745, "answer": "rbc"}, {"question_id": 35748, "answer": "hamilton watch company"}, {"question_id": 35751, "answer": "chardonnay"}, {"question_id": 35754, "answer": "n"}, {"question_id": 35757, "answer": "pizza"}, {"question_id": 35760, "answer": "4"}, {"question_id": 35763, "answer": "blue jerseys"}, {"question_id": 35766, "answer": "f12"}, {"question_id": 35769, "answer": "milk"}, {"question_id": 35772, "answer": "spartans"}, {"question_id": 35775, "answer": "fairfield"}, {"question_id": 35778, "answer": "trojans"}, {"question_id": 35781, "answer": "65"}, {"question_id": 35784, "answer": "apple store"}, {"question_id": 35787, "answer": "coca cola"}, {"question_id": 35790, "answer": "peligo"}, {"question_id": 35793, "answer": "72"}, {"question_id": 35796, "answer": "hydroxycut"}, {"question_id": 35799, "answer": "ruby"}, {"question_id": 35802, "answer": "tab"}, {"question_id": 35805, "answer": "orange"}, {"question_id": 35808, "answer": "emel\u00e9"}, {"question_id": 35811, "answer": "19"}, {"question_id": 35814, "answer": "ghost"}, {"question_id": 35817, "answer": "samsung"}, {"question_id": 35820, "answer": "audur"}, {"question_id": 35823, "answer": "yes"}, {"question_id": 35826, "answer": "yes"}, {"question_id": 35829, "answer": "vikings"}, {"question_id": 35832, "answer": "peach"}, {"question_id": 35835, "answer": "winners"}, {"question_id": 35838, "answer": "x"}, {"question_id": 35841, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 35844, "answer": "tie"}, {"question_id": 35847, "answer": "10:30"}, {"question_id": 35850, "answer": "framing company"}, {"question_id": 35853, "answer": "12"}, {"question_id": 35856, "answer": "wolverines"}, {"question_id": 35859, "answer": "france"}, {"question_id": 35862, "answer": "american"}, {"question_id": 35865, "answer": "microsoft"}, {"question_id": 35868, "answer": "no"}, {"question_id": 35871, "answer": "jim hamer"}, {"question_id": 35874, "answer": "11"}, {"question_id": 35877, "answer": "tax"}, {"question_id": 35880, "answer": "restaurant"}, {"question_id": 35883, "answer": "x treme"}, {"question_id": 35886, "answer": "butter"}, {"question_id": 35889, "answer": "tale of gettysburg white"}, {"question_id": 35892, "answer": "edwards"}, {"question_id": 35895, "answer": "19"}, {"question_id": 35898, "answer": "crossroft"}, {"question_id": 35901, "answer": "5%"}, {"question_id": 35904, "answer": "50"}, {"question_id": 35907, "answer": "rolex"}, {"question_id": 35910, "answer": "yes"}, {"question_id": 35913, "answer": "navy"}, {"question_id": 35916, "answer": "chocolate chip"}, {"question_id": 35919, "answer": "yes"}, {"question_id": 35922, "answer": "enter"}, {"question_id": 35925, "answer": "100"}, {"question_id": 35928, "answer": "pop"}, {"question_id": 35931, "answer": "44"}, {"question_id": 35934, "answer": "premier"}, {"question_id": 35937, "answer": "dell"}, {"question_id": 35940, "answer": "b"}, {"question_id": 35943, "answer": "17"}, {"question_id": 35946, "answer": "red bull"}, {"question_id": 35949, "answer": "usa"}, {"question_id": 35952, "answer": "phone"}, {"question_id": 35955, "answer": "safety"}, {"question_id": 35958, "answer": "hot n ready"}, {"question_id": 35961, "answer": "4"}, {"question_id": 35964, "answer": "wwwalterelectracom"}, {"question_id": 35967, "answer": "dehovoed"}, {"question_id": 35970, "answer": "1234567890"}, {"question_id": 35973, "answer": "raymond"}, {"question_id": 35976, "answer": "w w w w"}, {"question_id": 35979, "answer": "police"}, {"question_id": 35982, "answer": "yes"}, {"question_id": 35985, "answer": "macbook air"}, {"question_id": 35988, "answer": "top hit"}, {"question_id": 35991, "answer": "warriors"}, {"question_id": 35994, "answer": "lorina"}, {"question_id": 35997, "answer": "2013"}, {"question_id": 36000, "answer": "1700000"}, {"question_id": 36003, "answer": "think ahead when only portable is typewriter"}, {"question_id": 36006, "answer": "11"}, {"question_id": 36009, "answer": "$1.99"}, {"question_id": 36012, "answer": "37"}, {"question_id": 36015, "answer": "tumble fan"}, {"question_id": 36018, "answer": "100"}, {"question_id": 36021, "answer": "polar"}, {"question_id": 36024, "answer": "cubs"}, {"question_id": 36027, "answer": "gireau"}, {"question_id": 36030, "answer": "john rusher 's aftermath"}, {"question_id": 36033, "answer": "wine"}, {"question_id": 36036, "answer": "500"}, {"question_id": 36039, "answer": "i am monster"}, {"question_id": 36042, "answer": "$100"}, {"question_id": 36045, "answer": "hanoi"}, {"question_id": 36048, "answer": "18"}, {"question_id": 36051, "answer": "no"}, {"question_id": 36054, "answer": "cheer"}, {"question_id": 36057, "answer": "vienna"}, {"question_id": 36060, "answer": "25"}, {"question_id": 36063, "answer": "mac"}, {"question_id": 36066, "answer": "number 7"}, {"question_id": 36069, "answer": "citibank"}, {"question_id": 36072, "answer": "sailboat"}, {"question_id": 36075, "answer": "beer"}, {"question_id": 36078, "answer": "1759"}, {"question_id": 36081, "answer": "yes"}, {"question_id": 36084, "answer": "fridays"}, {"question_id": 36087, "answer": "66"}, {"question_id": 36090, "answer": "pacemont"}, {"question_id": 36093, "answer": "sony"}, {"question_id": 36096, "answer": "pennsylvania"}, {"question_id": 36099, "answer": "jruby"}, {"question_id": 36102, "answer": "stop"}, {"question_id": 36105, "answer": "kenwood"}, {"question_id": 36108, "answer": "coca cola"}, {"question_id": 36111, "answer": "france"}, {"question_id": 36114, "answer": "12"}, {"question_id": 36117, "answer": "kenya daily news"}, {"question_id": 36120, "answer": "honda"}, {"question_id": 36123, "answer": "yes"}, {"question_id": 36126, "answer": "yes"}, {"question_id": 36129, "answer": "mobilcom"}, {"question_id": 36132, "answer": "kumamoto"}, {"question_id": 36135, "answer": "16"}, {"question_id": 36138, "answer": "twitter"}, {"question_id": 36141, "answer": ""}, {"question_id": 36144, "answer": "tigers"}, {"question_id": 36147, "answer": "10"}, {"question_id": 36150, "answer": "10:10"}, {"question_id": 36153, "answer": "v"}, {"question_id": 36156, "answer": "bill"}, {"question_id": 36159, "answer": "f"}, {"question_id": 36162, "answer": "product design for web"}, {"question_id": 36165, "answer": "houseboat"}, {"question_id": 36168, "answer": "jr"}, {"question_id": 36171, "answer": "antwerpener"}, {"question_id": 36174, "answer": "band box cleaners"}, {"question_id": 36177, "answer": "bbm"}, {"question_id": 36180, "answer": "ticket"}, {"question_id": 36183, "answer": "citi"}, {"question_id": 36186, "answer": "performance time task"}, {"question_id": 36189, "answer": "800"}, {"question_id": 36192, "answer": "elephant"}, {"question_id": 36195, "answer": "queen victoria"}, {"question_id": 36198, "answer": "o and t"}, {"question_id": 36201, "answer": "heute denken fertig"}, {"question_id": 36204, "answer": "fallout"}, {"question_id": 36207, "answer": "1819"}, {"question_id": 36210, "answer": "telecom"}, {"question_id": 36213, "answer": "united states"}, {"question_id": 36216, "answer": "american apparel"}, {"question_id": 36219, "answer": "$10.00"}, {"question_id": 36222, "answer": "apologia"}, {"question_id": 36225, "answer": "stop"}, {"question_id": 36228, "answer": "yes"}, {"question_id": 36231, "answer": "slimbrew"}, {"question_id": 36234, "answer": "fortana"}, {"question_id": 36237, "answer": "apples"}, {"question_id": 36240, "answer": "k"}, {"question_id": 36243, "answer": "brown"}, {"question_id": 36246, "answer": "lisztzka"}, {"question_id": 36249, "answer": "10:30"}, {"question_id": 36252, "answer": "free"}, {"question_id": 36255, "answer": "1200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 36258, "answer": "250"}, {"question_id": 36261, "answer": "no"}, {"question_id": 36264, "answer": "1400"}, {"question_id": 36267, "answer": "corn"}, {"question_id": 36270, "answer": "welcome"}, {"question_id": 36273, "answer": "100"}, {"question_id": 36276, "answer": "visa"}, {"question_id": 36279, "answer": "millennial man"}, {"question_id": 36282, "answer": "rrftc"}, {"question_id": 36285, "answer": "rock"}, {"question_id": 36288, "answer": "apple"}, {"question_id": 36291, "answer": "nintendo"}, {"question_id": 36294, "answer": "10:00"}, {"question_id": 36297, "answer": "dylan rhodes"}, {"question_id": 36300, "answer": "limits"}, {"question_id": 36303, "answer": "22"}, {"question_id": 36306, "answer": "paint"}, {"question_id": 36309, "answer": "china"}, {"question_id": 36312, "answer": "automotive"}, {"question_id": 36315, "answer": "10"}, {"question_id": 36318, "answer": "air pacific"}, {"question_id": 36321, "answer": "80"}, {"question_id": 36324, "answer": "death 's corner"}, {"question_id": 36327, "answer": "10:10"}, {"question_id": 36330, "answer": "9"}, {"question_id": 36333, "answer": "nissan"}, {"question_id": 36336, "answer": "9412"}, {"question_id": 36339, "answer": "x men"}, {"question_id": 36342, "answer": "tonight"}, {"question_id": 36345, "answer": "10:55 pm"}, {"question_id": 36348, "answer": "villa di marte"}, {"question_id": 36351, "answer": "my gto"}, {"question_id": 36354, "answer": "yes"}, {"question_id": 36357, "answer": "lg"}, {"question_id": 36360, "answer": "chocolate"}, {"question_id": 36363, "answer": "10000"}, {"question_id": 36366, "answer": "california"}, {"question_id": 36369, "answer": "dead or alive 5 nintendo 64"}, {"question_id": 36372, "answer": "stop"}, {"question_id": 36375, "answer": "abraxas"}, {"question_id": 36378, "answer": "17 and 18"}, {"question_id": 36381, "answer": "80"}, {"question_id": 36384, "answer": "city bus"}, {"question_id": 36387, "answer": "swr"}, {"question_id": 36390, "answer": "nokia"}, {"question_id": 36393, "answer": "black"}, {"question_id": 36396, "answer": "i am musician and fucking filthah"}, {"question_id": 36399, "answer": "puma"}, {"question_id": 36402, "answer": "1990s"}, {"question_id": 36405, "answer": "arduino"}, {"question_id": 36408, "answer": "brooklyn"}, {"question_id": 36411, "answer": "dangerous"}, {"question_id": 36414, "answer": "100"}, {"question_id": 36417, "answer": "france"}, {"question_id": 36420, "answer": "vanilla"}, {"question_id": 36423, "answer": "belladonna"}, {"question_id": 36426, "answer": "lcd"}, {"question_id": 36429, "answer": "lego"}, {"question_id": 36432, "answer": "no"}, {"question_id": 36435, "answer": "dolls"}, {"question_id": 36438, "answer": "food"}, {"question_id": 36441, "answer": "500"}, {"question_id": 36444, "answer": "bathroom"}, {"question_id": 36447, "answer": "3"}, {"question_id": 36450, "answer": "no"}, {"question_id": 36453, "answer": "fanatic"}, {"question_id": 36456, "answer": "100"}, {"question_id": 36459, "answer": "virginia"}, {"question_id": 36462, "answer": "1850s"}, {"question_id": 36465, "answer": "chimay"}, {"question_id": 36468, "answer": "hybrid"}, {"question_id": 36471, "answer": "100"}, {"question_id": 36474, "answer": "$100"}, {"question_id": 36477, "answer": "first year"}, {"question_id": 36480, "answer": "pharmacy"}, {"question_id": 36483, "answer": "4"}, {"question_id": 36486, "answer": "tissot"}, {"question_id": 36489, "answer": "10:00"}, {"question_id": 36492, "answer": "14"}, {"question_id": 36495, "answer": "het judaesooog dutch"}, {"question_id": 36498, "answer": "history"}, {"question_id": 36501, "answer": "yankees"}, {"question_id": 36504, "answer": "saturday"}, {"question_id": 36507, "answer": "17"}, {"question_id": 36510, "answer": "ta"}, {"question_id": 36513, "answer": "gracano"}, {"question_id": 36516, "answer": "50"}, {"question_id": 36519, "answer": "emergency"}, {"question_id": 36522, "answer": "canada"}, {"question_id": 36525, "answer": "st vincent de paul"}, {"question_id": 36528, "answer": "22"}, {"question_id": 36531, "answer": "2015"}, {"question_id": 36534, "answer": "banca transsilvania"}, {"question_id": 36537, "answer": "koran"}, {"question_id": 36540, "answer": "yes"}, {"question_id": 36543, "answer": "starbucks"}, {"question_id": 36546, "answer": "david konheim"}, {"question_id": 36549, "answer": "100"}, {"question_id": 36552, "answer": "beer"}, {"question_id": 36555, "answer": "12 x 12 x 12"}, {"question_id": 36558, "answer": "nikos kazantzakis"}, {"question_id": 36561, "answer": "45"}, {"question_id": 36564, "answer": "toshiba"}, {"question_id": 36567, "answer": "10:10"}, {"question_id": 36570, "answer": "concordia"}, {"question_id": 36573, "answer": "palace"}, {"question_id": 36576, "answer": "natron"}, {"question_id": 36579, "answer": "yes"}, {"question_id": 36582, "answer": "mariners"}, {"question_id": 36585, "answer": "plaque"}, {"question_id": 36588, "answer": "taste quality"}, {"question_id": 36591, "answer": "raymond"}, {"question_id": 36594, "answer": "bishop 's finger"}, {"question_id": 36597, "answer": "1999"}, {"question_id": 36600, "answer": "taco"}, {"question_id": 36603, "answer": "boston"}, {"question_id": 36606, "answer": "johnson"}, {"question_id": 36609, "answer": "bookworm lasercom"}, {"question_id": 36612, "answer": "black"}, {"question_id": 36615, "answer": "11"}, {"question_id": 36618, "answer": "m"}, {"question_id": 36621, "answer": "hot sauce"}, {"question_id": 36624, "answer": "foundation 's edge"}, {"question_id": 36627, "answer": "no"}, {"question_id": 36630, "answer": "yes"}, {"question_id": 36633, "answer": "asahi breweries"}, {"question_id": 36636, "answer": "77"}, {"question_id": 36639, "answer": "penny"}, {"question_id": 36642, "answer": "united states"}, {"question_id": 36645, "answer": "zimax"}, {"question_id": 36648, "answer": "zara"}, {"question_id": 36651, "answer": "6"}, {"question_id": 36654, "answer": "samsung"}, {"question_id": 36657, "answer": "joyce carol oates"}, {"question_id": 36660, "answer": "yes"}, {"question_id": 36663, "answer": "billboard"}, {"question_id": 36666, "answer": "lenovo"}, {"question_id": 36669, "answer": "stephen covey"}, {"question_id": 36672, "answer": "10"}, {"question_id": 36675, "answer": "10000"}, {"question_id": 36678, "answer": "40"}, {"question_id": 36681, "answer": "king"}, {"question_id": 36684, "answer": "nemesispizzacom"}, {"question_id": 36687, "answer": "rolex"}, {"question_id": 36690, "answer": "tour bus"}, {"question_id": 36693, "answer": "ryuji"}, {"question_id": 36696, "answer": "c"}, {"question_id": 36699, "answer": "explorers"}, {"question_id": 36702, "answer": "dell"}, {"question_id": 36705, "answer": "lenovo"}, {"question_id": 36708, "answer": "l'oreal"}, {"question_id": 36711, "answer": "t mobile"}, {"question_id": 36714, "answer": "wii"}, {"question_id": 36717, "answer": "12:11"}, {"question_id": 36720, "answer": "lmcom"}, {"question_id": 36723, "answer": "no"}, {"question_id": 36726, "answer": "10"}, {"question_id": 36729, "answer": "ballonworksuk"}, {"question_id": 36732, "answer": "milan"}, {"question_id": 36735, "answer": ""}, {"question_id": 36738, "answer": "baseball"}, {"question_id": 36741, "answer": "dell"}, {"question_id": 36744, "answer": "yes"}, {"question_id": 36747, "answer": "gatorade"}, {"question_id": 36750, "answer": "police"}, {"question_id": 36753, "answer": "sherry"}, {"question_id": 36756, "answer": "heineken"}, {"question_id": 36759, "answer": "3"}, {"question_id": 36762, "answer": "airbus"}, {"question_id": 36765, "answer": "write to project team at city center master plan project"}, {"question_id": 36768, "answer": "new york"}, {"question_id": 36771, "answer": "x"}, {"question_id": 36774, "answer": "pittsburgh"}, {"question_id": 36777, "answer": "men"}, {"question_id": 36780, "answer": "toronto"}, {"question_id": 36783, "answer": "1895"}, {"question_id": 36786, "answer": "12"}, {"question_id": 36789, "answer": "it is not door"}, {"question_id": 36792, "answer": "doctor"}, {"question_id": 36795, "answer": "black book"}, {"question_id": 36798, "answer": "facebook"}, {"question_id": 36801, "answer": "nanking massacre"}, {"question_id": 36804, "answer": "81"}, {"question_id": 36807, "answer": "nokia"}, {"question_id": 36810, "answer": "user"}, {"question_id": 36813, "answer": "nikon"}, {"question_id": 36816, "answer": "technion"}, {"question_id": 36819, "answer": "pacific"}, {"question_id": 36822, "answer": "welcome"}, {"question_id": 36825, "answer": "flickr"}, {"question_id": 36828, "answer": "october 26"}, {"question_id": 36831, "answer": "airasia"}, {"question_id": 36834, "answer": "sapporo"}, {"question_id": 36837, "answer": "london"}, {"question_id": 36840, "answer": "hovercraft"}, {"question_id": 36843, "answer": "peace"}, {"question_id": 36846, "answer": "february 2012"}, {"question_id": 36849, "answer": "mexicans"}, {"question_id": 36852, "answer": "we are entering period of concern"}, {"question_id": 36855, "answer": "barbera"}, {"question_id": 36858, "answer": "train with grain"}, {"question_id": 36861, "answer": "bucket"}, {"question_id": 36864, "answer": "100"}, {"question_id": 36867, "answer": "no"}, {"question_id": 36870, "answer": "walmart"}, {"question_id": 36873, "answer": "100"}, {"question_id": 36876, "answer": "lg"}, {"question_id": 36879, "answer": "2"}, {"question_id": 36882, "answer": "10:00"}, {"question_id": 36885, "answer": "0123456789"}, {"question_id": 36888, "answer": "mario"}, {"question_id": 36891, "answer": "people"}, {"question_id": 36894, "answer": "box"}, {"question_id": 36897, "answer": "10:00"}, {"question_id": 36900, "answer": "time machine"}, {"question_id": 36903, "answer": "2015"}, {"question_id": 36906, "answer": "pinarello"}, {"question_id": 36909, "answer": "2012"}, {"question_id": 36912, "answer": "cross street"}, {"question_id": 36915, "answer": "astrophysics"}, {"question_id": 36918, "answer": "tamarindo"}, {"question_id": 36921, "answer": "f"}, {"question_id": 36924, "answer": "auto and truck"}, {"question_id": 36927, "answer": "11"}, {"question_id": 36930, "answer": "andrew"}, {"question_id": 36933, "answer": "green peace"}, {"question_id": 36936, "answer": "hmv"}, {"question_id": 36939, "answer": "enter"}, {"question_id": 36942, "answer": "100"}, {"question_id": 36945, "answer": "town hall"}, {"question_id": 36948, "answer": "olivetti"}, {"question_id": 36951, "answer": "idiot"}, {"question_id": 36954, "answer": "yes"}, {"question_id": 36957, "answer": "peas"}, {"question_id": 36960, "answer": "nike"}, {"question_id": 36963, "answer": "omumu"}, {"question_id": 36966, "answer": "45"}, {"question_id": 36969, "answer": "red"}, {"question_id": 36972, "answer": "liquid"}, {"question_id": 36975, "answer": "tasmanian"}, {"question_id": 36978, "answer": "gatorade"}, {"question_id": 36981, "answer": "complete series"}, {"question_id": 36984, "answer": "london"}, {"question_id": 36987, "answer": "book"}, {"question_id": 36990, "answer": "10"}, {"question_id": 36993, "answer": "2005"}, {"question_id": 36996, "answer": "virginia"}, {"question_id": 36999, "answer": "tremont"}, {"question_id": 37002, "answer": "westland"}, {"question_id": 37005, "answer": "new graphics toolkit"}, {"question_id": 37008, "answer": "coca"}, {"question_id": 37011, "answer": "tamale town"}, {"question_id": 37014, "answer": "coca"}, {"question_id": 37017, "answer": "keo"}, {"question_id": 37020, "answer": "stop"}, {"question_id": 37023, "answer": "no"}, {"question_id": 37026, "answer": "sy"}, {"question_id": 37029, "answer": "700"}, {"question_id": 37032, "answer": "unnera"}, {"question_id": 37035, "answer": "tv series"}, {"question_id": 37038, "answer": "orange"}, {"question_id": 37041, "answer": "bama"}, {"question_id": 37044, "answer": "10:10"}, {"question_id": 37047, "answer": "whiskey"}, {"question_id": 37050, "answer": "wood"}, {"question_id": 37053, "answer": "1975"}, {"question_id": 37056, "answer": "10:10"}, {"question_id": 37059, "answer": "october 2014"}, {"question_id": 37062, "answer": "stretched jackson bridges"}, {"question_id": 37065, "answer": "library"}, {"question_id": 37068, "answer": "no"}, {"question_id": 37071, "answer": "3"}, {"question_id": 37074, "answer": "1200"}, {"question_id": 37077, "answer": "sand dales"}, {"question_id": 37080, "answer": "belle"}, {"question_id": 37083, "answer": "met"}, {"question_id": 37086, "answer": "r"}, {"question_id": 37089, "answer": "101"}, {"question_id": 37092, "answer": "12"}, {"question_id": 37095, "answer": "french football"}, {"question_id": 37098, "answer": "highway 100"}, {"question_id": 37101, "answer": "1888"}, {"question_id": 37104, "answer": "desert"}, {"question_id": 37107, "answer": ""}, {"question_id": 37110, "answer": "greenscreen"}, {"question_id": 37113, "answer": "mitsouno"}, {"question_id": 37116, "answer": ""}, {"question_id": 37119, "answer": "tornado"}, {"question_id": 37122, "answer": "i am your father"}, {"question_id": 37125, "answer": "madison"}, {"question_id": 37128, "answer": "11"}, {"question_id": 37131, "answer": "1999"}, {"question_id": 37134, "answer": "wine"}, {"question_id": 37137, "answer": "2"}, {"question_id": 37140, "answer": "yes"}, {"question_id": 37143, "answer": "airphone"}, {"question_id": 37146, "answer": "restaurant restoff"}, {"question_id": 37149, "answer": "toshiba"}, {"question_id": 37152, "answer": "29"}, {"question_id": 37155, "answer": "9"}, {"question_id": 37158, "answer": "11"}, {"question_id": 37161, "answer": "hong kong"}, {"question_id": 37164, "answer": "1930"}, {"question_id": 37167, "answer": "papa told me"}, {"question_id": 37170, "answer": "orozo"}, {"question_id": 37173, "answer": "june"}, {"question_id": 37176, "answer": "0"}, {"question_id": 37179, "answer": "ikea"}, {"question_id": 37182, "answer": "2014"}, {"question_id": 37185, "answer": "google"}, {"question_id": 37188, "answer": "test"}, {"question_id": 37191, "answer": "bike"}, {"question_id": 37194, "answer": "out"}, {"question_id": 37197, "answer": "stop sign"}, {"question_id": 37200, "answer": "2010"}, {"question_id": 37203, "answer": "royal"}, {"question_id": 37206, "answer": "7"}, {"question_id": 37209, "answer": "no"}, {"question_id": 37212, "answer": "jared diamond"}, {"question_id": 37215, "answer": "am333"}, {"question_id": 37218, "answer": "woods cross"}, {"question_id": 37221, "answer": "yes"}, {"question_id": 37224, "answer": "emomobile"}, {"question_id": 37227, "answer": "toshiba"}, {"question_id": 37230, "answer": "10:08:11"}, {"question_id": 37233, "answer": "logos"}, {"question_id": 37236, "answer": "turn right"}, {"question_id": 37239, "answer": "10"}, {"question_id": 37242, "answer": "angel"}, {"question_id": 37245, "answer": "meat"}, {"question_id": 37248, "answer": "william sarbande"}, {"question_id": 37251, "answer": "trek"}, {"question_id": 37254, "answer": "aerosmith"}, {"question_id": 37257, "answer": "rbc"}, {"question_id": 37260, "answer": "canon cat"}, {"question_id": 37263, "answer": "canon"}, {"question_id": 37266, "answer": "tiles"}, {"question_id": 37269, "answer": "starbucks"}, {"question_id": 37272, "answer": "61"}, {"question_id": 37275, "answer": "dungeons of doom"}, {"question_id": 37278, "answer": "china"}, {"question_id": 37281, "answer": "green"}, {"question_id": 37284, "answer": "yes"}, {"question_id": 37287, "answer": "ronaldo"}, {"question_id": 37290, "answer": "stop"}, {"question_id": 37293, "answer": "yes"}, {"question_id": 37296, "answer": "bushmills"}, {"question_id": 37299, "answer": "no"}, {"question_id": 37302, "answer": "clubs"}, {"question_id": 37305, "answer": "walgreens"}, {"question_id": 37308, "answer": "french franc"}, {"question_id": 37311, "answer": "yes"}, {"question_id": 37314, "answer": "1996"}, {"question_id": 37317, "answer": "baseball"}, {"question_id": 37320, "answer": "grand lait 45"}, {"question_id": 37323, "answer": "red and black"}, {"question_id": 37326, "answer": "50"}, {"question_id": 37329, "answer": "airport extreme 802.11n wifi"}, {"question_id": 37332, "answer": "yes"}, {"question_id": 37335, "answer": "milk"}, {"question_id": 37338, "answer": "sunex"}, {"question_id": 37341, "answer": "rolex"}, {"question_id": 37344, "answer": "$10"}, {"question_id": 37347, "answer": "dubai"}, {"question_id": 37350, "answer": "north america"}, {"question_id": 37353, "answer": "budweiser"}, {"question_id": 37356, "answer": "yes"}, {"question_id": 37359, "answer": "cheese"}, {"question_id": 37362, "answer": "candy"}, {"question_id": 37365, "answer": "10 fl oz"}, {"question_id": 37368, "answer": "snapple"}, {"question_id": 37371, "answer": "kenwood"}, {"question_id": 37374, "answer": "jim"}, {"question_id": 37377, "answer": "blood pressure"}, {"question_id": 37380, "answer": ""}, {"question_id": 37383, "answer": "yes"}, {"question_id": 37386, "answer": "rolex"}, {"question_id": 37389, "answer": "bud"}, {"question_id": 37392, "answer": "tangeros"}, {"question_id": 37395, "answer": "p"}, {"question_id": 37398, "answer": "transtar"}, {"question_id": 37401, "answer": "united states"}, {"question_id": 37404, "answer": "toshiba"}, {"question_id": 37407, "answer": "ellington"}, {"question_id": 37410, "answer": "budweiser"}, {"question_id": 37413, "answer": "100"}, {"question_id": 37416, "answer": "pete 's"}, {"question_id": 37419, "answer": "postcardscom"}, {"question_id": 37422, "answer": "b"}, {"question_id": 37425, "answer": "10"}, {"question_id": 37428, "answer": "2007"}, {"question_id": 37431, "answer": "wewin"}, {"question_id": 37434, "answer": "12 10 2004"}, {"question_id": 37437, "answer": "gordon"}, {"question_id": 37440, "answer": "gn"}, {"question_id": 37443, "answer": "no"}, {"question_id": 37446, "answer": "soda"}, {"question_id": 37449, "answer": "cell phone"}, {"question_id": 37452, "answer": "book"}, {"question_id": 37455, "answer": "in vogue soul flower"}, {"question_id": 37458, "answer": "milk"}, {"question_id": 37461, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 37464, "answer": "apprentice"}, {"question_id": 37467, "answer": "10:45"}, {"question_id": 37470, "answer": "1000"}, {"question_id": 37473, "answer": "oklahoma"}, {"question_id": 37476, "answer": "california"}, {"question_id": 37479, "answer": "fine"}, {"question_id": 37482, "answer": "energy"}, {"question_id": 37485, "answer": "josh"}, {"question_id": 37488, "answer": "123456"}, {"question_id": 37491, "answer": "e r smith"}, {"question_id": 37494, "answer": "yes"}, {"question_id": 37497, "answer": "fish"}, {"question_id": 37500, "answer": "cjj"}, {"question_id": 37503, "answer": "space bar"}, {"question_id": 37506, "answer": "40"}, {"question_id": 37509, "answer": "10 minutes"}, {"question_id": 37512, "answer": "4"}, {"question_id": 37515, "answer": "35"}, {"question_id": 37518, "answer": "candy"}, {"question_id": 37521, "answer": "1900"}, {"question_id": 37524, "answer": "iphone"}, {"question_id": 37527, "answer": "1940s 1950s"}, {"question_id": 37530, "answer": "connected"}, {"question_id": 37533, "answer": "brunello"}, {"question_id": 37536, "answer": "cuvee des fleurs"}, {"question_id": 37539, "answer": "sell"}, {"question_id": 37542, "answer": "oats"}, {"question_id": 37545, "answer": "kitchen aid"}, {"question_id": 37548, "answer": "100"}, {"question_id": 37551, "answer": "orange"}, {"question_id": 37554, "answer": "andrew"}, {"question_id": 37557, "answer": "no"}, {"question_id": 37560, "answer": "north america"}, {"question_id": 37563, "answer": "enter"}, {"question_id": 37566, "answer": "12"}, {"question_id": 37569, "answer": "christmas"}, {"question_id": 37572, "answer": "phone"}, {"question_id": 37575, "answer": "2nd street"}, {"question_id": 37578, "answer": "rockstar"}, {"question_id": 37581, "answer": "red stripe"}, {"question_id": 37584, "answer": "waste land"}, {"question_id": 37587, "answer": "doughnuts in pink boxes"}, {"question_id": 37590, "answer": "1"}, {"question_id": 37593, "answer": "sports"}, {"question_id": 37596, "answer": "army"}, {"question_id": 37599, "answer": "virtual"}, {"question_id": 37602, "answer": "les soci\u00e9t\u00e9s"}, {"question_id": 37605, "answer": "adidas"}, {"question_id": 37608, "answer": "chirru"}, {"question_id": 37611, "answer": "crossworld"}, {"question_id": 37614, "answer": "pizza palace"}, {"question_id": 37617, "answer": "30086"}, {"question_id": 37620, "answer": "ay royce"}, {"question_id": 37623, "answer": "1955"}, {"question_id": 37626, "answer": "royal"}, {"question_id": 37629, "answer": "beer"}, {"question_id": 37632, "answer": "9"}, {"question_id": 37635, "answer": "coca"}, {"question_id": 37638, "answer": "5"}, {"question_id": 37641, "answer": "taking"}, {"question_id": 37644, "answer": "hipchat"}, {"question_id": 37647, "answer": "smile"}, {"question_id": 37650, "answer": "air"}, {"question_id": 37653, "answer": "hermes"}, {"question_id": 37656, "answer": "no"}, {"question_id": 37659, "answer": "crossing"}, {"question_id": 37662, "answer": "ishsas"}, {"question_id": 37665, "answer": "dark horse"}, {"question_id": 37668, "answer": "chas ward"}, {"question_id": 37671, "answer": "people"}, {"question_id": 37674, "answer": "12:00"}, {"question_id": 37677, "answer": "box"}, {"question_id": 37680, "answer": "stige"}, {"question_id": 37683, "answer": "30"}, {"question_id": 37686, "answer": "stock funding"}, {"question_id": 37689, "answer": "be compassionate"}, {"question_id": 37692, "answer": "island"}, {"question_id": 37695, "answer": "brita"}, {"question_id": 37698, "answer": "adidas"}, {"question_id": 37701, "answer": "sony"}, {"question_id": 37704, "answer": "csu"}, {"question_id": 37707, "answer": "yes"}, {"question_id": 37710, "answer": "baseball"}, {"question_id": 37713, "answer": "yes"}, {"question_id": 37716, "answer": "book of year"}, {"question_id": 37719, "answer": "1999"}, {"question_id": 37722, "answer": "imbue"}, {"question_id": 37725, "answer": "letter"}, {"question_id": 37728, "answer": "1990"}, {"question_id": 37731, "answer": "rubber"}, {"question_id": 37734, "answer": "jim"}, {"question_id": 37737, "answer": "turtles"}, {"question_id": 37740, "answer": "16"}, {"question_id": 37743, "answer": "beer"}, {"question_id": 37746, "answer": "coca cola"}, {"question_id": 37749, "answer": "google"}, {"question_id": 37752, "answer": "c"}, {"question_id": 37755, "answer": "ford"}, {"question_id": 37758, "answer": "12"}, {"question_id": 37761, "answer": "ad"}, {"question_id": 37764, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 37767, "answer": "stop sign"}, {"question_id": 37770, "answer": "e e taylor"}, {"question_id": 37773, "answer": "vietnam"}, {"question_id": 37776, "answer": "volkswagen"}, {"question_id": 37779, "answer": "lady chatterley 's lover"}, {"question_id": 37782, "answer": "waste management"}, {"question_id": 37785, "answer": "verreche"}, {"question_id": 37788, "answer": ""}, {"question_id": 37791, "answer": "potts"}, {"question_id": 37794, "answer": "1144"}, {"question_id": 37797, "answer": "10:10"}, {"question_id": 37800, "answer": "platinum egoiste"}, {"question_id": 37803, "answer": "10:10"}, {"question_id": 37806, "answer": "pro events"}, {"question_id": 37809, "answer": "wrist"}, {"question_id": 37812, "answer": "britta"}, {"question_id": 37815, "answer": "23"}, {"question_id": 37818, "answer": "e"}, {"question_id": 37821, "answer": "london"}, {"question_id": 37824, "answer": "10"}, {"question_id": 37827, "answer": "147"}, {"question_id": 37830, "answer": "lakers"}, {"question_id": 37833, "answer": "1200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 37836, "answer": "deadpool"}, {"question_id": 37839, "answer": "text"}, {"question_id": 37842, "answer": "washington"}, {"question_id": 37845, "answer": "pizza"}, {"question_id": 37848, "answer": "10"}, {"question_id": 37851, "answer": "7 nights of evil"}, {"question_id": 37854, "answer": "eye"}, {"question_id": 37857, "answer": "ferbet call"}, {"question_id": 37860, "answer": "caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left is caption on bottom left"}, {"question_id": 37863, "answer": "503 222 2222"}, {"question_id": 37866, "answer": ""}, {"question_id": 37869, "answer": "ronzoni"}, {"question_id": 37872, "answer": "10:10"}, {"question_id": 37875, "answer": "savignon blanc"}, {"question_id": 37878, "answer": "tudor"}, {"question_id": 37881, "answer": "10:00"}, {"question_id": 37884, "answer": "x telera"}, {"question_id": 37887, "answer": "right"}, {"question_id": 37890, "answer": "10"}, {"question_id": 37893, "answer": "man"}, {"question_id": 37896, "answer": "rolex"}, {"question_id": 37899, "answer": "charles babbage"}, {"question_id": 37902, "answer": "m6 tal"}, {"question_id": 37905, "answer": "zarulelas famosas spanish"}, {"question_id": 37908, "answer": "heinz"}, {"question_id": 37911, "answer": "eliot wisse"}, {"question_id": 37914, "answer": "ogier"}, {"question_id": 37917, "answer": "talisker"}, {"question_id": 37920, "answer": "holy bible"}, {"question_id": 37923, "answer": "optimus"}, {"question_id": 37926, "answer": "18"}, {"question_id": 37929, "answer": "business design plan"}, {"question_id": 37932, "answer": "soda"}, {"question_id": 37935, "answer": "2012"}, {"question_id": 37938, "answer": "white"}, {"question_id": 37941, "answer": "washington"}, {"question_id": 37944, "answer": "pornumediacom"}, {"question_id": 37947, "answer": "r2500"}, {"question_id": 37950, "answer": "utah"}, {"question_id": 37953, "answer": "tedxconcordia"}, {"question_id": 37956, "answer": "5000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 37959, "answer": "cigarette butts"}, {"question_id": 37962, "answer": "red cup"}, {"question_id": 37965, "answer": "7"}, {"question_id": 37968, "answer": "no"}, {"question_id": 37971, "answer": "sr"}, {"question_id": 37974, "answer": "10"}, {"question_id": 37977, "answer": "rem martin"}, {"question_id": 37980, "answer": "army"}, {"question_id": 37983, "answer": "dna"}, {"question_id": 37986, "answer": "motorcycle"}, {"question_id": 37989, "answer": "104"}, {"question_id": 37992, "answer": "40"}, {"question_id": 37995, "answer": "keyboard"}, {"question_id": 37998, "answer": "408"}, {"question_id": 38001, "answer": "cell phone"}, {"question_id": 38004, "answer": "west"}, {"question_id": 38007, "answer": "postcards"}, {"question_id": 38010, "answer": "cathay"}, {"question_id": 38013, "answer": "crocodile"}, {"question_id": 38016, "answer": "dell"}, {"question_id": 38019, "answer": "no"}, {"question_id": 38022, "answer": "plate"}, {"question_id": 38025, "answer": "laughter"}, {"question_id": 38028, "answer": "text editor"}, {"question_id": 38031, "answer": "wild west witches"}, {"question_id": 38034, "answer": "nike"}, {"question_id": 38037, "answer": "1000"}, {"question_id": 38040, "answer": "e"}, {"question_id": 38043, "answer": "tzr"}, {"question_id": 38046, "answer": "15:20"}, {"question_id": 38049, "answer": "100"}, {"question_id": 38052, "answer": "1980"}, {"question_id": 38055, "answer": "11"}, {"question_id": 38058, "answer": "p"}, {"question_id": 38061, "answer": "10"}, {"question_id": 38064, "answer": "100"}, {"question_id": 38067, "answer": "no"}, {"question_id": 38070, "answer": "iphone"}, {"question_id": 38073, "answer": "canada"}, {"question_id": 38076, "answer": "blob"}, {"question_id": 38079, "answer": "school bus"}, {"question_id": 38082, "answer": "amazon"}, {"question_id": 38085, "answer": "gn"}, {"question_id": 38088, "answer": "truthaboutseattlecom"}, {"question_id": 38091, "answer": "10 15 2015"}, {"question_id": 38094, "answer": "3"}, {"question_id": 38097, "answer": "summer blend"}, {"question_id": 38100, "answer": "c"}, {"question_id": 38103, "answer": "lanc\u00f4me"}, {"question_id": 38106, "answer": "1"}, {"question_id": 38109, "answer": "10:10"}, {"question_id": 38112, "answer": "b"}, {"question_id": 38115, "answer": "laurelwood"}, {"question_id": 38118, "answer": "casio"}, {"question_id": 38121, "answer": "1 teaspoon"}, {"question_id": 38124, "answer": "2013"}, {"question_id": 38127, "answer": "ricard"}, {"question_id": 38130, "answer": "duncan plunkett"}, {"question_id": 38133, "answer": "people"}, {"question_id": 38136, "answer": "no"}, {"question_id": 38139, "answer": "hudson"}, {"question_id": 38142, "answer": "dream angel"}, {"question_id": 38145, "answer": "t mobile"}, {"question_id": 38148, "answer": "oakland"}, {"question_id": 38151, "answer": "yes"}, {"question_id": 38154, "answer": "no"}, {"question_id": 38157, "answer": "ipa"}, {"question_id": 38160, "answer": "english ivy 's"}, {"question_id": 38163, "answer": "100"}, {"question_id": 38166, "answer": "apple"}, {"question_id": 38169, "answer": "tempa"}, {"question_id": 38172, "answer": "15 october 2010"}, {"question_id": 38175, "answer": "espresso"}, {"question_id": 38178, "answer": "crayola"}, {"question_id": 38181, "answer": "4"}, {"question_id": 38184, "answer": "22"}, {"question_id": 38187, "answer": "team 1000"}, {"question_id": 38190, "answer": "11:44"}, {"question_id": 38193, "answer": "text"}, {"question_id": 38196, "answer": "orange"}, {"question_id": 38199, "answer": "toffer"}, {"question_id": 38202, "answer": "sony"}, {"question_id": 38205, "answer": "100"}, {"question_id": 38208, "answer": "p 1234 h"}, {"question_id": 38211, "answer": "stop"}, {"question_id": 38214, "answer": "taxi"}, {"question_id": 38217, "answer": "terra rossa"}, {"question_id": 38220, "answer": "123456"}, {"question_id": 38223, "answer": "astros"}, {"question_id": 38226, "answer": "china"}, {"question_id": 38229, "answer": "el capitan"}, {"question_id": 38232, "answer": "$9.99"}, {"question_id": 38235, "answer": "1940s"}, {"question_id": 38238, "answer": "tape"}, {"question_id": 38241, "answer": "city"}, {"question_id": 38244, "answer": "skyy"}, {"question_id": 38247, "answer": "soda"}, {"question_id": 38250, "answer": "surgery general office"}, {"question_id": 38253, "answer": "kids"}, {"question_id": 38256, "answer": "computer"}, {"question_id": 38259, "answer": "vietnam"}, {"question_id": 38262, "answer": "black beans"}, {"question_id": 38265, "answer": "87"}, {"question_id": 38268, "answer": "city bus"}, {"question_id": 38271, "answer": "trash"}, {"question_id": 38274, "answer": "samsung"}, {"question_id": 38277, "answer": "wikki"}, {"question_id": 38280, "answer": "2013"}, {"question_id": 38283, "answer": "royal legacy"}, {"question_id": 38286, "answer": "halloween"}, {"question_id": 38289, "answer": "ireland"}, {"question_id": 38292, "answer": "tv"}, {"question_id": 38295, "answer": "star wars"}, {"question_id": 38298, "answer": "yes"}, {"question_id": 38301, "answer": "oscar"}, {"question_id": 38304, "answer": "greenbaugh"}, {"question_id": 38307, "answer": "red"}, {"question_id": 38310, "answer": "lab"}, {"question_id": 38313, "answer": "taylor"}, {"question_id": 38316, "answer": "2009"}, {"question_id": 38319, "answer": "right"}, {"question_id": 38322, "answer": "giants"}, {"question_id": 38325, "answer": "no parking"}, {"question_id": 38328, "answer": "cloud"}, {"question_id": 38331, "answer": "273"}, {"question_id": 38334, "answer": "arsenal"}, {"question_id": 38337, "answer": "toys"}, {"question_id": 38340, "answer": "premium"}, {"question_id": 38343, "answer": "10:10"}, {"question_id": 38346, "answer": "100"}, {"question_id": 38349, "answer": "tequila"}, {"question_id": 38352, "answer": "rum"}, {"question_id": 38355, "answer": "chateau ponte cane"}, {"question_id": 38358, "answer": "cellson"}, {"question_id": 38361, "answer": "burlington"}, {"question_id": 38364, "answer": "strength"}, {"question_id": 38367, "answer": "white"}, {"question_id": 38370, "answer": "electra"}, {"question_id": 38373, "answer": "samsung"}, {"question_id": 38376, "answer": "black"}, {"question_id": 38379, "answer": "0.25 per minute"}, {"question_id": 38382, "answer": "chicago"}, {"question_id": 38385, "answer": "vorker"}, {"question_id": 38388, "answer": "canucks"}, {"question_id": 38391, "answer": "22"}, {"question_id": 38394, "answer": "shiner"}, {"question_id": 38397, "answer": "8 j593"}, {"question_id": 38400, "answer": "raspberry pussy"}, {"question_id": 38403, "answer": "yes"}, {"question_id": 38406, "answer": "24"}, {"question_id": 38409, "answer": "asahi"}, {"question_id": 38412, "answer": "100"}, {"question_id": 38415, "answer": "vulcan"}, {"question_id": 38418, "answer": "brown"}, {"question_id": 38421, "answer": "ryanair"}, {"question_id": 38424, "answer": "iron maiden"}, {"question_id": 38427, "answer": "virgin money"}, {"question_id": 38430, "answer": "chateau"}, {"question_id": 38433, "answer": "xii"}, {"question_id": 38436, "answer": "2010"}, {"question_id": 38439, "answer": "1950"}, {"question_id": 38442, "answer": "boston"}, {"question_id": 38445, "answer": "21"}, {"question_id": 38448, "answer": "5"}, {"question_id": 38451, "answer": "general motors"}, {"question_id": 38454, "answer": "elvis presley"}, {"question_id": 38457, "answer": "habanero"}, {"question_id": 38460, "answer": "leuven"}, {"question_id": 38463, "answer": ""}, {"question_id": 38466, "answer": "italy"}, {"question_id": 38469, "answer": "cranbrook"}, {"question_id": 38472, "answer": "american"}, {"question_id": 38475, "answer": "sony"}, {"question_id": 38478, "answer": "u"}, {"question_id": 38481, "answer": "10"}, {"question_id": 38484, "answer": "atomik"}, {"question_id": 38487, "answer": "hp lovecraft"}, {"question_id": 38490, "answer": "honda"}, {"question_id": 38493, "answer": "100"}, {"question_id": 38496, "answer": "bulletin"}, {"question_id": 38499, "answer": "samsung"}, {"question_id": 38502, "answer": "route 66"}, {"question_id": 38505, "answer": "jamee albran"}, {"question_id": 38508, "answer": ""}, {"question_id": 38511, "answer": "sony"}, {"question_id": 38514, "answer": "birthday dog"}, {"question_id": 38517, "answer": "ncaa"}, {"question_id": 38520, "answer": "red"}, {"question_id": 38523, "answer": "yes"}, {"question_id": 38526, "answer": "59"}, {"question_id": 38529, "answer": "ibm"}, {"question_id": 38532, "answer": "john f kennedy"}, {"question_id": 38535, "answer": "president"}, {"question_id": 38538, "answer": "11 15 2015"}, {"question_id": 38541, "answer": "small"}, {"question_id": 38544, "answer": "30"}, {"question_id": 38547, "answer": "decivilisation"}, {"question_id": 38550, "answer": "10"}, {"question_id": 38553, "answer": "houston"}, {"question_id": 38556, "answer": "bah humbug"}, {"question_id": 38559, "answer": "google"}, {"question_id": 38562, "answer": "no"}, {"question_id": 38565, "answer": "callaway"}, {"question_id": 38568, "answer": "you've been sold"}, {"question_id": 38571, "answer": "qatar"}, {"question_id": 38574, "answer": "sony"}, {"question_id": 38577, "answer": "5:05"}, {"question_id": 38580, "answer": "8"}, {"question_id": 38583, "answer": "10"}, {"question_id": 38586, "answer": "404"}, {"question_id": 38589, "answer": "23"}, {"question_id": 38592, "answer": "beer"}, {"question_id": 38595, "answer": "boss noveau"}, {"question_id": 38598, "answer": "am it build it live"}, {"question_id": 38601, "answer": "car"}, {"question_id": 38604, "answer": "hp"}, {"question_id": 38607, "answer": "clock tower"}, {"question_id": 38610, "answer": "becquerel"}, {"question_id": 38613, "answer": "10"}, {"question_id": 38616, "answer": "35"}, {"question_id": 38619, "answer": "lmcom"}, {"question_id": 38622, "answer": "1880"}, {"question_id": 38625, "answer": "no"}, {"question_id": 38628, "answer": "france"}, {"question_id": 38631, "answer": "nhl"}, {"question_id": 38634, "answer": "textbooks"}, {"question_id": 38637, "answer": "sizzler"}, {"question_id": 38640, "answer": "3d"}, {"question_id": 38643, "answer": "daily camera"}, {"question_id": 38646, "answer": "44"}, {"question_id": 38649, "answer": "long island"}, {"question_id": 38652, "answer": "ootberfest"}, {"question_id": 38655, "answer": "texas"}, {"question_id": 38658, "answer": "cd"}, {"question_id": 38661, "answer": "florida vs united states"}, {"question_id": 38664, "answer": "big rock candy mountain"}, {"question_id": 38667, "answer": "bulletin"}, {"question_id": 38670, "answer": "space bar"}, {"question_id": 38673, "answer": "starbucks"}, {"question_id": 38676, "answer": "oracle"}, {"question_id": 38679, "answer": "10 20 2010"}, {"question_id": 38682, "answer": "8"}, {"question_id": 38685, "answer": "yale"}, {"question_id": 38688, "answer": "3"}, {"question_id": 38691, "answer": "soda"}, {"question_id": 38694, "answer": "tripel"}, {"question_id": 38697, "answer": "boxes"}, {"question_id": 38700, "answer": "sapporo classic"}, {"question_id": 38703, "answer": "royal air force"}, {"question_id": 38706, "answer": "sea"}, {"question_id": 38709, "answer": "10:00"}, {"question_id": 38712, "answer": "navy blue"}, {"question_id": 38715, "answer": "borden"}, {"question_id": 38718, "answer": "mexico"}, {"question_id": 38721, "answer": "hoff"}, {"question_id": 38724, "answer": "1"}, {"question_id": 38727, "answer": "zumiez"}, {"question_id": 38730, "answer": "diane law"}, {"question_id": 38733, "answer": "poddymore"}, {"question_id": 38736, "answer": "m"}, {"question_id": 38739, "answer": "brian wadleigh"}, {"question_id": 38742, "answer": "messercenter"}, {"question_id": 38745, "answer": "jvc"}, {"question_id": 38748, "answer": "12:00"}, {"question_id": 38751, "answer": "yes"}, {"question_id": 38754, "answer": "101"}, {"question_id": 38757, "answer": "red"}, {"question_id": 38760, "answer": "100"}, {"question_id": 38763, "answer": "$10"}, {"question_id": 38766, "answer": "sugar"}, {"question_id": 38769, "answer": "my make up nintendo ds"}, {"question_id": 38772, "answer": "yes"}, {"question_id": 38775, "answer": "no events"}, {"question_id": 38778, "answer": "subway"}, {"question_id": 38781, "answer": "10"}, {"question_id": 38784, "answer": "2005"}, {"question_id": 38787, "answer": "gi"}, {"question_id": 38790, "answer": "rolex"}, {"question_id": 38793, "answer": "motorized vehicles"}, {"question_id": 38796, "answer": "11:00 am"}, {"question_id": 38799, "answer": "samsung"}, {"question_id": 38802, "answer": "1954"}, {"question_id": 38805, "answer": "lego"}, {"question_id": 38808, "answer": "woolworths"}, {"question_id": 38811, "answer": "norfolk"}, {"question_id": 38814, "answer": "south africa"}, {"question_id": 38817, "answer": "cubs"}, {"question_id": 38820, "answer": "411"}, {"question_id": 38823, "answer": "samsung"}, {"question_id": 38826, "answer": "old chub"}, {"question_id": 38829, "answer": "htc"}, {"question_id": 38832, "answer": "topco"}, {"question_id": 38835, "answer": "bwin"}, {"question_id": 38838, "answer": "st paul"}, {"question_id": 38841, "answer": "100"}, {"question_id": 38844, "answer": "kvass"}, {"question_id": 38847, "answer": "10:00"}, {"question_id": 38850, "answer": "sense applied"}, {"question_id": 38853, "answer": "school"}, {"question_id": 38856, "answer": "starbucks"}, {"question_id": 38859, "answer": "conference"}, {"question_id": 38862, "answer": "brand"}, {"question_id": 38865, "answer": "london pride"}, {"question_id": 38868, "answer": "coca cola"}, {"question_id": 38871, "answer": "family prime"}, {"question_id": 38874, "answer": "conference room"}, {"question_id": 38877, "answer": "mocha and coffee"}, {"question_id": 38880, "answer": "yes"}, {"question_id": 38883, "answer": "riesling"}, {"question_id": 38886, "answer": "new york"}, {"question_id": 38889, "answer": "excellent"}, {"question_id": 38892, "answer": "tcs"}, {"question_id": 38895, "answer": "oude gezelle vilette"}, {"question_id": 38898, "answer": "mort subite"}, {"question_id": 38901, "answer": "mdv"}, {"question_id": 38904, "answer": "yes"}, {"question_id": 38907, "answer": "no"}, {"question_id": 38910, "answer": "fun"}, {"question_id": 38913, "answer": "samsung"}, {"question_id": 38916, "answer": "coca"}, {"question_id": 38919, "answer": "no"}, {"question_id": 38922, "answer": "cross"}, {"question_id": 38925, "answer": "spanish"}, {"question_id": 38928, "answer": "premium paas 1900"}, {"question_id": 38931, "answer": "yes"}, {"question_id": 38934, "answer": "extra virgin"}, {"question_id": 38937, "answer": "rosie"}, {"question_id": 38940, "answer": "11"}, {"question_id": 38943, "answer": "brew"}, {"question_id": 38946, "answer": "christmas"}, {"question_id": 38949, "answer": "voldemort"}, {"question_id": 38952, "answer": "51"}, {"question_id": 38955, "answer": "sport"}, {"question_id": 38958, "answer": "sprite"}, {"question_id": 38961, "answer": "20"}, {"question_id": 38964, "answer": "nokia"}, {"question_id": 38967, "answer": "wicked witch of west"}, {"question_id": 38970, "answer": "ipod"}, {"question_id": 38973, "answer": "htc"}, {"question_id": 38976, "answer": "city bus"}, {"question_id": 38979, "answer": "bottle"}, {"question_id": 38982, "answer": "levitton"}, {"question_id": 38985, "answer": "bolton"}, {"question_id": 38988, "answer": "k"}, {"question_id": 38991, "answer": "cowellman"}, {"question_id": 38994, "answer": "umocrisigil"}, {"question_id": 38997, "answer": "9800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 39000, "answer": "12:23"}, {"question_id": 39003, "answer": "gilmore"}, {"question_id": 39006, "answer": "1999"}, {"question_id": 39009, "answer": "cc"}, {"question_id": 39012, "answer": "price"}, {"question_id": 39015, "answer": "no"}, {"question_id": 39018, "answer": "jeremy"}, {"question_id": 39021, "answer": "pepsi"}, {"question_id": 39024, "answer": "yes"}, {"question_id": 39027, "answer": "cell phone"}, {"question_id": 39030, "answer": "trains"}, {"question_id": 39033, "answer": "lego"}, {"question_id": 39036, "answer": "tucson"}, {"question_id": 39039, "answer": "ounces cups"}, {"question_id": 39042, "answer": "2009"}, {"question_id": 39045, "answer": "coca cola"}, {"question_id": 39048, "answer": "100"}, {"question_id": 39051, "answer": "lost"}, {"question_id": 39054, "answer": "canada"}, {"question_id": 39057, "answer": "yes"}, {"question_id": 39060, "answer": "lm"}, {"question_id": 39063, "answer": "te"}, {"question_id": 39066, "answer": "california"}, {"question_id": 39069, "answer": "100"}, {"question_id": 39072, "answer": "10"}, {"question_id": 39075, "answer": "lenovo"}, {"question_id": 39078, "answer": "grilled cheese"}, {"question_id": 39081, "answer": "no"}, {"question_id": 39084, "answer": "o and t"}, {"question_id": 39087, "answer": "van"}, {"question_id": 39090, "answer": "b"}, {"question_id": 39093, "answer": "10 15 2015"}, {"question_id": 39096, "answer": "mary higgins clark"}, {"question_id": 39099, "answer": "beer"}, {"question_id": 39102, "answer": "10:00"}, {"question_id": 39105, "answer": "yes"}, {"question_id": 39108, "answer": "100"}, {"question_id": 39111, "answer": "simon"}, {"question_id": 39114, "answer": "build"}, {"question_id": 39117, "answer": "lindsey apodaca"}, {"question_id": 39120, "answer": "hell"}, {"question_id": 39123, "answer": "shiner"}, {"question_id": 39126, "answer": "legode"}, {"question_id": 39129, "answer": "brand"}, {"question_id": 39132, "answer": "husson"}, {"question_id": 39135, "answer": "fox"}, {"question_id": 39138, "answer": "ritz"}, {"question_id": 39141, "answer": "value"}, {"question_id": 39144, "answer": "17"}, {"question_id": 39147, "answer": "jk rowling"}, {"question_id": 39150, "answer": "000"}, {"question_id": 39153, "answer": "trash"}, {"question_id": 39156, "answer": "la"}, {"question_id": 39159, "answer": "djakebees"}, {"question_id": 39162, "answer": "pinterest"}, {"question_id": 39165, "answer": "china"}, {"question_id": 39168, "answer": "jet star imperial"}, {"question_id": 39171, "answer": "10:10"}, {"question_id": 39174, "answer": "candle sticks"}, {"question_id": 39177, "answer": "show"}, {"question_id": 39180, "answer": "coca"}, {"question_id": 39183, "answer": "christmas century"}, {"question_id": 39186, "answer": "antica"}, {"question_id": 39189, "answer": "goslings"}, {"question_id": 39192, "answer": "learning"}, {"question_id": 39195, "answer": "corner"}, {"question_id": 39198, "answer": "12"}, {"question_id": 39201, "answer": "no i'm not going to help it"}, {"question_id": 39204, "answer": "lg"}, {"question_id": 39207, "answer": "beer"}, {"question_id": 39210, "answer": "nokia"}, {"question_id": 39213, "answer": "lips of faith"}, {"question_id": 39216, "answer": "hello"}, {"question_id": 39219, "answer": "car"}, {"question_id": 39222, "answer": "duncan"}, {"question_id": 39225, "answer": "ww"}, {"question_id": 39228, "answer": "bud"}, {"question_id": 39231, "answer": "book"}, {"question_id": 39234, "answer": "jbl"}, {"question_id": 39237, "answer": "e"}, {"question_id": 39240, "answer": "office"}, {"question_id": 39243, "answer": "no"}, {"question_id": 39246, "answer": "tiger"}, {"question_id": 39249, "answer": "u"}, {"question_id": 39252, "answer": "1"}, {"question_id": 39255, "answer": "raiders"}, {"question_id": 39258, "answer": "spanish"}, {"question_id": 39261, "answer": "no parking"}, {"question_id": 39264, "answer": "gamer"}, {"question_id": 39267, "answer": "live"}, {"question_id": 39270, "answer": "10000"}, {"question_id": 39273, "answer": "rutgers"}, {"question_id": 39276, "answer": "loka"}, {"question_id": 39279, "answer": "building"}, {"question_id": 39282, "answer": "pacers"}, {"question_id": 39285, "answer": "nuclear iraq"}, {"question_id": 39288, "answer": "c"}, {"question_id": 39291, "answer": "nike"}, {"question_id": 39294, "answer": "all top pages"}, {"question_id": 39297, "answer": "nigeria"}, {"question_id": 39300, "answer": "15"}, {"question_id": 39303, "answer": "ip"}, {"question_id": 39306, "answer": "adidas"}, {"question_id": 39309, "answer": "cuckoo 's calling"}, {"question_id": 39312, "answer": "army"}, {"question_id": 39315, "answer": "3rd edition"}, {"question_id": 39318, "answer": "10"}, {"question_id": 39321, "answer": "national geographic"}, {"question_id": 39324, "answer": "yes"}, {"question_id": 39327, "answer": "j"}, {"question_id": 39330, "answer": "60 0"}, {"question_id": 39333, "answer": "b"}, {"question_id": 39336, "answer": "nike"}, {"question_id": 39339, "answer": "2010"}, {"question_id": 39342, "answer": "samsung"}, {"question_id": 39345, "answer": "flavor stamp"}, {"question_id": 39348, "answer": "8"}, {"question_id": 39351, "answer": "chargers"}, {"question_id": 39354, "answer": "im"}, {"question_id": 39357, "answer": "important"}, {"question_id": 39360, "answer": "57"}, {"question_id": 39363, "answer": "peddle"}, {"question_id": 39366, "answer": "wmhs"}, {"question_id": 39369, "answer": "$15"}, {"question_id": 39372, "answer": "hump"}, {"question_id": 39375, "answer": "explorers"}, {"question_id": 39378, "answer": "carrton"}, {"question_id": 39381, "answer": "humber"}, {"question_id": 39384, "answer": "8:01"}, {"question_id": 39387, "answer": "f"}, {"question_id": 39390, "answer": "123456"}, {"question_id": 39393, "answer": "james hilton"}, {"question_id": 39396, "answer": "at&t"}, {"question_id": 39399, "answer": "book fair"}, {"question_id": 39402, "answer": "00000000"}, {"question_id": 39405, "answer": "yes"}, {"question_id": 39408, "answer": "conway"}, {"question_id": 39411, "answer": "h"}, {"question_id": 39414, "answer": "fallopia"}, {"question_id": 39417, "answer": "lego"}, {"question_id": 39420, "answer": "clips"}, {"question_id": 39423, "answer": "in truth"}, {"question_id": 39426, "answer": "next door"}, {"question_id": 39429, "answer": "mo"}, {"question_id": 39432, "answer": "100"}, {"question_id": 39435, "answer": "33"}, {"question_id": 39438, "answer": "tacos"}, {"question_id": 39441, "answer": "100"}, {"question_id": 39444, "answer": "astronaut"}, {"question_id": 39447, "answer": "computer cases"}, {"question_id": 39450, "answer": "toronto"}, {"question_id": 39453, "answer": "zeron heroes"}, {"question_id": 39456, "answer": "s"}, {"question_id": 39459, "answer": "atlantic"}, {"question_id": 39462, "answer": "song"}, {"question_id": 39465, "answer": "pyrex"}, {"question_id": 39468, "answer": "10:10"}, {"question_id": 39471, "answer": "8"}, {"question_id": 39474, "answer": "1999"}, {"question_id": 39477, "answer": "jennifer"}, {"question_id": 39480, "answer": "golda & dulce"}, {"question_id": 39483, "answer": "17"}, {"question_id": 39486, "answer": "200"}, {"question_id": 39489, "answer": "worlds of theodore sturgeon"}, {"question_id": 39492, "answer": "ravenswood avenue chicago illinois"}, {"question_id": 39495, "answer": "330"}, {"question_id": 39498, "answer": "pepsi"}, {"question_id": 39501, "answer": "blue cd"}, {"question_id": 39504, "answer": "alaska"}, {"question_id": 39507, "answer": "coca cola"}, {"question_id": 39510, "answer": "energy glut: politics of fatness in overconsumpting world"}, {"question_id": 39513, "answer": "t"}, {"question_id": 39516, "answer": "10:00"}, {"question_id": 39519, "answer": "sean williams"}, {"question_id": 39522, "answer": "pepsi"}, {"question_id": 39525, "answer": "3"}, {"question_id": 39528, "answer": "2010"}, {"question_id": 39531, "answer": "5"}, {"question_id": 39534, "answer": "visa"}, {"question_id": 39537, "answer": "ticketmaster"}, {"question_id": 39540, "answer": "b"}, {"question_id": 39543, "answer": "dragon"}, {"question_id": 39546, "answer": "blue book"}, {"question_id": 39549, "answer": "60"}, {"question_id": 39552, "answer": "yes"}, {"question_id": 39555, "answer": "v"}, {"question_id": 39558, "answer": "ww"}, {"question_id": 39561, "answer": "malt"}, {"question_id": 39564, "answer": "budweiser"}, {"question_id": 39567, "answer": "no trespassing"}, {"question_id": 39570, "answer": "crazy"}, {"question_id": 39573, "answer": "luu b"}, {"question_id": 39576, "answer": "jimmy"}, {"question_id": 39579, "answer": "0"}, {"question_id": 39582, "answer": "cisco"}, {"question_id": 39585, "answer": "building"}, {"question_id": 39588, "answer": "yes"}, {"question_id": 39591, "answer": "cubs"}, {"question_id": 39594, "answer": "15"}, {"question_id": 39597, "answer": "box"}, {"question_id": 39600, "answer": "9"}, {"question_id": 34603, "answer": "drupal"}, {"question_id": 34606, "answer": "10"}, {"question_id": 34609, "answer": "rolex"}, {"question_id": 34612, "answer": "off"}, {"question_id": 34615, "answer": "$1.99"}, {"question_id": 34618, "answer": "johnnie"}, {"question_id": 34621, "answer": "10"}, {"question_id": 34624, "answer": "macbook"}, {"question_id": 34627, "answer": "2012"}, {"question_id": 34630, "answer": "harry"}, {"question_id": 34633, "answer": "yes"}, {"question_id": 34636, "answer": "nedden"}, {"question_id": 34639, "answer": "jesus"}, {"question_id": 34642, "answer": "10000"}, {"question_id": 34645, "answer": "vodka"}, {"question_id": 34648, "answer": "1 pound"}, {"question_id": 34651, "answer": "california"}, {"question_id": 34654, "answer": "british"}, {"question_id": 34657, "answer": "life cycle"}, {"question_id": 34660, "answer": "235"}, {"question_id": 34663, "answer": "corona"}, {"question_id": 34666, "answer": "white"}, {"question_id": 34669, "answer": "food"}, {"question_id": 34672, "answer": "norway"}, {"question_id": 34675, "answer": "korean"}, {"question_id": 34678, "answer": "yes"}, {"question_id": 34681, "answer": "smashed pumpkin ale"}, {"question_id": 34684, "answer": "cathy williams"}, {"question_id": 34687, "answer": "kraeumerwustz"}, {"question_id": 34690, "answer": "baby frog"}, {"question_id": 34693, "answer": "painted veil"}, {"question_id": 34696, "answer": "648 80"}, {"question_id": 34699, "answer": "norman samelson"}, {"question_id": 34702, "answer": "0123456789"}, {"question_id": 34705, "answer": "2011"}, {"question_id": 34708, "answer": "best cellar"}, {"question_id": 34711, "answer": "just mobile phone"}, {"question_id": 34714, "answer": "10"}, {"question_id": 34717, "answer": "r\u00e9volution"}, {"question_id": 34720, "answer": "brahms"}, {"question_id": 34723, "answer": "star"}, {"question_id": 34726, "answer": "gallery"}, {"question_id": 34729, "answer": "smoothies"}, {"question_id": 34732, "answer": "721"}, {"question_id": 34735, "answer": "wwwbbccouk"}, {"question_id": 34738, "answer": "design"}, {"question_id": 34741, "answer": "remington"}, {"question_id": 34744, "answer": "10:00"}, {"question_id": 34747, "answer": "army"}, {"question_id": 34750, "answer": "pulse width modulation"}, {"question_id": 34753, "answer": "5"}, {"question_id": 34756, "answer": "pdx"}, {"question_id": 34759, "answer": "classical"}, {"question_id": 34762, "answer": "yes"}, {"question_id": 34765, "answer": "yes"}, {"question_id": 34768, "answer": "code veronica"}, {"question_id": 34771, "answer": "samsung"}, {"question_id": 34774, "answer": "redhook"}, {"question_id": 34777, "answer": "lanc\u00f4me"}, {"question_id": 34780, "answer": "zinzino"}, {"question_id": 34783, "answer": "bottle"}, {"question_id": 34786, "answer": "taipan"}, {"question_id": 34789, "answer": "yes"}, {"question_id": 34792, "answer": "12"}, {"question_id": 34795, "answer": "apele"}, {"question_id": 34798, "answer": "f11272"}, {"question_id": 34801, "answer": "france"}, {"question_id": 34804, "answer": "emirates"}, {"question_id": 34807, "answer": "vfws"}, {"question_id": 34810, "answer": "flour"}, {"question_id": 34813, "answer": "sony"}, {"question_id": 34816, "answer": "schin"}, {"question_id": 34819, "answer": "10:10"}, {"question_id": 34822, "answer": "9:00"}, {"question_id": 34825, "answer": "packers"}, {"question_id": 34828, "answer": "3"}, {"question_id": 34831, "answer": "yes"}, {"question_id": 34834, "answer": "11:43"}, {"question_id": 34837, "answer": "viropus"}, {"question_id": 34840, "answer": "007"}, {"question_id": 34843, "answer": "12"}, {"question_id": 34846, "answer": "1"}, {"question_id": 34849, "answer": "100"}, {"question_id": 34852, "answer": "malt"}, {"question_id": 34855, "answer": "stirling"}, {"question_id": 34858, "answer": "01 01 2012"}, {"question_id": 34861, "answer": "1.5"}, {"question_id": 34864, "answer": "carnegie hall"}, {"question_id": 34867, "answer": "yes"}, {"question_id": 34870, "answer": "fox"}, {"question_id": 34873, "answer": "dealing in desire: asian accomplices western desires and hidden currencies of global work"}, {"question_id": 34876, "answer": "1 oz"}, {"question_id": 34879, "answer": "cheese"}, {"question_id": 34882, "answer": "office"}, {"question_id": 34885, "answer": "gospel"}, {"question_id": 34888, "answer": "sign"}, {"question_id": 34891, "answer": "1234567890"}, {"question_id": 34894, "answer": "no"}, {"question_id": 34897, "answer": "titanfall"}, {"question_id": 34900, "answer": "10:00"}, {"question_id": 34903, "answer": "yes"}, {"question_id": 34906, "answer": "no"}, {"question_id": 34909, "answer": "oil"}, {"question_id": 34912, "answer": "wine"}, {"question_id": 34915, "answer": "100"}, {"question_id": 34918, "answer": "olivetti"}, {"question_id": 34921, "answer": "secret"}, {"question_id": 34924, "answer": "wonton"}, {"question_id": 34927, "answer": "60"}, {"question_id": 34930, "answer": "old fashioned"}, {"question_id": 34933, "answer": "right"}, {"question_id": 34936, "answer": "tonino maccioni"}, {"question_id": 34939, "answer": "allure"}, {"question_id": 34942, "answer": "2013"}, {"question_id": 34945, "answer": "meijer"}, {"question_id": 34948, "answer": "hbaa"}, {"question_id": 34951, "answer": "friends"}, {"question_id": 34954, "answer": "10000"}, {"question_id": 34957, "answer": "5"}, {"question_id": 34960, "answer": "100"}, {"question_id": 34963, "answer": "windows"}, {"question_id": 34966, "answer": "healthy herbal"}, {"question_id": 34969, "answer": "energy"}, {"question_id": 34972, "answer": "global warming"}, {"question_id": 34975, "answer": "ocean"}, {"question_id": 34978, "answer": "monteith 's"}, {"question_id": 34981, "answer": "cave of thousand tales"}, {"question_id": 34984, "answer": "romulus"}, {"question_id": 34987, "answer": "296"}, {"question_id": 34990, "answer": "vincent"}, {"question_id": 34993, "answer": "70"}, {"question_id": 34996, "answer": "1991"}, {"question_id": 34999, "answer": "wildcats"}, {"question_id": 35002, "answer": "macys"}, {"question_id": 35005, "answer": "1234"}, {"question_id": 35008, "answer": "smart mart"}, {"question_id": 35011, "answer": "premier cru"}, {"question_id": 35014, "answer": "2012"}, {"question_id": 35017, "answer": "laptop"}, {"question_id": 35020, "answer": "white"}, {"question_id": 35023, "answer": "yes"}, {"question_id": 35026, "answer": "sheppey 's"}, {"question_id": 35029, "answer": "700"}, {"question_id": 35032, "answer": "bacardi"}, {"question_id": 35035, "answer": "yes"}, {"question_id": 35038, "answer": "captain morgan"}, {"question_id": 35041, "answer": "deep water"}, {"question_id": 35044, "answer": "beatles"}, {"question_id": 35047, "answer": "big emperor"}, {"question_id": 35050, "answer": "f9"}, {"question_id": 35053, "answer": "book"}, {"question_id": 35056, "answer": "53"}, {"question_id": 35059, "answer": "66"}, {"question_id": 35062, "answer": "power mancer"}, {"question_id": 35065, "answer": "10:10"}, {"question_id": 35068, "answer": "fajita revolution"}, {"question_id": 35071, "answer": "office"}, {"question_id": 35074, "answer": "fail"}, {"question_id": 35077, "answer": "shirt"}, {"question_id": 35080, "answer": "chateau d'orbois"}, {"question_id": 35083, "answer": "beats"}, {"question_id": 35086, "answer": "10"}, {"question_id": 35089, "answer": "red"}, {"question_id": 35092, "answer": "h"}, {"question_id": 35095, "answer": "you tube"}, {"question_id": 35098, "answer": "coco"}, {"question_id": 35101, "answer": "10"}, {"question_id": 35104, "answer": "stevie ray vaughan"}, {"question_id": 35107, "answer": "battle"}, {"question_id": 35110, "answer": "steve"}, {"question_id": 35113, "answer": "12:00"}, {"question_id": 35116, "answer": "nissan"}, {"question_id": 35119, "answer": "refrigerator"}, {"question_id": 35122, "answer": "puffin festival"}, {"question_id": 35125, "answer": "soda"}, {"question_id": 35128, "answer": "beer"}, {"question_id": 35131, "answer": "drink bar"}, {"question_id": 35134, "answer": "soda"}, {"question_id": 35137, "answer": "$10.00"}, {"question_id": 35140, "answer": "10"}, {"question_id": 35143, "answer": "yves saint laurent"}, {"question_id": 35146, "answer": "weaver"}, {"question_id": 35149, "answer": "yes"}, {"question_id": 35152, "answer": "espresso"}, {"question_id": 35155, "answer": "dg"}, {"question_id": 35158, "answer": "stop"}, {"question_id": 35161, "answer": "$100"}, {"question_id": 35164, "answer": "baby"}, {"question_id": 35167, "answer": "justice for leviv"}, {"question_id": 35170, "answer": "olive"}, {"question_id": 35173, "answer": "firefighting"}, {"question_id": 35176, "answer": "railhawks"}, {"question_id": 35179, "answer": "1870"}, {"question_id": 35182, "answer": "samsung"}, {"question_id": 35185, "answer": "10:30"}, {"question_id": 35188, "answer": "lg"}, {"question_id": 35191, "answer": "8 oz"}, {"question_id": 35194, "answer": "apple"}, {"question_id": 35197, "answer": "ron"}, {"question_id": 35200, "answer": "picture"}, {"question_id": 35203, "answer": "nutter"}, {"question_id": 35206, "answer": "montreal"}, {"question_id": 35209, "answer": "whiteboard"}, {"question_id": 35212, "answer": "megan gardiner"}, {"question_id": 35215, "answer": "dodgers"}, {"question_id": 35218, "answer": "2"}, {"question_id": 35221, "answer": "canon"}, {"question_id": 35224, "answer": "black"}, {"question_id": 35227, "answer": "funeral"}, {"question_id": 35230, "answer": "samsung"}, {"question_id": 35233, "answer": "reading"}, {"question_id": 35236, "answer": "london"}, {"question_id": 35239, "answer": "maotong and i were beggars"}, {"question_id": 35242, "answer": "1"}, {"question_id": 35245, "answer": "yes"}, {"question_id": 35248, "answer": "illustrator"}, {"question_id": 35251, "answer": "paul"}, {"question_id": 35254, "answer": "stop"}, {"question_id": 35257, "answer": "1999"}, {"question_id": 35260, "answer": "80"}, {"question_id": 35263, "answer": "yes"}, {"question_id": 35266, "answer": "yes"}, {"question_id": 35269, "answer": "ben 's"}, {"question_id": 35272, "answer": "10:10"}, {"question_id": 35275, "answer": ""}, {"question_id": 35278, "answer": "thug"}, {"question_id": 35281, "answer": "35"}, {"question_id": 35284, "answer": "yes"}, {"question_id": 35287, "answer": "odell"}, {"question_id": 35290, "answer": "maine"}, {"question_id": 35293, "answer": "black"}, {"question_id": 35296, "answer": "coffee"}, {"question_id": 35299, "answer": "no"}, {"question_id": 35302, "answer": "arizona"}, {"question_id": 35305, "answer": "10:05"}, {"question_id": 35308, "answer": "papertape"}, {"question_id": 35311, "answer": "no smoking"}, {"question_id": 35314, "answer": "castacon"}, {"question_id": 35317, "answer": "no"}, {"question_id": 35320, "answer": "samsung"}, {"question_id": 35323, "answer": "gshock"}, {"question_id": 35326, "answer": "ipa"}, {"question_id": 35329, "answer": "street"}, {"question_id": 35332, "answer": "2012"}, {"question_id": 35335, "answer": "pepsi"}, {"question_id": 35338, "answer": "party"}, {"question_id": 35341, "answer": "rolex"}, {"question_id": 35344, "answer": "new york"}, {"question_id": 35347, "answer": "hard drive"}, {"question_id": 35350, "answer": "yes"}, {"question_id": 35353, "answer": "apple"}, {"question_id": 35356, "answer": "yes"}, {"question_id": 35359, "answer": "yes"}, {"question_id": 35362, "answer": "e"}, {"question_id": 35365, "answer": "el salvador"}, {"question_id": 35368, "answer": "st peter 's"}, {"question_id": 35371, "answer": "bin"}, {"question_id": 35374, "answer": "children"}, {"question_id": 35377, "answer": "flexible connector"}, {"question_id": 35380, "answer": "i mac"}, {"question_id": 35383, "answer": "fire"}, {"question_id": 35386, "answer": "hello"}, {"question_id": 35389, "answer": "massif"}, {"question_id": 35392, "answer": "5"}, {"question_id": 35395, "answer": "5%"}, {"question_id": 35398, "answer": "chardonnay"}, {"question_id": 35401, "answer": "yes"}, {"question_id": 35404, "answer": "google"}, {"question_id": 35407, "answer": "14"}, {"question_id": 35410, "answer": "2012"}, {"question_id": 35413, "answer": "clear care"}, {"question_id": 35416, "answer": "early"}, {"question_id": 35419, "answer": "police"}, {"question_id": 35422, "answer": "magazines"}, {"question_id": 35425, "answer": "spm"}, {"question_id": 35428, "answer": "ww"}, {"question_id": 35431, "answer": "1971"}, {"question_id": 35434, "answer": "c"}, {"question_id": 35437, "answer": "you"}, {"question_id": 35440, "answer": "elginiano"}, {"question_id": 35443, "answer": "seema st"}, {"question_id": 35446, "answer": "10:37"}, {"question_id": 35449, "answer": "2011"}, {"question_id": 35452, "answer": "100"}, {"question_id": 35455, "answer": "royal oak"}, {"question_id": 35458, "answer": "city"}, {"question_id": 35461, "answer": "beer"}, {"question_id": 35464, "answer": "no"}, {"question_id": 35467, "answer": "august"}, {"question_id": 35470, "answer": "government"}, {"question_id": 35473, "answer": "37"}, {"question_id": 35476, "answer": "mark zusak"}, {"question_id": 35479, "answer": "kristin"}, {"question_id": 35482, "answer": "cristal"}, {"question_id": 35485, "answer": "cold mountain"}, {"question_id": 35488, "answer": "0"}, {"question_id": 35491, "answer": "100"}, {"question_id": 35494, "answer": "90"}, {"question_id": 35497, "answer": "taxi"}, {"question_id": 35500, "answer": "michael conrad"}, {"question_id": 35503, "answer": "6197"}, {"question_id": 35506, "answer": "12"}, {"question_id": 35509, "answer": "wmhs"}, {"question_id": 35512, "answer": "installation and learning"}, {"question_id": 35515, "answer": "you're 10"}, {"question_id": 35518, "answer": "bookstore"}, {"question_id": 35521, "answer": "11"}, {"question_id": 35524, "answer": "tennessee"}, {"question_id": 35527, "answer": "united"}, {"question_id": 35530, "answer": "conversation"}, {"question_id": 35533, "answer": "80"}, {"question_id": 35536, "answer": "beer"}, {"question_id": 35539, "answer": "la fimonde"}, {"question_id": 35542, "answer": "science"}, {"question_id": 35545, "answer": "people"}, {"question_id": 35548, "answer": "100"}, {"question_id": 35551, "answer": "label"}, {"question_id": 35554, "answer": "amazing"}, {"question_id": 35557, "answer": "10:10"}, {"question_id": 35560, "answer": "7 11"}, {"question_id": 35563, "answer": "3:00"}, {"question_id": 35566, "answer": "heart rate"}, {"question_id": 35569, "answer": "macdonald 's"}, {"question_id": 35572, "answer": "china"}, {"question_id": 35575, "answer": "beer"}, {"question_id": 35578, "answer": "british"}, {"question_id": 35581, "answer": "sony"}, {"question_id": 35584, "answer": "11"}, {"question_id": 35587, "answer": "valerie"}, {"question_id": 35590, "answer": "16"}, {"question_id": 35593, "answer": "toys"}, {"question_id": 35596, "answer": "bud"}, {"question_id": 35599, "answer": "black and decker"}, {"question_id": 35602, "answer": "rick poynor"}, {"question_id": 35605, "answer": "3"}, {"question_id": 35608, "answer": "new york"}, {"question_id": 35611, "answer": "bus stop"}, {"question_id": 35614, "answer": "nike"}, {"question_id": 35617, "answer": "cocraft"}, {"question_id": 35620, "answer": "2007"}, {"question_id": 35623, "answer": "trojans"}, {"question_id": 35626, "answer": "10"}, {"question_id": 35629, "answer": "10"}, {"question_id": 35632, "answer": "city market"}, {"question_id": 35635, "answer": "branding"}, {"question_id": 35638, "answer": "100"}, {"question_id": 35641, "answer": "mrs alia"}, {"question_id": 35644, "answer": "chinese"}, {"question_id": 35647, "answer": "yes"}, {"question_id": 35650, "answer": "pepsi"}, {"question_id": 35653, "answer": "rolex"}, {"question_id": 35656, "answer": "no"}, {"question_id": 35659, "answer": "all"}, {"question_id": 35662, "answer": "f"}, {"question_id": 35665, "answer": "wintertide"}, {"question_id": 35668, "answer": "do"}, {"question_id": 35671, "answer": "1970 1980"}, {"question_id": 35674, "answer": "10"}, {"question_id": 35677, "answer": "store"}, {"question_id": 35680, "answer": "101"}, {"question_id": 35683, "answer": "hudson memorial stadium"}, {"question_id": 35686, "answer": "sunny"}, {"question_id": 35689, "answer": "poland"}, {"question_id": 35692, "answer": "airbus"}, {"question_id": 35695, "answer": "6"}, {"question_id": 35698, "answer": "800 555 1212"}, {"question_id": 35701, "answer": "brian k vaughan"}, {"question_id": 35704, "answer": "21"}, {"question_id": 35707, "answer": "and all stars stage"}, {"question_id": 35710, "answer": "1234"}, {"question_id": 35713, "answer": "tigers"}, {"question_id": 35716, "answer": "stop"}, {"question_id": 35719, "answer": "saturday"}, {"question_id": 35722, "answer": "cleveland"}, {"question_id": 35725, "answer": "shu uemura"}, {"question_id": 35728, "answer": "2011"}, {"question_id": 35731, "answer": "dell"}, {"question_id": 35734, "answer": "64"}, {"question_id": 35737, "answer": "orioles"}, {"question_id": 35740, "answer": "12"}, {"question_id": 35743, "answer": "sol"}, {"question_id": 35746, "answer": "toshiba"}, {"question_id": 35749, "answer": "menu"}, {"question_id": 35752, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 35755, "answer": "776"}, {"question_id": 35758, "answer": "12"}, {"question_id": 35761, "answer": "blue"}, {"question_id": 35764, "answer": "4"}, {"question_id": 35767, "answer": "2012"}, {"question_id": 35770, "answer": "dark tower"}, {"question_id": 35773, "answer": "nike"}, {"question_id": 35776, "answer": "main street"}, {"question_id": 35779, "answer": "12"}, {"question_id": 35782, "answer": "you are young man"}, {"question_id": 35785, "answer": "23"}, {"question_id": 35788, "answer": "wet n wild"}, {"question_id": 35791, "answer": "jordan"}, {"question_id": 35794, "answer": "2"}, {"question_id": 35797, "answer": "library system"}, {"question_id": 35800, "answer": "100"}, {"question_id": 35803, "answer": "airplane"}, {"question_id": 35806, "answer": "melissa keel"}, {"question_id": 35809, "answer": "1900"}, {"question_id": 35812, "answer": "yes"}, {"question_id": 35815, "answer": "jingutai"}, {"question_id": 35818, "answer": "10:00"}, {"question_id": 35821, "answer": "small"}, {"question_id": 35824, "answer": "10:10"}, {"question_id": 35827, "answer": "crown"}, {"question_id": 35830, "answer": "lg"}, {"question_id": 35833, "answer": "mountain dew"}, {"question_id": 35836, "answer": "mets"}, {"question_id": 35839, "answer": "keurig"}, {"question_id": 35842, "answer": "toronto"}, {"question_id": 35845, "answer": "army"}, {"question_id": 35848, "answer": "united states"}, {"question_id": 35851, "answer": "cuv"}, {"question_id": 35854, "answer": "warm"}, {"question_id": 35857, "answer": "rodada"}, {"question_id": 35860, "answer": "beach"}, {"question_id": 35863, "answer": "airplane"}, {"question_id": 35866, "answer": "bud"}, {"question_id": 35869, "answer": "suntory"}, {"question_id": 35872, "answer": "5"}, {"question_id": 35875, "answer": "23"}, {"question_id": 35878, "answer": "canada"}, {"question_id": 35881, "answer": "cereal"}, {"question_id": 35884, "answer": "yellow team"}, {"question_id": 35887, "answer": "yes"}, {"question_id": 35890, "answer": "lufthansa"}, {"question_id": 35893, "answer": "train"}, {"question_id": 35896, "answer": "19"}, {"question_id": 35899, "answer": "dornfelder"}, {"question_id": 35902, "answer": "georgia"}, {"question_id": 35905, "answer": "pelican island"}, {"question_id": 35908, "answer": "10:30"}, {"question_id": 35911, "answer": "5"}, {"question_id": 35914, "answer": "chateau"}, {"question_id": 35917, "answer": "coffee"}, {"question_id": 35920, "answer": "old gold"}, {"question_id": 35923, "answer": "enter"}, {"question_id": 35926, "answer": "map"}, {"question_id": 35929, "answer": "lemon"}, {"question_id": 35932, "answer": "chapter 10"}, {"question_id": 35935, "answer": "intersporte"}, {"question_id": 35938, "answer": "11"}, {"question_id": 35941, "answer": "nanometer"}, {"question_id": 35944, "answer": "number"}, {"question_id": 35947, "answer": "ai"}, {"question_id": 35950, "answer": "triathlon"}, {"question_id": 35953, "answer": "chinese"}, {"question_id": 35956, "answer": "grovn"}, {"question_id": 35959, "answer": "b"}, {"question_id": 35962, "answer": "cookery school"}, {"question_id": 35965, "answer": "electricity"}, {"question_id": 35968, "answer": "map"}, {"question_id": 35971, "answer": "sex"}, {"question_id": 35974, "answer": "rubbermaid"}, {"question_id": 35977, "answer": "stop"}, {"question_id": 35980, "answer": "no plate"}, {"question_id": 35983, "answer": "black draught"}, {"question_id": 35986, "answer": "mac os"}, {"question_id": 35989, "answer": "kappa"}, {"question_id": 35992, "answer": "y"}, {"question_id": 35995, "answer": "lorina"}, {"question_id": 35998, "answer": "wvg"}, {"question_id": 36001, "answer": "no instructions"}, {"question_id": 36004, "answer": "eversharp"}, {"question_id": 36007, "answer": "night"}, {"question_id": 36010, "answer": "nintendo"}, {"question_id": 36013, "answer": "nokia"}, {"question_id": 36016, "answer": "build"}, {"question_id": 36019, "answer": "yes"}, {"question_id": 36022, "answer": "polar"}, {"question_id": 36025, "answer": "no"}, {"question_id": 36028, "answer": "wicker man"}, {"question_id": 36031, "answer": "john ruggiero"}, {"question_id": 36034, "answer": "virginia"}, {"question_id": 36037, "answer": "i dnt"}, {"question_id": 36040, "answer": "50"}, {"question_id": 36043, "answer": "100"}, {"question_id": 36046, "answer": "no"}, {"question_id": 36049, "answer": "black"}, {"question_id": 36052, "answer": "yes"}, {"question_id": 36055, "answer": "44"}, {"question_id": 36058, "answer": "hewlett packard"}, {"question_id": 36061, "answer": "united"}, {"question_id": 36064, "answer": "pass"}, {"question_id": 36067, "answer": "lemon"}, {"question_id": 36070, "answer": "10"}, {"question_id": 36073, "answer": "taste"}, {"question_id": 36076, "answer": "orion"}, {"question_id": 36079, "answer": "guinness"}, {"question_id": 36082, "answer": "10"}, {"question_id": 36085, "answer": "thank you"}, {"question_id": 36088, "answer": "united kingdom"}, {"question_id": 36091, "answer": "usa"}, {"question_id": 36094, "answer": "sony"}, {"question_id": 36097, "answer": "new jersey"}, {"question_id": 36100, "answer": "juroy"}, {"question_id": 36103, "answer": "whole wheat"}, {"question_id": 36106, "answer": "revista trinidad"}, {"question_id": 36109, "answer": "10 minutes"}, {"question_id": 36112, "answer": "brandebourg"}, {"question_id": 36115, "answer": "cola"}, {"question_id": 36118, "answer": "kenya loses 6 million trees daily"}, {"question_id": 36121, "answer": "9"}, {"question_id": 36124, "answer": "h"}, {"question_id": 36127, "answer": "interstate 97"}, {"question_id": 36130, "answer": "400"}, {"question_id": 36133, "answer": "campbell city"}, {"question_id": 36136, "answer": "rolex"}, {"question_id": 36139, "answer": "rolex"}, {"question_id": 36142, "answer": "1"}, {"question_id": 36145, "answer": "10"}, {"question_id": 36148, "answer": "god"}, {"question_id": 36151, "answer": "10"}, {"question_id": 36154, "answer": "box"}, {"question_id": 36157, "answer": "100"}, {"question_id": 36160, "answer": "r"}, {"question_id": 36163, "answer": "duphaston"}, {"question_id": 36166, "answer": "wisconsin court"}, {"question_id": 36169, "answer": "georges de latour"}, {"question_id": 36172, "answer": "duck"}, {"question_id": 36175, "answer": "orange"}, {"question_id": 36178, "answer": "bbm"}, {"question_id": 36181, "answer": "54"}, {"question_id": 36184, "answer": "ra dickey"}, {"question_id": 36187, "answer": "improves health"}, {"question_id": 36190, "answer": "2011"}, {"question_id": 36193, "answer": "eleven"}, {"question_id": 36196, "answer": "b"}, {"question_id": 36199, "answer": "o"}, {"question_id": 36202, "answer": "performance"}, {"question_id": 36205, "answer": "fallout"}, {"question_id": 36208, "answer": "27"}, {"question_id": 36211, "answer": "windows"}, {"question_id": 36214, "answer": "44"}, {"question_id": 36217, "answer": "10:00"}, {"question_id": 36220, "answer": "$10.00"}, {"question_id": 36223, "answer": "snickers"}, {"question_id": 36226, "answer": "news"}, {"question_id": 36229, "answer": "phillip loope"}, {"question_id": 36232, "answer": "christians"}, {"question_id": 36235, "answer": "virginia"}, {"question_id": 36238, "answer": "right book"}, {"question_id": 36241, "answer": "june"}, {"question_id": 36244, "answer": "yes"}, {"question_id": 36247, "answer": "woman"}, {"question_id": 36250, "answer": "vixton"}, {"question_id": 36253, "answer": "$1"}, {"question_id": 36256, "answer": "349.99"}, {"question_id": 36259, "answer": "edge"}, {"question_id": 36262, "answer": "10"}, {"question_id": 36265, "answer": "samsung"}, {"question_id": 36268, "answer": "heart health"}, {"question_id": 36271, "answer": "via"}, {"question_id": 36274, "answer": "10"}, {"question_id": 36277, "answer": "corona"}, {"question_id": 36280, "answer": "no service"}, {"question_id": 36283, "answer": "emirates"}, {"question_id": 36286, "answer": "50"}, {"question_id": 36289, "answer": "iphone"}, {"question_id": 36292, "answer": "34"}, {"question_id": 36295, "answer": "seiko"}, {"question_id": 36298, "answer": "no"}, {"question_id": 36301, "answer": "ebel"}, {"question_id": 36304, "answer": "22"}, {"question_id": 36307, "answer": "title"}, {"question_id": 36310, "answer": "1"}, {"question_id": 36313, "answer": "aut"}, {"question_id": 36316, "answer": "orange"}, {"question_id": 36319, "answer": "air pacific"}, {"question_id": 36322, "answer": "edwards"}, {"question_id": 36325, "answer": "yes"}, {"question_id": 36328, "answer": "lovequotesnet"}, {"question_id": 36331, "answer": "fire"}, {"question_id": 36334, "answer": "30"}, {"question_id": 36337, "answer": "katarri"}, {"question_id": 36340, "answer": "yes"}, {"question_id": 36343, "answer": "lissa debernadis band"}, {"question_id": 36346, "answer": "limonene"}, {"question_id": 36349, "answer": "ultra ocatra 030v"}, {"question_id": 36352, "answer": "taxi"}, {"question_id": 36355, "answer": "google"}, {"question_id": 36358, "answer": "you"}, {"question_id": 36361, "answer": "fedex"}, {"question_id": 36364, "answer": "nike"}, {"question_id": 36367, "answer": "l"}, {"question_id": 36370, "answer": "e3 dead or alive e3"}, {"question_id": 36373, "answer": "3"}, {"question_id": 36376, "answer": "xii"}, {"question_id": 36379, "answer": "chambord"}, {"question_id": 36382, "answer": "bulldogs"}, {"question_id": 36385, "answer": "101"}, {"question_id": 36388, "answer": "samsung"}, {"question_id": 36391, "answer": "10:30"}, {"question_id": 36394, "answer": "1890"}, {"question_id": 36397, "answer": "chamberlin"}, {"question_id": 36400, "answer": "10"}, {"question_id": 36403, "answer": "british royalty"}, {"question_id": 36406, "answer": "seattle"}, {"question_id": 36409, "answer": "energy drinks"}, {"question_id": 36412, "answer": "no"}, {"question_id": 36415, "answer": "13"}, {"question_id": 36418, "answer": "2010"}, {"question_id": 36421, "answer": "california"}, {"question_id": 36424, "answer": "lenovo"}, {"question_id": 36427, "answer": "since 1920"}, {"question_id": 36430, "answer": "see"}, {"question_id": 36433, "answer": "beer"}, {"question_id": 36436, "answer": "hasbro"}, {"question_id": 36439, "answer": "milk"}, {"question_id": 36442, "answer": "winchester"}, {"question_id": 36445, "answer": "camel"}, {"question_id": 36448, "answer": "3"}, {"question_id": 36451, "answer": "chase"}, {"question_id": 36454, "answer": "29"}, {"question_id": 36457, "answer": "computer"}, {"question_id": 36460, "answer": "say"}, {"question_id": 36463, "answer": "newtown 's 150th anniversary celebration"}, {"question_id": 36466, "answer": "eminem"}, {"question_id": 36469, "answer": "tallman"}, {"question_id": 36472, "answer": "field"}, {"question_id": 36475, "answer": "blue label"}, {"question_id": 36478, "answer": "man"}, {"question_id": 36481, "answer": "soap"}, {"question_id": 36484, "answer": "poem"}, {"question_id": 36487, "answer": "calculus"}, {"question_id": 36490, "answer": "yes"}, {"question_id": 36493, "answer": "ocean"}, {"question_id": 36496, "answer": "10 minutes"}, {"question_id": 36499, "answer": "right"}, {"question_id": 36502, "answer": "7"}, {"question_id": 36505, "answer": "yes"}, {"question_id": 36508, "answer": "arrow"}, {"question_id": 36511, "answer": "blue jays"}, {"question_id": 36514, "answer": "graciano"}, {"question_id": 36517, "answer": "10 inches"}, {"question_id": 36520, "answer": "jefferson"}, {"question_id": 36523, "answer": "new york"}, {"question_id": 36526, "answer": "1900"}, {"question_id": 36529, "answer": "yes"}, {"question_id": 36532, "answer": "10:10"}, {"question_id": 36535, "answer": "stout"}, {"question_id": 36538, "answer": "samsung"}, {"question_id": 36541, "answer": "2004"}, {"question_id": 36544, "answer": "espresso"}, {"question_id": 36547, "answer": "no"}, {"question_id": 36550, "answer": "100"}, {"question_id": 36553, "answer": "07 08 2012"}, {"question_id": 36556, "answer": "instalado"}, {"question_id": 36559, "answer": "gorka"}, {"question_id": 36562, "answer": "miller"}, {"question_id": 36565, "answer": "toshiba"}, {"question_id": 36568, "answer": "12:00"}, {"question_id": 36571, "answer": "jamie"}, {"question_id": 36574, "answer": "jewel tower"}, {"question_id": 36577, "answer": "left"}, {"question_id": 36580, "answer": "100"}, {"question_id": 36583, "answer": "page"}, {"question_id": 36586, "answer": "dodgers"}, {"question_id": 36589, "answer": "sex in every way possible"}, {"question_id": 36592, "answer": "12 fl oz"}, {"question_id": 36595, "answer": "bishop 's finger"}, {"question_id": 36598, "answer": "blue"}, {"question_id": 36601, "answer": "france"}, {"question_id": 36604, "answer": "1234567890"}, {"question_id": 36607, "answer": "100"}, {"question_id": 36610, "answer": "2 and 3"}, {"question_id": 36613, "answer": "ramada"}, {"question_id": 36616, "answer": "honey"}, {"question_id": 36619, "answer": "judo"}, {"question_id": 36622, "answer": "pineapple"}, {"question_id": 36625, "answer": "isaac asimov"}, {"question_id": 36628, "answer": "314159265"}, {"question_id": 36631, "answer": "premium"}, {"question_id": 36634, "answer": "2008"}, {"question_id": 36637, "answer": "tuscon"}, {"question_id": 36640, "answer": "1"}, {"question_id": 36643, "answer": "1"}, {"question_id": 36646, "answer": "10:00 10:08"}, {"question_id": 36649, "answer": "coca"}, {"question_id": 36652, "answer": "roland"}, {"question_id": 36655, "answer": "yes"}, {"question_id": 36658, "answer": "american gothic tales includes stories by edgar allan poe nathaniel hawthorne herman melville and others"}, {"question_id": 36661, "answer": "soda"}, {"question_id": 36664, "answer": "john cheever"}, {"question_id": 36667, "answer": "tournee"}, {"question_id": 36670, "answer": "3"}, {"question_id": 36673, "answer": "cgp"}, {"question_id": 36676, "answer": "small"}, {"question_id": 36679, "answer": "dog"}, {"question_id": 36682, "answer": "yes"}, {"question_id": 36685, "answer": "apple"}, {"question_id": 36688, "answer": "joshua humphreys"}, {"question_id": 36691, "answer": "dell"}, {"question_id": 36694, "answer": "what is right"}, {"question_id": 36697, "answer": "c"}, {"question_id": 36700, "answer": "32"}, {"question_id": 36703, "answer": "space bar"}, {"question_id": 36706, "answer": "sandwiches"}, {"question_id": 36709, "answer": "hamilton"}, {"question_id": 36712, "answer": "800 666 6666"}, {"question_id": 36715, "answer": "steve jobs"}, {"question_id": 36718, "answer": "boxing"}, {"question_id": 36721, "answer": "nats"}, {"question_id": 36724, "answer": "nissan"}, {"question_id": 36727, "answer": "10:10"}, {"question_id": 36730, "answer": "ballonworkscouk"}, {"question_id": 36733, "answer": "android"}, {"question_id": 36736, "answer": ""}, {"question_id": 36739, "answer": "twitter"}, {"question_id": 36742, "answer": "10:30"}, {"question_id": 36745, "answer": "broadway"}, {"question_id": 36748, "answer": "samsung"}, {"question_id": 36751, "answer": "d"}, {"question_id": 36754, "answer": "sun"}, {"question_id": 36757, "answer": "large"}, {"question_id": 36760, "answer": "card stock"}, {"question_id": 36763, "answer": "airbus business center"}, {"question_id": 36766, "answer": "10"}, {"question_id": 36769, "answer": "no"}, {"question_id": 36772, "answer": "london"}, {"question_id": 36775, "answer": "30"}, {"question_id": 36778, "answer": "clase"}, {"question_id": 36781, "answer": "canada"}, {"question_id": 36784, "answer": "5"}, {"question_id": 36787, "answer": "10:10"}, {"question_id": 36790, "answer": "keyboard"}, {"question_id": 36793, "answer": "rachel doctah love"}, {"question_id": 36796, "answer": "dell"}, {"question_id": 36799, "answer": "yes"}, {"question_id": 36802, "answer": "history"}, {"question_id": 36805, "answer": "senza chioderi gocciocchi"}, {"question_id": 36808, "answer": "google"}, {"question_id": 36811, "answer": "blancpain"}, {"question_id": 36814, "answer": "north carolina"}, {"question_id": 36817, "answer": "12:45"}, {"question_id": 36820, "answer": "cristal"}, {"question_id": 36823, "answer": "94043"}, {"question_id": 36826, "answer": "chicago"}, {"question_id": 36829, "answer": "countdown"}, {"question_id": 36832, "answer": "winter"}, {"question_id": 36835, "answer": "search"}, {"question_id": 36838, "answer": "colorado"}, {"question_id": 36841, "answer": "hovercraft"}, {"question_id": 36844, "answer": "2009"}, {"question_id": 36847, "answer": "panther science"}, {"question_id": 36850, "answer": "lakers"}, {"question_id": 36853, "answer": "gary"}, {"question_id": 36856, "answer": "leo herman"}, {"question_id": 36859, "answer": "mekonomen"}, {"question_id": 36862, "answer": "vestor"}, {"question_id": 36865, "answer": "renato orazi arra"}, {"question_id": 36868, "answer": "spider man"}, {"question_id": 36871, "answer": "no"}, {"question_id": 36874, "answer": "100"}, {"question_id": 36877, "answer": "gipuzo"}, {"question_id": 36880, "answer": "100"}, {"question_id": 36883, "answer": "ptchaikovsky symphony no 1 1999"}, {"question_id": 36886, "answer": "cruise ship"}, {"question_id": 36889, "answer": "catherine feller"}, {"question_id": 36892, "answer": "wwwnationalrailcouk"}, {"question_id": 36895, "answer": "guinness"}, {"question_id": 36898, "answer": "webconverge"}, {"question_id": 36901, "answer": "bioabfall"}, {"question_id": 36904, "answer": "12:00"}, {"question_id": 36907, "answer": "old yeller"}, {"question_id": 36910, "answer": "51000000"}, {"question_id": 36913, "answer": "no left turn"}, {"question_id": 36916, "answer": "61"}, {"question_id": 36919, "answer": "cobra"}, {"question_id": 36922, "answer": "herald sun"}, {"question_id": 36925, "answer": "35"}, {"question_id": 36928, "answer": "lake shore"}, {"question_id": 36931, "answer": "andrew"}, {"question_id": 36934, "answer": "1234"}, {"question_id": 36937, "answer": "yes"}, {"question_id": 36940, "answer": "10:10"}, {"question_id": 36943, "answer": "7"}, {"question_id": 36946, "answer": "10:10"}, {"question_id": 36949, "answer": ""}, {"question_id": 36952, "answer": "10"}, {"question_id": 36955, "answer": "yes"}, {"question_id": 36958, "answer": "peas"}, {"question_id": 36961, "answer": "policia civil"}, {"question_id": 36964, "answer": "miss"}, {"question_id": 36967, "answer": "soul"}, {"question_id": 36970, "answer": "concord hill"}, {"question_id": 36973, "answer": "yes"}, {"question_id": 36976, "answer": "yes"}, {"question_id": 36979, "answer": "ladies"}, {"question_id": 36982, "answer": "spring"}, {"question_id": 36985, "answer": "workers"}, {"question_id": 36988, "answer": "kiruna"}, {"question_id": 36991, "answer": "facebook"}, {"question_id": 36994, "answer": "10:30"}, {"question_id": 36997, "answer": "virginia"}, {"question_id": 37000, "answer": "$14.95"}, {"question_id": 37003, "answer": "coca cola"}, {"question_id": 37006, "answer": "8"}, {"question_id": 37009, "answer": "2"}, {"question_id": 37012, "answer": "1234"}, {"question_id": 37015, "answer": "iphone"}, {"question_id": 37018, "answer": "black"}, {"question_id": 37021, "answer": "birrifico"}, {"question_id": 37024, "answer": "no"}, {"question_id": 37027, "answer": "4"}, {"question_id": 37030, "answer": "samsung"}, {"question_id": 37033, "answer": "photo"}, {"question_id": 37036, "answer": "yes"}, {"question_id": 37039, "answer": "red signal"}, {"question_id": 37042, "answer": "fire"}, {"question_id": 37045, "answer": "triangle"}, {"question_id": 37048, "answer": "yes"}, {"question_id": 37051, "answer": "jason graham"}, {"question_id": 37054, "answer": "symphony no 9"}, {"question_id": 37057, "answer": "lucky club"}, {"question_id": 37060, "answer": "department of justice"}, {"question_id": 37063, "answer": "7"}, {"question_id": 37066, "answer": "pacari"}, {"question_id": 37069, "answer": "vodka"}, {"question_id": 37072, "answer": "10 20 2012"}, {"question_id": 37075, "answer": "experience"}, {"question_id": 37078, "answer": "47"}, {"question_id": 37081, "answer": "london"}, {"question_id": 37084, "answer": "l"}, {"question_id": 37087, "answer": "beginning"}, {"question_id": 37090, "answer": "air force"}, {"question_id": 37093, "answer": "coca cola"}, {"question_id": 37096, "answer": "fc"}, {"question_id": 37099, "answer": "telephone"}, {"question_id": 37102, "answer": "yes"}, {"question_id": 37105, "answer": "0 9"}, {"question_id": 37108, "answer": "busy bee"}, {"question_id": 37111, "answer": "worry"}, {"question_id": 37114, "answer": "scansom"}, {"question_id": 37117, "answer": "whiskey"}, {"question_id": 37120, "answer": "1999"}, {"question_id": 37123, "answer": "rhetoric"}, {"question_id": 37126, "answer": "yes"}, {"question_id": 37129, "answer": "chinese"}, {"question_id": 37132, "answer": "b"}, {"question_id": 37135, "answer": "sony"}, {"question_id": 37138, "answer": "intel"}, {"question_id": 37141, "answer": "words in black ink on white paper"}, {"question_id": 37144, "answer": "airphone"}, {"question_id": 37147, "answer": "otter brewing"}, {"question_id": 37150, "answer": "buttons"}, {"question_id": 37153, "answer": "korean"}, {"question_id": 37156, "answer": "wwwplaneteclipsecom"}, {"question_id": 37159, "answer": "sword of dust 2005"}, {"question_id": 37162, "answer": "75"}, {"question_id": 37165, "answer": "asahi"}, {"question_id": 37168, "answer": "skyline"}, {"question_id": 37171, "answer": "el emperador"}, {"question_id": 37174, "answer": "yes"}, {"question_id": 37177, "answer": "wristwatch"}, {"question_id": 37180, "answer": "ikea"}, {"question_id": 37183, "answer": "golden dragon"}, {"question_id": 37186, "answer": "small"}, {"question_id": 37189, "answer": "test of devices"}, {"question_id": 37192, "answer": "yes"}, {"question_id": 37195, "answer": "yes"}, {"question_id": 37198, "answer": "2"}, {"question_id": 37201, "answer": "eisenbahnbr\u00fccke"}, {"question_id": 37204, "answer": "heineken"}, {"question_id": 37207, "answer": "rings"}, {"question_id": 37210, "answer": "innocent"}, {"question_id": 37213, "answer": "national book award"}, {"question_id": 37216, "answer": "spring water"}, {"question_id": 37219, "answer": "dublin"}, {"question_id": 37222, "answer": "what is title of poster"}, {"question_id": 37225, "answer": "mouse"}, {"question_id": 37228, "answer": "graciously"}, {"question_id": 37231, "answer": "friday"}, {"question_id": 37234, "answer": "killing lincoln"}, {"question_id": 37237, "answer": "identita"}, {"question_id": 37240, "answer": "yes"}, {"question_id": 37243, "answer": "angel"}, {"question_id": 37246, "answer": "beef"}, {"question_id": 37249, "answer": "bourbon"}, {"question_id": 37252, "answer": "37"}, {"question_id": 37255, "answer": "rock"}, {"question_id": 37258, "answer": "rutgers"}, {"question_id": 37261, "answer": "red"}, {"question_id": 37264, "answer": "coupons"}, {"question_id": 37267, "answer": "totoro"}, {"question_id": 37270, "answer": "cat"}, {"question_id": 37273, "answer": "rock"}, {"question_id": 37276, "answer": "samsung"}, {"question_id": 37279, "answer": "china"}, {"question_id": 37282, "answer": "177"}, {"question_id": 37285, "answer": "tennis"}, {"question_id": 37288, "answer": "traveler"}, {"question_id": 37291, "answer": "marmite"}, {"question_id": 37294, "answer": "kalamazoo"}, {"question_id": 37297, "answer": "2"}, {"question_id": 37300, "answer": "shift"}, {"question_id": 37303, "answer": "we're club 4u"}, {"question_id": 37306, "answer": "advantage"}, {"question_id": 37309, "answer": "love"}, {"question_id": 37312, "answer": "vietnam"}, {"question_id": 37315, "answer": "open data"}, {"question_id": 37318, "answer": "yes"}, {"question_id": 37321, "answer": "samsung"}, {"question_id": 37324, "answer": "12 12 12"}, {"question_id": 37327, "answer": "10"}, {"question_id": 37330, "answer": "apple"}, {"question_id": 37333, "answer": "simplex"}, {"question_id": 37336, "answer": "coffee"}, {"question_id": 37339, "answer": "2010"}, {"question_id": 37342, "answer": "12"}, {"question_id": 37345, "answer": "mobile booking service"}, {"question_id": 37348, "answer": "as above so below"}, {"question_id": 37351, "answer": "ii"}, {"question_id": 37354, "answer": "guard against imitation"}, {"question_id": 37357, "answer": "work rework"}, {"question_id": 37360, "answer": "malaysian"}, {"question_id": 37363, "answer": "b"}, {"question_id": 37366, "answer": "nike"}, {"question_id": 37369, "answer": "cathedral"}, {"question_id": 37372, "answer": "kx 100 messages"}, {"question_id": 37375, "answer": "t"}, {"question_id": 37378, "answer": "o"}, {"question_id": 37381, "answer": "10:00"}, {"question_id": 37384, "answer": "driving"}, {"question_id": 37387, "answer": "10"}, {"question_id": 37390, "answer": "bembo"}, {"question_id": 37393, "answer": "8"}, {"question_id": 37396, "answer": "3"}, {"question_id": 37399, "answer": "33"}, {"question_id": 37402, "answer": "1820"}, {"question_id": 37405, "answer": "souvranis notabilies dindouchine"}, {"question_id": 37408, "answer": "harry potter"}, {"question_id": 37411, "answer": "bottles"}, {"question_id": 37414, "answer": "sharepoint"}, {"question_id": 37417, "answer": "pete"}, {"question_id": 37420, "answer": "dynasty warriors: gundam reborn"}, {"question_id": 37423, "answer": "turbo 1000"}, {"question_id": 37426, "answer": "pocket"}, {"question_id": 37429, "answer": "peanuts"}, {"question_id": 37432, "answer": "yes"}, {"question_id": 37435, "answer": "bruno"}, {"question_id": 37438, "answer": "limited edition"}, {"question_id": 37441, "answer": "genitry"}, {"question_id": 37444, "answer": "9"}, {"question_id": 37447, "answer": "coca cola"}, {"question_id": 37450, "answer": "yellow book"}, {"question_id": 37453, "answer": "cary l blumberg"}, {"question_id": 37456, "answer": "facebook"}, {"question_id": 37459, "answer": "10 2010"}, {"question_id": 37462, "answer": "yes"}, {"question_id": 37465, "answer": "phillips"}, {"question_id": 37468, "answer": "10 10 2010"}, {"question_id": 37471, "answer": "motorola"}, {"question_id": 37474, "answer": "15"}, {"question_id": 37477, "answer": "yes"}, {"question_id": 37480, "answer": "choice"}, {"question_id": 37483, "answer": "50"}, {"question_id": 37486, "answer": "yankees"}, {"question_id": 37489, "answer": "smile"}, {"question_id": 37492, "answer": "swin"}, {"question_id": 37495, "answer": "perez"}, {"question_id": 37498, "answer": "10"}, {"question_id": 37501, "answer": "bible"}, {"question_id": 37504, "answer": "control"}, {"question_id": 37507, "answer": "0000000000"}, {"question_id": 37510, "answer": "no"}, {"question_id": 37513, "answer": "stop"}, {"question_id": 37516, "answer": "yes"}, {"question_id": 37519, "answer": "nutrition"}, {"question_id": 37522, "answer": "cold"}, {"question_id": 37525, "answer": "andersen"}, {"question_id": 37528, "answer": "1990"}, {"question_id": 37531, "answer": "displaying"}, {"question_id": 37534, "answer": "hajime isayama"}, {"question_id": 37537, "answer": "128"}, {"question_id": 37540, "answer": "300"}, {"question_id": 37543, "answer": "3"}, {"question_id": 37546, "answer": "sign"}, {"question_id": 37549, "answer": "1234"}, {"question_id": 37552, "answer": "ww 1022"}, {"question_id": 37555, "answer": "1976"}, {"question_id": 37558, "answer": "pea"}, {"question_id": 37561, "answer": "yes"}, {"question_id": 37564, "answer": "10:00"}, {"question_id": 37567, "answer": "power"}, {"question_id": 37570, "answer": "day"}, {"question_id": 37573, "answer": "10:30"}, {"question_id": 37576, "answer": "mets"}, {"question_id": 37579, "answer": "star wars"}, {"question_id": 37582, "answer": "tablets"}, {"question_id": 37585, "answer": "dot matrix"}, {"question_id": 37588, "answer": "google"}, {"question_id": 37591, "answer": "fly emirates"}, {"question_id": 37594, "answer": "6"}, {"question_id": 37597, "answer": "2000"}, {"question_id": 37600, "answer": "play money"}, {"question_id": 37603, "answer": "flickr"}, {"question_id": 37606, "answer": "adidas"}, {"question_id": 37609, "answer": "benjamin benjamin benjamine benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin benjamin"}, {"question_id": 37612, "answer": "heartbreak"}, {"question_id": 37615, "answer": "france"}, {"question_id": 37618, "answer": "title slide"}, {"question_id": 37621, "answer": "red"}, {"question_id": 37624, "answer": "rio de janeiro"}, {"question_id": 37627, "answer": "no"}, {"question_id": 37630, "answer": "jeremiah weed"}, {"question_id": 37633, "answer": "airasia"}, {"question_id": 37636, "answer": "coca cola"}, {"question_id": 37639, "answer": "bruins"}, {"question_id": 37642, "answer": "yes"}, {"question_id": 37645, "answer": ""}, {"question_id": 37648, "answer": "start"}, {"question_id": 37651, "answer": "potato chips"}, {"question_id": 37654, "answer": ""}, {"question_id": 37657, "answer": "guinness"}, {"question_id": 37660, "answer": "florida"}, {"question_id": 37663, "answer": "candy"}, {"question_id": 37666, "answer": "r heinlein"}, {"question_id": 37669, "answer": "prophecies of nostradamus"}, {"question_id": 37672, "answer": "photographer"}, {"question_id": 37675, "answer": "escape"}, {"question_id": 37678, "answer": "alan"}, {"question_id": 37681, "answer": "denmark"}, {"question_id": 37684, "answer": "terraform insurance"}, {"question_id": 37687, "answer": "621338"}, {"question_id": 37690, "answer": "it is not from compassion"}, {"question_id": 37693, "answer": "50"}, {"question_id": 37696, "answer": "ua"}, {"question_id": 37699, "answer": "bitter"}, {"question_id": 37702, "answer": "cheese"}, {"question_id": 37705, "answer": "herrmann"}, {"question_id": 37708, "answer": "10:00"}, {"question_id": 37711, "answer": "11"}, {"question_id": 37714, "answer": "clipper"}, {"question_id": 37717, "answer": "book"}, {"question_id": 37720, "answer": "whiskey"}, {"question_id": 37723, "answer": "jennifer brennan"}, {"question_id": 37726, "answer": "yes"}, {"question_id": 37729, "answer": "coca cola"}, {"question_id": 37732, "answer": "road work ahead"}, {"question_id": 37735, "answer": "yes"}, {"question_id": 37738, "answer": "2"}, {"question_id": 37741, "answer": "sunday"}, {"question_id": 37744, "answer": "2"}, {"question_id": 37747, "answer": "savannah apartments"}, {"question_id": 37750, "answer": "car"}, {"question_id": 37753, "answer": "n"}, {"question_id": 37756, "answer": "no"}, {"question_id": 37759, "answer": "50"}, {"question_id": 37762, "answer": "freds beauty"}, {"question_id": 37765, "answer": "langton"}, {"question_id": 37768, "answer": "yes"}, {"question_id": 37771, "answer": "game box"}, {"question_id": 37774, "answer": "candy"}, {"question_id": 37777, "answer": "1234"}, {"question_id": 37780, "answer": "no"}, {"question_id": 37783, "answer": "7"}, {"question_id": 37786, "answer": "verreaux"}, {"question_id": 37789, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 37792, "answer": "western jet"}, {"question_id": 37795, "answer": "yellow"}, {"question_id": 37798, "answer": "yvv"}, {"question_id": 37801, "answer": "france"}, {"question_id": 37804, "answer": "t"}, {"question_id": 37807, "answer": "black"}, {"question_id": 37810, "answer": "water"}, {"question_id": 37813, "answer": "severall defnec with i t i t e l"}, {"question_id": 37816, "answer": "12"}, {"question_id": 37819, "answer": "california"}, {"question_id": 37822, "answer": "racercom"}, {"question_id": 37825, "answer": "8"}, {"question_id": 37828, "answer": "arsenalcom"}, {"question_id": 37831, "answer": "40"}, {"question_id": 37834, "answer": "pulkakex"}, {"question_id": 37837, "answer": "samsung"}, {"question_id": 37840, "answer": "women"}, {"question_id": 37843, "answer": "new writings in sf.9"}, {"question_id": 37846, "answer": "p"}, {"question_id": 37849, "answer": "jazz"}, {"question_id": 37852, "answer": "10 20 2010"}, {"question_id": 37855, "answer": "sonic"}, {"question_id": 37858, "answer": "ferbet"}, {"question_id": 37861, "answer": "april"}, {"question_id": 37864, "answer": "6000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 37867, "answer": "10:10"}, {"question_id": 37870, "answer": "rolex"}, {"question_id": 37873, "answer": "smartened children will be seen and free kitten"}, {"question_id": 37876, "answer": "romeo"}, {"question_id": 37879, "answer": "dell"}, {"question_id": 37882, "answer": "10"}, {"question_id": 37885, "answer": "x"}, {"question_id": 37888, "answer": "windows"}, {"question_id": 37891, "answer": "british"}, {"question_id": 37894, "answer": "sedan"}, {"question_id": 37897, "answer": "elba"}, {"question_id": 37900, "answer": ""}, {"question_id": 37903, "answer": "canon"}, {"question_id": 37906, "answer": "globe and mail"}, {"question_id": 37909, "answer": "advertisement"}, {"question_id": 37912, "answer": "yes"}, {"question_id": 37915, "answer": "adidas"}, {"question_id": 37918, "answer": "merlot"}, {"question_id": 37921, "answer": "1999"}, {"question_id": 37924, "answer": "voting pamphlet"}, {"question_id": 37927, "answer": "7:18"}, {"question_id": 37930, "answer": "park"}, {"question_id": 37933, "answer": "love"}, {"question_id": 37936, "answer": "new york"}, {"question_id": 37939, "answer": "white"}, {"question_id": 37942, "answer": "united states"}, {"question_id": 37945, "answer": "france"}, {"question_id": 37948, "answer": "james wong howe"}, {"question_id": 37951, "answer": "beaker"}, {"question_id": 37954, "answer": "auditorium"}, {"question_id": 37957, "answer": "new york"}, {"question_id": 37960, "answer": "lol"}, {"question_id": 37963, "answer": "yes"}, {"question_id": 37966, "answer": "root beer"}, {"question_id": 37969, "answer": "palm"}, {"question_id": 37972, "answer": "catholiceservicescom"}, {"question_id": 37975, "answer": "t"}, {"question_id": 37978, "answer": "yes"}, {"question_id": 37981, "answer": "rolex"}, {"question_id": 37984, "answer": "100"}, {"question_id": 37987, "answer": "canada"}, {"question_id": 37990, "answer": "nuc77"}, {"question_id": 37993, "answer": "17"}, {"question_id": 37996, "answer": "sight & sound"}, {"question_id": 37999, "answer": "29"}, {"question_id": 38002, "answer": "google"}, {"question_id": 38005, "answer": "edge magazine"}, {"question_id": 38008, "answer": "45"}, {"question_id": 38011, "answer": "unknown"}, {"question_id": 38014, "answer": "laptop"}, {"question_id": 38017, "answer": "dell"}, {"question_id": 38020, "answer": "kompass"}, {"question_id": 38023, "answer": "stop"}, {"question_id": 38026, "answer": "denver"}, {"question_id": 38029, "answer": "118"}, {"question_id": 38032, "answer": "horror"}, {"question_id": 38035, "answer": "black russian"}, {"question_id": 38038, "answer": "1900"}, {"question_id": 38041, "answer": "black"}, {"question_id": 38044, "answer": "tesco"}, {"question_id": 38047, "answer": "yes"}, {"question_id": 38050, "answer": "scott"}, {"question_id": 38053, "answer": "johnson johnson"}, {"question_id": 38056, "answer": "toshiba"}, {"question_id": 38059, "answer": "yellow"}, {"question_id": 38062, "answer": "10"}, {"question_id": 38065, "answer": "coca cola"}, {"question_id": 38068, "answer": "meteor menace meteor menace"}, {"question_id": 38071, "answer": "eli marcotte 's responsive web design"}, {"question_id": 38074, "answer": "lg"}, {"question_id": 38077, "answer": "stop"}, {"question_id": 38080, "answer": "800 555 1234"}, {"question_id": 38083, "answer": "windows"}, {"question_id": 38086, "answer": "city"}, {"question_id": 38089, "answer": "no"}, {"question_id": 38092, "answer": "dr lalfer"}, {"question_id": 38095, "answer": "feybom"}, {"question_id": 38098, "answer": "49"}, {"question_id": 38101, "answer": "move"}, {"question_id": 38104, "answer": "woman"}, {"question_id": 38107, "answer": "v"}, {"question_id": 38110, "answer": "10:00"}, {"question_id": 38113, "answer": "10"}, {"question_id": 38116, "answer": "germany"}, {"question_id": 38119, "answer": "2009"}, {"question_id": 38122, "answer": "command"}, {"question_id": 38125, "answer": "yes"}, {"question_id": 38128, "answer": "yes"}, {"question_id": 38131, "answer": "beer"}, {"question_id": 38134, "answer": "yes"}, {"question_id": 38137, "answer": "fingerprint"}, {"question_id": 38140, "answer": "10"}, {"question_id": 38143, "answer": "dream angels"}, {"question_id": 38146, "answer": "100"}, {"question_id": 38149, "answer": "12"}, {"question_id": 38152, "answer": "kataar"}, {"question_id": 38155, "answer": "5000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 38158, "answer": "kotch 1971"}, {"question_id": 38161, "answer": "fine food"}, {"question_id": 38164, "answer": "hopfenbrau"}, {"question_id": 38167, "answer": "laser cutting"}, {"question_id": 38170, "answer": "b"}, {"question_id": 38173, "answer": "0"}, {"question_id": 38176, "answer": "1066"}, {"question_id": 38179, "answer": "guerrero"}, {"question_id": 38182, "answer": "sony"}, {"question_id": 38185, "answer": "london"}, {"question_id": 38188, "answer": "iran"}, {"question_id": 38191, "answer": "11"}, {"question_id": 38194, "answer": "no"}, {"question_id": 38197, "answer": "goodbye"}, {"question_id": 38200, "answer": "puma"}, {"question_id": 38203, "answer": "10"}, {"question_id": 38206, "answer": "1899"}, {"question_id": 38209, "answer": "yankees"}, {"question_id": 38212, "answer": "1"}, {"question_id": 38215, "answer": "samsung"}, {"question_id": 38218, "answer": "24"}, {"question_id": 38221, "answer": "red button"}, {"question_id": 38224, "answer": ""}, {"question_id": 38227, "answer": "adidas"}, {"question_id": 38230, "answer": "el capitan"}, {"question_id": 38233, "answer": "no"}, {"question_id": 38236, "answer": "yes"}, {"question_id": 38239, "answer": "fitbit"}, {"question_id": 38242, "answer": "south"}, {"question_id": 38245, "answer": "silly"}, {"question_id": 38248, "answer": "5"}, {"question_id": 38251, "answer": "1900"}, {"question_id": 38254, "answer": "sony"}, {"question_id": 38257, "answer": "no"}, {"question_id": 38260, "answer": "o henry"}, {"question_id": 38263, "answer": "flawless"}, {"question_id": 38266, "answer": "american apparel"}, {"question_id": 38269, "answer": "park and ride"}, {"question_id": 38272, "answer": "france"}, {"question_id": 38275, "answer": "john updike"}, {"question_id": 38278, "answer": "young person 's guide to behavior"}, {"question_id": 38281, "answer": "cremant de pouilly fuiss\u00e9"}, {"question_id": 38284, "answer": "1999"}, {"question_id": 38287, "answer": "$10"}, {"question_id": 38290, "answer": "10:10"}, {"question_id": 38293, "answer": "xbox embedded"}, {"question_id": 38296, "answer": "jim"}, {"question_id": 38299, "answer": "13"}, {"question_id": 38302, "answer": "10:11"}, {"question_id": 38305, "answer": "map"}, {"question_id": 38308, "answer": "google"}, {"question_id": 38311, "answer": "chardonnay"}, {"question_id": 38314, "answer": "yes"}, {"question_id": 38317, "answer": "no"}, {"question_id": 38320, "answer": "detour"}, {"question_id": 38323, "answer": "hello"}, {"question_id": 38326, "answer": "burgundy"}, {"question_id": 38329, "answer": "l"}, {"question_id": 38332, "answer": "apple"}, {"question_id": 38335, "answer": "arriva"}, {"question_id": 38338, "answer": "guest"}, {"question_id": 38341, "answer": "10:00"}, {"question_id": 38344, "answer": "12"}, {"question_id": 38347, "answer": "tatamagazine"}, {"question_id": 38350, "answer": "old"}, {"question_id": 38353, "answer": "rum"}, {"question_id": 38356, "answer": "800 222 5555"}, {"question_id": 38359, "answer": "1600"}, {"question_id": 38362, "answer": "neal stephenson"}, {"question_id": 38365, "answer": "what 's your favorite language"}, {"question_id": 38368, "answer": "laptop battery"}, {"question_id": 38371, "answer": "germany"}, {"question_id": 38374, "answer": "americans"}, {"question_id": 38377, "answer": "apple"}, {"question_id": 38380, "answer": "5"}, {"question_id": 38383, "answer": "22"}, {"question_id": 38386, "answer": "v\u00f6rkers"}, {"question_id": 38389, "answer": "30"}, {"question_id": 38392, "answer": "2009"}, {"question_id": 38395, "answer": "duck"}, {"question_id": 38398, "answer": "10 20 2015"}, {"question_id": 38401, "answer": "dubbel"}, {"question_id": 38404, "answer": "samsung"}, {"question_id": 38407, "answer": "war"}, {"question_id": 38410, "answer": "canada"}, {"question_id": 38413, "answer": "august 4th"}, {"question_id": 38416, "answer": "dallas"}, {"question_id": 38419, "answer": "13"}, {"question_id": 38422, "answer": "touch chicks"}, {"question_id": 38425, "answer": "iron maiden killers"}, {"question_id": 38428, "answer": "rock"}, {"question_id": 38431, "answer": "budweiser"}, {"question_id": 38434, "answer": "website"}, {"question_id": 38437, "answer": "census 2010"}, {"question_id": 38440, "answer": "fiver"}, {"question_id": 38443, "answer": "northwestern"}, {"question_id": 38446, "answer": "kamagero"}, {"question_id": 38449, "answer": "shiner"}, {"question_id": 38452, "answer": "ephesus"}, {"question_id": 38455, "answer": "sign"}, {"question_id": 38458, "answer": "rolex"}, {"question_id": 38461, "answer": "10:30"}, {"question_id": 38464, "answer": "korea"}, {"question_id": 38467, "answer": "yes"}, {"question_id": 38470, "answer": "building"}, {"question_id": 38473, "answer": "10"}, {"question_id": 38476, "answer": "11 inches"}, {"question_id": 38479, "answer": "trojan talkcom"}, {"question_id": 38482, "answer": "jb"}, {"question_id": 38485, "answer": "baseball"}, {"question_id": 38488, "answer": "19"}, {"question_id": 38491, "answer": "motorcycle"}, {"question_id": 38494, "answer": "hamburg"}, {"question_id": 38497, "answer": "bullet"}, {"question_id": 38500, "answer": "vino 66"}, {"question_id": 38503, "answer": "force"}, {"question_id": 38506, "answer": "1999"}, {"question_id": 38509, "answer": "i promise you i love you german"}, {"question_id": 38512, "answer": "no"}, {"question_id": 38515, "answer": "100"}, {"question_id": 38518, "answer": "book"}, {"question_id": 38521, "answer": "tamron"}, {"question_id": 38524, "answer": "pentax"}, {"question_id": 38527, "answer": "nike"}, {"question_id": 38530, "answer": "school"}, {"question_id": 38533, "answer": "kaboom"}, {"question_id": 38536, "answer": "onnamacom"}, {"question_id": 38539, "answer": "11:05"}, {"question_id": 38542, "answer": "yes"}, {"question_id": 38545, "answer": "10:00"}, {"question_id": 38548, "answer": "coffee"}, {"question_id": 38551, "answer": "adidas"}, {"question_id": 38554, "answer": "yes"}, {"question_id": 38557, "answer": "1999"}, {"question_id": 38560, "answer": "tag"}, {"question_id": 38563, "answer": "logos"}, {"question_id": 38566, "answer": "white"}, {"question_id": 38569, "answer": "lg"}, {"question_id": 38572, "answer": "qatar 's time for all"}, {"question_id": 38575, "answer": "sony"}, {"question_id": 38578, "answer": "red sox"}, {"question_id": 38581, "answer": "10:10"}, {"question_id": 38584, "answer": "ruler"}, {"question_id": 38587, "answer": "pepsi"}, {"question_id": 38590, "answer": "parking lot"}, {"question_id": 38593, "answer": "chinese"}, {"question_id": 38596, "answer": "piano"}, {"question_id": 38599, "answer": ""}, {"question_id": 38602, "answer": "1969"}, {"question_id": 38605, "answer": "mountain dew"}, {"question_id": 38608, "answer": "10"}, {"question_id": 38611, "answer": "1900"}, {"question_id": 38614, "answer": "earth"}, {"question_id": 38617, "answer": "brown"}, {"question_id": 38620, "answer": "lm"}, {"question_id": 38623, "answer": "federico de bock"}, {"question_id": 38626, "answer": "corporate sponsored events"}, {"question_id": 38629, "answer": "yes"}, {"question_id": 38632, "answer": "bouton r\u00e9mi"}, {"question_id": 38635, "answer": "rye"}, {"question_id": 38638, "answer": "10"}, {"question_id": 38641, "answer": "underwater"}, {"question_id": 38644, "answer": "john pizzarelli"}, {"question_id": 38647, "answer": "adidas"}, {"question_id": 38650, "answer": "yes"}, {"question_id": 38653, "answer": "to be happy make other people happy"}, {"question_id": 38656, "answer": "unlike"}, {"question_id": 38659, "answer": "artists"}, {"question_id": 38662, "answer": "vietnam"}, {"question_id": 38665, "answer": "cucumbers"}, {"question_id": 38668, "answer": "bulletin"}, {"question_id": 38671, "answer": "no name"}, {"question_id": 38674, "answer": "horror"}, {"question_id": 38677, "answer": "polaris"}, {"question_id": 38680, "answer": "cal"}, {"question_id": 38683, "answer": "not applicable"}, {"question_id": 38686, "answer": "yr2088"}, {"question_id": 38689, "answer": "2012"}, {"question_id": 38692, "answer": "orange"}, {"question_id": 38695, "answer": "hacking"}, {"question_id": 38698, "answer": "10"}, {"question_id": 38701, "answer": "adventures of sherlock holmes"}, {"question_id": 38704, "answer": "kent"}, {"question_id": 38707, "answer": "coarse"}, {"question_id": 38710, "answer": "1838"}, {"question_id": 38713, "answer": "clinical notes"}, {"question_id": 38716, "answer": "broadway"}, {"question_id": 38719, "answer": "stadium"}, {"question_id": 38722, "answer": "apple"}, {"question_id": 38725, "answer": "northern lion"}, {"question_id": 38728, "answer": "fire prevention"}, {"question_id": 38731, "answer": "dictionary of bullshit"}, {"question_id": 38734, "answer": "corazon"}, {"question_id": 38737, "answer": "1999"}, {"question_id": 38740, "answer": "yes"}, {"question_id": 38743, "answer": "german"}, {"question_id": 38746, "answer": "radio"}, {"question_id": 38749, "answer": "puma"}, {"question_id": 38752, "answer": "rolex"}, {"question_id": 38755, "answer": "9999"}, {"question_id": 38758, "answer": "bike"}, {"question_id": 38761, "answer": "100"}, {"question_id": 38764, "answer": "red"}, {"question_id": 38767, "answer": "fire"}, {"question_id": 38770, "answer": "aluris"}, {"question_id": 38773, "answer": "harry potter"}, {"question_id": 38776, "answer": "battle"}, {"question_id": 38779, "answer": "orange"}, {"question_id": 38782, "answer": "j"}, {"question_id": 38785, "answer": "tom brinkman"}, {"question_id": 38788, "answer": "italy"}, {"question_id": 38791, "answer": "10:00"}, {"question_id": 38794, "answer": "saturday"}, {"question_id": 38797, "answer": "50"}, {"question_id": 38800, "answer": "10:00"}, {"question_id": 38803, "answer": "rolex"}, {"question_id": 38806, "answer": "corbusier"}, {"question_id": 38809, "answer": "corton"}, {"question_id": 38812, "answer": "coffee"}, {"question_id": 38815, "answer": "reviving mummy"}, {"question_id": 38818, "answer": "10:00"}, {"question_id": 38821, "answer": "ecton kolin"}, {"question_id": 38824, "answer": "tire kars"}, {"question_id": 38827, "answer": "nintendo"}, {"question_id": 38830, "answer": "10:30"}, {"question_id": 38833, "answer": "18"}, {"question_id": 38836, "answer": "lion"}, {"question_id": 38839, "answer": "pure white rock potion"}, {"question_id": 38842, "answer": "100"}, {"question_id": 38845, "answer": "baby"}, {"question_id": 38848, "answer": "ambulance"}, {"question_id": 38851, "answer": "cell phone"}, {"question_id": 38854, "answer": "children"}, {"question_id": 38857, "answer": "coffee maker"}, {"question_id": 38860, "answer": "florida"}, {"question_id": 38863, "answer": "80"}, {"question_id": 38866, "answer": "book fair"}, {"question_id": 38869, "answer": "english"}, {"question_id": 38872, "answer": "cemetery"}, {"question_id": 38875, "answer": "pirae"}, {"question_id": 38878, "answer": "trash can"}, {"question_id": 38881, "answer": "adidas"}, {"question_id": 38884, "answer": "qu\u00e9becer"}, {"question_id": 38887, "answer": "britain"}, {"question_id": 38890, "answer": "16 oz"}, {"question_id": 38893, "answer": "no"}, {"question_id": 38896, "answer": "85"}, {"question_id": 38899, "answer": "wedding"}, {"question_id": 38902, "answer": "tuesday"}, {"question_id": 38905, "answer": "heavy metals"}, {"question_id": 38908, "answer": "virginia"}, {"question_id": 38911, "answer": "mall"}, {"question_id": 38914, "answer": "no"}, {"question_id": 38917, "answer": "met"}, {"question_id": 38920, "answer": "amani"}, {"question_id": 38923, "answer": "10"}, {"question_id": 38926, "answer": "1994"}, {"question_id": 38929, "answer": "1900s"}, {"question_id": 38932, "answer": "letter"}, {"question_id": 38935, "answer": "prestige"}, {"question_id": 38938, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 38941, "answer": "reference"}, {"question_id": 38944, "answer": "coffee"}, {"question_id": 38947, "answer": "star wars"}, {"question_id": 38950, "answer": "orange"}, {"question_id": 38953, "answer": "12"}, {"question_id": 38956, "answer": "10"}, {"question_id": 38959, "answer": "11 was inside job"}, {"question_id": 38962, "answer": "10:10"}, {"question_id": 38965, "answer": "dinosaur"}, {"question_id": 38968, "answer": "richard morgan"}, {"question_id": 38971, "answer": "model 100"}, {"question_id": 38974, "answer": "cold"}, {"question_id": 38977, "answer": "sports"}, {"question_id": 38980, "answer": "bottle"}, {"question_id": 38983, "answer": "$10"}, {"question_id": 38986, "answer": "htc"}, {"question_id": 38989, "answer": "van"}, {"question_id": 38992, "answer": "21"}, {"question_id": 38995, "answer": "2005"}, {"question_id": 38998, "answer": "products"}, {"question_id": 39001, "answer": "computers"}, {"question_id": 39004, "answer": "gilchrist"}, {"question_id": 39007, "answer": "germany"}, {"question_id": 39010, "answer": "french"}, {"question_id": 39013, "answer": "sunshine"}, {"question_id": 39016, "answer": "5"}, {"question_id": 39019, "answer": "right"}, {"question_id": 39022, "answer": "12"}, {"question_id": 39025, "answer": "42"}, {"question_id": 39028, "answer": "keyboard"}, {"question_id": 39031, "answer": "stop"}, {"question_id": 39034, "answer": "star"}, {"question_id": 39037, "answer": "new york"}, {"question_id": 39040, "answer": "10"}, {"question_id": 39043, "answer": "usb"}, {"question_id": 39046, "answer": "sovereign"}, {"question_id": 39049, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 39052, "answer": "1000"}, {"question_id": 39055, "answer": "4"}, {"question_id": 39058, "answer": "cowboy"}, {"question_id": 39061, "answer": "yes"}, {"question_id": 39064, "answer": "ballad"}, {"question_id": 39067, "answer": "pink lady"}, {"question_id": 39070, "answer": "samsung"}, {"question_id": 39073, "answer": "tire shop"}, {"question_id": 39076, "answer": "lenovo"}, {"question_id": 39079, "answer": "rescue"}, {"question_id": 39082, "answer": "wireless"}, {"question_id": 39085, "answer": "4"}, {"question_id": 39088, "answer": "no name"}, {"question_id": 39091, "answer": "apple"}, {"question_id": 39094, "answer": "msi"}, {"question_id": 39097, "answer": "no"}, {"question_id": 39100, "answer": "corona"}, {"question_id": 39103, "answer": "primal of nature by r gouldt"}, {"question_id": 39106, "answer": "title of book on right is lost world"}, {"question_id": 39109, "answer": "$1.99"}, {"question_id": 39112, "answer": "powers"}, {"question_id": 39115, "answer": "10"}, {"question_id": 39118, "answer": "l"}, {"question_id": 39121, "answer": "length"}, {"question_id": 39124, "answer": "wisconsin"}, {"question_id": 39127, "answer": "lego"}, {"question_id": 39130, "answer": "smith"}, {"question_id": 39133, "answer": "nike"}, {"question_id": 39136, "answer": "stop sign"}, {"question_id": 39139, "answer": "55"}, {"question_id": 39142, "answer": "propaganda"}, {"question_id": 39145, "answer": "blogspotcom"}, {"question_id": 39148, "answer": "texas"}, {"question_id": 39151, "answer": "10"}, {"question_id": 39154, "answer": "trash"}, {"question_id": 39157, "answer": "resurrection"}, {"question_id": 39160, "answer": "chocolate"}, {"question_id": 39163, "answer": "1999"}, {"question_id": 39166, "answer": "japan"}, {"question_id": 39169, "answer": "128"}, {"question_id": 39172, "answer": "white"}, {"question_id": 39175, "answer": "11:35"}, {"question_id": 39178, "answer": "alto ha"}, {"question_id": 39181, "answer": "september 2014"}, {"question_id": 39184, "answer": "yes"}, {"question_id": 39187, "answer": "$3.00"}, {"question_id": 39190, "answer": "yes"}, {"question_id": 39193, "answer": "samsung"}, {"question_id": 39196, "answer": "$100"}, {"question_id": 39199, "answer": "coca cola"}, {"question_id": 39202, "answer": "whippers times"}, {"question_id": 39205, "answer": "disney ride"}, {"question_id": 39208, "answer": "born digital"}, {"question_id": 39211, "answer": "y"}, {"question_id": 39214, "answer": "33"}, {"question_id": 39217, "answer": "montevideo"}, {"question_id": 39220, "answer": "1935"}, {"question_id": 39223, "answer": "pick it up in pack"}, {"question_id": 39226, "answer": "yes"}, {"question_id": 39229, "answer": "rocher figeac"}, {"question_id": 39232, "answer": "toshiba"}, {"question_id": 39235, "answer": "yes"}, {"question_id": 39238, "answer": "6"}, {"question_id": 39241, "answer": "ray"}, {"question_id": 39244, "answer": "katarri"}, {"question_id": 39247, "answer": "10:10"}, {"question_id": 39250, "answer": "7"}, {"question_id": 39253, "answer": "taiwan"}, {"question_id": 39256, "answer": "chicken"}, {"question_id": 39259, "answer": "10:30"}, {"question_id": 39262, "answer": "b c"}, {"question_id": 39265, "answer": "guy"}, {"question_id": 39268, "answer": "willy"}, {"question_id": 39271, "answer": "cigars"}, {"question_id": 39274, "answer": "sigtercom"}, {"question_id": 39277, "answer": "t"}, {"question_id": 39280, "answer": "lincoln"}, {"question_id": 39283, "answer": "kensal green"}, {"question_id": 39286, "answer": "introduction"}, {"question_id": 39289, "answer": "11056"}, {"question_id": 39292, "answer": "wooden box"}, {"question_id": 39295, "answer": "independent"}, {"question_id": 39298, "answer": "1"}, {"question_id": 39301, "answer": "sure"}, {"question_id": 39304, "answer": "1910"}, {"question_id": 39307, "answer": "idorg"}, {"question_id": 39310, "answer": "robert galbraith"}, {"question_id": 39313, "answer": "pizza"}, {"question_id": 39316, "answer": "pain"}, {"question_id": 39319, "answer": "10"}, {"question_id": 39322, "answer": "fast food"}, {"question_id": 39325, "answer": "ford"}, {"question_id": 39328, "answer": "andres perz"}, {"question_id": 39331, "answer": "100"}, {"question_id": 39334, "answer": "yes"}, {"question_id": 39337, "answer": "11"}, {"question_id": 39340, "answer": "beauty queen"}, {"question_id": 39343, "answer": "renters assurance"}, {"question_id": 39346, "answer": "100"}, {"question_id": 39349, "answer": "yes"}, {"question_id": 39352, "answer": "3"}, {"question_id": 39355, "answer": "granada market"}, {"question_id": 39358, "answer": "nike"}, {"question_id": 39361, "answer": "yes"}, {"question_id": 39364, "answer": "samsung"}, {"question_id": 39367, "answer": "s"}, {"question_id": 39370, "answer": "6"}, {"question_id": 39373, "answer": "8"}, {"question_id": 39376, "answer": "22"}, {"question_id": 39379, "answer": "color tv"}, {"question_id": 39382, "answer": "emirates"}, {"question_id": 39385, "answer": "00:00:00"}, {"question_id": 39388, "answer": "store"}, {"question_id": 39391, "answer": "yellow"}, {"question_id": 39394, "answer": "james"}, {"question_id": 39397, "answer": "space bar"}, {"question_id": 39400, "answer": "yes"}, {"question_id": 39403, "answer": "white"}, {"question_id": 39406, "answer": "cider"}, {"question_id": 39409, "answer": "diet"}, {"question_id": 39412, "answer": "ice river"}, {"question_id": 39415, "answer": "fallopia"}, {"question_id": 39418, "answer": "red sox"}, {"question_id": 39421, "answer": "yes"}, {"question_id": 39424, "answer": "subway"}, {"question_id": 39427, "answer": "taxi"}, {"question_id": 39430, "answer": "m"}, {"question_id": 39433, "answer": "1"}, {"question_id": 39436, "answer": "e van vogt"}, {"question_id": 39439, "answer": "yes"}, {"question_id": 39442, "answer": "trucker"}, {"question_id": 39445, "answer": "white"}, {"question_id": 39448, "answer": "no teachers"}, {"question_id": 39451, "answer": "toronto"}, {"question_id": 39454, "answer": "0123456789"}, {"question_id": 39457, "answer": "diabetic"}, {"question_id": 39460, "answer": "32"}, {"question_id": 39463, "answer": "longhorns"}, {"question_id": 39466, "answer": "16 oz"}, {"question_id": 39469, "answer": "yes"}, {"question_id": 39472, "answer": "11008"}, {"question_id": 39475, "answer": "guinness"}, {"question_id": 39478, "answer": "dark journey"}, {"question_id": 39481, "answer": "red sox"}, {"question_id": 39484, "answer": "rachel jones"}, {"question_id": 39487, "answer": "chateau"}, {"question_id": 39490, "answer": "yes"}, {"question_id": 39493, "answer": "text"}, {"question_id": 39496, "answer": "elizabeth aynsley"}, {"question_id": 39499, "answer": "mixed"}, {"question_id": 39502, "answer": "marlboro"}, {"question_id": 39505, "answer": "martini"}, {"question_id": 39508, "answer": "blinds mice"}, {"question_id": 39511, "answer": "laptop"}, {"question_id": 39514, "answer": "1876"}, {"question_id": 39517, "answer": "yes"}, {"question_id": 39520, "answer": "navy"}, {"question_id": 39523, "answer": "2008"}, {"question_id": 39526, "answer": "3"}, {"question_id": 39529, "answer": "uniunea europa politicissimo periodi piete spanish"}, {"question_id": 39532, "answer": "katariki"}, {"question_id": 39535, "answer": "singapore"}, {"question_id": 39538, "answer": "0123456789"}, {"question_id": 39541, "answer": "beer"}, {"question_id": 39544, "answer": "beer"}, {"question_id": 39547, "answer": "red book"}, {"question_id": 39550, "answer": "jeremy cohen"}, {"question_id": 39553, "answer": "sprin"}, {"question_id": 39556, "answer": "1000"}, {"question_id": 39559, "answer": "adidas"}, {"question_id": 39562, "answer": "yebisu"}, {"question_id": 39565, "answer": "turn right"}, {"question_id": 39568, "answer": "no"}, {"question_id": 39571, "answer": "tokyo black"}, {"question_id": 39574, "answer": "luu b"}, {"question_id": 39577, "answer": "rubin de casta"}, {"question_id": 39580, "answer": "perris"}, {"question_id": 39583, "answer": "62"}, {"question_id": 39586, "answer": "its bro"}, {"question_id": 39589, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 39592, "answer": "washington"}, {"question_id": 39595, "answer": "taco"}, {"question_id": 39598, "answer": "adidas"}, {"question_id": 39601, "answer": "tonight"}, {"question_id": 34604, "answer": "stone"}, {"question_id": 34607, "answer": "25"}, {"question_id": 34610, "answer": "boy"}, {"question_id": 34613, "answer": "snickers"}, {"question_id": 34616, "answer": "yes"}, {"question_id": 34619, "answer": "100"}, {"question_id": 34622, "answer": "yamaha"}, {"question_id": 34625, "answer": "select time"}, {"question_id": 34628, "answer": "williams"}, {"question_id": 34631, "answer": "king 's cross"}, {"question_id": 34634, "answer": "jefferson rogers"}, {"question_id": 34637, "answer": "bike"}, {"question_id": 34640, "answer": "chronicler 's wildland"}, {"question_id": 34643, "answer": "no"}, {"question_id": 34646, "answer": "chase"}, {"question_id": 34649, "answer": "1 pound"}, {"question_id": 34652, "answer": "sunday"}, {"question_id": 34655, "answer": "9"}, {"question_id": 34658, "answer": "10"}, {"question_id": 34661, "answer": "1835"}, {"question_id": 34664, "answer": "1920"}, {"question_id": 34667, "answer": "marvel"}, {"question_id": 34670, "answer": "hershey"}, {"question_id": 34673, "answer": "nlsi"}, {"question_id": 34676, "answer": "no"}, {"question_id": 34679, "answer": "jiba"}, {"question_id": 34682, "answer": "coca light"}, {"question_id": 34685, "answer": "cathy williams presents: secrets of rustless tycoon"}, {"question_id": 34688, "answer": "fox"}, {"question_id": 34691, "answer": "bear"}, {"question_id": 34694, "answer": "painted veil"}, {"question_id": 34697, "answer": "drummer"}, {"question_id": 34700, "answer": "10:10"}, {"question_id": 34703, "answer": "yes"}, {"question_id": 34706, "answer": "10:00"}, {"question_id": 34709, "answer": "tm"}, {"question_id": 34712, "answer": "no"}, {"question_id": 34715, "answer": "german"}, {"question_id": 34718, "answer": "italy"}, {"question_id": 34721, "answer": "bernard haitink"}, {"question_id": 34724, "answer": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 34727, "answer": "michael kiwanuka"}, {"question_id": 34730, "answer": "10000"}, {"question_id": 34733, "answer": "cockpit"}, {"question_id": 34736, "answer": "people"}, {"question_id": 34739, "answer": "domino"}, {"question_id": 34742, "answer": "fancy dress shop"}, {"question_id": 34745, "answer": "13"}, {"question_id": 34748, "answer": "values"}, {"question_id": 34751, "answer": "wave"}, {"question_id": 34754, "answer": "tower resort hotel"}, {"question_id": 34757, "answer": "united states"}, {"question_id": 34760, "answer": "obama"}, {"question_id": 34763, "answer": "snack star"}, {"question_id": 34766, "answer": "royal"}, {"question_id": 34769, "answer": "ace"}, {"question_id": 34772, "answer": "stitt"}, {"question_id": 34775, "answer": "beer"}, {"question_id": 34778, "answer": "1000"}, {"question_id": 34781, "answer": "geico"}, {"question_id": 34784, "answer": "title of book"}, {"question_id": 34787, "answer": "merlot"}, {"question_id": 34790, "answer": "nora roberts"}, {"question_id": 34793, "answer": "sony"}, {"question_id": 34796, "answer": "canada"}, {"question_id": 34799, "answer": "store"}, {"question_id": 34802, "answer": "baseball"}, {"question_id": 34805, "answer": "1234"}, {"question_id": 34808, "answer": "veterans of foreign wars"}, {"question_id": 34811, "answer": "video"}, {"question_id": 34814, "answer": "mortiz"}, {"question_id": 34817, "answer": "lufthansa"}, {"question_id": 34820, "answer": ""}, {"question_id": 34823, "answer": "h&m"}, {"question_id": 34826, "answer": "packers"}, {"question_id": 34829, "answer": "brick"}, {"question_id": 34832, "answer": "demarcus"}, {"question_id": 34835, "answer": "10:00"}, {"question_id": 34838, "answer": "29"}, {"question_id": 34841, "answer": "tattoo"}, {"question_id": 34844, "answer": "wine"}, {"question_id": 34847, "answer": "100"}, {"question_id": 34850, "answer": "new york"}, {"question_id": 34853, "answer": "2"}, {"question_id": 34856, "answer": ""}, {"question_id": 34859, "answer": "coca cola"}, {"question_id": 34862, "answer": "coca cola"}, {"question_id": 34865, "answer": "yes"}, {"question_id": 34868, "answer": "kicker"}, {"question_id": 34871, "answer": "tiffany 's"}, {"question_id": 34874, "answer": "dell"}, {"question_id": 34877, "answer": "yes"}, {"question_id": 34880, "answer": "soda"}, {"question_id": 34883, "answer": "dell"}, {"question_id": 34886, "answer": "coca"}, {"question_id": 34889, "answer": "samsung"}, {"question_id": 34892, "answer": "b"}, {"question_id": 34895, "answer": "english"}, {"question_id": 34898, "answer": "palm"}, {"question_id": 34901, "answer": "united kingdom"}, {"question_id": 34904, "answer": "tokyo"}, {"question_id": 34907, "answer": "stop"}, {"question_id": 34910, "answer": "summer"}, {"question_id": 34913, "answer": "nature 's harvest"}, {"question_id": 34916, "answer": "squatting"}, {"question_id": 34919, "answer": "shift"}, {"question_id": 34922, "answer": "ws reymont"}, {"question_id": 34925, "answer": "77"}, {"question_id": 34928, "answer": "olive oil"}, {"question_id": 34931, "answer": "ale"}, {"question_id": 34934, "answer": "right"}, {"question_id": 34937, "answer": "10"}, {"question_id": 34940, "answer": "human sport"}, {"question_id": 34943, "answer": "1000000000"}, {"question_id": 34946, "answer": "10:00"}, {"question_id": 34949, "answer": "slideology apa choreografias"}, {"question_id": 34952, "answer": "blue"}, {"question_id": 34955, "answer": "god"}, {"question_id": 34958, "answer": "6478 3000044"}, {"question_id": 34961, "answer": "toshiba"}, {"question_id": 34964, "answer": "full"}, {"question_id": 34967, "answer": "adidas"}, {"question_id": 34970, "answer": "1"}, {"question_id": 34973, "answer": "stop"}, {"question_id": 34976, "answer": "ferry"}, {"question_id": 34979, "answer": "coca cola"}, {"question_id": 34982, "answer": "rock strip"}, {"question_id": 34985, "answer": "romulus"}, {"question_id": 34988, "answer": "weight"}, {"question_id": 34991, "answer": "yes"}, {"question_id": 34994, "answer": "12:40"}, {"question_id": 34997, "answer": "131777"}, {"question_id": 35000, "answer": "soda"}, {"question_id": 35003, "answer": "100"}, {"question_id": 35006, "answer": "i"}, {"question_id": 35009, "answer": "smart mart"}, {"question_id": 35012, "answer": "ola"}, {"question_id": 35015, "answer": "berkley"}, {"question_id": 35018, "answer": "75"}, {"question_id": 35021, "answer": "jean paul sartre"}, {"question_id": 35024, "answer": "no"}, {"question_id": 35027, "answer": "wild"}, {"question_id": 35030, "answer": "mission"}, {"question_id": 35033, "answer": "hopnuts"}, {"question_id": 35036, "answer": "united states"}, {"question_id": 35039, "answer": "bacardi"}, {"question_id": 35042, "answer": ""}, {"question_id": 35045, "answer": "fender"}, {"question_id": 35048, "answer": "nestle"}, {"question_id": 35051, "answer": "samsung"}, {"question_id": 35054, "answer": "japan"}, {"question_id": 35057, "answer": "transportation"}, {"question_id": 35060, "answer": "welcome to deep space diner"}, {"question_id": 35063, "answer": "press"}, {"question_id": 35066, "answer": "12:00"}, {"question_id": 35069, "answer": "f"}, {"question_id": 35072, "answer": "clean"}, {"question_id": 35075, "answer": "pizza and subs"}, {"question_id": 35078, "answer": "no"}, {"question_id": 35081, "answer": "toshiba"}, {"question_id": 35084, "answer": "10:10"}, {"question_id": 35087, "answer": "wii"}, {"question_id": 35090, "answer": "b"}, {"question_id": 35093, "answer": "tv"}, {"question_id": 35096, "answer": "margaret macmillan"}, {"question_id": 35099, "answer": "qantas"}, {"question_id": 35102, "answer": "10:00"}, {"question_id": 35105, "answer": "free"}, {"question_id": 35108, "answer": "walgreens"}, {"question_id": 35111, "answer": "france"}, {"question_id": 35114, "answer": "1780"}, {"question_id": 35117, "answer": "florida hemmers m"}, {"question_id": 35120, "answer": "12"}, {"question_id": 35123, "answer": "lg"}, {"question_id": 35126, "answer": "7 eleven"}, {"question_id": 35129, "answer": "seafood"}, {"question_id": 35132, "answer": "drink"}, {"question_id": 35135, "answer": "martian odyssey ace science fiction library"}, {"question_id": 35138, "answer": "india"}, {"question_id": 35141, "answer": "terminator 2: judgment day"}, {"question_id": 35144, "answer": "yves saint laurent"}, {"question_id": 35147, "answer": "yes"}, {"question_id": 35150, "answer": "10:00"}, {"question_id": 35153, "answer": "warren"}, {"question_id": 35156, "answer": "metal"}, {"question_id": 35159, "answer": "30"}, {"question_id": 35162, "answer": "live"}, {"question_id": 35165, "answer": "caps lock"}, {"question_id": 35168, "answer": "justiceforleviorg"}, {"question_id": 35171, "answer": "gourmet"}, {"question_id": 35174, "answer": "restaurant"}, {"question_id": 35177, "answer": "b"}, {"question_id": 35180, "answer": "lager"}, {"question_id": 35183, "answer": "12:45"}, {"question_id": 35186, "answer": "bike"}, {"question_id": 35189, "answer": "2009"}, {"question_id": 35192, "answer": "lee ho kim"}, {"question_id": 35195, "answer": "slogan"}, {"question_id": 35198, "answer": "guidance"}, {"question_id": 35201, "answer": "profile"}, {"question_id": 35204, "answer": "guinness"}, {"question_id": 35207, "answer": "montreal"}, {"question_id": 35210, "answer": "coffee"}, {"question_id": 35213, "answer": "black"}, {"question_id": 35216, "answer": "2012"}, {"question_id": 35219, "answer": "chino"}, {"question_id": 35222, "answer": "magnum"}, {"question_id": 35225, "answer": "yes"}, {"question_id": 35228, "answer": "large"}, {"question_id": 35231, "answer": "daily create"}, {"question_id": 35234, "answer": "16"}, {"question_id": 35237, "answer": "left"}, {"question_id": 35240, "answer": "sai yu siyu"}, {"question_id": 35243, "answer": "12:00"}, {"question_id": 35246, "answer": "budweiser"}, {"question_id": 35249, "answer": "adobe"}, {"question_id": 35252, "answer": "beer"}, {"question_id": 35255, "answer": "it was slow brewed for taste"}, {"question_id": 35258, "answer": "blackberry"}, {"question_id": 35261, "answer": "united states"}, {"question_id": 35264, "answer": "tokyo"}, {"question_id": 35267, "answer": "new york"}, {"question_id": 35270, "answer": "chili bowl"}, {"question_id": 35273, "answer": "no"}, {"question_id": 35276, "answer": "10"}, {"question_id": 35279, "answer": "thug"}, {"question_id": 35282, "answer": "35"}, {"question_id": 35285, "answer": "mexico"}, {"question_id": 35288, "answer": "green"}, {"question_id": 35291, "answer": "407"}, {"question_id": 35294, "answer": "zedd"}, {"question_id": 35297, "answer": "slagger"}, {"question_id": 35300, "answer": "fischer traub"}, {"question_id": 35303, "answer": "yes"}, {"question_id": 35306, "answer": "digital"}, {"question_id": 35309, "answer": "10"}, {"question_id": 35312, "answer": "bud"}, {"question_id": 35315, "answer": "harsh 's shadow"}, {"question_id": 35318, "answer": "antibiotics"}, {"question_id": 35321, "answer": "1200"}, {"question_id": 35324, "answer": "gshock"}, {"question_id": 35327, "answer": "10000"}, {"question_id": 35330, "answer": "harold 's"}, {"question_id": 35333, "answer": "10 22 2012"}, {"question_id": 35336, "answer": "pepsi"}, {"question_id": 35339, "answer": "0123456789"}, {"question_id": 35342, "answer": "rolex"}, {"question_id": 35345, "answer": "stairwell"}, {"question_id": 35348, "answer": "xp"}, {"question_id": 35351, "answer": "father"}, {"question_id": 35354, "answer": "roger 's"}, {"question_id": 35357, "answer": "bourbon"}, {"question_id": 35360, "answer": "001"}, {"question_id": 35363, "answer": "e"}, {"question_id": 35366, "answer": "100"}, {"question_id": 35369, "answer": "australia"}, {"question_id": 35372, "answer": "1"}, {"question_id": 35375, "answer": "beacon"}, {"question_id": 35378, "answer": "stairs"}, {"question_id": 35381, "answer": "30 zone"}, {"question_id": 35384, "answer": "100"}, {"question_id": 35387, "answer": "2009"}, {"question_id": 35390, "answer": "trap shy blonde"}, {"question_id": 35393, "answer": "dreyer 's"}, {"question_id": 35396, "answer": "cemetery mausoleum and cremations"}, {"question_id": 35399, "answer": "france"}, {"question_id": 35402, "answer": "6000000000"}, {"question_id": 35405, "answer": "dior"}, {"question_id": 35408, "answer": "general"}, {"question_id": 35411, "answer": "nasa"}, {"question_id": 35414, "answer": "clear"}, {"question_id": 35417, "answer": "face"}, {"question_id": 35420, "answer": "taxi"}, {"question_id": 35423, "answer": "tv"}, {"question_id": 35426, "answer": "germany"}, {"question_id": 35429, "answer": "13"}, {"question_id": 35432, "answer": "vancouver"}, {"question_id": 35435, "answer": "16"}, {"question_id": 35438, "answer": "lady premontre"}, {"question_id": 35441, "answer": "1899"}, {"question_id": 35444, "answer": "dell"}, {"question_id": 35447, "answer": "cacao"}, {"question_id": 35450, "answer": "53"}, {"question_id": 35453, "answer": "bill"}, {"question_id": 35456, "answer": "to get lot of volunteers"}, {"question_id": 35459, "answer": "michigan stadium"}, {"question_id": 35462, "answer": "beer"}, {"question_id": 35465, "answer": "vedett"}, {"question_id": 35468, "answer": "art magazine"}, {"question_id": 35471, "answer": "10:00am"}, {"question_id": 35474, "answer": "all"}, {"question_id": 35477, "answer": "yes"}, {"question_id": 35480, "answer": "no"}, {"question_id": 35483, "answer": "blackberry"}, {"question_id": 35486, "answer": "joan baez"}, {"question_id": 35489, "answer": "100"}, {"question_id": 35492, "answer": "10:12"}, {"question_id": 35495, "answer": "charrette"}, {"question_id": 35498, "answer": "1234"}, {"question_id": 35501, "answer": "heineken"}, {"question_id": 35504, "answer": "cyclops"}, {"question_id": 35507, "answer": "red"}, {"question_id": 35510, "answer": "website"}, {"question_id": 35513, "answer": "whiskey"}, {"question_id": 35516, "answer": "10"}, {"question_id": 35519, "answer": "no entry"}, {"question_id": 35522, "answer": "lamomcom"}, {"question_id": 35525, "answer": "california"}, {"question_id": 35528, "answer": "2004"}, {"question_id": 35531, "answer": "20"}, {"question_id": 35534, "answer": "11:00"}, {"question_id": 35537, "answer": "$1.99"}, {"question_id": 35540, "answer": "yes"}, {"question_id": 35543, "answer": "13"}, {"question_id": 35546, "answer": "italy"}, {"question_id": 35549, "answer": "jpeg secret"}, {"question_id": 35552, "answer": "apple"}, {"question_id": 35555, "answer": "what is difference"}, {"question_id": 35558, "answer": "yes"}, {"question_id": 35561, "answer": "$6"}, {"question_id": 35564, "answer": "smirnoff"}, {"question_id": 35567, "answer": "god"}, {"question_id": 35570, "answer": "15"}, {"question_id": 35573, "answer": "archaeopteryx"}, {"question_id": 35576, "answer": "epi"}, {"question_id": 35579, "answer": "fallin' water cookbook"}, {"question_id": 35582, "answer": "10"}, {"question_id": 35585, "answer": "11"}, {"question_id": 35588, "answer": "10"}, {"question_id": 35591, "answer": "coca cola"}, {"question_id": 35594, "answer": "toys"}, {"question_id": 35597, "answer": "treasures"}, {"question_id": 35600, "answer": "16"}, {"question_id": 35603, "answer": "rick riordan"}, {"question_id": 35606, "answer": "tourism"}, {"question_id": 35609, "answer": "toshiba"}, {"question_id": 35612, "answer": "bus"}, {"question_id": 35615, "answer": "talk"}, {"question_id": 35618, "answer": "120"}, {"question_id": 35621, "answer": "royals"}, {"question_id": 35624, "answer": "10"}, {"question_id": 35627, "answer": "los lobos beach"}, {"question_id": 35630, "answer": "101"}, {"question_id": 35633, "answer": "1980"}, {"question_id": 35636, "answer": "beer"}, {"question_id": 35639, "answer": "7"}, {"question_id": 35642, "answer": "red"}, {"question_id": 35645, "answer": "techno man"}, {"question_id": 35648, "answer": "999"}, {"question_id": 35651, "answer": "12 inches"}, {"question_id": 35654, "answer": "2015"}, {"question_id": 35657, "answer": "live"}, {"question_id": 35660, "answer": "sony"}, {"question_id": 35663, "answer": "24"}, {"question_id": 35666, "answer": "100"}, {"question_id": 35669, "answer": "100"}, {"question_id": 35672, "answer": "1970"}, {"question_id": 35675, "answer": "kim 's"}, {"question_id": 35678, "answer": "dodgers"}, {"question_id": 35681, "answer": "101"}, {"question_id": 35684, "answer": "river rock"}, {"question_id": 35687, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 35690, "answer": "roof"}, {"question_id": 35693, "answer": "barber"}, {"question_id": 35696, "answer": "6"}, {"question_id": 35699, "answer": ""}, {"question_id": 35702, "answer": "surfing"}, {"question_id": 35705, "answer": "12"}, {"question_id": 35708, "answer": "donut world"}, {"question_id": 35711, "answer": "10"}, {"question_id": 35714, "answer": "meier"}, {"question_id": 35717, "answer": "rachel kramer bussel"}, {"question_id": 35720, "answer": "beer"}, {"question_id": 35723, "answer": "vnes"}, {"question_id": 35726, "answer": "rl10"}, {"question_id": 35729, "answer": "25"}, {"question_id": 35732, "answer": "fitbit"}, {"question_id": 35735, "answer": "nano computing"}, {"question_id": 35738, "answer": "orioles"}, {"question_id": 35741, "answer": "mcconnell"}, {"question_id": 35744, "answer": "cola"}, {"question_id": 35747, "answer": "clock"}, {"question_id": 35750, "answer": "saturday"}, {"question_id": 35753, "answer": "50"}, {"question_id": 35756, "answer": "pepsi"}, {"question_id": 35759, "answer": "7042222"}, {"question_id": 35762, "answer": "silver surfer"}, {"question_id": 35765, "answer": "freddie"}, {"question_id": 35768, "answer": "honda"}, {"question_id": 35771, "answer": "gregory keyes"}, {"question_id": 35774, "answer": "1940"}, {"question_id": 35777, "answer": "b"}, {"question_id": 35780, "answer": "how to be agent in occupied europe"}, {"question_id": 35783, "answer": "400 commercial way"}, {"question_id": 35786, "answer": "red"}, {"question_id": 35789, "answer": "stop"}, {"question_id": 35792, "answer": "24"}, {"question_id": 35795, "answer": "178"}, {"question_id": 35798, "answer": "amazon"}, {"question_id": 35801, "answer": "graffiti"}, {"question_id": 35804, "answer": "ryanair"}, {"question_id": 35807, "answer": "skaters"}, {"question_id": 35810, "answer": "book"}, {"question_id": 35813, "answer": "yes"}, {"question_id": 35816, "answer": "japan"}, {"question_id": 35819, "answer": "yes"}, {"question_id": 35822, "answer": "ibiza"}, {"question_id": 35825, "answer": "charming"}, {"question_id": 35828, "answer": "croft"}, {"question_id": 35831, "answer": "lg"}, {"question_id": 35834, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 35837, "answer": "10:10"}, {"question_id": 35840, "answer": "coca cola"}, {"question_id": 35843, "answer": "10"}, {"question_id": 35846, "answer": "10 20 2012"}, {"question_id": 35849, "answer": "usa"}, {"question_id": 35852, "answer": "raymond weil"}, {"question_id": 35855, "answer": "laptop"}, {"question_id": 35858, "answer": "1999"}, {"question_id": 35861, "answer": "california"}, {"question_id": 35864, "answer": "16"}, {"question_id": 35867, "answer": "books"}, {"question_id": 35870, "answer": "bank of america"}, {"question_id": 35873, "answer": "doggy"}, {"question_id": 35876, "answer": "punk"}, {"question_id": 35879, "answer": "yes"}, {"question_id": 35882, "answer": "x u"}, {"question_id": 35885, "answer": "emelektistacom"}, {"question_id": 35888, "answer": "1800s"}, {"question_id": 35891, "answer": "no"}, {"question_id": 35894, "answer": "10"}, {"question_id": 35897, "answer": "10"}, {"question_id": 35900, "answer": "2009"}, {"question_id": 35903, "answer": "1000"}, {"question_id": 35906, "answer": "pole 's garden seeds 1890"}, {"question_id": 35909, "answer": "stop"}, {"question_id": 35912, "answer": "curt meyerowitz"}, {"question_id": 35915, "answer": "chateau"}, {"question_id": 35918, "answer": "50"}, {"question_id": 35921, "answer": "jack daniel 's"}, {"question_id": 35924, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 35927, "answer": "10"}, {"question_id": 35930, "answer": "city"}, {"question_id": 35933, "answer": "city"}, {"question_id": 35936, "answer": "10"}, {"question_id": 35939, "answer": "auburn"}, {"question_id": 35942, "answer": "microscope"}, {"question_id": 35945, "answer": "torres"}, {"question_id": 35948, "answer": "lonram"}, {"question_id": 35951, "answer": "global scale"}, {"question_id": 35954, "answer": "safety marshal"}, {"question_id": 35957, "answer": "g"}, {"question_id": 35960, "answer": "cc"}, {"question_id": 35963, "answer": "c"}, {"question_id": 35966, "answer": "kunstmuseum"}, {"question_id": 35969, "answer": "london"}, {"question_id": 35972, "answer": "e"}, {"question_id": 35975, "answer": "w w w w"}, {"question_id": 35978, "answer": "yes"}, {"question_id": 35981, "answer": "viking river cruises"}, {"question_id": 35984, "answer": "16"}, {"question_id": 35987, "answer": "top hit"}, {"question_id": 35990, "answer": "nike"}, {"question_id": 35993, "answer": "jet"}, {"question_id": 35996, "answer": "dogolga"}, {"question_id": 35999, "answer": "turn"}, {"question_id": 36002, "answer": "yes"}, {"question_id": 36005, "answer": "stout"}, {"question_id": 36008, "answer": "candy"}, {"question_id": 36011, "answer": "outrun"}, {"question_id": 36014, "answer": "nokia"}, {"question_id": 36017, "answer": "build"}, {"question_id": 36020, "answer": "touch control"}, {"question_id": 36023, "answer": "chicago"}, {"question_id": 36026, "answer": "hewlett packard"}, {"question_id": 36029, "answer": "anthony shaffer"}, {"question_id": 36032, "answer": "1999"}, {"question_id": 36035, "answer": "rnk"}, {"question_id": 36038, "answer": "nothing"}, {"question_id": 36041, "answer": "stihl"}, {"question_id": 36044, "answer": "1960s"}, {"question_id": 36047, "answer": "tony and mary scanlon"}, {"question_id": 36050, "answer": "2"}, {"question_id": 36053, "answer": "no"}, {"question_id": 36056, "answer": "vivarriba"}, {"question_id": 36059, "answer": "patterns"}, {"question_id": 36062, "answer": "forward"}, {"question_id": 36065, "answer": "red"}, {"question_id": 36068, "answer": "10"}, {"question_id": 36071, "answer": "light"}, {"question_id": 36074, "answer": "taste"}, {"question_id": 36077, "answer": "la raza"}, {"question_id": 36080, "answer": "book"}, {"question_id": 36083, "answer": "rum"}, {"question_id": 36086, "answer": "tales of cacophony society"}, {"question_id": 36089, "answer": "craft brewery"}, {"question_id": 36092, "answer": "air force"}, {"question_id": 36095, "answer": "philadelphia"}, {"question_id": 36098, "answer": "furnished"}, {"question_id": 36101, "answer": "war"}, {"question_id": 36104, "answer": "bimbo"}, {"question_id": 36107, "answer": "gillette"}, {"question_id": 36110, "answer": "tefal"}, {"question_id": 36113, "answer": "10:10"}, {"question_id": 36116, "answer": "yes"}, {"question_id": 36119, "answer": "holiday"}, {"question_id": 36122, "answer": "citi"}, {"question_id": 36125, "answer": "paul and faithfulness of god"}, {"question_id": 36128, "answer": "779"}, {"question_id": 36131, "answer": "kumamoto"}, {"question_id": 36134, "answer": "white"}, {"question_id": 36137, "answer": "10:00"}, {"question_id": 36140, "answer": "yes"}, {"question_id": 36143, "answer": "boston"}, {"question_id": 36146, "answer": "borjomi"}, {"question_id": 36149, "answer": "i am student"}, {"question_id": 36152, "answer": "volcano"}, {"question_id": 36155, "answer": "bayonetta"}, {"question_id": 36158, "answer": "prophets of world"}, {"question_id": 36161, "answer": "randy j hunt"}, {"question_id": 36164, "answer": "temple of rock"}, {"question_id": 36167, "answer": "yes"}, {"question_id": 36170, "answer": "swan"}, {"question_id": 36173, "answer": "11 liberty st"}, {"question_id": 36176, "answer": "no"}, {"question_id": 36179, "answer": "sunday"}, {"question_id": 36182, "answer": "west"}, {"question_id": 36185, "answer": "hipbarriocom"}, {"question_id": 36188, "answer": "0000000000"}, {"question_id": 36191, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 36194, "answer": "john f kennedy"}, {"question_id": 36197, "answer": "dare"}, {"question_id": 36200, "answer": "puma"}, {"question_id": 36203, "answer": "performance"}, {"question_id": 36206, "answer": "bible"}, {"question_id": 36209, "answer": "2011"}, {"question_id": 36212, "answer": "hewlett packard"}, {"question_id": 36215, "answer": "imac"}, {"question_id": 36218, "answer": "bbc"}, {"question_id": 36221, "answer": "apologia in difesa della s s inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior inferior infer"}, {"question_id": 36224, "answer": "pepsi"}, {"question_id": 36227, "answer": "t"}, {"question_id": 36230, "answer": "writing in new york: literary anthology"}, {"question_id": 36233, "answer": "fortuna"}, {"question_id": 36236, "answer": "green"}, {"question_id": 36239, "answer": "cake day"}, {"question_id": 36242, "answer": "frenchman 's"}, {"question_id": 36245, "answer": "classical music"}, {"question_id": 36248, "answer": "10:30"}, {"question_id": 36251, "answer": "10"}, {"question_id": 36254, "answer": "1999"}, {"question_id": 36257, "answer": "349.99"}, {"question_id": 36260, "answer": "sweet"}, {"question_id": 36263, "answer": "2014"}, {"question_id": 36266, "answer": "500"}, {"question_id": 36269, "answer": "nutrient"}, {"question_id": 36272, "answer": "hot wheels"}, {"question_id": 36275, "answer": "concordia"}, {"question_id": 36278, "answer": "acers"}, {"question_id": 36281, "answer": "kitchen"}, {"question_id": 36284, "answer": "stop"}, {"question_id": 36287, "answer": "game"}, {"question_id": 36290, "answer": "wii"}, {"question_id": 36293, "answer": "firefox"}, {"question_id": 36296, "answer": "dylan rhodes"}, {"question_id": 36299, "answer": "limits"}, {"question_id": 36302, "answer": "12:00"}, {"question_id": 36305, "answer": "sugar"}, {"question_id": 36308, "answer": "peace"}, {"question_id": 36311, "answer": "samsung"}, {"question_id": 36314, "answer": "help"}, {"question_id": 36317, "answer": "sunkist"}, {"question_id": 36320, "answer": "2009"}, {"question_id": 36323, "answer": "3"}, {"question_id": 36326, "answer": "power"}, {"question_id": 36329, "answer": "i don't believe in love but i do believe in love"}, {"question_id": 36332, "answer": "ram"}, {"question_id": 36335, "answer": "honolulu"}, {"question_id": 36338, "answer": "stop sign"}, {"question_id": 36341, "answer": "no"}, {"question_id": 36344, "answer": "rapleys"}, {"question_id": 36347, "answer": "spanish"}, {"question_id": 36350, "answer": "bongganger"}, {"question_id": 36353, "answer": "pf"}, {"question_id": 36356, "answer": "verizon"}, {"question_id": 36359, "answer": "cinnamon"}, {"question_id": 36362, "answer": "1030"}, {"question_id": 36365, "answer": "yes"}, {"question_id": 36368, "answer": "1"}, {"question_id": 36371, "answer": "graffiti"}, {"question_id": 36374, "answer": "soccer"}, {"question_id": 36377, "answer": "monarch"}, {"question_id": 36380, "answer": "pirates of caribbean"}, {"question_id": 36383, "answer": "44"}, {"question_id": 36386, "answer": "roger norrington"}, {"question_id": 36389, "answer": "program"}, {"question_id": 36392, "answer": "coolab"}, {"question_id": 36395, "answer": "musicians"}, {"question_id": 36398, "answer": "garage"}, {"question_id": 36401, "answer": "texas"}, {"question_id": 36404, "answer": "text"}, {"question_id": 36407, "answer": "skyliners"}, {"question_id": 36410, "answer": "cherry"}, {"question_id": 36413, "answer": "wine"}, {"question_id": 36416, "answer": "blue"}, {"question_id": 36419, "answer": "500mg"}, {"question_id": 36422, "answer": "santa monica boulevard"}, {"question_id": 36425, "answer": "red"}, {"question_id": 36428, "answer": "12 18"}, {"question_id": 36431, "answer": "casio"}, {"question_id": 36434, "answer": "fujifilm"}, {"question_id": 36437, "answer": "food"}, {"question_id": 36440, "answer": "new york"}, {"question_id": 36443, "answer": "parking staff"}, {"question_id": 36446, "answer": "cola"}, {"question_id": 36449, "answer": "kikii"}, {"question_id": 36452, "answer": "3"}, {"question_id": 36455, "answer": "bernard"}, {"question_id": 36458, "answer": "dell"}, {"question_id": 36461, "answer": "10:10"}, {"question_id": 36464, "answer": "beer"}, {"question_id": 36467, "answer": "don't touch"}, {"question_id": 36470, "answer": "eucherre and court of king 's bench"}, {"question_id": 36473, "answer": "chateau d'or"}, {"question_id": 36476, "answer": "2012"}, {"question_id": 36479, "answer": "beer"}, {"question_id": 36482, "answer": "yes"}, {"question_id": 36485, "answer": "10:10"}, {"question_id": 36488, "answer": "lg"}, {"question_id": 36491, "answer": "50"}, {"question_id": 36494, "answer": "carl sandburg"}, {"question_id": 36497, "answer": "crock pot"}, {"question_id": 36500, "answer": "yes"}, {"question_id": 36503, "answer": "4470"}, {"question_id": 36506, "answer": "17"}, {"question_id": 36509, "answer": "r"}, {"question_id": 36512, "answer": "15"}, {"question_id": 36515, "answer": "stop"}, {"question_id": 36518, "answer": "single phase"}, {"question_id": 36521, "answer": "chocolate"}, {"question_id": 36524, "answer": "nytaxi"}, {"question_id": 36527, "answer": "new york"}, {"question_id": 36530, "answer": "fifth elephant"}, {"question_id": 36533, "answer": "bank"}, {"question_id": 36536, "answer": "wheelchair"}, {"question_id": 36539, "answer": "katz 's drugs"}, {"question_id": 36542, "answer": "stop"}, {"question_id": 36545, "answer": "trucks"}, {"question_id": 36548, "answer": "red"}, {"question_id": 36551, "answer": "palm"}, {"question_id": 36554, "answer": "12"}, {"question_id": 36557, "answer": "x"}, {"question_id": 36560, "answer": "large in world even creatures you can see"}, {"question_id": 36563, "answer": "3"}, {"question_id": 36566, "answer": "20"}, {"question_id": 36569, "answer": "1999"}, {"question_id": 36572, "answer": "juan"}, {"question_id": 36575, "answer": "etna"}, {"question_id": 36578, "answer": "london"}, {"question_id": 36581, "answer": "mariners"}, {"question_id": 36584, "answer": "19"}, {"question_id": 36587, "answer": "nothing"}, {"question_id": 36590, "answer": "sex"}, {"question_id": 36593, "answer": "beeze"}, {"question_id": 36596, "answer": "renesse"}, {"question_id": 36599, "answer": "tacos"}, {"question_id": 36602, "answer": "17"}, {"question_id": 36605, "answer": "pills"}, {"question_id": 36608, "answer": "$10"}, {"question_id": 36611, "answer": "beer"}, {"question_id": 36614, "answer": "banana"}, {"question_id": 36617, "answer": "crucifixion"}, {"question_id": 36620, "answer": "no logo"}, {"question_id": 36623, "answer": "fairroux"}, {"question_id": 36626, "answer": "climbing"}, {"question_id": 36629, "answer": "13:45"}, {"question_id": 36632, "answer": "beer"}, {"question_id": 36635, "answer": "calendar"}, {"question_id": 36638, "answer": "5"}, {"question_id": 36641, "answer": "no"}, {"question_id": 36644, "answer": "16"}, {"question_id": 36647, "answer": "16"}, {"question_id": 36650, "answer": "pepsi"}, {"question_id": 36653, "answer": "10:00"}, {"question_id": 36656, "answer": "it 's game"}, {"question_id": 36659, "answer": "100"}, {"question_id": 36662, "answer": "saber"}, {"question_id": 36665, "answer": "1963"}, {"question_id": 36668, "answer": "8"}, {"question_id": 36671, "answer": "stop"}, {"question_id": 36674, "answer": "100"}, {"question_id": 36677, "answer": "10:00"}, {"question_id": 36680, "answer": "web accessibility"}, {"question_id": 36683, "answer": "stjiliorno"}, {"question_id": 36686, "answer": "10:00"}, {"question_id": 36689, "answer": "javasunseker"}, {"question_id": 36692, "answer": "sushi"}, {"question_id": 36695, "answer": "blue"}, {"question_id": 36698, "answer": ""}, {"question_id": 36701, "answer": "boston"}, {"question_id": 36704, "answer": "lancome"}, {"question_id": 36707, "answer": "ammatica"}, {"question_id": 36710, "answer": "10 10 2010"}, {"question_id": 36713, "answer": "phone number"}, {"question_id": 36716, "answer": "to"}, {"question_id": 36719, "answer": "bacardi"}, {"question_id": 36722, "answer": "target"}, {"question_id": 36725, "answer": "buy"}, {"question_id": 36728, "answer": "n pushpashev rus"}, {"question_id": 36731, "answer": "milan"}, {"question_id": 36734, "answer": "2007"}, {"question_id": 36737, "answer": "cummins"}, {"question_id": 36740, "answer": "10"}, {"question_id": 36743, "answer": "utah"}, {"question_id": 36746, "answer": "taxi"}, {"question_id": 36749, "answer": "1"}, {"question_id": 36752, "answer": "santa rita"}, {"question_id": 36755, "answer": "end of world"}, {"question_id": 36758, "answer": "1080p"}, {"question_id": 36761, "answer": "bottle"}, {"question_id": 36764, "answer": "brisbane"}, {"question_id": 36767, "answer": "100"}, {"question_id": 36770, "answer": "10:10"}, {"question_id": 36773, "answer": "mentimeter"}, {"question_id": 36776, "answer": "silver"}, {"question_id": 36779, "answer": "camel"}, {"question_id": 36782, "answer": "achieving agility and discipline made easy"}, {"question_id": 36785, "answer": "1890"}, {"question_id": 36788, "answer": "10:10"}, {"question_id": 36791, "answer": "verizon"}, {"question_id": 36794, "answer": "black book"}, {"question_id": 36797, "answer": "space"}, {"question_id": 36800, "answer": "india"}, {"question_id": 36803, "answer": "no"}, {"question_id": 36806, "answer": "syrup"}, {"question_id": 36809, "answer": "yes"}, {"question_id": 36812, "answer": "10:10"}, {"question_id": 36815, "answer": "back"}, {"question_id": 36818, "answer": "well 's"}, {"question_id": 36821, "answer": "baseball"}, {"question_id": 36824, "answer": "flickr"}, {"question_id": 36827, "answer": "9"}, {"question_id": 36830, "answer": "airasia"}, {"question_id": 36833, "answer": "beer"}, {"question_id": 36836, "answer": "1890"}, {"question_id": 36839, "answer": "jean cocteau"}, {"question_id": 36842, "answer": "no"}, {"question_id": 36845, "answer": "playground"}, {"question_id": 36848, "answer": "cts"}, {"question_id": 36851, "answer": "beer"}, {"question_id": 36854, "answer": "aperol"}, {"question_id": 36857, "answer": "gobo"}, {"question_id": 36860, "answer": "0%"}, {"question_id": 36863, "answer": "no text"}, {"question_id": 36866, "answer": "white horse"}, {"question_id": 36869, "answer": "oil"}, {"question_id": 36872, "answer": "graffiti"}, {"question_id": 36875, "answer": "lg phones"}, {"question_id": 36878, "answer": "111"}, {"question_id": 36881, "answer": "june"}, {"question_id": 36884, "answer": "symphony no 1"}, {"question_id": 36887, "answer": "100"}, {"question_id": 36890, "answer": "life for vietnam"}, {"question_id": 36893, "answer": "485"}, {"question_id": 36896, "answer": "sellforandfind"}, {"question_id": 36899, "answer": "introduction"}, {"question_id": 36902, "answer": "trash"}, {"question_id": 36905, "answer": "pinarello"}, {"question_id": 36908, "answer": "disney"}, {"question_id": 36911, "answer": "no"}, {"question_id": 36914, "answer": "conquer"}, {"question_id": 36917, "answer": "toronto"}, {"question_id": 36920, "answer": "cobra"}, {"question_id": 36923, "answer": "saturday"}, {"question_id": 36926, "answer": "11"}, {"question_id": 36929, "answer": "children 's products"}, {"question_id": 36932, "answer": "peace"}, {"question_id": 36935, "answer": ""}, {"question_id": 36938, "answer": "mac"}, {"question_id": 36941, "answer": "yes"}, {"question_id": 36944, "answer": "isle"}, {"question_id": 36947, "answer": "timex"}, {"question_id": 36950, "answer": "idiot"}, {"question_id": 36953, "answer": "etisalat"}, {"question_id": 36956, "answer": "texas"}, {"question_id": 36959, "answer": "brand"}, {"question_id": 36962, "answer": "yes"}, {"question_id": 36965, "answer": "united states"}, {"question_id": 36968, "answer": "yes"}, {"question_id": 36971, "answer": "chanel"}, {"question_id": 36974, "answer": "tasmanian"}, {"question_id": 36977, "answer": "diet coke"}, {"question_id": 36980, "answer": "35"}, {"question_id": 36983, "answer": "absinth"}, {"question_id": 36986, "answer": "alan"}, {"question_id": 36989, "answer": "108"}, {"question_id": 36992, "answer": "wine"}, {"question_id": 36995, "answer": "5"}, {"question_id": 36998, "answer": "workshop"}, {"question_id": 37001, "answer": "kate valien"}, {"question_id": 37004, "answer": "coca cola"}, {"question_id": 37007, "answer": "powered"}, {"question_id": 37010, "answer": "tamale town"}, {"question_id": 37013, "answer": "8"}, {"question_id": 37016, "answer": "40"}, {"question_id": 37019, "answer": "stop"}, {"question_id": 37022, "answer": "yes"}, {"question_id": 37025, "answer": "blue"}, {"question_id": 37028, "answer": "100%"}, {"question_id": 37031, "answer": "vodka"}, {"question_id": 37034, "answer": "monday"}, {"question_id": 37037, "answer": "postage stamps"}, {"question_id": 37040, "answer": "stop on red"}, {"question_id": 37043, "answer": "fries"}, {"question_id": 37046, "answer": "blue"}, {"question_id": 37049, "answer": "wood"}, {"question_id": 37052, "answer": "2013 2014"}, {"question_id": 37055, "answer": "lightfoot"}, {"question_id": 37058, "answer": "5"}, {"question_id": 37061, "answer": "right"}, {"question_id": 37064, "answer": "john gage"}, {"question_id": 37067, "answer": "pacari"}, {"question_id": 37070, "answer": "friends"}, {"question_id": 37073, "answer": "10:30"}, {"question_id": 37076, "answer": "amsterdam"}, {"question_id": 37079, "answer": "phone"}, {"question_id": 37082, "answer": "california"}, {"question_id": 37085, "answer": "first"}, {"question_id": 37088, "answer": "beginning"}, {"question_id": 37091, "answer": "norway"}, {"question_id": 37094, "answer": "cap"}, {"question_id": 37097, "answer": "102"}, {"question_id": 37100, "answer": "van gogh"}, {"question_id": 37103, "answer": "1974 1980"}, {"question_id": 37106, "answer": "sony"}, {"question_id": 37109, "answer": "cat"}, {"question_id": 37112, "answer": "10:10"}, {"question_id": 37115, "answer": "united states"}, {"question_id": 37118, "answer": "75"}, {"question_id": 37121, "answer": "laptop"}, {"question_id": 37124, "answer": "yes"}, {"question_id": 37127, "answer": "jose constao"}, {"question_id": 37130, "answer": "1525"}, {"question_id": 37133, "answer": "burrman geared boxes"}, {"question_id": 37136, "answer": "sony"}, {"question_id": 37139, "answer": "norwegian"}, {"question_id": 37142, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 37145, "answer": "france"}, {"question_id": 37148, "answer": "16"}, {"question_id": 37151, "answer": "yes"}, {"question_id": 37154, "answer": "dell"}, {"question_id": 37157, "answer": "tasmania"}, {"question_id": 37160, "answer": "kenji misumi"}, {"question_id": 37163, "answer": "100"}, {"question_id": 37166, "answer": "you"}, {"question_id": 37169, "answer": "west"}, {"question_id": 37172, "answer": "virginia"}, {"question_id": 37175, "answer": "19"}, {"question_id": 37178, "answer": "10:00"}, {"question_id": 37181, "answer": "amsterdam"}, {"question_id": 37184, "answer": "verizon"}, {"question_id": 37187, "answer": "cooking in compact space"}, {"question_id": 37190, "answer": "apple"}, {"question_id": 37193, "answer": "jr"}, {"question_id": 37196, "answer": "washington"}, {"question_id": 37199, "answer": "california"}, {"question_id": 37202, "answer": "100"}, {"question_id": 37205, "answer": "yes"}, {"question_id": 37208, "answer": "gertrude f leroy"}, {"question_id": 37211, "answer": "lcd"}, {"question_id": 37214, "answer": "minix"}, {"question_id": 37217, "answer": "16.9"}, {"question_id": 37220, "answer": "dublin"}, {"question_id": 37223, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 37226, "answer": "ben"}, {"question_id": 37229, "answer": "stop"}, {"question_id": 37232, "answer": "microphone"}, {"question_id": 37235, "answer": "east"}, {"question_id": 37238, "answer": "identita"}, {"question_id": 37241, "answer": "yes"}, {"question_id": 37244, "answer": "puma"}, {"question_id": 37247, "answer": "shadow of watching star"}, {"question_id": 37250, "answer": "kentucky"}, {"question_id": 37253, "answer": "garmin"}, {"question_id": 37256, "answer": "ahl"}, {"question_id": 37259, "answer": "canon cat"}, {"question_id": 37262, "answer": "restaurant"}, {"question_id": 37265, "answer": "cards"}, {"question_id": 37268, "answer": "hampstead roads"}, {"question_id": 37271, "answer": "i'm smart i can do things"}, {"question_id": 37274, "answer": "dungeons of doom"}, {"question_id": 37277, "answer": "no"}, {"question_id": 37280, "answer": "mac"}, {"question_id": 37283, "answer": "177"}, {"question_id": 37286, "answer": "robert heinlein"}, {"question_id": 37289, "answer": "no"}, {"question_id": 37292, "answer": "golden"}, {"question_id": 37295, "answer": "bushmills"}, {"question_id": 37298, "answer": "ja house"}, {"question_id": 37301, "answer": "enter"}, {"question_id": 37304, "answer": "299"}, {"question_id": 37307, "answer": "1815"}, {"question_id": 37310, "answer": "failsworth"}, {"question_id": 37313, "answer": "vietnamcom"}, {"question_id": 37316, "answer": "open data"}, {"question_id": 37319, "answer": "100"}, {"question_id": 37322, "answer": "samsung"}, {"question_id": 37325, "answer": "2"}, {"question_id": 37328, "answer": "1"}, {"question_id": 37331, "answer": "candy"}, {"question_id": 37334, "answer": "2"}, {"question_id": 37337, "answer": "sunexpress"}, {"question_id": 37340, "answer": "yes"}, {"question_id": 37343, "answer": "be happy"}, {"question_id": 37346, "answer": "dubai"}, {"question_id": 37349, "answer": "as above so below"}, {"question_id": 37352, "answer": "ii"}, {"question_id": 37355, "answer": "rolex"}, {"question_id": 37358, "answer": "m"}, {"question_id": 37361, "answer": "kuala malas"}, {"question_id": 37364, "answer": "107 107"}, {"question_id": 37367, "answer": "e"}, {"question_id": 37370, "answer": "yes"}, {"question_id": 37373, "answer": "yes"}, {"question_id": 37376, "answer": "orange"}, {"question_id": 37379, "answer": "7"}, {"question_id": 37382, "answer": "sign"}, {"question_id": 37385, "answer": "sun"}, {"question_id": 37388, "answer": "beer"}, {"question_id": 37391, "answer": "9 am"}, {"question_id": 37394, "answer": "chapter 10"}, {"question_id": 37397, "answer": "navy"}, {"question_id": 37400, "answer": "1801"}, {"question_id": 37403, "answer": "balvene"}, {"question_id": 37406, "answer": "1998"}, {"question_id": 37409, "answer": "j k rowling"}, {"question_id": 37412, "answer": "waste"}, {"question_id": 37415, "answer": "sharepoint"}, {"question_id": 37418, "answer": "universe"}, {"question_id": 37421, "answer": "london"}, {"question_id": 37424, "answer": "interstate"}, {"question_id": 37427, "answer": "mount riley"}, {"question_id": 37430, "answer": "st thomas"}, {"question_id": 37433, "answer": "old picture of church"}, {"question_id": 37436, "answer": "hugo gernsback"}, {"question_id": 37439, "answer": "white hot tee"}, {"question_id": 37442, "answer": "sunriver"}, {"question_id": 37445, "answer": "hong kong"}, {"question_id": 37448, "answer": "white"}, {"question_id": 37451, "answer": "cuervo"}, {"question_id": 37454, "answer": "in vogue"}, {"question_id": 37457, "answer": "photos"}, {"question_id": 37460, "answer": "air nippon"}, {"question_id": 37463, "answer": "can help me understand it"}, {"question_id": 37466, "answer": "crazy 8s"}, {"question_id": 37469, "answer": "kenner"}, {"question_id": 37472, "answer": "store"}, {"question_id": 37475, "answer": "no"}, {"question_id": 37478, "answer": "10"}, {"question_id": 37481, "answer": "rockstar"}, {"question_id": 37484, "answer": "50"}, {"question_id": 37487, "answer": "yankees"}, {"question_id": 37490, "answer": "irv"}, {"question_id": 37493, "answer": "1980"}, {"question_id": 37496, "answer": "shoesmith"}, {"question_id": 37499, "answer": "yes you do"}, {"question_id": 37502, "answer": "god"}, {"question_id": 37505, "answer": "back up"}, {"question_id": 37508, "answer": "washington"}, {"question_id": 37511, "answer": "stop sign"}, {"question_id": 37514, "answer": "titan"}, {"question_id": 37517, "answer": "indians"}, {"question_id": 37520, "answer": "apothecary"}, {"question_id": 37523, "answer": "facebook"}, {"question_id": 37526, "answer": "andersen"}, {"question_id": 37529, "answer": "samuel pupier"}, {"question_id": 37532, "answer": "yes"}, {"question_id": 37535, "answer": "yes"}, {"question_id": 37538, "answer": "coca cola"}, {"question_id": 37541, "answer": "please drive carefully"}, {"question_id": 37544, "answer": "kitchen aid"}, {"question_id": 37547, "answer": "yes"}, {"question_id": 37550, "answer": "102"}, {"question_id": 37553, "answer": "ww"}, {"question_id": 37556, "answer": "22"}, {"question_id": 37559, "answer": "new york times"}, {"question_id": 37562, "answer": "u"}, {"question_id": 37565, "answer": "inimigo"}, {"question_id": 37568, "answer": "2004"}, {"question_id": 37571, "answer": "first"}, {"question_id": 37574, "answer": "2nd street"}, {"question_id": 37577, "answer": "mary williams"}, {"question_id": 37580, "answer": "10:00"}, {"question_id": 37583, "answer": "spain"}, {"question_id": 37586, "answer": "store"}, {"question_id": 37589, "answer": "orange"}, {"question_id": 37592, "answer": "flyem"}, {"question_id": 37595, "answer": "army"}, {"question_id": 37598, "answer": "london"}, {"question_id": 37601, "answer": "les soci\u00e9t\u00e9s"}, {"question_id": 37604, "answer": "amazon"}, {"question_id": 37607, "answer": "ford"}, {"question_id": 37610, "answer": "mall dads"}, {"question_id": 37613, "answer": "heartbreaker"}, {"question_id": 37616, "answer": "2"}, {"question_id": 37619, "answer": "1999"}, {"question_id": 37622, "answer": "1 cent"}, {"question_id": 37625, "answer": "yes"}, {"question_id": 37628, "answer": "mira schendel"}, {"question_id": 37631, "answer": "9"}, {"question_id": 37634, "answer": "water"}, {"question_id": 37637, "answer": "ultramarino"}, {"question_id": 37640, "answer": "1977"}, {"question_id": 37643, "answer": "software company"}, {"question_id": 37646, "answer": "premier 1000"}, {"question_id": 37649, "answer": "g"}, {"question_id": 37652, "answer": "title of book"}, {"question_id": 37655, "answer": "harry potter and goblet of fire"}, {"question_id": 37658, "answer": "pedestrians"}, {"question_id": 37661, "answer": "11 14"}, {"question_id": 37664, "answer": "english"}, {"question_id": 37667, "answer": "18"}, {"question_id": 37670, "answer": "3"}, {"question_id": 37673, "answer": "sea"}, {"question_id": 37676, "answer": "en"}, {"question_id": 37679, "answer": "beer"}, {"question_id": 37682, "answer": "denmark"}, {"question_id": 37685, "answer": "cta"}, {"question_id": 37688, "answer": "running"}, {"question_id": 37691, "answer": "yes"}, {"question_id": 37694, "answer": "barilla"}, {"question_id": 37697, "answer": "adidas"}, {"question_id": 37700, "answer": "thirst"}, {"question_id": 37703, "answer": "csu"}, {"question_id": 37706, "answer": "herriot"}, {"question_id": 37709, "answer": "swiss"}, {"question_id": 37712, "answer": "god"}, {"question_id": 37715, "answer": "apple"}, {"question_id": 37718, "answer": "3 m"}, {"question_id": 37721, "answer": "canada"}, {"question_id": 37724, "answer": "mail"}, {"question_id": 37727, "answer": "panera"}, {"question_id": 37730, "answer": "yes"}, {"question_id": 37733, "answer": "red"}, {"question_id": 37736, "answer": "11"}, {"question_id": 37739, "answer": "yes"}, {"question_id": 37742, "answer": "orange"}, {"question_id": 37745, "answer": "stand up comedy club"}, {"question_id": 37748, "answer": "acer"}, {"question_id": 37751, "answer": "$10"}, {"question_id": 37754, "answer": "rawlins"}, {"question_id": 37757, "answer": "troublesome valley rd"}, {"question_id": 37760, "answer": "50"}, {"question_id": 37763, "answer": "108"}, {"question_id": 37766, "answer": "wine"}, {"question_id": 37769, "answer": "yes"}, {"question_id": 37772, "answer": "grow and build"}, {"question_id": 37775, "answer": "instant"}, {"question_id": 37778, "answer": "d h louis"}, {"question_id": 37781, "answer": "louvre"}, {"question_id": 37784, "answer": "10"}, {"question_id": 37787, "answer": "map"}, {"question_id": 37790, "answer": "tiger"}, {"question_id": 37793, "answer": "wesjern"}, {"question_id": 37796, "answer": "von"}, {"question_id": 37799, "answer": "no"}, {"question_id": 37802, "answer": "ikko"}, {"question_id": 37805, "answer": "yes"}, {"question_id": 37808, "answer": "african american fiction best african american fiction"}, {"question_id": 37811, "answer": "yes"}, {"question_id": 37814, "answer": "lynx"}, {"question_id": 37817, "answer": "12"}, {"question_id": 37820, "answer": "india pale ale"}, {"question_id": 37823, "answer": "ron"}, {"question_id": 37826, "answer": "race"}, {"question_id": 37829, "answer": "10"}, {"question_id": 37832, "answer": "laptop"}, {"question_id": 37835, "answer": "10"}, {"question_id": 37838, "answer": "samsung"}, {"question_id": 37841, "answer": "bertram"}, {"question_id": 37844, "answer": "john carrnel"}, {"question_id": 37847, "answer": "4"}, {"question_id": 37850, "answer": "7 nights of evil"}, {"question_id": 37853, "answer": "dell"}, {"question_id": 37856, "answer": "660909"}, {"question_id": 37859, "answer": "1990"}, {"question_id": 37862, "answer": "give book chance"}, {"question_id": 37865, "answer": "bear"}, {"question_id": 37868, "answer": "tunlop"}, {"question_id": 37871, "answer": "rolex"}, {"question_id": 37874, "answer": "holmes point"}, {"question_id": 37877, "answer": "1999"}, {"question_id": 37880, "answer": "10 15 14"}, {"question_id": 37883, "answer": "10"}, {"question_id": 37886, "answer": "tech"}, {"question_id": 37889, "answer": "glenfield"}, {"question_id": 37892, "answer": "yes"}, {"question_id": 37895, "answer": "707th fd"}, {"question_id": 37898, "answer": "7"}, {"question_id": 37901, "answer": "m6 tal"}, {"question_id": 37904, "answer": "canon"}, {"question_id": 37907, "answer": "welcome to jersey"}, {"question_id": 37910, "answer": "samsung"}, {"question_id": 37913, "answer": "arizona"}, {"question_id": 37916, "answer": "beer"}, {"question_id": 37919, "answer": "dog house"}, {"question_id": 37922, "answer": "europe"}, {"question_id": 37925, "answer": "voter pamphlet"}, {"question_id": 37928, "answer": "business design"}, {"question_id": 37931, "answer": "hero"}, {"question_id": 37934, "answer": "application"}, {"question_id": 37937, "answer": "food"}, {"question_id": 37940, "answer": "1999"}, {"question_id": 37943, "answer": "100"}, {"question_id": 37946, "answer": "laguna"}, {"question_id": 37949, "answer": "charles grayson"}, {"question_id": 37952, "answer": "1000"}, {"question_id": 37955, "answer": "b c d e f g h i j k l m n o p q r s t u v w x y z"}, {"question_id": 37958, "answer": "no butts on beach"}, {"question_id": 37961, "answer": "16"}, {"question_id": 37964, "answer": "cooler ferry"}, {"question_id": 37967, "answer": "trash"}, {"question_id": 37970, "answer": "sr"}, {"question_id": 37973, "answer": "10"}, {"question_id": 37976, "answer": "vodka"}, {"question_id": 37979, "answer": "dragonfly effect: quick effective and powerful ways to drive social change"}, {"question_id": 37982, "answer": "black"}, {"question_id": 37985, "answer": "adidas"}, {"question_id": 37988, "answer": "brand name"}, {"question_id": 37991, "answer": "barnes"}, {"question_id": 37994, "answer": "7"}, {"question_id": 37997, "answer": "no"}, {"question_id": 38000, "answer": "wired"}, {"question_id": 38003, "answer": "traffic"}, {"question_id": 38006, "answer": "albany"}, {"question_id": 38009, "answer": "cathay"}, {"question_id": 38012, "answer": "1000"}, {"question_id": 38015, "answer": "word"}, {"question_id": 38018, "answer": "hamlet"}, {"question_id": 38021, "answer": "mary melgermans"}, {"question_id": 38024, "answer": "o"}, {"question_id": 38027, "answer": "demers"}, {"question_id": 38030, "answer": "118"}, {"question_id": 38033, "answer": "aa"}, {"question_id": 38036, "answer": "1999"}, {"question_id": 38039, "answer": "$1.99"}, {"question_id": 38042, "answer": "pie"}, {"question_id": 38045, "answer": "10:30"}, {"question_id": 38048, "answer": "10 15"}, {"question_id": 38051, "answer": "tonight"}, {"question_id": 38054, "answer": "no"}, {"question_id": 38057, "answer": "n"}, {"question_id": 38060, "answer": "rubbermaid"}, {"question_id": 38063, "answer": "navy"}, {"question_id": 38066, "answer": "tachymeter"}, {"question_id": 38069, "answer": "no"}, {"question_id": 38072, "answer": "eli marcotte"}, {"question_id": 38075, "answer": "300"}, {"question_id": 38078, "answer": "yosemite"}, {"question_id": 38081, "answer": "google"}, {"question_id": 38084, "answer": "ruby"}, {"question_id": 38087, "answer": "yes"}, {"question_id": 38090, "answer": "aliki recreation center"}, {"question_id": 38093, "answer": "123"}, {"question_id": 38096, "answer": "12"}, {"question_id": 38099, "answer": "49"}, {"question_id": 38102, "answer": "100"}, {"question_id": 38105, "answer": "tiger"}, {"question_id": 38108, "answer": "1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 38111, "answer": "10:10"}, {"question_id": 38114, "answer": "lagunitas"}, {"question_id": 38117, "answer": "1895"}, {"question_id": 38120, "answer": "reference"}, {"question_id": 38123, "answer": "command module"}, {"question_id": 38126, "answer": "apple"}, {"question_id": 38129, "answer": "samsung"}, {"question_id": 38132, "answer": "mercedes"}, {"question_id": 38135, "answer": "acer"}, {"question_id": 38138, "answer": "hudson"}, {"question_id": 38141, "answer": "no"}, {"question_id": 38144, "answer": "htc"}, {"question_id": 38147, "answer": "102"}, {"question_id": 38150, "answer": "5:22"}, {"question_id": 38153, "answer": "floral"}, {"question_id": 38156, "answer": "canada"}, {"question_id": 38159, "answer": "joe pesci"}, {"question_id": 38162, "answer": "shell"}, {"question_id": 38165, "answer": "hb"}, {"question_id": 38168, "answer": "lego"}, {"question_id": 38171, "answer": "15070"}, {"question_id": 38174, "answer": "0"}, {"question_id": 38177, "answer": "1"}, {"question_id": 38180, "answer": "fast"}, {"question_id": 38183, "answer": "sony"}, {"question_id": 38186, "answer": "yes"}, {"question_id": 38189, "answer": "wednesday"}, {"question_id": 38192, "answer": "lady gaga"}, {"question_id": 38195, "answer": "78"}, {"question_id": 38198, "answer": "huntington"}, {"question_id": 38201, "answer": "nissan"}, {"question_id": 38204, "answer": "fort collins"}, {"question_id": 38207, "answer": "pp"}, {"question_id": 38210, "answer": "fan paw patrol"}, {"question_id": 38213, "answer": "building"}, {"question_id": 38216, "answer": "2005"}, {"question_id": 38219, "answer": "mercedes"}, {"question_id": 38222, "answer": "no"}, {"question_id": 38225, "answer": "nestle"}, {"question_id": 38228, "answer": "42"}, {"question_id": 38231, "answer": "3"}, {"question_id": 38234, "answer": "ye old editor has said"}, {"question_id": 38237, "answer": "mc"}, {"question_id": 38240, "answer": "road"}, {"question_id": 38243, "answer": "10:10"}, {"question_id": 38246, "answer": "awesome"}, {"question_id": 38249, "answer": "tipiret"}, {"question_id": 38252, "answer": "house on floss"}, {"question_id": 38255, "answer": "11:52"}, {"question_id": 38258, "answer": "2012"}, {"question_id": 38261, "answer": "el regalo de los reyes magos spanish"}, {"question_id": 38264, "answer": "px"}, {"question_id": 38267, "answer": "abc"}, {"question_id": 38270, "answer": "trash"}, {"question_id": 38273, "answer": "yes"}, {"question_id": 38276, "answer": "afterlife and other stories by john updike"}, {"question_id": 38279, "answer": "spring"}, {"question_id": 38282, "answer": "wine"}, {"question_id": 38285, "answer": "terra ferma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafirma terrafir"}, {"question_id": 38288, "answer": "italian"}, {"question_id": 38291, "answer": "10"}, {"question_id": 38294, "answer": "xbox"}, {"question_id": 38297, "answer": "milk"}, {"question_id": 38300, "answer": "inkjet"}, {"question_id": 38303, "answer": "greenbaugh"}, {"question_id": 38306, "answer": "hellosquid"}, {"question_id": 38309, "answer": "yes"}, {"question_id": 38312, "answer": "trapiche"}, {"question_id": 38315, "answer": "ex"}, {"question_id": 38318, "answer": "brooklyn"}, {"question_id": 38321, "answer": "gold 's"}, {"question_id": 38324, "answer": "2 hours"}, {"question_id": 38327, "answer": "2013"}, {"question_id": 38330, "answer": "9"}, {"question_id": 38333, "answer": "yes"}, {"question_id": 38336, "answer": "store"}, {"question_id": 38339, "answer": "stella"}, {"question_id": 38342, "answer": "chacabuco"}, {"question_id": 38345, "answer": "11:30"}, {"question_id": 38348, "answer": "blackberry"}, {"question_id": 38351, "answer": ""}, {"question_id": 38354, "answer": "1970"}, {"question_id": 38357, "answer": "yes"}, {"question_id": 38360, "answer": "bell"}, {"question_id": 38363, "answer": "confusion"}, {"question_id": 38366, "answer": "chinese"}, {"question_id": 38369, "answer": ""}, {"question_id": 38372, "answer": "gilmore"}, {"question_id": 38375, "answer": "athletics"}, {"question_id": 38378, "answer": "salad"}, {"question_id": 38381, "answer": "3"}, {"question_id": 38384, "answer": "lg"}, {"question_id": 38387, "answer": "delta"}, {"question_id": 38390, "answer": "schubert beethoven"}, {"question_id": 38393, "answer": "theo"}, {"question_id": 38396, "answer": "19"}, {"question_id": 38399, "answer": "umbus"}, {"question_id": 38402, "answer": "dubbel"}, {"question_id": 38405, "answer": "changers"}, {"question_id": 38408, "answer": "sweet"}, {"question_id": 38411, "answer": "babar"}, {"question_id": 38414, "answer": "yes"}, {"question_id": 38417, "answer": "augustus 's"}, {"question_id": 38420, "answer": "101"}, {"question_id": 38423, "answer": "turtles"}, {"question_id": 38426, "answer": "brown"}, {"question_id": 38429, "answer": "christie"}, {"question_id": 38432, "answer": "8:22"}, {"question_id": 38435, "answer": "cadence"}, {"question_id": 38438, "answer": "1905"}, {"question_id": 38441, "answer": "football"}, {"question_id": 38444, "answer": "baseball"}, {"question_id": 38447, "answer": "take seat"}, {"question_id": 38450, "answer": "airline"}, {"question_id": 38453, "answer": "candy"}, {"question_id": 38456, "answer": "star club"}, {"question_id": 38459, "answer": "leuven"}, {"question_id": 38462, "answer": "mac"}, {"question_id": 38465, "answer": "ll"}, {"question_id": 38468, "answer": "epistot"}, {"question_id": 38471, "answer": "crew"}, {"question_id": 38474, "answer": "sony"}, {"question_id": 38477, "answer": "10 20 13"}, {"question_id": 38480, "answer": "10:10"}, {"question_id": 38483, "answer": "recycling"}, {"question_id": 38486, "answer": "adidas"}, {"question_id": 38489, "answer": "19"}, {"question_id": 38492, "answer": "100"}, {"question_id": 38495, "answer": "hamburg"}, {"question_id": 38498, "answer": "strawberries"}, {"question_id": 38501, "answer": "route rest"}, {"question_id": 38504, "answer": "software"}, {"question_id": 38507, "answer": "24"}, {"question_id": 38510, "answer": "i promise to love you"}, {"question_id": 38513, "answer": "3"}, {"question_id": 38516, "answer": "2014"}, {"question_id": 38519, "answer": ""}, {"question_id": 38522, "answer": "100"}, {"question_id": 38525, "answer": "orion"}, {"question_id": 38528, "answer": "investment"}, {"question_id": 38531, "answer": "us army"}, {"question_id": 38534, "answer": "konami"}, {"question_id": 38537, "answer": "107"}, {"question_id": 38540, "answer": "bang"}, {"question_id": 38543, "answer": "army man"}, {"question_id": 38546, "answer": "renan camus"}, {"question_id": 38549, "answer": "yes"}, {"question_id": 38552, "answer": "atlanta"}, {"question_id": 38555, "answer": "promo"}, {"question_id": 38558, "answer": "own line"}, {"question_id": 38561, "answer": "1991"}, {"question_id": 38564, "answer": "girardi"}, {"question_id": 38567, "answer": "nintendo"}, {"question_id": 38570, "answer": "cell phone"}, {"question_id": 38573, "answer": "book"}, {"question_id": 38576, "answer": "samsung"}, {"question_id": 38579, "answer": "12"}, {"question_id": 38582, "answer": "seiko"}, {"question_id": 38585, "answer": "white"}, {"question_id": 38588, "answer": "pepsi"}, {"question_id": 38591, "answer": "12345"}, {"question_id": 38594, "answer": "wwwkoreancom"}, {"question_id": 38597, "answer": "red"}, {"question_id": 38600, "answer": "board"}, {"question_id": 38603, "answer": "1909"}, {"question_id": 38606, "answer": "10:10"}, {"question_id": 38609, "answer": "soda"}, {"question_id": 38612, "answer": "10"}, {"question_id": 38615, "answer": "nexus"}, {"question_id": 38618, "answer": "godiva"}, {"question_id": 38621, "answer": "wwweurasian nationorg"}, {"question_id": 38624, "answer": "2005"}, {"question_id": 38627, "answer": "2013"}, {"question_id": 38630, "answer": "yellow"}, {"question_id": 38633, "answer": "louis roederer"}, {"question_id": 38636, "answer": "rye"}, {"question_id": 38639, "answer": "apple"}, {"question_id": 38642, "answer": "daily camera"}, {"question_id": 38645, "answer": "john pizzicato"}, {"question_id": 38648, "answer": "7"}, {"question_id": 38651, "answer": "oktoberfest"}, {"question_id": 38654, "answer": "ford"}, {"question_id": 38657, "answer": "unlike"}, {"question_id": 38660, "answer": "elmira"}, {"question_id": 38663, "answer": "2000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"}, {"question_id": 38666, "answer": "store"}, {"question_id": 38669, "answer": "education"}, {"question_id": 38672, "answer": "viking court"}, {"question_id": 38675, "answer": "100"}, {"question_id": 38678, "answer": "wildfire"}, {"question_id": 38681, "answer": "yes"}, {"question_id": 38684, "answer": "25"}, {"question_id": 38687, "answer": "3"}, {"question_id": 38690, "answer": "road to riches"}, {"question_id": 38693, "answer": "tripel"}, {"question_id": 38696, "answer": "saturday"}, {"question_id": 38699, "answer": "hokkaido"}, {"question_id": 38702, "answer": "yes"}, {"question_id": 38705, "answer": "brown"}, {"question_id": 38708, "answer": "rolex"}, {"question_id": 38711, "answer": "100"}, {"question_id": 38714, "answer": "potatoes"}, {"question_id": 38717, "answer": "guadalajara"}, {"question_id": 38720, "answer": "coca cola"}, {"question_id": 38723, "answer": "harris"}, {"question_id": 38726, "answer": "zimere"}, {"question_id": 38729, "answer": "fire"}, {"question_id": 38732, "answer": "poddymore"}, {"question_id": 38735, "answer": "tango"}, {"question_id": 38738, "answer": "pomerol"}, {"question_id": 38741, "answer": "hermann"}, {"question_id": 38744, "answer": "murphy 's"}, {"question_id": 38747, "answer": "10:10"}, {"question_id": 38750, "answer": "pandora"}, {"question_id": 38753, "answer": "10:10"}, {"question_id": 38756, "answer": "black and white"}, {"question_id": 38759, "answer": "syrtaki"}, {"question_id": 38762, "answer": "alcohol"}, {"question_id": 38765, "answer": "coca cola"}, {"question_id": 38768, "answer": "great hall ceiling model"}, {"question_id": 38771, "answer": "google"}, {"question_id": 38774, "answer": "j k rowling"}, {"question_id": 38777, "answer": "123456"}, {"question_id": 38780, "answer": "001"}, {"question_id": 38783, "answer": "wine"}, {"question_id": 38786, "answer": "dhl"}, {"question_id": 38789, "answer": "100"}, {"question_id": 38792, "answer": "digtal"}, {"question_id": 38795, "answer": "8:30"}, {"question_id": 38798, "answer": "chocolates"}, {"question_id": 38801, "answer": "rose"}, {"question_id": 38804, "answer": "on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it says on back of coin it"}, {"question_id": 38807, "answer": "2018"}, {"question_id": 38810, "answer": "norfolkinecom"}, {"question_id": 38813, "answer": "whipsibibi"}, {"question_id": 38816, "answer": "union"}, {"question_id": 38819, "answer": "anchor"}, {"question_id": 38822, "answer": "nintendo"}, {"question_id": 38825, "answer": "scotch ale"}, {"question_id": 38828, "answer": "compaq"}, {"question_id": 38831, "answer": "alfred d pfeiffer"}, {"question_id": 38834, "answer": "dance with dragons"}, {"question_id": 38837, "answer": "yes"}, {"question_id": 38840, "answer": "white"}, {"question_id": 38843, "answer": "lager"}, {"question_id": 38846, "answer": "magazine"}, {"question_id": 38849, "answer": "100"}, {"question_id": 38852, "answer": "32"}, {"question_id": 38855, "answer": "book"}, {"question_id": 38858, "answer": "premier aire"}, {"question_id": 38861, "answer": "beer"}, {"question_id": 38864, "answer": "london"}, {"question_id": 38867, "answer": "fairy tales"}, {"question_id": 38870, "answer": "cbc"}, {"question_id": 38873, "answer": "5 miles"}, {"question_id": 38876, "answer": "12 oz"}, {"question_id": 38879, "answer": "delta"}, {"question_id": 38882, "answer": "yes"}, {"question_id": 38885, "answer": "quebec"}, {"question_id": 38888, "answer": "12"}, {"question_id": 38891, "answer": "1332"}, {"question_id": 38894, "answer": "hawaii"}, {"question_id": 38897, "answer": "no"}, {"question_id": 38900, "answer": "1990"}, {"question_id": 38903, "answer": "1000"}, {"question_id": 38906, "answer": "periodic table of heavy metals"}, {"question_id": 38909, "answer": "fun"}, {"question_id": 38912, "answer": "bookstore"}, {"question_id": 38915, "answer": "trash"}, {"question_id": 38918, "answer": "5"}, {"question_id": 38921, "answer": "vogue"}, {"question_id": 38924, "answer": "10"}, {"question_id": 38927, "answer": "peace"}, {"question_id": 38930, "answer": "national city"}, {"question_id": 38933, "answer": "dell"}, {"question_id": 38936, "answer": "poppy"}, {"question_id": 38939, "answer": "1965 1973"}, {"question_id": 38942, "answer": "5"}, {"question_id": 38945, "answer": "81"}, {"question_id": 38948, "answer": "darth vader"}, {"question_id": 38951, "answer": "indians"}, {"question_id": 38954, "answer": "german"}, {"question_id": 38957, "answer": "pepsi"}, {"question_id": 38960, "answer": "no"}, {"question_id": 38963, "answer": "samsung"}, {"question_id": 38966, "answer": "dinosaur"}, {"question_id": 38969, "answer": "music"}, {"question_id": 38972, "answer": "concise"}, {"question_id": 38975, "answer": "44"}, {"question_id": 38978, "answer": "no"}, {"question_id": 38981, "answer": "army"}, {"question_id": 38984, "answer": "10"}, {"question_id": 38987, "answer": "western"}, {"question_id": 38990, "answer": "no"}, {"question_id": 38993, "answer": "bessie"}, {"question_id": 38996, "answer": "sign"}, {"question_id": 38999, "answer": "12 23 2012"}, {"question_id": 39002, "answer": "10:00"}, {"question_id": 39005, "answer": "bacteria"}, {"question_id": 39008, "answer": "21"}, {"question_id": 39011, "answer": "99"}, {"question_id": 39014, "answer": "firenze"}, {"question_id": 39017, "answer": "503 222 2222"}, {"question_id": 39020, "answer": "in god i trust"}, {"question_id": 39023, "answer": "stop"}, {"question_id": 39026, "answer": "moma"}, {"question_id": 39029, "answer": "old times"}, {"question_id": 39032, "answer": "lego"}, {"question_id": 39035, "answer": "e"}, {"question_id": 39038, "answer": "metka"}, {"question_id": 39041, "answer": "big omaha 2009"}, {"question_id": 39044, "answer": "yes"}, {"question_id": 39047, "answer": "toronto"}, {"question_id": 39050, "answer": "lost"}, {"question_id": 39053, "answer": "yes"}, {"question_id": 39056, "answer": "10:00"}, {"question_id": 39059, "answer": "caboose"}, {"question_id": 39062, "answer": "cu"}, {"question_id": 39065, "answer": "carl davis"}, {"question_id": 39068, "answer": "queen"}, {"question_id": 39071, "answer": "boston"}, {"question_id": 39074, "answer": "hand"}, {"question_id": 39077, "answer": "no"}, {"question_id": 39080, "answer": "raf"}, {"question_id": 39083, "answer": "wii"}, {"question_id": 39086, "answer": "10:10"}, {"question_id": 39089, "answer": "100"}, {"question_id": 39092, "answer": "british"}, {"question_id": 39095, "answer": "space bar"}, {"question_id": 39098, "answer": "google"}, {"question_id": 39101, "answer": "police"}, {"question_id": 39104, "answer": "roxy gould"}, {"question_id": 39107, "answer": "p"}, {"question_id": 39110, "answer": "pizza"}, {"question_id": 39113, "answer": "55"}, {"question_id": 39116, "answer": "alcatraz"}, {"question_id": 39119, "answer": "supreme court"}, {"question_id": 39122, "answer": "nintendo"}, {"question_id": 39125, "answer": "2015"}, {"question_id": 39128, "answer": "cnn"}, {"question_id": 39131, "answer": "10 20 09"}, {"question_id": 39134, "answer": "yes"}, {"question_id": 39137, "answer": "nigeria"}, {"question_id": 39140, "answer": "caberlereiro"}, {"question_id": 39143, "answer": "panasonic"}, {"question_id": 39146, "answer": "2010"}, {"question_id": 39149, "answer": "21"}, {"question_id": 39152, "answer": "ireland"}, {"question_id": 39155, "answer": "large range"}, {"question_id": 39158, "answer": "lion gate"}, {"question_id": 39161, "answer": "put to paper"}, {"question_id": 39164, "answer": "sony"}, {"question_id": 39167, "answer": "japan"}, {"question_id": 39170, "answer": "rolex"}, {"question_id": 39173, "answer": "no"}, {"question_id": 39176, "answer": "yes"}, {"question_id": 39179, "answer": "1971"}, {"question_id": 39182, "answer": "christmas"}, {"question_id": 39185, "answer": "yes"}, {"question_id": 39188, "answer": "30"}, {"question_id": 39191, "answer": "visa"}, {"question_id": 39194, "answer": "315"}, {"question_id": 39197, "answer": "android"}, {"question_id": 39200, "answer": "tell it to marines"}, {"question_id": 39203, "answer": "lg"}, {"question_id": 39206, "answer": "los angeles"}, {"question_id": 39209, "answer": "new orleans"}, {"question_id": 39212, "answer": "bud"}, {"question_id": 39215, "answer": "tea"}, {"question_id": 39218, "answer": "jet smart xtl"}, {"question_id": 39221, "answer": "55"}, {"question_id": 39224, "answer": "w vintner"}, {"question_id": 39227, "answer": "hotel"}, {"question_id": 39230, "answer": "baby"}, {"question_id": 39233, "answer": "yes"}, {"question_id": 39236, "answer": "street"}, {"question_id": 39239, "answer": "tellus"}, {"question_id": 39242, "answer": "nikon"}, {"question_id": 39245, "answer": "swiss"}, {"question_id": 39248, "answer": "iowa"}, {"question_id": 39251, "answer": "fresh"}, {"question_id": 39254, "answer": "oscars"}, {"question_id": 39257, "answer": "ormacheck"}, {"question_id": 39260, "answer": "01 01 2010"}, {"question_id": 39263, "answer": "10"}, {"question_id": 39266, "answer": "delirium"}, {"question_id": 39269, "answer": "165"}, {"question_id": 39272, "answer": "rutgers"}, {"question_id": 39275, "answer": "loka"}, {"question_id": 39278, "answer": "t"}, {"question_id": 39281, "answer": "90"}, {"question_id": 39284, "answer": "saturday"}, {"question_id": 39287, "answer": "california"}, {"question_id": 39290, "answer": "hewlett"}, {"question_id": 39293, "answer": "woodnut"}, {"question_id": 39296, "answer": "yes"}, {"question_id": 39299, "answer": "heavy"}, {"question_id": 39302, "answer": "washington"}, {"question_id": 39305, "answer": "canada"}, {"question_id": 39308, "answer": "idorg"}, {"question_id": 39311, "answer": "army"}, {"question_id": 39314, "answer": "peggy goodman"}, {"question_id": 39317, "answer": "red burgundy book"}, {"question_id": 39320, "answer": "exit"}, {"question_id": 39323, "answer": "sky mall"}, {"question_id": 39326, "answer": "klein"}, {"question_id": 39329, "answer": "60 seconds"}, {"question_id": 39332, "answer": "fast"}, {"question_id": 39335, "answer": "skrin"}, {"question_id": 39338, "answer": "11"}, {"question_id": 39341, "answer": "ford"}, {"question_id": 39344, "answer": "backboard"}, {"question_id": 39347, "answer": "red"}, {"question_id": 39350, "answer": "highways and center of william az"}, {"question_id": 39353, "answer": "im"}, {"question_id": 39356, "answer": "10:10"}, {"question_id": 39359, "answer": "nescafe"}, {"question_id": 39362, "answer": "dunkin'"}, {"question_id": 39365, "answer": "1"}, {"question_id": 39368, "answer": "stop"}, {"question_id": 39371, "answer": "valuable"}, {"question_id": 39374, "answer": "kirby"}, {"question_id": 39377, "answer": "50"}, {"question_id": 39380, "answer": "toyota"}, {"question_id": 39383, "answer": "emirates"}, {"question_id": 39386, "answer": "2"}, {"question_id": 39389, "answer": "rivoli"}, {"question_id": 39392, "answer": "yahoo"}, {"question_id": 39395, "answer": "chicago riverdogs"}, {"question_id": 39398, "answer": "ctrl"}, {"question_id": 39401, "answer": "microsoft"}, {"question_id": 39404, "answer": "harry"}, {"question_id": 39407, "answer": "congway"}, {"question_id": 39410, "answer": "whole"}, {"question_id": 39413, "answer": "ice"}, {"question_id": 39416, "answer": "love"}, {"question_id": 39419, "answer": "babe"}, {"question_id": 39422, "answer": "8"}, {"question_id": 39425, "answer": "2"}, {"question_id": 39428, "answer": "scissors"}, {"question_id": 39431, "answer": "fort worth beach"}, {"question_id": 39434, "answer": "yes"}, {"question_id": 39437, "answer": "taco truck"}, {"question_id": 39440, "answer": "30"}, {"question_id": 39443, "answer": "rd"}, {"question_id": 39446, "answer": "white"}, {"question_id": 39449, "answer": "harm"}, {"question_id": 39452, "answer": "put down man"}, {"question_id": 39455, "answer": "evostik"}, {"question_id": 39458, "answer": "at&t"}, {"question_id": 39461, "answer": "song of solomon"}, {"question_id": 39464, "answer": "yes"}, {"question_id": 39467, "answer": "sun"}, {"question_id": 39470, "answer": "10"}, {"question_id": 39473, "answer": "caslon"}, {"question_id": 39476, "answer": "poland"}, {"question_id": 39479, "answer": "horner"}, {"question_id": 39482, "answer": "highland"}, {"question_id": 39485, "answer": "10:10"}, {"question_id": 39488, "answer": "$12.99"}, {"question_id": 39491, "answer": "advertisement"}, {"question_id": 39494, "answer": "yellow"}, {"question_id": 39497, "answer": "dancer 's arrow chronicles of tormor"}, {"question_id": 39500, "answer": "$1.50"}, {"question_id": 39503, "answer": "glenn"}, {"question_id": 39506, "answer": "yes"}, {"question_id": 39509, "answer": "energy glut: politics of fatness in overconsumpting world"}, {"question_id": 39512, "answer": "10"}, {"question_id": 39515, "answer": "colorado"}, {"question_id": 39518, "answer": "yes"}, {"question_id": 39521, "answer": ""}, {"question_id": 39524, "answer": "cabernet sauvignon"}, {"question_id": 39527, "answer": "wwwkickoffgameovercom"}, {"question_id": 39530, "answer": "gabriel carmen pascuata"}, {"question_id": 39533, "answer": "shh"}, {"question_id": 39536, "answer": "tv"}, {"question_id": 39539, "answer": "4"}, {"question_id": 39542, "answer": "beer"}, {"question_id": 39545, "answer": "firefox"}, {"question_id": 39548, "answer": "tape measure"}, {"question_id": 39551, "answer": "2011"}, {"question_id": 39554, "answer": "16"}, {"question_id": 39557, "answer": "16 cups"}, {"question_id": 39560, "answer": "bachelorette"}, {"question_id": 39563, "answer": "1000"}, {"question_id": 39566, "answer": "stop"}, {"question_id": 39569, "answer": "9000"}, {"question_id": 39572, "answer": "yes"}, {"question_id": 39575, "answer": "diet"}, {"question_id": 39578, "answer": "sonora"}, {"question_id": 39581, "answer": "perris"}, {"question_id": 39584, "answer": "sale"}, {"question_id": 39587, "answer": "sierra nevada"}, {"question_id": 39590, "answer": "tree"}, {"question_id": 39593, "answer": "15"}, {"question_id": 39596, "answer": "100"}, {"question_id": 39599, "answer": "14"}] \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/textvqa_val.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/textvqa_val.json new file mode 100644 index 0000000000000000000000000000000000000000..f9f5d69f8e402c81ea79a9eecf59db378254dd80 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/logs/0512_0233_llava...a_val_llava_model_args_f58958/textvqa_val.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c975f9d9ba823b5756eecbd96848cd7fcf27325bbc7a8b6bfe1c574963ad989a +size 13251637 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/model-00001-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-1040/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/model-00002-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-1040/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fc88e0c4ac2cd86a9e247b43be1d8ba345042304 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fef2b243074dff406fcb0b85f2fc0f2d944c3aa2b6648a35328c0847e42c1c10 +size 3759030203 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/model.safetensors.index.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..1c36aea017a82c896c2bf8d32802184967811e4c --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/model.safetensors.index.json @@ -0,0 +1,673 @@ +{ + "metadata": { + "total_size": 8731429675 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_0.pth b/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..74aaffdc337c5a168a279aed341c53617abfb292 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7428511a0f39116505eb0e78fefd1d50fe2ddacee4482cdd5d925938d450347 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_1.pth b/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_2.pth b/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_3.pth b/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/special_tokens_map.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/tokenizer.model b/sft_pretrain/Full_competesmoev30/checkpoint-1040/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/tokenizer_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/trainer_state.json b/sft_pretrain/Full_competesmoev30/checkpoint-1040/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aee0a09f7a0c5c07cc903085a419e318ae426cf5 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/trainer_state.json @@ -0,0 +1,17113 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2000769526741054, + "eval_steps": 500, + "global_step": 1040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03936368, + "balance_loss_mlp": 2.84994221, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 15.847607787273237, + "language_loss": 2.91765308, + "learning_rate": 0.0, + "loss": 1.97528625, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.278199672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02015882, + "balance_loss_mlp": 1.26743817, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 26.39987998366427, + "language_loss": 2.42349291, + "learning_rate": 0.00013726078121135892, + "loss": 2.44365168, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 7.4765625, + "step": 2, + "time_per_iteration": 2.74550199508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02034476, + "balance_loss_mlp": 1.28603244, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 23.46624299076427, + "language_loss": 2.13354897, + "learning_rate": 0.00021755319103969496, + "loss": 2.15389395, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 7.4765625, + "step": 3, + "time_per_iteration": 2.820986270904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02058399, + "balance_loss_mlp": 1.29927421, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 3.493910581799846, + "language_loss": 1.37129521, + "learning_rate": 0.00027452156242271784, + "loss": 1.3918792, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 7.5859375, + "step": 4, + "time_per_iteration": 2.677243947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02066247, + "balance_loss_mlp": 1.30979228, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 0.8674817587168525, + "language_loss": 1.33187473, + "learning_rate": 0.0003187096642208417, + "loss": 1.35253716, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 7.55859375, + "step": 5, + "time_per_iteration": 2.6032657623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02071583, + "balance_loss_mlp": 1.31322157, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 2.033424387355904, + "language_loss": 1.30649018, + "learning_rate": 0.0003548139722510539, + "loss": 1.32720602, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 7.578125, + "step": 6, + "time_per_iteration": 2.6967170238494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101369, + "balance_loss_mlp": 1.33652186, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7061194413900653, + "language_loss": 1.22160292, + "learning_rate": 0.00038533972973918044, + "loss": 1.24261677, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 7.64453125, + "step": 7, + "time_per_iteration": 2.7199785709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02146806, + "balance_loss_mlp": 1.36975181, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.35850971046258795, + "language_loss": 1.17196155, + "learning_rate": 0.0004117823436340768, + "loss": 1.19342971, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 7.76171875, + "step": 8, + "time_per_iteration": 2.6428823471069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02153063, + "balance_loss_mlp": 1.36837983, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.22105321402960548, + "language_loss": 1.2430563, + "learning_rate": 0.00043510638207938993, + "loss": 1.26458693, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 7.8359375, + "step": 9, + "time_per_iteration": 2.7773404121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194678, + "balance_loss_mlp": 1.4077065, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.2650641779955913, + "language_loss": 1.13927829, + "learning_rate": 0.00045597044543220066, + "loss": 1.16122508, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 7.87109375, + "step": 10, + "time_per_iteration": 2.6966803073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215625, + "balance_loss_mlp": 1.42216802, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.17099192662038445, + "language_loss": 1.11761594, + "learning_rate": 0.00047484428652143135, + "loss": 1.13977218, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 7.921875, + "step": 11, + "time_per_iteration": 2.846426010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218955, + "balance_loss_mlp": 1.42854977, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.11899482154082718, + "language_loss": 1.17641664, + "learning_rate": 0.0004920747534624128, + "loss": 1.19860613, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 7.890625, + "step": 12, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02207543, + "balance_loss_mlp": 1.41751897, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.14172497717456267, + "language_loss": 1.20158505, + "learning_rate": 0.0005079252465375872, + "loss": 1.22366059, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 7.8984375, + "step": 13, + "time_per_iteration": 2.7560088634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02203989, + "balance_loss_mlp": 1.41625452, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.1448362910448976, + "language_loss": 1.09927368, + "learning_rate": 0.0005226005109505393, + "loss": 1.12131357, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 7.859375, + "step": 14, + "time_per_iteration": 2.623379707336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02125464, + "balance_loss_mlp": 1.36481309, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.13392565488521943, + "language_loss": 1.15514731, + "learning_rate": 0.0005362628552605367, + "loss": 1.17640197, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 7.59765625, + "step": 15, + "time_per_iteration": 2.596914768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02122013, + "balance_loss_mlp": 1.3682282, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.12347082932885804, + "language_loss": 1.19854355, + "learning_rate": 0.0005490431248454357, + "loss": 1.21976352, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 7.53125, + "step": 16, + "time_per_iteration": 2.685072898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02419001, + "balance_loss_mlp": 1.67742407, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2736231848322761, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78124118, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.40625, + "step": 17, + "time_per_iteration": 5.928683757781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02002798, + "balance_loss_mlp": 1.29097593, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.09154168539226555, + "language_loss": 1.06151795, + "learning_rate": 0.0005723671632907488, + "loss": 1.08154595, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.12109375, + "step": 18, + "time_per_iteration": 2.6618175506591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945774, + "balance_loss_mlp": 1.26141703, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11342789334024792, + "language_loss": 1.1168499, + "learning_rate": 0.0005830738490244919, + "loss": 1.13630772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.8515625, + "step": 19, + "time_per_iteration": 2.5248160362243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01908107, + "balance_loss_mlp": 1.24625731, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10096694408553891, + "language_loss": 1.13845825, + "learning_rate": 0.0005932312266435596, + "loss": 1.15753937, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.62109375, + "step": 20, + "time_per_iteration": 2.800579309463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01843731, + "balance_loss_mlp": 1.21316147, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.1378013237236713, + "language_loss": 1.09039617, + "learning_rate": 0.0006028929207788754, + "loss": 1.10883355, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 6.30078125, + "step": 21, + "time_per_iteration": 2.693075656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01796963, + "balance_loss_mlp": 1.19309616, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.10529209836160877, + "language_loss": 1.11936951, + "learning_rate": 0.0006121050677327902, + "loss": 1.13733912, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 6.03125, + "step": 22, + "time_per_iteration": 2.8881568908691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746784, + "balance_loss_mlp": 1.17724967, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.085047282331545, + "language_loss": 1.02962387, + "learning_rate": 0.0006209076479463684, + "loss": 1.04709172, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 5.70703125, + "step": 23, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01714578, + "balance_loss_mlp": 1.16831291, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.1446104563316411, + "language_loss": 1.12823486, + "learning_rate": 0.0006293355346737718, + "loss": 1.1453805, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 5.46875, + "step": 24, + "time_per_iteration": 2.662325382232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664908, + "balance_loss_mlp": 1.14725351, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.08929005506461926, + "language_loss": 1.08926165, + "learning_rate": 0.0006374193284416834, + "loss": 1.10591078, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 5.17578125, + "step": 25, + "time_per_iteration": 2.7794790267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01647718, + "balance_loss_mlp": 1.15752983, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.382953647696995, + "language_loss": 1.07588863, + "learning_rate": 0.0006451860277489461, + "loss": 1.09236586, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.89453125, + "step": 26, + "time_per_iteration": 2.6574552059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01623745, + "balance_loss_mlp": 1.1686517, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.13377036730821817, + "language_loss": 1.14740276, + "learning_rate": 0.0006526595731190848, + "loss": 1.16364002, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 4.55078125, + "step": 27, + "time_per_iteration": 2.5226099491119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558493, + "balance_loss_mlp": 1.14078379, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.07887885702942038, + "language_loss": 1.08901012, + "learning_rate": 0.0006598612921618983, + "loss": 1.10459495, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 4.18359375, + "step": 28, + "time_per_iteration": 2.839459180831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01503024, + "balance_loss_mlp": 1.11487842, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.08107526710192482, + "language_loss": 1.0255661, + "learning_rate": 0.0006668102665011454, + "loss": 1.04059625, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 3.87695312, + "step": 29, + "time_per_iteration": 3.257913589477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474291, + "balance_loss_mlp": 1.11227608, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.13697687064909753, + "language_loss": 1.11483085, + "learning_rate": 0.0006735236364718957, + "loss": 1.1295737, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 3.6171875, + "step": 30, + "time_per_iteration": 2.7084178924560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0142553, + "balance_loss_mlp": 1.09460521, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.11726589989245696, + "language_loss": 1.10265064, + "learning_rate": 0.0006800168558381346, + "loss": 1.11690593, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 3.31054688, + "step": 31, + "time_per_iteration": 2.588890552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01390474, + "balance_loss_mlp": 1.08758759, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.10666498872881085, + "language_loss": 1.13109517, + "learning_rate": 0.0006863039060567947, + "loss": 1.14499998, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 3.0234375, + "step": 32, + "time_per_iteration": 2.671940326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372012, + "balance_loss_mlp": 1.09372997, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.09439068448398888, + "language_loss": 1.06106949, + "learning_rate": 0.0006923974775611263, + "loss": 1.07478976, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 2.78710938, + "step": 33, + "time_per_iteration": 2.854475498199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370442, + "balance_loss_mlp": 1.11390388, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.06215931521992215, + "language_loss": 1.03014469, + "learning_rate": 0.0006983091239737814, + "loss": 1.04384923, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 2.56445312, + "step": 34, + "time_per_iteration": 3.0690298080444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361344, + "balance_loss_mlp": 1.12464166, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.09515467516314563, + "language_loss": 1.01683736, + "learning_rate": 0.0007040493939600222, + "loss": 1.03045082, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 2.36523438, + "step": 35, + "time_per_iteration": 2.8111989498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344012, + "balance_loss_mlp": 1.12600231, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.06987238068095514, + "language_loss": 1.02534437, + "learning_rate": 0.0007096279445021078, + "loss": 1.0387845, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 2.18554688, + "step": 36, + "time_per_iteration": 2.704871654510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340389, + "balance_loss_mlp": 1.14107156, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.1404335763188921, + "language_loss": 1.09097314, + "learning_rate": 0.0007150536386503726, + "loss": 1.10437703, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 1.9921875, + "step": 37, + "time_per_iteration": 2.872793436050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315876, + "balance_loss_mlp": 1.13486814, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.16061978088166937, + "language_loss": 1.01896858, + "learning_rate": 0.0007203346302358509, + "loss": 1.0321275, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 1.81054688, + "step": 38, + "time_per_iteration": 2.9352476596832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304512, + "balance_loss_mlp": 1.13332772, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.19798610454398824, + "language_loss": 1.06942129, + "learning_rate": 0.000725478437577282, + "loss": 1.08246636, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 1.71386719, + "step": 39, + "time_per_iteration": 2.766380786895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266397, + "balance_loss_mlp": 1.10894561, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.0682924496804484, + "language_loss": 1.01676083, + "learning_rate": 0.0007304920078549186, + "loss": 1.02942467, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 1.57324219, + "step": 40, + "time_per_iteration": 2.7017316818237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260084, + "balance_loss_mlp": 1.10988009, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.18661861035366387, + "language_loss": 1.03648829, + "learning_rate": 0.0007353817735343603, + "loss": 1.04908907, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 1.50097656, + "step": 41, + "time_per_iteration": 2.7103593349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243555, + "balance_loss_mlp": 1.10651195, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.09436856387031409, + "language_loss": 0.996611, + "learning_rate": 0.0007401537019902344, + "loss": 1.00904644, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 1.37109375, + "step": 42, + "time_per_iteration": 2.6113343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223311, + "balance_loss_mlp": 1.09961998, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.12261468754490484, + "language_loss": 1.02989793, + "learning_rate": 0.0007448133392900729, + "loss": 1.04213095, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.23535156, + "step": 43, + "time_per_iteration": 2.6736834049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123183, + "balance_loss_mlp": 1.11490965, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.06742287935331995, + "language_loss": 0.98469728, + "learning_rate": 0.0007493658489441491, + "loss": 0.9970156, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.16699219, + "step": 44, + "time_per_iteration": 2.8660154342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221739, + "balance_loss_mlp": 1.11549973, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.13165016268944502, + "language_loss": 1.02125764, + "learning_rate": 0.0007538160463002316, + "loss": 1.03347504, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.06445312, + "step": 45, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219104, + "balance_loss_mlp": 1.12082767, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.09154051415002856, + "language_loss": 1.05303812, + "learning_rate": 0.0007581684291577274, + "loss": 1.06522906, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.98193359, + "step": 46, + "time_per_iteration": 2.5779762268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211973, + "balance_loss_mlp": 1.12180293, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.10098348979088022, + "language_loss": 1.08761919, + "learning_rate": 0.0007624272050891776, + "loss": 1.09973884, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.90185547, + "step": 47, + "time_per_iteration": 2.8511393070220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.09893048, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.06288361982709323, + "language_loss": 0.98731792, + "learning_rate": 0.0007665963158851307, + "loss": 0.9991011, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.79345703, + "step": 48, + "time_per_iteration": 2.7975704669952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117803, + "balance_loss_mlp": 1.10588408, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.07935638516568921, + "language_loss": 1.07018328, + "learning_rate": 0.0007706794594783609, + "loss": 1.08196378, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.72167969, + "step": 49, + "time_per_iteration": 2.762869358062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170672, + "balance_loss_mlp": 1.10281849, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.06589219417940043, + "language_loss": 1.06122911, + "learning_rate": 0.0007746801096530423, + "loss": 1.07293582, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.67919922, + "step": 50, + "time_per_iteration": 2.755232334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116692, + "balance_loss_mlp": 1.10545588, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09337036144210262, + "language_loss": 1.10751569, + "learning_rate": 0.0007786015338021173, + "loss": 1.11918497, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.61376953, + "step": 51, + "time_per_iteration": 2.6145899295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.10279799, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.0700474736529942, + "language_loss": 1.03127432, + "learning_rate": 0.0007824468089603051, + "loss": 1.04286635, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.56396484, + "step": 52, + "time_per_iteration": 2.653333902359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.1128397, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.0678828268350522, + "language_loss": 1.02721131, + "learning_rate": 0.0007862188363098669, + "loss": 1.0388329, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.4934082, + "step": 53, + "time_per_iteration": 3.16854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150565, + "balance_loss_mlp": 1.10464573, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.07226768628462193, + "language_loss": 1.03151178, + "learning_rate": 0.0007899203543304438, + "loss": 1.04301751, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.45947266, + "step": 54, + "time_per_iteration": 2.684342384338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153237, + "balance_loss_mlp": 1.10901022, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.2877805661885644, + "language_loss": 1.16480064, + "learning_rate": 0.0007935539507422731, + "loss": 1.17633295, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.44213867, + "step": 55, + "time_per_iteration": 2.550560235977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135293, + "balance_loss_mlp": 1.09545326, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.09011321470942846, + "language_loss": 1.08752644, + "learning_rate": 0.0007971220733732573, + "loss": 1.09887934, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.39819336, + "step": 56, + "time_per_iteration": 2.6777026653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138051, + "balance_loss_mlp": 1.10307515, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.08011479339587849, + "language_loss": 1.04026377, + "learning_rate": 0.0008006270400641869, + "loss": 1.05164433, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.34985352, + "step": 57, + "time_per_iteration": 2.6899423599243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140153, + "balance_loss_mlp": 1.10787153, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.11169369867739573, + "language_loss": 1.05261517, + "learning_rate": 0.0008040710477125043, + "loss": 1.06401682, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.32275391, + "step": 58, + "time_per_iteration": 2.723038911819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144338, + "balance_loss_mlp": 1.11403465, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.15034464280850074, + "language_loss": 1.06417704, + "learning_rate": 0.0008074561805429771, + "loss": 1.07562041, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.30297852, + "step": 59, + "time_per_iteration": 2.6378283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136058, + "balance_loss_mlp": 1.10842514, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.12260992246729245, + "language_loss": 1.03937411, + "learning_rate": 0.0008107844176832545, + "loss": 1.05073476, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.27612305, + "step": 60, + "time_per_iteration": 2.700141668319702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143848, + "balance_loss_mlp": 1.11745548, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.07189127634205647, + "language_loss": 1.05365705, + "learning_rate": 0.0008140576401132568, + "loss": 1.06509542, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.2644043, + "step": 61, + "time_per_iteration": 2.6508264541625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141309, + "balance_loss_mlp": 1.11781311, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.05216073972873087, + "language_loss": 1.06422329, + "learning_rate": 0.0008172776370494935, + "loss": 1.07563639, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.23461914, + "step": 62, + "time_per_iteration": 2.725492238998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136117, + "balance_loss_mlp": 1.11272764, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.101779425959611, + "language_loss": 1.13612652, + "learning_rate": 0.0008204461118185703, + "loss": 1.14748764, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.23376465, + "step": 63, + "time_per_iteration": 2.5753746032714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148279, + "balance_loss_mlp": 1.12627339, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.07447427381713748, + "language_loss": 1.0324012, + "learning_rate": 0.0008235646872681536, + "loss": 1.04388404, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.22009277, + "step": 64, + "time_per_iteration": 2.5766890048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134709, + "balance_loss_mlp": 1.11331069, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.38827595406324295, + "language_loss": 1.02755439, + "learning_rate": 0.0008266349107584288, + "loss": 1.03890157, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.2142334, + "step": 65, + "time_per_iteration": 2.6795432567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150765, + "balance_loss_mlp": 1.12982011, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.12495940986475743, + "language_loss": 1.06208372, + "learning_rate": 0.0008296582587724851, + "loss": 1.07359147, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.20947266, + "step": 66, + "time_per_iteration": 2.7176458835601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140545, + "balance_loss_mlp": 1.11969519, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.1040817091496257, + "language_loss": 1.04495656, + "learning_rate": 0.0008326361411800136, + "loss": 1.05636215, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.20861816, + "step": 67, + "time_per_iteration": 2.944484233856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11664486, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.1236975736999165, + "language_loss": 1.04613113, + "learning_rate": 0.0008355699051851403, + "loss": 1.05749726, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1998291, + "step": 68, + "time_per_iteration": 2.7155401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163949, + "balance_loss_mlp": 1.14371967, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.08669769947970225, + "language_loss": 1.11325383, + "learning_rate": 0.0008384608389860635, + "loss": 1.12489343, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.20214844, + "step": 69, + "time_per_iteration": 2.6746206283569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.15127182, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.13494585106435908, + "language_loss": 1.01927853, + "learning_rate": 0.000841310175171381, + "loss": 1.03098571, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.19433594, + "step": 70, + "time_per_iteration": 2.6096978187561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116458, + "balance_loss_mlp": 1.14537501, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.08071853308807045, + "language_loss": 0.99831259, + "learning_rate": 0.000844119093875517, + "loss": 1.00995839, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.19189453, + "step": 71, + "time_per_iteration": 2.7110228538513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172694, + "balance_loss_mlp": 1.1531322, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.1298896621631551, + "language_loss": 1.05077183, + "learning_rate": 0.0008468887257134666, + "loss": 1.06249881, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.19543457, + "step": 72, + "time_per_iteration": 2.6877832412719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117331, + "balance_loss_mlp": 1.15338969, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.15655470084299106, + "language_loss": 1.07319438, + "learning_rate": 0.0008496201545131264, + "loss": 1.08492744, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.19909668, + "step": 73, + "time_per_iteration": 2.712404251098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155518, + "balance_loss_mlp": 1.13590837, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.16190508579873739, + "language_loss": 1.04767108, + "learning_rate": 0.0008523144198617317, + "loss": 1.05922627, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.19604492, + "step": 74, + "time_per_iteration": 3.1923534870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136399, + "balance_loss_mlp": 1.11624122, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.09478832041488004, + "language_loss": 1.04861999, + "learning_rate": 0.0008549725194813783, + "loss": 1.05998397, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.20153809, + "step": 75, + "time_per_iteration": 2.6708076000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116508, + "balance_loss_mlp": 1.09800684, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.08770819878028477, + "language_loss": 1.03907192, + "learning_rate": 0.0008575954114472099, + "loss": 1.05023694, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.18481445, + "step": 76, + "time_per_iteration": 3.13152813911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115762, + "balance_loss_mlp": 1.09717751, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.13848190952411177, + "language_loss": 1.01474786, + "learning_rate": 0.0008601840162606118, + "loss": 1.02590549, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.18591309, + "step": 77, + "time_per_iteration": 3.0026464462280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126663, + "balance_loss_mlp": 1.10745883, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04300320251384177, + "language_loss": 1.07548404, + "learning_rate": 0.000862739218788641, + "loss": 1.08675063, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.19189453, + "step": 78, + "time_per_iteration": 2.780151128768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136666, + "balance_loss_mlp": 1.11736631, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.05300805683051922, + "language_loss": 1.05217659, + "learning_rate": 0.0008652618700799138, + "loss": 1.0635432, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.19287109, + "step": 79, + "time_per_iteration": 2.644989252090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115046, + "balance_loss_mlp": 1.13105261, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.13679514692214284, + "language_loss": 1.04483461, + "learning_rate": 0.0008677527890662774, + "loss": 1.05633926, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.19384766, + "step": 80, + "time_per_iteration": 2.4652533531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151484, + "balance_loss_mlp": 1.13120639, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.06949005945359786, + "language_loss": 1.05593443, + "learning_rate": 0.0008702127641587799, + "loss": 1.06744933, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.20263672, + "step": 81, + "time_per_iteration": 2.6423192024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155894, + "balance_loss_mlp": 1.13492513, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.09507058081046676, + "language_loss": 1.01514888, + "learning_rate": 0.0008726425547457192, + "loss": 1.02670789, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.20959473, + "step": 82, + "time_per_iteration": 2.7670798301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.11376882, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.0793725108169458, + "language_loss": 1.00304663, + "learning_rate": 0.0008750428925998964, + "loss": 1.01438546, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.20117188, + "step": 83, + "time_per_iteration": 2.7451062202453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145272, + "balance_loss_mlp": 1.12516141, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.14534943996774727, + "language_loss": 1.06251049, + "learning_rate": 0.0008774144832015932, + "loss": 1.07396317, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.2010498, + "step": 84, + "time_per_iteration": 2.7039954662323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01784137, + "balance_loss_mlp": 1.77116704, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.33978769388161495, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76558447, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.12988281, + "step": 85, + "time_per_iteration": 4.672428846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133263, + "balance_loss_mlp": 1.11339045, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.0814354491433929, + "language_loss": 1.01647198, + "learning_rate": 0.0008820741205014318, + "loss": 1.02780461, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.19873047, + "step": 86, + "time_per_iteration": 2.9217472076416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135249, + "balance_loss_mlp": 1.11522174, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.09136661427056217, + "language_loss": 1.02933669, + "learning_rate": 0.0008843634575408404, + "loss": 1.04068923, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20031738, + "step": 87, + "time_per_iteration": 2.7795376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126576, + "balance_loss_mlp": 1.10805094, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.08653972064742017, + "language_loss": 1.04609084, + "learning_rate": 0.0008866266301555082, + "loss": 1.0573566, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.18518066, + "step": 88, + "time_per_iteration": 2.7490010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144591, + "balance_loss_mlp": 1.12630451, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.0643644920813647, + "language_loss": 1.05052233, + "learning_rate": 0.0008888642296509615, + "loss": 1.06196821, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.18273926, + "step": 89, + "time_per_iteration": 2.594862222671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167485, + "balance_loss_mlp": 1.14840007, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.0960094219381758, + "language_loss": 1.09507632, + "learning_rate": 0.0008910768275115906, + "loss": 1.10675108, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.1907959, + "step": 90, + "time_per_iteration": 2.732243299484253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168948, + "balance_loss_mlp": 1.14970791, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.08670111946866453, + "language_loss": 1.05579484, + "learning_rate": 0.0008932649762767675, + "loss": 1.06748414, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.19238281, + "step": 91, + "time_per_iteration": 2.58011531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156938, + "balance_loss_mlp": 1.13799536, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.1377326340865385, + "language_loss": 1.07988524, + "learning_rate": 0.0008954292103690864, + "loss": 1.09145451, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.18933105, + "step": 92, + "time_per_iteration": 2.88777494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144865, + "balance_loss_mlp": 1.12581539, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.08013614344713903, + "language_loss": 1.10040021, + "learning_rate": 0.0008975700468778296, + "loss": 1.11184883, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.19042969, + "step": 93, + "time_per_iteration": 2.5774590969085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153192, + "balance_loss_mlp": 1.13429725, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.08120240816831911, + "language_loss": 1.03244281, + "learning_rate": 0.0008996879863005366, + "loss": 1.04397476, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.18896484, + "step": 94, + "time_per_iteration": 2.6684646606445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166139, + "balance_loss_mlp": 1.14685082, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.10696755240582503, + "language_loss": 1.0365541, + "learning_rate": 0.0009017835132453337, + "loss": 1.04821539, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.19262695, + "step": 95, + "time_per_iteration": 2.5731871128082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160643, + "balance_loss_mlp": 1.14130712, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.09689172385373614, + "language_loss": 1.03809953, + "learning_rate": 0.0009038570970964896, + "loss": 1.04970598, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.1932373, + "step": 96, + "time_per_iteration": 2.7642133235931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142174, + "balance_loss_mlp": 1.1226114, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0731237284630876, + "language_loss": 1.01012015, + "learning_rate": 0.0009059091926454854, + "loss": 1.02154183, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.19543457, + "step": 97, + "time_per_iteration": 2.5798768997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134737, + "balance_loss_mlp": 1.11522222, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.09616120207899966, + "language_loss": 1.00179553, + "learning_rate": 0.0009079402406897198, + "loss": 1.01314282, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.19494629, + "step": 98, + "time_per_iteration": 3.2566075325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.12357211, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.06455780129345397, + "language_loss": 1.01265812, + "learning_rate": 0.0009099506686008212, + "loss": 1.02409148, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.19763184, + "step": 99, + "time_per_iteration": 2.799565553665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129571, + "balance_loss_mlp": 1.11054564, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.10657448879387016, + "language_loss": 1.0467732, + "learning_rate": 0.0009119408908644013, + "loss": 1.05806899, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.19030762, + "step": 100, + "time_per_iteration": 2.684875249862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122425, + "balance_loss_mlp": 1.10363734, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.06970738765852934, + "language_loss": 1.09725833, + "learning_rate": 0.0009139113095929519, + "loss": 1.1084826, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.18762207, + "step": 101, + "time_per_iteration": 2.8530783653259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130466, + "balance_loss_mlp": 1.11095107, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.04951217111237057, + "language_loss": 1.03750157, + "learning_rate": 0.0009158623150134762, + "loss": 1.04880619, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.19506836, + "step": 102, + "time_per_iteration": 2.5738718509674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124372, + "balance_loss_mlp": 1.10552466, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.07829016079597523, + "language_loss": 1.03829539, + "learning_rate": 0.000917794285931332, + "loss": 1.04953909, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.18859863, + "step": 103, + "time_per_iteration": 2.6672050952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116034, + "balance_loss_mlp": 1.09756863, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.06055754000551873, + "language_loss": 0.96430528, + "learning_rate": 0.0009197075901716639, + "loss": 0.97546566, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.18444824, + "step": 104, + "time_per_iteration": 2.7030909061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143399, + "balance_loss_mlp": 1.12458754, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.08851166873462187, + "language_loss": 1.06492853, + "learning_rate": 0.0009216025849997171, + "loss": 1.07636249, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.18798828, + "step": 105, + "time_per_iteration": 2.770717144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.11799645, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.1087806769155691, + "language_loss": 1.01426148, + "learning_rate": 0.0009234796175212258, + "loss": 1.02562797, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.18640137, + "step": 106, + "time_per_iteration": 2.9345030784606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145469, + "balance_loss_mlp": 1.12691963, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.08314221817588373, + "language_loss": 1.04264343, + "learning_rate": 0.000925339025064007, + "loss": 1.05409813, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.18530273, + "step": 107, + "time_per_iteration": 2.9724230766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136133, + "balance_loss_mlp": 1.11766744, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.06103111074840472, + "language_loss": 0.9746207, + "learning_rate": 0.0009271811355418027, + "loss": 0.98598194, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.18457031, + "step": 108, + "time_per_iteration": 2.8312766551971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114122, + "balance_loss_mlp": 1.12251627, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09366723049874563, + "language_loss": 1.0430491, + "learning_rate": 0.0009290062678013548, + "loss": 1.05446124, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.18713379, + "step": 109, + "time_per_iteration": 2.8890299797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119997, + "balance_loss_mlp": 1.10091138, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.07845117671788823, + "language_loss": 1.02498507, + "learning_rate": 0.0009308147319536321, + "loss": 1.03618503, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.19067383, + "step": 110, + "time_per_iteration": 2.6301145553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124178, + "balance_loss_mlp": 1.10517561, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.06169483511964636, + "language_loss": 1.08628201, + "learning_rate": 0.0009326068296900676, + "loss": 1.09752393, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.18981934, + "step": 111, + "time_per_iteration": 2.8480148315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124091, + "balance_loss_mlp": 1.1046958, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.07277353768082521, + "language_loss": 1.00328588, + "learning_rate": 0.0009343828545846161, + "loss": 1.01452684, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.19384766, + "step": 112, + "time_per_iteration": 2.785245656967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_mlp": 1.12596965, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.0989159829516975, + "language_loss": 1.03963184, + "learning_rate": 0.0009361430923823841, + "loss": 1.05108869, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.19702148, + "step": 113, + "time_per_iteration": 2.6218817234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139838, + "balance_loss_mlp": 1.11994159, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.08134488401387123, + "language_loss": 1.07289195, + "learning_rate": 0.0009378878212755459, + "loss": 1.08429039, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.19885254, + "step": 114, + "time_per_iteration": 2.489394426345825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135841, + "balance_loss_mlp": 1.11546779, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.08931795851274972, + "language_loss": 0.98084462, + "learning_rate": 0.0009396173121672103, + "loss": 0.992203, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.20373535, + "step": 115, + "time_per_iteration": 2.6338186264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.11229324, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.07784948028132394, + "language_loss": 1.03230667, + "learning_rate": 0.0009413318289238633, + "loss": 1.04362714, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.1973877, + "step": 116, + "time_per_iteration": 2.7797064781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119746, + "balance_loss_mlp": 1.10049319, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.10235619274826367, + "language_loss": 0.95674431, + "learning_rate": 0.0009430316286169771, + "loss": 0.96794176, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.19226074, + "step": 117, + "time_per_iteration": 3.0148251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123727, + "balance_loss_mlp": 1.10400951, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.08556933686221588, + "language_loss": 1.00759292, + "learning_rate": 0.0009447169617543361, + "loss": 1.0188303, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.19714355, + "step": 118, + "time_per_iteration": 2.570577383041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147362, + "balance_loss_mlp": 1.12738276, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.14195532580527156, + "language_loss": 1.07468402, + "learning_rate": 0.0009463880725016029, + "loss": 1.08615768, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.19970703, + "step": 119, + "time_per_iteration": 2.687791585922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119491, + "balance_loss_mlp": 1.1002152, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.12580227983012474, + "language_loss": 1.02723956, + "learning_rate": 0.0009480451988946134, + "loss": 1.03843451, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19274902, + "step": 120, + "time_per_iteration": 2.86080002784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.09974504, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.09779732210141849, + "language_loss": 1.04102588, + "learning_rate": 0.0009496885730428627, + "loss": 1.05221319, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1895752, + "step": 121, + "time_per_iteration": 3.058720350265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129608, + "balance_loss_mlp": 1.11076128, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.21300696817673925, + "language_loss": 1.02294064, + "learning_rate": 0.0009513184213246156, + "loss": 1.03423667, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.18859863, + "step": 122, + "time_per_iteration": 2.634585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112851, + "balance_loss_mlp": 1.10879278, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.08876505507315528, + "language_loss": 1.05331969, + "learning_rate": 0.0009529349645740552, + "loss": 1.06460488, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.19702148, + "step": 123, + "time_per_iteration": 2.68062686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139736, + "balance_loss_mlp": 1.11948287, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.07165211399576038, + "language_loss": 1.04294729, + "learning_rate": 0.0009545384182608524, + "loss": 1.05434453, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.20239258, + "step": 124, + "time_per_iteration": 2.541867971420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147945, + "balance_loss_mlp": 1.12758446, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.1170262954091428, + "language_loss": 1.01733518, + "learning_rate": 0.0009561289926625252, + "loss": 1.02881455, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.20361328, + "step": 125, + "time_per_iteration": 2.6904866695404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144311, + "balance_loss_mlp": 1.12337756, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.0767802787123007, + "language_loss": 1.06512678, + "learning_rate": 0.0009577068930299292, + "loss": 1.07656991, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.20935059, + "step": 126, + "time_per_iteration": 2.5956666469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112026, + "balance_loss_mlp": 1.10011339, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.05578094289714296, + "language_loss": 1.01563096, + "learning_rate": 0.0009592723197462087, + "loss": 1.02683353, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.20141602, + "step": 127, + "time_per_iteration": 2.652282953262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135813, + "balance_loss_mlp": 1.11633444, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.08941911012616197, + "language_loss": 0.98464531, + "learning_rate": 0.0009608254684795125, + "loss": 0.99600339, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.19470215, + "step": 128, + "time_per_iteration": 2.9219348430633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113214, + "balance_loss_mlp": 1.11204123, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.07851670709976168, + "language_loss": 1.01339173, + "learning_rate": 0.0009623665303297678, + "loss": 1.02471328, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.20092773, + "step": 129, + "time_per_iteration": 2.72129225730896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138949, + "balance_loss_mlp": 1.11936343, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.10234054898828188, + "language_loss": 1.05215728, + "learning_rate": 0.0009638956919697878, + "loss": 1.0635469, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19580078, + "step": 130, + "time_per_iteration": 2.8943347930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120076, + "balance_loss_mlp": 1.10040641, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.07955649128739337, + "language_loss": 0.97532988, + "learning_rate": 0.0009654131357809714, + "loss": 0.98653066, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.19665527, + "step": 131, + "time_per_iteration": 2.5710790157318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131377, + "balance_loss_mlp": 1.11108756, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.09603534709419483, + "language_loss": 1.06830871, + "learning_rate": 0.0009669190399838441, + "loss": 1.07962251, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.20275879, + "step": 132, + "time_per_iteration": 3.12355899810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104613, + "balance_loss_mlp": 1.08422863, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.07678679730921736, + "language_loss": 0.99635059, + "learning_rate": 0.0009684135787636724, + "loss": 1.0073967, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.20373535, + "step": 133, + "time_per_iteration": 2.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011225, + "balance_loss_mlp": 1.10198379, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.06194161941979751, + "language_loss": 1.03999257, + "learning_rate": 0.0009698969223913726, + "loss": 1.05121756, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.2052002, + "step": 134, + "time_per_iteration": 3.0173001289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111055, + "balance_loss_mlp": 1.09066617, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.06876216863310104, + "language_loss": 1.06792855, + "learning_rate": 0.0009713692373399265, + "loss": 1.07903397, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.19873047, + "step": 135, + "time_per_iteration": 2.670929431915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134721, + "balance_loss_mlp": 1.33280921, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.15411027982306336, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.80803436, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.14355469, + "step": 136, + "time_per_iteration": 5.4502341747283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142923, + "balance_loss_mlp": 1.13023889, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.0420308652143082, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.78953964, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.911421298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.1204778, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.15008184892874737, + "language_loss": 0.99414909, + "learning_rate": 0.0009757216201974225, + "loss": 1.00555539, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.20141602, + "step": 138, + "time_per_iteration": 2.805294990539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163113, + "balance_loss_mlp": 1.1417979, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.10042691837700132, + "language_loss": 1.04683781, + "learning_rate": 0.0009771514130396581, + "loss": 1.05846894, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.21325684, + "step": 139, + "time_per_iteration": 2.6785237789154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.15150893, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.13712828131438198, + "language_loss": 1.04777944, + "learning_rate": 0.00097857095638274, + "loss": 1.05949712, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.20275879, + "step": 140, + "time_per_iteration": 2.5689632892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161751, + "balance_loss_mlp": 1.140818, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.04776427930188189, + "language_loss": 0.96152979, + "learning_rate": 0.0009799803961288726, + "loss": 0.97314727, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.20922852, + "step": 141, + "time_per_iteration": 3.005524158477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114311, + "balance_loss_mlp": 1.12280869, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.08242063446041879, + "language_loss": 1.02058709, + "learning_rate": 0.000981379875086876, + "loss": 1.03201818, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.20300293, + "step": 142, + "time_per_iteration": 3.0404272079467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149559, + "balance_loss_mlp": 1.12884021, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.08811908081945614, + "language_loss": 0.97007114, + "learning_rate": 0.0009827695330590185, + "loss": 0.98156673, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.20727539, + "step": 143, + "time_per_iteration": 2.677872896194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139202, + "balance_loss_mlp": 1.11838782, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.09095558281985278, + "language_loss": 0.9660008, + "learning_rate": 0.0009841495069248256, + "loss": 0.97739279, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.20788574, + "step": 144, + "time_per_iteration": 3.0181970596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124304, + "balance_loss_mlp": 1.10402668, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.06968867614461936, + "language_loss": 0.96011639, + "learning_rate": 0.0009855199307219871, + "loss": 0.97135949, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.20275879, + "step": 145, + "time_per_iteration": 2.6638803482055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129462, + "balance_loss_mlp": 1.10819507, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.10380696742567494, + "language_loss": 0.97768301, + "learning_rate": 0.0009868809357244854, + "loss": 0.98897767, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.21264648, + "step": 146, + "time_per_iteration": 2.6609416007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_mlp": 1.08754969, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.04767435219925792, + "language_loss": 1.01976728, + "learning_rate": 0.0009882326505180556, + "loss": 1.03085351, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.21081543, + "step": 147, + "time_per_iteration": 2.7018306255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116059, + "balance_loss_mlp": 1.09487534, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.081387986355653, + "language_loss": 1.0020777, + "learning_rate": 0.0009895752010730906, + "loss": 1.01323831, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.21191406, + "step": 148, + "time_per_iteration": 2.9776458740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114545, + "balance_loss_mlp": 1.09280121, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07164111136345892, + "language_loss": 1.06547272, + "learning_rate": 0.0009909087108150867, + "loss": 1.07661819, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.21740723, + "step": 149, + "time_per_iteration": 2.7685787677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120998, + "balance_loss_mlp": 1.09932601, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.09002123643314056, + "language_loss": 1.07463562, + "learning_rate": 0.0009922333006927371, + "loss": 1.08584571, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.2166748, + "step": 150, + "time_per_iteration": 2.5377442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134752, + "balance_loss_mlp": 1.11268604, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.07882603128859848, + "language_loss": 1.00827551, + "learning_rate": 0.0009935490892437632, + "loss": 1.01962304, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.22070312, + "step": 151, + "time_per_iteration": 2.5629055500030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126049, + "balance_loss_mlp": 1.10497248, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.07540534084758796, + "language_loss": 0.99210167, + "learning_rate": 0.0009948561926585687, + "loss": 1.00336218, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.21069336, + "step": 152, + "time_per_iteration": 2.755824565887451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133957, + "balance_loss_mlp": 1.1110214, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09890448438657973, + "language_loss": 1.02627087, + "learning_rate": 0.0009961547248418122, + "loss": 1.03761053, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.22937012, + "step": 153, + "time_per_iteration": 2.6255645751953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115876, + "balance_loss_mlp": 1.09208155, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.0750271830701194, + "language_loss": 0.99508584, + "learning_rate": 0.0009974447974719707, + "loss": 1.00624466, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.23791504, + "step": 154, + "time_per_iteration": 2.685029983520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126502, + "balance_loss_mlp": 1.10213518, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.12681443605953674, + "language_loss": 1.01620197, + "learning_rate": 0.0009987265200589763, + "loss": 1.02746701, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.24365234, + "step": 155, + "time_per_iteration": 2.7264955043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119337, + "balance_loss_mlp": 1.09590077, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.07965097154096117, + "language_loss": 1.01522899, + "learning_rate": 0.001, + "loss": 1.02642226, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.23400879, + "step": 156, + "time_per_iteration": 2.864698886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111456, + "balance_loss_mlp": 1.09257805, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.061020534493473076, + "language_loss": 0.9859184, + "learning_rate": 0.0009999999029413921, + "loss": 0.99706399, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.2199707, + "step": 157, + "time_per_iteration": 2.8241283893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125679, + "balance_loss_mlp": 1.1049242, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.05862251807890935, + "language_loss": 1.00346851, + "learning_rate": 0.0009999996117656068, + "loss": 1.01472545, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.2076416, + "step": 158, + "time_per_iteration": 2.7097458839416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113624, + "balance_loss_mlp": 1.09279847, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.09545570145123992, + "language_loss": 0.93653512, + "learning_rate": 0.0009999991264727564, + "loss": 0.94767129, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.20837402, + "step": 159, + "time_per_iteration": 2.756363868713379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110635, + "balance_loss_mlp": 1.08577418, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.09475469160316574, + "language_loss": 1.04571712, + "learning_rate": 0.0009999984470630296, + "loss": 1.05678058, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.20581055, + "step": 160, + "time_per_iteration": 2.5990707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112086, + "balance_loss_mlp": 1.09061611, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.07420241291943742, + "language_loss": 0.9342289, + "learning_rate": 0.0009999975735366902, + "loss": 0.94534969, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.21472168, + "step": 161, + "time_per_iteration": 3.06878662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114184, + "balance_loss_mlp": 1.09270215, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0799449593456649, + "language_loss": 0.95189524, + "learning_rate": 0.0009999965058940775, + "loss": 0.96303707, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.21484375, + "step": 162, + "time_per_iteration": 3.4937808513641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112457, + "balance_loss_mlp": 1.10226631, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08293329451395655, + "language_loss": 1.01278222, + "learning_rate": 0.0009999952441356057, + "loss": 1.02402782, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.22314453, + "step": 163, + "time_per_iteration": 2.535121202468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109663, + "balance_loss_mlp": 1.08820534, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.06727245316799851, + "language_loss": 1.0154388, + "learning_rate": 0.000999993788261765, + "loss": 1.02653539, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.21472168, + "step": 164, + "time_per_iteration": 3.5832889080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110987, + "balance_loss_mlp": 1.08942175, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.07205404441274409, + "language_loss": 1.03110182, + "learning_rate": 0.00099999213827312, + "loss": 1.04221165, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.21569824, + "step": 165, + "time_per_iteration": 2.8096628189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118839, + "balance_loss_mlp": 1.09684491, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.050309165813849886, + "language_loss": 0.98088074, + "learning_rate": 0.000999990294170312, + "loss": 0.99206913, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.22009277, + "step": 166, + "time_per_iteration": 2.663135051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116813, + "balance_loss_mlp": 1.09486628, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.06058681172545402, + "language_loss": 1.02190185, + "learning_rate": 0.0009999882559540566, + "loss": 1.03306985, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.21948242, + "step": 167, + "time_per_iteration": 2.649784564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118672, + "balance_loss_mlp": 1.09543872, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.10019647540930027, + "language_loss": 0.98887956, + "learning_rate": 0.000999986023625145, + "loss": 1.00006628, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.23217773, + "step": 168, + "time_per_iteration": 2.6998720169067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01817799, + "balance_loss_mlp": 1.79767668, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.21411409700219255, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80742216, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.20117188, + "step": 169, + "time_per_iteration": 5.029488563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112576, + "balance_loss_mlp": 1.10157228, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.09130724925200479, + "language_loss": 0.99515283, + "learning_rate": 0.0009999809766328958, + "loss": 1.00641036, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.24206543, + "step": 170, + "time_per_iteration": 2.6508679389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153612, + "balance_loss_mlp": 1.12968671, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.0981725040523357, + "language_loss": 1.01766157, + "learning_rate": 0.0009999781619715177, + "loss": 1.02919769, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.23925781, + "step": 171, + "time_per_iteration": 2.5449466705322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151071, + "balance_loss_mlp": 1.12767053, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.10018141203760955, + "language_loss": 1.0104121, + "learning_rate": 0.000999975153201402, + "loss": 1.02192283, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.23388672, + "step": 172, + "time_per_iteration": 2.8463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114097, + "balance_loss_mlp": 1.11745048, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.05920698759335099, + "language_loss": 0.98661143, + "learning_rate": 0.0009999719503237174, + "loss": 0.99802113, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.23498535, + "step": 173, + "time_per_iteration": 2.733147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157549, + "balance_loss_mlp": 1.1333611, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.12686135486457134, + "language_loss": 1.07479167, + "learning_rate": 0.0009999685533397073, + "loss": 1.08636713, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.24194336, + "step": 174, + "time_per_iteration": 2.5705809593200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110896, + "balance_loss_mlp": 1.08707762, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.07652801902249555, + "language_loss": 0.99758261, + "learning_rate": 0.00099996496225069, + "loss": 1.00869155, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.23815918, + "step": 175, + "time_per_iteration": 2.6572659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118018, + "balance_loss_mlp": 1.09399772, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.05463854096335067, + "language_loss": 1.01895058, + "learning_rate": 0.0009999611770580604, + "loss": 1.03013086, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.24023438, + "step": 176, + "time_per_iteration": 2.8216159343719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.09596181, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.08810438351502946, + "language_loss": 1.01167393, + "learning_rate": 0.0009999571977632876, + "loss": 1.02288568, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.25231934, + "step": 177, + "time_per_iteration": 2.581037998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115073, + "balance_loss_mlp": 1.09040904, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.08419866181616258, + "language_loss": 1.03353202, + "learning_rate": 0.0009999530243679166, + "loss": 1.04468274, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.24682617, + "step": 178, + "time_per_iteration": 2.5844500064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137225, + "balance_loss_mlp": 1.11332321, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.13671082465577608, + "language_loss": 0.99045932, + "learning_rate": 0.0009999486568735675, + "loss": 1.00183165, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.23913574, + "step": 179, + "time_per_iteration": 3.044409990310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125047, + "balance_loss_mlp": 1.1010983, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.0738854697341979, + "language_loss": 0.99422705, + "learning_rate": 0.0009999440952819362, + "loss": 1.00547755, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.23950195, + "step": 180, + "time_per_iteration": 3.644280433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112251, + "balance_loss_mlp": 1.08836114, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.04789131390967285, + "language_loss": 0.98983485, + "learning_rate": 0.0009999393395947935, + "loss": 1.00095737, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2388916, + "step": 181, + "time_per_iteration": 2.8229053020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114504, + "balance_loss_mlp": 1.08992302, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.08040661288612141, + "language_loss": 1.02358437, + "learning_rate": 0.0009999343898139858, + "loss": 1.03472936, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.24584961, + "step": 182, + "time_per_iteration": 2.6112709045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123737, + "balance_loss_mlp": 1.09824967, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.0879280890069936, + "language_loss": 1.01010704, + "learning_rate": 0.0009999292459414348, + "loss": 1.02134442, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.25476074, + "step": 183, + "time_per_iteration": 2.574800491333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111495, + "balance_loss_mlp": 1.08559036, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.08068750200828848, + "language_loss": 1.05455053, + "learning_rate": 0.0009999239079791374, + "loss": 1.06566548, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.25915527, + "step": 184, + "time_per_iteration": 2.5650548934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110884, + "balance_loss_mlp": 1.08343673, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.07300059562366337, + "language_loss": 0.98493111, + "learning_rate": 0.0009999183759291659, + "loss": 0.99601954, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.25427246, + "step": 185, + "time_per_iteration": 2.7383785247802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110571, + "balance_loss_mlp": 1.08168936, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.09426698036311254, + "language_loss": 1.00536895, + "learning_rate": 0.0009999126497936682, + "loss": 1.01642609, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.24023438, + "step": 186, + "time_per_iteration": 2.5103538036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110913, + "balance_loss_mlp": 1.08740544, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.07507023604654985, + "language_loss": 1.03590488, + "learning_rate": 0.0009999067295748676, + "loss": 1.047014, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.23510742, + "step": 187, + "time_per_iteration": 2.806403160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112247, + "balance_loss_mlp": 1.09995186, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.10679989437153373, + "language_loss": 1.00781608, + "learning_rate": 0.000999900615275062, + "loss": 1.01904082, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.22509766, + "step": 188, + "time_per_iteration": 2.6750597953796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105556, + "balance_loss_mlp": 1.0823226, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.06425431277780277, + "language_loss": 1.06987619, + "learning_rate": 0.0009998943068966256, + "loss": 1.0809319, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.23242188, + "step": 189, + "time_per_iteration": 2.4297006130218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.0826813, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.07322572175010231, + "language_loss": 1.01591444, + "learning_rate": 0.0009998878044420072, + "loss": 1.02697778, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23669434, + "step": 190, + "time_per_iteration": 2.6686899662017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108272, + "balance_loss_mlp": 1.08489525, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.07088525550270033, + "language_loss": 0.97819, + "learning_rate": 0.0009998811079137318, + "loss": 0.98927271, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.23400879, + "step": 191, + "time_per_iteration": 2.5795974731445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118931, + "balance_loss_mlp": 1.09439743, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.07437245365565072, + "language_loss": 0.9895249, + "learning_rate": 0.0009998742173143987, + "loss": 1.0007143, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.24536133, + "step": 192, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133748, + "balance_loss_mlp": 1.10824919, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.06698686336952825, + "language_loss": 0.98415262, + "learning_rate": 0.0009998671326466833, + "loss": 0.99549013, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.25524902, + "step": 193, + "time_per_iteration": 2.955780506134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136952, + "balance_loss_mlp": 1.10922432, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.07154145387165563, + "language_loss": 0.99267447, + "learning_rate": 0.0009998598539133362, + "loss": 1.00404394, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.27734375, + "step": 194, + "time_per_iteration": 3.0137686729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163557, + "balance_loss_mlp": 1.13373041, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.09795763902625766, + "language_loss": 1.00780571, + "learning_rate": 0.0009998523811171828, + "loss": 1.01944125, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2980957, + "step": 195, + "time_per_iteration": 2.5090267658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164155, + "balance_loss_mlp": 1.13323212, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0756543485462421, + "language_loss": 1.0036695, + "learning_rate": 0.0009998447142611248, + "loss": 1.015311, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.30883789, + "step": 196, + "time_per_iteration": 2.653759241104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12615836, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.10738469994654526, + "language_loss": 0.9438082, + "learning_rate": 0.0009998368533481387, + "loss": 0.95537138, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.30126953, + "step": 197, + "time_per_iteration": 3.03090763092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123277, + "balance_loss_mlp": 1.09433353, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08947148055588174, + "language_loss": 0.97516447, + "learning_rate": 0.0009998287983812762, + "loss": 0.98639727, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.28930664, + "step": 198, + "time_per_iteration": 2.842519760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.10672641, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.08719552456544254, + "language_loss": 1.03183711, + "learning_rate": 0.0009998205493636646, + "loss": 1.04316807, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.26416016, + "step": 199, + "time_per_iteration": 2.657094955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099837, + "balance_loss_mlp": 1.07485092, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.11937452390124363, + "language_loss": 0.95869702, + "learning_rate": 0.0009998121062985063, + "loss": 0.96969533, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.24987793, + "step": 200, + "time_per_iteration": 2.6954355239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108694, + "balance_loss_mlp": 1.08444691, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.09459530753006626, + "language_loss": 0.98493665, + "learning_rate": 0.0009998034691890794, + "loss": 0.9960236, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.24243164, + "step": 201, + "time_per_iteration": 2.7717928886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.08075976, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.07675440437740683, + "language_loss": 1.0290482, + "learning_rate": 0.0009997946380387369, + "loss": 1.04009235, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.23632812, + "step": 202, + "time_per_iteration": 2.63975191116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111336, + "balance_loss_mlp": 1.08706474, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09220046036918417, + "language_loss": 1.04956245, + "learning_rate": 0.0009997856128509076, + "loss": 1.06067586, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.24279785, + "step": 203, + "time_per_iteration": 2.856816053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124883, + "balance_loss_mlp": 1.10112453, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.08622839045605694, + "language_loss": 0.99688643, + "learning_rate": 0.0009997763936290952, + "loss": 1.00813532, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.23754883, + "step": 204, + "time_per_iteration": 2.5392112731933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113243, + "balance_loss_mlp": 1.10773039, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.09842935942049862, + "language_loss": 1.0453217, + "learning_rate": 0.0009997669803768789, + "loss": 1.05664587, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24694824, + "step": 205, + "time_per_iteration": 2.7708992958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108465, + "balance_loss_mlp": 1.08426595, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.10843184908981528, + "language_loss": 0.9984858, + "learning_rate": 0.0009997573730979134, + "loss": 1.00957048, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.24194336, + "step": 206, + "time_per_iteration": 2.7474939823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01685643, + "balance_loss_mlp": 1.6616106, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.13014896830523812, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80878842, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 0.24023438, + "step": 207, + "time_per_iteration": 4.682751655578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109669, + "balance_loss_mlp": 1.08474243, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.07677308889428856, + "language_loss": 0.98866731, + "learning_rate": 0.0009997375764747294, + "loss": 0.99976397, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.24926758, + "step": 208, + "time_per_iteration": 2.9866418838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110763, + "balance_loss_mlp": 1.08659935, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.07362493409063897, + "language_loss": 0.96845645, + "learning_rate": 0.0009997273871381967, + "loss": 0.97956407, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.24169922, + "step": 209, + "time_per_iteration": 2.7354848384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125333, + "balance_loss_mlp": 1.09998906, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.07873798613461079, + "language_loss": 1.01664305, + "learning_rate": 0.0009997170037902862, + "loss": 1.0278964, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.25366211, + "step": 210, + "time_per_iteration": 2.704061269760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120611, + "balance_loss_mlp": 1.09462297, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.06515356853390573, + "language_loss": 1.04550838, + "learning_rate": 0.0009997064264350292, + "loss": 1.05671442, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.26013184, + "step": 211, + "time_per_iteration": 2.8975577354431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113542, + "balance_loss_mlp": 1.08662462, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.07652094351016743, + "language_loss": 0.98263478, + "learning_rate": 0.0009996956550765317, + "loss": 0.99377024, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.26928711, + "step": 212, + "time_per_iteration": 2.6716954708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125588, + "balance_loss_mlp": 1.09752572, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07289633346919515, + "language_loss": 0.93075061, + "learning_rate": 0.0009996846897189762, + "loss": 0.94200653, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.28051758, + "step": 213, + "time_per_iteration": 2.621661901473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110085, + "balance_loss_mlp": 1.08412087, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.055838089119108855, + "language_loss": 0.99370623, + "learning_rate": 0.0009996735303666193, + "loss": 1.004807, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.2598877, + "step": 214, + "time_per_iteration": 2.6928601264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095032, + "balance_loss_mlp": 1.06966448, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.04962656356162825, + "language_loss": 1.01034558, + "learning_rate": 0.0009996621770237937, + "loss": 1.02129602, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.25390625, + "step": 215, + "time_per_iteration": 2.760256290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098352, + "balance_loss_mlp": 1.07167339, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.06820201547086252, + "language_loss": 0.97216904, + "learning_rate": 0.0009996506296949073, + "loss": 0.98315251, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.26708984, + "step": 216, + "time_per_iteration": 2.921712636947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106582, + "balance_loss_mlp": 1.0792954, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.05678696526689756, + "language_loss": 0.96681535, + "learning_rate": 0.0009996388883844428, + "loss": 0.97788119, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27282715, + "step": 217, + "time_per_iteration": 2.6392288208007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092837, + "balance_loss_mlp": 1.06704009, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06325985488704432, + "language_loss": 1.01514912, + "learning_rate": 0.0009996269530969588, + "loss": 1.02607751, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25830078, + "step": 218, + "time_per_iteration": 2.6588566303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105308, + "balance_loss_mlp": 1.08038127, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.07879458740668356, + "language_loss": 0.99769139, + "learning_rate": 0.0009996148238370888, + "loss": 1.00874448, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24938965, + "step": 219, + "time_per_iteration": 2.7322278022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103161, + "balance_loss_mlp": 1.07711363, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.0629407592127239, + "language_loss": 0.95434463, + "learning_rate": 0.0009996025006095421, + "loss": 0.96537632, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.26049805, + "step": 220, + "time_per_iteration": 3.336355209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02460831, + "balance_loss_mlp": 2.43965983, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.4526401201513886, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.80243975, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 0.21191406, + "step": 221, + "time_per_iteration": 5.584397315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138695, + "balance_loss_mlp": 1.11146736, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.08000509590360377, + "language_loss": 0.96767551, + "learning_rate": 0.0009995772722706307, + "loss": 0.9790625, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.27246094, + "step": 222, + "time_per_iteration": 2.932035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.14898777, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.06295735346771135, + "language_loss": 1.10290885, + "learning_rate": 0.0009995643671690604, + "loss": 1.1146853, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.28686523, + "step": 223, + "time_per_iteration": 2.489574909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118606, + "balance_loss_mlp": 1.15768862, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.06397701682602697, + "language_loss": 0.97599596, + "learning_rate": 0.0009995512681194023, + "loss": 0.98785651, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.28369141, + "step": 224, + "time_per_iteration": 2.8617055416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204697, + "balance_loss_mlp": 1.17644429, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.0569906191636753, + "language_loss": 0.95713508, + "learning_rate": 0.0009995379751267417, + "loss": 0.96918201, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28295898, + "step": 225, + "time_per_iteration": 3.272956371307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211045, + "balance_loss_mlp": 1.17959809, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.06210348551978246, + "language_loss": 0.970909, + "learning_rate": 0.0009995244881962398, + "loss": 0.98301941, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.31420898, + "step": 226, + "time_per_iteration": 2.629014253616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207143, + "balance_loss_mlp": 1.17750776, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.06412842399528458, + "language_loss": 0.97423029, + "learning_rate": 0.0009995108073331323, + "loss": 0.98630178, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.29614258, + "step": 227, + "time_per_iteration": 2.598266124725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.1790204, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.05900157234221112, + "language_loss": 1.00919747, + "learning_rate": 0.0009994969325427309, + "loss": 1.02128983, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.30200195, + "step": 228, + "time_per_iteration": 2.681445598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208149, + "balance_loss_mlp": 1.17727375, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.08372721248844238, + "language_loss": 0.96768719, + "learning_rate": 0.0009994828638304218, + "loss": 0.97976863, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.30883789, + "step": 229, + "time_per_iteration": 2.6330137252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213023, + "balance_loss_mlp": 1.18202829, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.09332052147555223, + "language_loss": 1.02555704, + "learning_rate": 0.0009994686012016675, + "loss": 1.0376873, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.30981445, + "step": 230, + "time_per_iteration": 2.519575595855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205079, + "balance_loss_mlp": 1.17470419, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.07303811655625075, + "language_loss": 1.02279592, + "learning_rate": 0.000999454144662005, + "loss": 1.03484678, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.3034668, + "step": 231, + "time_per_iteration": 2.8772194385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200788, + "balance_loss_mlp": 1.16729009, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.05982585511102693, + "language_loss": 0.9550131, + "learning_rate": 0.0009994394942170468, + "loss": 0.96702093, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.33520508, + "step": 232, + "time_per_iteration": 2.705536127090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200355, + "balance_loss_mlp": 1.16673827, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06482734437318205, + "language_loss": 0.93872058, + "learning_rate": 0.0009994246498724808, + "loss": 0.95072412, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.33642578, + "step": 233, + "time_per_iteration": 2.729526996612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204357, + "balance_loss_mlp": 1.17043054, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.06840473363398163, + "language_loss": 0.96267349, + "learning_rate": 0.00099940961163407, + "loss": 0.97471702, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.33935547, + "step": 234, + "time_per_iteration": 2.8506321907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210646, + "balance_loss_mlp": 1.1758604, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.061734633326469966, + "language_loss": 0.99016106, + "learning_rate": 0.0009993943795076528, + "loss": 1.0022676, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.34814453, + "step": 235, + "time_per_iteration": 2.6817193031311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012082, + "balance_loss_mlp": 1.17379582, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.07722659013027651, + "language_loss": 1.01211047, + "learning_rate": 0.0009993789534991427, + "loss": 1.02419257, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34423828, + "step": 236, + "time_per_iteration": 2.4797797203063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216471, + "balance_loss_mlp": 1.18354487, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.057771959372629855, + "language_loss": 0.96296465, + "learning_rate": 0.0009993633336145287, + "loss": 0.97512937, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.3293457, + "step": 237, + "time_per_iteration": 2.629390001296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225643, + "balance_loss_mlp": 1.19369495, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.07668042159358972, + "language_loss": 1.00654197, + "learning_rate": 0.0009993475198598752, + "loss": 1.01879823, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.31958008, + "step": 238, + "time_per_iteration": 3.01481032371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220207, + "balance_loss_mlp": 1.1866858, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08994725037560618, + "language_loss": 0.96828419, + "learning_rate": 0.0009993315122413212, + "loss": 0.98048627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.33544922, + "step": 239, + "time_per_iteration": 2.6483867168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215592, + "balance_loss_mlp": 1.18042517, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.08238446857980607, + "language_loss": 0.9678297, + "learning_rate": 0.0009993153107650818, + "loss": 0.97998565, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.35180664, + "step": 240, + "time_per_iteration": 2.594534158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199347, + "balance_loss_mlp": 1.16303563, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.09316981102360596, + "language_loss": 0.96465278, + "learning_rate": 0.0009992989154374468, + "loss": 0.9766463, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.36328125, + "step": 241, + "time_per_iteration": 2.5503900051116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190623, + "balance_loss_mlp": 1.15631413, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06540072726643342, + "language_loss": 1.03219867, + "learning_rate": 0.0009992823262647817, + "loss": 1.04410505, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.34301758, + "step": 242, + "time_per_iteration": 2.7218894958496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156597, + "balance_loss_mlp": 1.1235044, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.09177405734811558, + "language_loss": 0.97326249, + "learning_rate": 0.0009992655432535264, + "loss": 0.98482847, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.33105469, + "step": 243, + "time_per_iteration": 2.800133466720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136682, + "balance_loss_mlp": 1.10614085, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.0753000751829641, + "language_loss": 0.98140877, + "learning_rate": 0.0009992485664101973, + "loss": 0.99277562, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.30517578, + "step": 244, + "time_per_iteration": 2.6863763332366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.08648348, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.06369495608278983, + "language_loss": 1.00049853, + "learning_rate": 0.000999231395741385, + "loss": 1.01165819, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.29467773, + "step": 245, + "time_per_iteration": 3.145612955093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104415, + "balance_loss_mlp": 1.0764488, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.058358007346171054, + "language_loss": 0.97651666, + "learning_rate": 0.0009992140312537557, + "loss": 0.98756075, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.2800293, + "step": 246, + "time_per_iteration": 2.612847328186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092763, + "balance_loss_mlp": 1.06641817, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.0813165094086701, + "language_loss": 0.93562448, + "learning_rate": 0.000999196472954051, + "loss": 0.94655204, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.26379395, + "step": 247, + "time_per_iteration": 2.9633545875549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02706023, + "balance_loss_mlp": 2.55038333, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.26644214904670055, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.82130873, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.5546875, + "step": 248, + "time_per_iteration": 5.665804624557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.12381256, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.07780849766073628, + "language_loss": 1.00670481, + "learning_rate": 0.0009991607749457578, + "loss": 1.01821971, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.27709961, + "step": 249, + "time_per_iteration": 2.511357069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173992, + "balance_loss_mlp": 1.14483345, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.08242230719461915, + "language_loss": 0.98555326, + "learning_rate": 0.0009991426352510286, + "loss": 0.99729323, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.29174805, + "step": 250, + "time_per_iteration": 2.9747626781463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213643, + "balance_loss_mlp": 1.18186164, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.08110439009499554, + "language_loss": 0.99640858, + "learning_rate": 0.0009991243017719422, + "loss": 1.00854492, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.31787109, + "step": 251, + "time_per_iteration": 2.6450002193450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247147, + "balance_loss_mlp": 1.21276748, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.09531666026222298, + "language_loss": 0.94547766, + "learning_rate": 0.0009991057745156165, + "loss": 0.95794916, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.34375, + "step": 252, + "time_per_iteration": 2.608226776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0212821, + "balance_loss_mlp": 2.05687547, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.23568337742673945, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84039193, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.71484375, + "step": 253, + "time_per_iteration": 5.009166955947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253718, + "balance_loss_mlp": 1.22112656, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.11732554794190522, + "language_loss": 1.02719152, + "learning_rate": 0.0009990681387000943, + "loss": 1.03972876, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.32568359, + "step": 254, + "time_per_iteration": 2.733544111251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259536, + "balance_loss_mlp": 1.22959042, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.10757948615664437, + "language_loss": 0.99075437, + "learning_rate": 0.0009990490301555093, + "loss": 1.00334978, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.29907227, + "step": 255, + "time_per_iteration": 2.952223777770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01833791, + "balance_loss_mlp": 1.79201972, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.13001926806611183, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81048942, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.41796875, + "step": 256, + "time_per_iteration": 4.834028244018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01839647, + "balance_loss_mlp": 1.7994014, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.11989001468728706, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81082386, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.40234375, + "step": 257, + "time_per_iteration": 4.963416814804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764173, + "balance_loss_mlp": 1.72659838, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.09913369297847359, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71740055, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.375, + "step": 258, + "time_per_iteration": 4.860485076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242536, + "balance_loss_mlp": 1.21342516, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.09740558448014502, + "language_loss": 0.93272007, + "learning_rate": 0.0009989706585723202, + "loss": 0.94514549, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29101562, + "step": 259, + "time_per_iteration": 2.763617753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252163, + "balance_loss_mlp": 1.22202659, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.1249592106702951, + "language_loss": 0.99313855, + "learning_rate": 0.0009989505813633442, + "loss": 1.0056603, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.30102539, + "step": 260, + "time_per_iteration": 2.687018394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240716, + "balance_loss_mlp": 1.2099601, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12109163963871895, + "language_loss": 0.99271172, + "learning_rate": 0.000998930310444573, + "loss": 1.00511885, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.30712891, + "step": 261, + "time_per_iteration": 2.7355992794036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194626, + "balance_loss_mlp": 1.16220057, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.10196827835843725, + "language_loss": 0.96712077, + "learning_rate": 0.0009989098458238765, + "loss": 0.97906703, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.32421875, + "step": 262, + "time_per_iteration": 2.8160154819488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120265, + "balance_loss_mlp": 1.16850853, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.08050125519090791, + "language_loss": 0.96376812, + "learning_rate": 0.0009988891875091998, + "loss": 0.97579467, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.34179688, + "step": 263, + "time_per_iteration": 2.7738425731658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221172, + "balance_loss_mlp": 1.18657792, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09840792148235085, + "language_loss": 0.91716301, + "learning_rate": 0.0009988683355085636, + "loss": 0.92937469, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.34619141, + "step": 264, + "time_per_iteration": 2.7763147354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240941, + "balance_loss_mlp": 1.20393836, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.10851467261948886, + "language_loss": 0.99809039, + "learning_rate": 0.000998847289830063, + "loss": 1.01049972, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37011719, + "step": 265, + "time_per_iteration": 2.824655532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228337, + "balance_loss_mlp": 1.1930747, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.10300549526892724, + "language_loss": 0.92410266, + "learning_rate": 0.0009988260504818682, + "loss": 0.93638599, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.35253906, + "step": 266, + "time_per_iteration": 2.5484864711761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187227, + "balance_loss_mlp": 1.15127397, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.08304900792028935, + "language_loss": 0.99349552, + "learning_rate": 0.000998804617472226, + "loss": 1.00536776, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.35986328, + "step": 267, + "time_per_iteration": 2.67124342918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115754, + "balance_loss_mlp": 1.1241138, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.09977621520267708, + "language_loss": 0.94207335, + "learning_rate": 0.0009987829908094568, + "loss": 0.95364869, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.33447266, + "step": 268, + "time_per_iteration": 2.813934087753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134088, + "balance_loss_mlp": 1.09908843, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.11738978381138881, + "language_loss": 1.00792646, + "learning_rate": 0.0009987611705019569, + "loss": 1.01926744, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.3503418, + "step": 269, + "time_per_iteration": 4.138862133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117111, + "balance_loss_mlp": 1.08282614, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.05348082980263852, + "language_loss": 0.99369657, + "learning_rate": 0.0009987391565581978, + "loss": 1.00486767, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34277344, + "step": 270, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126117, + "balance_loss_mlp": 1.09176075, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.07524916084480812, + "language_loss": 0.92056942, + "learning_rate": 0.000998716948986726, + "loss": 0.93183053, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34350586, + "step": 271, + "time_per_iteration": 2.7993569374084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142479, + "balance_loss_mlp": 1.10948217, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.0817059207133684, + "language_loss": 0.94050443, + "learning_rate": 0.0009986945477961633, + "loss": 0.95192927, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.33032227, + "step": 272, + "time_per_iteration": 2.692488193511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162369, + "balance_loss_mlp": 1.13108802, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07154102990319093, + "language_loss": 0.9958387, + "learning_rate": 0.0009986719529952066, + "loss": 1.00746238, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.3125, + "step": 273, + "time_per_iteration": 2.834634780883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151893, + "balance_loss_mlp": 1.12099373, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.11641144040169231, + "language_loss": 0.98596179, + "learning_rate": 0.000998649164592628, + "loss": 0.99748075, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.30859375, + "step": 274, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128229, + "balance_loss_mlp": 1.0986656, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.08444223005841496, + "language_loss": 0.96863008, + "learning_rate": 0.0009986261825972748, + "loss": 0.97991234, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29541016, + "step": 275, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116765, + "balance_loss_mlp": 1.08734369, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.09541227165854013, + "language_loss": 0.9859423, + "learning_rate": 0.000998603007018069, + "loss": 0.99711001, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29394531, + "step": 276, + "time_per_iteration": 2.7675342559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108591, + "balance_loss_mlp": 1.07731009, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.06559506468622318, + "language_loss": 0.95903766, + "learning_rate": 0.0009985796378640089, + "loss": 0.97012359, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.3125, + "step": 277, + "time_per_iteration": 2.7019519805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111687, + "balance_loss_mlp": 1.08012068, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.07318038514420845, + "language_loss": 0.95983016, + "learning_rate": 0.0009985560751441665, + "loss": 0.97094703, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.31542969, + "step": 278, + "time_per_iteration": 2.8234922885894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111914, + "balance_loss_mlp": 1.0874306, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.07220087085065136, + "language_loss": 0.98319995, + "learning_rate": 0.00099853231886769, + "loss": 0.99439132, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.31713867, + "step": 279, + "time_per_iteration": 2.7748613357543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133292, + "balance_loss_mlp": 1.10162961, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06439402113592181, + "language_loss": 0.98657203, + "learning_rate": 0.0009985083690438024, + "loss": 0.99790496, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.31640625, + "step": 280, + "time_per_iteration": 2.700810670852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132869, + "balance_loss_mlp": 1.10204113, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.04843472954862069, + "language_loss": 0.89283121, + "learning_rate": 0.0009984842256818016, + "loss": 0.9041599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.30786133, + "step": 281, + "time_per_iteration": 3.115292549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113546, + "balance_loss_mlp": 1.10580087, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.06657413960403659, + "language_loss": 0.99515754, + "learning_rate": 0.0009984598887910613, + "loss": 1.00651217, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.29614258, + "step": 282, + "time_per_iteration": 2.735640048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140553, + "balance_loss_mlp": 1.10893846, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.07881571737542031, + "language_loss": 0.95306879, + "learning_rate": 0.0009984353583810297, + "loss": 0.96447432, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.31616211, + "step": 283, + "time_per_iteration": 2.8240931034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128602, + "balance_loss_mlp": 1.09834647, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0943213260733239, + "language_loss": 0.97471213, + "learning_rate": 0.0009984106344612302, + "loss": 0.98599815, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.30224609, + "step": 284, + "time_per_iteration": 2.802689790725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119254, + "balance_loss_mlp": 1.08964229, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.0726777825280204, + "language_loss": 0.92919928, + "learning_rate": 0.0009983857170412615, + "loss": 0.94039178, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.29589844, + "step": 285, + "time_per_iteration": 3.0111782550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.10165143, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.06957121076923053, + "language_loss": 0.92976809, + "learning_rate": 0.000998360606130798, + "loss": 0.94110835, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.32324219, + "step": 286, + "time_per_iteration": 2.8221306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01949249, + "balance_loss_mlp": 1.90461755, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.20138197735421756, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71022367, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.44726562, + "step": 287, + "time_per_iteration": 4.872509956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160615, + "balance_loss_mlp": 1.12447047, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.09083797153449202, + "language_loss": 0.98382282, + "learning_rate": 0.0009983098038774552, + "loss": 0.99542892, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.36132812, + "step": 288, + "time_per_iteration": 2.7861900329589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156365, + "balance_loss_mlp": 1.54524422, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.05039988105800305, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79733872, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.18359375, + "step": 289, + "time_per_iteration": 4.809176683425903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183294, + "balance_loss_mlp": 1.14958155, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.11767359006900376, + "language_loss": 0.95852768, + "learning_rate": 0.0009982582277800948, + "loss": 0.9703607, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.33666992, + "step": 290, + "time_per_iteration": 2.5785539150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11738336, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.09005932528563108, + "language_loss": 1.03039932, + "learning_rate": 0.0009982321495648908, + "loss": 1.04188573, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.3125, + "step": 291, + "time_per_iteration": 2.798412561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133747, + "balance_loss_mlp": 1.10218096, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.07041326246084649, + "language_loss": 0.9488259, + "learning_rate": 0.0009982058779188115, + "loss": 0.96016335, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.31542969, + "step": 292, + "time_per_iteration": 2.7117443084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113109, + "balance_loss_mlp": 1.08354521, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.0659469171672323, + "language_loss": 1.02221513, + "learning_rate": 0.0009981794128520567, + "loss": 1.0333463, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.29589844, + "step": 293, + "time_per_iteration": 2.83561372756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113063, + "balance_loss_mlp": 1.10104227, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.07618014203826041, + "language_loss": 0.98908657, + "learning_rate": 0.000998152754374901, + "loss": 1.00039291, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.2956543, + "step": 294, + "time_per_iteration": 2.879502773284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133819, + "balance_loss_mlp": 1.1052562, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.09109925372268521, + "language_loss": 0.94850433, + "learning_rate": 0.0009981259024976943, + "loss": 0.95984244, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.28564453, + "step": 295, + "time_per_iteration": 2.708038568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129466, + "balance_loss_mlp": 1.10023606, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.08548016831625774, + "language_loss": 0.92669952, + "learning_rate": 0.0009980988572308612, + "loss": 0.93799424, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.29248047, + "step": 296, + "time_per_iteration": 2.99466609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126952, + "balance_loss_mlp": 1.09779358, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.05751010220277151, + "language_loss": 0.96034563, + "learning_rate": 0.0009980716185849015, + "loss": 0.9716152, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.29174805, + "step": 297, + "time_per_iteration": 3.0216734409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135652, + "balance_loss_mlp": 1.10651755, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06310788330802251, + "language_loss": 0.92855394, + "learning_rate": 0.0009980441865703904, + "loss": 0.93991041, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29150391, + "step": 298, + "time_per_iteration": 2.6354267597198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124221, + "balance_loss_mlp": 1.09456158, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.07879622532675779, + "language_loss": 1.0091691, + "learning_rate": 0.000998016561197978, + "loss": 1.02041125, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29638672, + "step": 299, + "time_per_iteration": 2.726853370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104202, + "balance_loss_mlp": 1.0768075, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.07606317837722033, + "language_loss": 0.9243238, + "learning_rate": 0.0009979887424783895, + "loss": 0.9353658, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.27441406, + "step": 300, + "time_per_iteration": 2.866880416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03286275, + "balance_loss_mlp": 5.97428513, + "diversity_loss_mlp": 0.40086228, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.08630620995418306, + "language_loss": 1.00780904, + "learning_rate": 0.0009979607304224248, + "loss": 1.04067183, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.09870158, + "step": 301, + "time_per_iteration": 2.8737847805023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101287, + "balance_loss_mlp": 1.07100797, + "diversity_loss_mlp": 0.0, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.07465341521099292, + "language_loss": 0.98771101, + "learning_rate": 0.000997932525040959, + "loss": 0.99872386, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.30273438, + "routerloss_mlp": 0.0, + "step": 302, + "time_per_iteration": 2.646038055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097684, + "balance_loss_mlp": 1.06912112, + "diversity_loss_mlp": 0.0, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.0784548088046029, + "language_loss": 1.01345074, + "learning_rate": 0.000997904126344943, + "loss": 1.02442753, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.28527832, + "routerloss_mlp": 0.0, + "step": 303, + "time_per_iteration": 2.607773542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117476, + "balance_loss_mlp": 1.08612442, + "diversity_loss_mlp": 0.0, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.08413175271133923, + "language_loss": 0.96722186, + "learning_rate": 0.0009978755343454018, + "loss": 0.97839665, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.31323242, + "routerloss_mlp": 0.0, + "step": 304, + "time_per_iteration": 2.7423698902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.11099684, + "diversity_loss_mlp": 0.0, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.08591892096672729, + "language_loss": 0.97475642, + "learning_rate": 0.0009978467490534355, + "loss": 0.98621881, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.35229492, + "routerloss_mlp": 0.0, + "step": 305, + "time_per_iteration": 2.5751075744628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144387, + "balance_loss_mlp": 1.10974526, + "diversity_loss_mlp": 0.0, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.06674928608125212, + "language_loss": 0.95161211, + "learning_rate": 0.00099781777048022, + "loss": 0.96305597, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.34667969, + "routerloss_mlp": 0.0, + "step": 306, + "time_per_iteration": 2.697453260421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142445, + "balance_loss_mlp": 1.10766006, + "diversity_loss_mlp": 0.0, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.08714127978238019, + "language_loss": 0.96547389, + "learning_rate": 0.0009977885986370057, + "loss": 0.97689843, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.34790039, + "routerloss_mlp": 0.0, + "step": 307, + "time_per_iteration": 2.555311679840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.11098385, + "diversity_loss_mlp": 0.0, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.07630797692789458, + "language_loss": 0.93133295, + "learning_rate": 0.000997759233535118, + "loss": 0.94276774, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.32495117, + "routerloss_mlp": 0.0, + "step": 308, + "time_per_iteration": 2.7760326862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.10530353, + "diversity_loss_mlp": 0.0, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.1535726459245726, + "language_loss": 0.98530197, + "learning_rate": 0.0009977296751859576, + "loss": 0.99668187, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.32666016, + "routerloss_mlp": 0.0, + "step": 309, + "time_per_iteration": 2.7718236446380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119868, + "balance_loss_mlp": 1.09030402, + "diversity_loss_mlp": 0.0, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.09363029892750833, + "language_loss": 1.00139546, + "learning_rate": 0.0009976999236009998, + "loss": 1.01259422, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 310, + "time_per_iteration": 2.7480924129486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128418, + "balance_loss_mlp": 1.1004039, + "diversity_loss_mlp": 0.0, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.11799476734746514, + "language_loss": 1.01830125, + "learning_rate": 0.0009976699787917955, + "loss": 1.02958548, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.28051758, + "routerloss_mlp": 0.0, + "step": 311, + "time_per_iteration": 2.6702628135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02237821, + "balance_loss_mlp": 2.22513723, + "diversity_loss_mlp": 0.0, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.1521885653041848, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75680816, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 312, + "time_per_iteration": 4.968472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01934551, + "balance_loss_mlp": 3.38140035, + "diversity_loss_mlp": 0.39575127, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.05936914788699087, + "language_loss": 0.983639, + "learning_rate": 0.0009976095095472243, + "loss": 1.00298452, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.04597524, + "step": 313, + "time_per_iteration": 2.6077775955200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140935, + "balance_loss_mlp": 1.11120427, + "diversity_loss_mlp": 0.0, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.09323488343042824, + "language_loss": 0.95392269, + "learning_rate": 0.0009975789851353334, + "loss": 0.96533203, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29736328, + "routerloss_mlp": 0.0, + "step": 314, + "time_per_iteration": 2.810530424118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152798, + "balance_loss_mlp": 1.12359178, + "diversity_loss_mlp": 0.0, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.09115128879339694, + "language_loss": 0.97407585, + "learning_rate": 0.0009975482675461487, + "loss": 0.98560387, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.29223633, + "routerloss_mlp": 0.0, + "step": 315, + "time_per_iteration": 2.658961772918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165231, + "balance_loss_mlp": 1.13464189, + "diversity_loss_mlp": 0.0, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08232329918432242, + "language_loss": 0.95008749, + "learning_rate": 0.0009975173567915952, + "loss": 0.96173978, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.3059082, + "routerloss_mlp": 0.0, + "step": 316, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208938, + "balance_loss_mlp": 1.17508304, + "diversity_loss_mlp": 0.0, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.11734128354988786, + "language_loss": 0.89037865, + "learning_rate": 0.000997486252883674, + "loss": 0.90246803, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.33886719, + "routerloss_mlp": 0.0, + "step": 317, + "time_per_iteration": 2.82440447807312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246386, + "balance_loss_mlp": 1.21069503, + "diversity_loss_mlp": 0.0, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.09191065951965113, + "language_loss": 0.94435382, + "learning_rate": 0.0009974549558344602, + "loss": 0.95681769, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.35693359, + "routerloss_mlp": 0.0, + "step": 318, + "time_per_iteration": 3.6594014167785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256455, + "balance_loss_mlp": 1.22028661, + "diversity_loss_mlp": 0.0, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.10186826507715854, + "language_loss": 1.03254342, + "learning_rate": 0.000997423465656105, + "loss": 1.04510808, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.36181641, + "routerloss_mlp": 0.0, + "step": 319, + "time_per_iteration": 2.7277376651763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228783, + "balance_loss_mlp": 1.19342566, + "diversity_loss_mlp": 0.0, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.07892523617459922, + "language_loss": 1.00628281, + "learning_rate": 0.0009973917823608335, + "loss": 1.01857066, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.35375977, + "routerloss_mlp": 0.0, + "step": 320, + "time_per_iteration": 2.608973503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216411, + "balance_loss_mlp": 1.18279386, + "diversity_loss_mlp": 0.0, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.08046246772740448, + "language_loss": 0.96186835, + "learning_rate": 0.0009973599059609462, + "loss": 0.9740324, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 321, + "time_per_iteration": 2.736543655395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188588, + "balance_loss_mlp": 1.15735531, + "diversity_loss_mlp": 0.0, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.06958940991484033, + "language_loss": 0.93877137, + "learning_rate": 0.000997327836468819, + "loss": 0.95065725, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.31225586, + "routerloss_mlp": 0.0, + "step": 322, + "time_per_iteration": 2.6034624576568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_mlp": 1.14392066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.10097410409674823, + "language_loss": 0.96476239, + "learning_rate": 0.000997295573896902, + "loss": 0.97648811, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28649902, + "routerloss_mlp": 0.0, + "step": 323, + "time_per_iteration": 2.8207039833068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02388506, + "balance_loss_mlp": 2.37343788, + "diversity_loss_mlp": 0.0, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.2858946964689234, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83584547, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 324, + "time_per_iteration": 4.691263437271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01793915, + "balance_loss_mlp": 1.78142214, + "diversity_loss_mlp": 0.0, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.11944332826526777, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80365855, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 325, + "time_per_iteration": 4.837715148925781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214832, + "balance_loss_mlp": 1.18657923, + "diversity_loss_mlp": 0.0, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.0814388529334085, + "language_loss": 0.91516924, + "learning_rate": 0.000997197627828043, + "loss": 0.92731762, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 326, + "time_per_iteration": 2.5261096954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228602, + "balance_loss_mlp": 1.20018268, + "diversity_loss_mlp": 0.0, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.08774897428196327, + "language_loss": 0.86495018, + "learning_rate": 0.0009971645930629716, + "loss": 0.87723619, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.28442383, + "routerloss_mlp": 0.0, + "step": 327, + "time_per_iteration": 2.73193621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236303, + "balance_loss_mlp": 1.20914674, + "diversity_loss_mlp": 0.0, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.0823367638378532, + "language_loss": 0.99889791, + "learning_rate": 0.0009971313652814872, + "loss": 1.01126099, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.2722168, + "routerloss_mlp": 0.0, + "step": 328, + "time_per_iteration": 2.79278826713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224995, + "balance_loss_mlp": 1.1973865, + "diversity_loss_mlp": 0.0, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.1407341288256049, + "language_loss": 0.97435188, + "learning_rate": 0.0009970979444964903, + "loss": 0.98660183, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27636719, + "routerloss_mlp": 0.0, + "step": 329, + "time_per_iteration": 2.9955334663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213807, + "balance_loss_mlp": 1.18553066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.10291010686297611, + "language_loss": 0.9869082, + "learning_rate": 0.0009970643307209556, + "loss": 0.99904621, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 330, + "time_per_iteration": 2.79775071144104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202809, + "balance_loss_mlp": 1.17248201, + "diversity_loss_mlp": 0.0, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.08231148280507655, + "language_loss": 0.94842714, + "learning_rate": 0.0009970305239679334, + "loss": 0.96045524, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.30322266, + "routerloss_mlp": 0.0, + "step": 331, + "time_per_iteration": 2.802400827407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203401, + "balance_loss_mlp": 1.17300248, + "diversity_loss_mlp": 0.0, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.08804880344809486, + "language_loss": 0.99692816, + "learning_rate": 0.0009969965242505483, + "loss": 1.00896215, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.30371094, + "routerloss_mlp": 0.0, + "step": 332, + "time_per_iteration": 2.634702682495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224958, + "balance_loss_mlp": 1.19243741, + "diversity_loss_mlp": 0.0, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06414677867033303, + "language_loss": 0.95931363, + "learning_rate": 0.0009969623315820007, + "loss": 0.97156322, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 333, + "time_per_iteration": 2.6661436557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245141, + "balance_loss_mlp": 1.21149969, + "diversity_loss_mlp": 0.0, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06624608002660057, + "language_loss": 0.9590115, + "learning_rate": 0.000996927945975565, + "loss": 0.97146285, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 334, + "time_per_iteration": 2.576922655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252992, + "balance_loss_mlp": 1.21672821, + "diversity_loss_mlp": 0.0, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.07108304231036514, + "language_loss": 0.93002915, + "learning_rate": 0.0009968933674445906, + "loss": 0.94255906, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.36230469, + "routerloss_mlp": 0.0, + "step": 335, + "time_per_iteration": 2.706836462020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267675, + "balance_loss_mlp": 1.23026776, + "diversity_loss_mlp": 0.0, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.0701420022906001, + "language_loss": 0.95153642, + "learning_rate": 0.0009968585960025028, + "loss": 0.96421325, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.37402344, + "routerloss_mlp": 0.0, + "step": 336, + "time_per_iteration": 2.9356396198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01838771, + "balance_loss_mlp": 1.81416643, + "diversity_loss_mlp": 0.0, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.09587986506557475, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79491967, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.24511719, + "routerloss_mlp": 0.0, + "step": 337, + "time_per_iteration": 4.784119606018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242978, + "balance_loss_mlp": 1.20874155, + "diversity_loss_mlp": 0.0, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.1007121907193806, + "language_loss": 0.9314844, + "learning_rate": 0.0009967884744390583, + "loss": 0.94391423, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.3425293, + "routerloss_mlp": 0.0, + "step": 338, + "time_per_iteration": 3.5315823554992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209945, + "balance_loss_mlp": 1.1758039, + "diversity_loss_mlp": 0.0, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.10820011352875603, + "language_loss": 0.93812096, + "learning_rate": 0.0009967531243449256, + "loss": 0.95022047, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.34130859, + "routerloss_mlp": 0.0, + "step": 339, + "time_per_iteration": 2.6663827896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172072, + "balance_loss_mlp": 1.13959908, + "diversity_loss_mlp": 0.0, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.07246387309668721, + "language_loss": 1.014539, + "learning_rate": 0.000996717581394126, + "loss": 1.02625966, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 340, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142038, + "balance_loss_mlp": 1.11142516, + "diversity_loss_mlp": 0.0, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.07622939946709405, + "language_loss": 1.01788783, + "learning_rate": 0.000996681845600459, + "loss": 1.0293082, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.30615234, + "routerloss_mlp": 0.0, + "step": 341, + "time_per_iteration": 2.6651370525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138836, + "balance_loss_mlp": 1.10901034, + "diversity_loss_mlp": 0.0, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.06359259902727714, + "language_loss": 0.94080132, + "learning_rate": 0.0009966459169777982, + "loss": 0.95218974, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.29785156, + "routerloss_mlp": 0.0, + "step": 342, + "time_per_iteration": 2.524775981903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136152, + "balance_loss_mlp": 1.10670757, + "diversity_loss_mlp": 0.0, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.07912610309003802, + "language_loss": 1.03090763, + "learning_rate": 0.0009966097955400924, + "loss": 1.04226899, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.29418945, + "routerloss_mlp": 0.0, + "step": 343, + "time_per_iteration": 2.662269115447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074802, + "balance_loss_mlp": 1.74366593, + "diversity_loss_mlp": 0.35364389, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.10968898462568231, + "language_loss": 0.99445379, + "learning_rate": 0.0009965734813013652, + "loss": 1.00520182, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02614743, + "step": 344, + "time_per_iteration": 2.82026743888855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138748, + "balance_loss_mlp": 1.10989952, + "diversity_loss_mlp": 0.0, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.13046244738635646, + "language_loss": 0.99630761, + "learning_rate": 0.0009965369742757151, + "loss": 1.00769508, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.28833008, + "routerloss_mlp": 0.0, + "step": 345, + "time_per_iteration": 2.565809965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112942, + "balance_loss_mlp": 1.10131097, + "diversity_loss_mlp": 0.0, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.1120170016707216, + "language_loss": 0.96858162, + "learning_rate": 0.0009965002744773152, + "loss": 0.9798758, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 346, + "time_per_iteration": 3.52542781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144914, + "balance_loss_mlp": 1.1170671, + "diversity_loss_mlp": 0.0, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.08447825810050776, + "language_loss": 0.93369007, + "learning_rate": 0.0009964633819204139, + "loss": 0.94513917, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.27832031, + "routerloss_mlp": 0.0, + "step": 347, + "time_per_iteration": 2.6504640579223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02729187, + "balance_loss_mlp": 2.68856025, + "diversity_loss_mlp": 0.0, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.36365581545094156, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.84530306, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 348, + "time_per_iteration": 4.9217259883880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01886969, + "balance_loss_mlp": 1.8606472, + "diversity_loss_mlp": 0.0, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.11180228987157655, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.77040851, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.26367188, + "routerloss_mlp": 0.0, + "step": 349, + "time_per_iteration": 4.915479898452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148873, + "balance_loss_mlp": 1.11942816, + "diversity_loss_mlp": 0.0, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.08620115988858058, + "language_loss": 0.93105251, + "learning_rate": 0.000996351547842304, + "loss": 0.94254124, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29443359, + "routerloss_mlp": 0.0, + "step": 350, + "time_per_iteration": 3.2273383140563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183797, + "balance_loss_mlp": 1.152946, + "diversity_loss_mlp": 0.0, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.10656846418921655, + "language_loss": 0.91589314, + "learning_rate": 0.0009963138843953744, + "loss": 0.92773116, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.30810547, + "routerloss_mlp": 0.0, + "step": 351, + "time_per_iteration": 2.6443302631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122224, + "balance_loss_mlp": 1.19079256, + "diversity_loss_mlp": 0.0, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.12218392571909323, + "language_loss": 0.95582229, + "learning_rate": 0.000996276028262306, + "loss": 0.9680447, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.31420898, + "routerloss_mlp": 0.0, + "step": 352, + "time_per_iteration": 2.819287061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121763, + "balance_loss_mlp": 1.18711233, + "diversity_loss_mlp": 0.0, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.14903684788896404, + "language_loss": 1.01496267, + "learning_rate": 0.0009962379794577964, + "loss": 1.02713895, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.30493164, + "routerloss_mlp": 0.0, + "step": 353, + "time_per_iteration": 2.591759204864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123139, + "balance_loss_mlp": 1.2003479, + "diversity_loss_mlp": 0.0, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.0632056956592815, + "language_loss": 0.9195236, + "learning_rate": 0.000996199737996617, + "loss": 0.9318375, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "routerloss_mlp": 0.0, + "step": 354, + "time_per_iteration": 2.889040231704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209696, + "balance_loss_mlp": 1.17963195, + "diversity_loss_mlp": 0.0, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.07119928644727336, + "language_loss": 1.00405252, + "learning_rate": 0.0009961613038936149, + "loss": 1.0161494, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.30029297, + "routerloss_mlp": 0.0, + "step": 355, + "time_per_iteration": 2.5856525897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187257, + "balance_loss_mlp": 1.15755057, + "diversity_loss_mlp": 0.0, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.07116362106359332, + "language_loss": 0.93361115, + "learning_rate": 0.000996122677163711, + "loss": 0.9454838, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.296875, + "routerloss_mlp": 0.0, + "step": 356, + "time_per_iteration": 2.8134818077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213028, + "balance_loss_mlp": 1.18367887, + "diversity_loss_mlp": 0.0, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.08014414191517881, + "language_loss": 0.98940754, + "learning_rate": 0.000996083857821902, + "loss": 1.0015378, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.29345703, + "routerloss_mlp": 0.0, + "step": 357, + "time_per_iteration": 3.0531890392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237281, + "balance_loss_mlp": 1.20714498, + "diversity_loss_mlp": 0.0, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.06260381392843543, + "language_loss": 0.96791607, + "learning_rate": 0.0009960448458832588, + "loss": 0.98028892, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30126953, + "routerloss_mlp": 0.0, + "step": 358, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.20750594, + "diversity_loss_mlp": 0.0, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.07177130169486132, + "language_loss": 0.96227086, + "learning_rate": 0.000996005641362927, + "loss": 0.97463197, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28637695, + "routerloss_mlp": 0.0, + "step": 359, + "time_per_iteration": 2.58060884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229528, + "balance_loss_mlp": 1.19984436, + "diversity_loss_mlp": 0.0, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.09877521418753983, + "language_loss": 0.99257219, + "learning_rate": 0.0009959662442761274, + "loss": 1.00486755, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.29663086, + "routerloss_mlp": 0.0, + "step": 360, + "time_per_iteration": 2.8970725536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241998, + "balance_loss_mlp": 1.21033561, + "diversity_loss_mlp": 0.0, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.07509157549903762, + "language_loss": 0.93086261, + "learning_rate": 0.000995926654638155, + "loss": 0.9432826, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.31640625, + "routerloss_mlp": 0.0, + "step": 361, + "time_per_iteration": 2.787796974182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225169, + "balance_loss_mlp": 1.19405532, + "diversity_loss_mlp": 0.0, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.08313329413520473, + "language_loss": 0.94580126, + "learning_rate": 0.00099588687246438, + "loss": 0.95805293, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.31103516, + "routerloss_mlp": 0.0, + "step": 362, + "time_per_iteration": 2.826186418533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188587, + "balance_loss_mlp": 1.15785527, + "diversity_loss_mlp": 0.0, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.12654684897021498, + "language_loss": 1.02203465, + "learning_rate": 0.0009958468977702471, + "loss": 1.03392053, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 363, + "time_per_iteration": 2.5915637016296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02117372, + "balance_loss_mlp": 1.97470212, + "diversity_loss_mlp": 0.0, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.12517092959889778, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81852078, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.4296875, + "routerloss_mlp": 0.0, + "step": 364, + "time_per_iteration": 4.79950737953186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195198, + "balance_loss_mlp": 1.16406059, + "diversity_loss_mlp": 0.0, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.08484436116426784, + "language_loss": 0.90580225, + "learning_rate": 0.0009957663708830612, + "loss": 0.91775423, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 365, + "time_per_iteration": 3.2616662979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119947, + "balance_loss_mlp": 1.16575801, + "diversity_loss_mlp": 0.0, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.10575932689534903, + "language_loss": 0.93159938, + "learning_rate": 0.0009957258187212714, + "loss": 0.9435941, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.33740234, + "routerloss_mlp": 0.0, + "step": 366, + "time_per_iteration": 3.0113134384155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02012454, + "balance_loss_mlp": 1.90030205, + "diversity_loss_mlp": 0.0, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.0781885975604906, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.81207317, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.125, + "routerloss_mlp": 0.0, + "step": 367, + "time_per_iteration": 4.857182502746582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238272, + "balance_loss_mlp": 1.20377314, + "diversity_loss_mlp": 0.0, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.10459556468103207, + "language_loss": 0.9040041, + "learning_rate": 0.0009956441370400167, + "loss": 0.91638684, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.34472656, + "routerloss_mlp": 0.0, + "step": 368, + "time_per_iteration": 2.6384623050689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212552, + "balance_loss_mlp": 1.17986465, + "diversity_loss_mlp": 0.0, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.11871319311308551, + "language_loss": 0.96155751, + "learning_rate": 0.0009956030075522636, + "loss": 0.973683, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.3269043, + "routerloss_mlp": 0.0, + "step": 369, + "time_per_iteration": 2.7690951824188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.85686088, + "diversity_loss_mlp": 0.26596725, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0445321938876095, + "language_loss": 0.99161661, + "learning_rate": 0.0009955616856543587, + "loss": 1.00259984, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.03691306, + "step": 370, + "time_per_iteration": 2.6551451683044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136923, + "balance_loss_mlp": 1.10690594, + "diversity_loss_mlp": 0.0, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.06345816714032589, + "language_loss": 0.89315635, + "learning_rate": 0.0009955201713623448, + "loss": 0.90452558, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.29980469, + "routerloss_mlp": 0.0, + "step": 371, + "time_per_iteration": 2.7738049030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01981215, + "balance_loss_mlp": 1.93124223, + "diversity_loss_mlp": 0.0, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.16358882606758401, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78653932, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.5, + "routerloss_mlp": 0.0, + "step": 372, + "time_per_iteration": 4.94252347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117773, + "balance_loss_mlp": 1.08999681, + "diversity_loss_mlp": 0.0, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.14652608757044766, + "language_loss": 1.03006279, + "learning_rate": 0.0009954365656605333, + "loss": 1.04124057, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.27783203, + "routerloss_mlp": 0.0, + "step": 373, + "time_per_iteration": 2.551156759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138367, + "balance_loss_mlp": 1.10901785, + "diversity_loss_mlp": 0.0, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.09116429227244367, + "language_loss": 0.95790577, + "learning_rate": 0.0009953944742831947, + "loss": 0.96928942, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.29296875, + "routerloss_mlp": 0.0, + "step": 374, + "time_per_iteration": 2.995286226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159694, + "balance_loss_mlp": 1.13084567, + "diversity_loss_mlp": 0.0, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.10582188185488459, + "language_loss": 0.99257255, + "learning_rate": 0.0009953521905766642, + "loss": 1.00416946, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.28808594, + "routerloss_mlp": 0.0, + "step": 375, + "time_per_iteration": 2.946237325668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186879, + "balance_loss_mlp": 1.15664721, + "diversity_loss_mlp": 0.0, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.09648654328935216, + "language_loss": 0.97696835, + "learning_rate": 0.0009953097145573577, + "loss": 0.98883718, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.30200195, + "routerloss_mlp": 0.0, + "step": 376, + "time_per_iteration": 2.64080548286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119333, + "balance_loss_mlp": 1.16164398, + "diversity_loss_mlp": 0.0, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.11805021949506506, + "language_loss": 0.95023847, + "learning_rate": 0.000995267046241766, + "loss": 0.96217185, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 377, + "time_per_iteration": 3.2120020389556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188603, + "balance_loss_mlp": 1.15617776, + "diversity_loss_mlp": 0.0, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.10215127385841216, + "language_loss": 0.94931126, + "learning_rate": 0.0009952241856464547, + "loss": 0.96119732, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.32421875, + "routerloss_mlp": 0.0, + "step": 378, + "time_per_iteration": 2.595047950744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183617, + "balance_loss_mlp": 1.14971423, + "diversity_loss_mlp": 0.0, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.08294465031859817, + "language_loss": 1.01604176, + "learning_rate": 0.0009951811327880632, + "loss": 1.02787805, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.33911133, + "routerloss_mlp": 0.0, + "step": 379, + "time_per_iteration": 2.7318813800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173744, + "balance_loss_mlp": 1.13891101, + "diversity_loss_mlp": 0.0, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.06744176383892367, + "language_loss": 0.94898254, + "learning_rate": 0.0009951378876833063, + "loss": 0.96071994, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.34838867, + "routerloss_mlp": 0.0, + "step": 380, + "time_per_iteration": 2.565268039703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198329, + "balance_loss_mlp": 1.16392517, + "diversity_loss_mlp": 0.0, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.08808941505023588, + "language_loss": 1.01867247, + "learning_rate": 0.0009950944503489736, + "loss": 1.03065586, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.34399414, + "routerloss_mlp": 0.0, + "step": 381, + "time_per_iteration": 2.7605583667755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220014, + "balance_loss_mlp": 1.18479919, + "diversity_loss_mlp": 0.0, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.09503573620830386, + "language_loss": 0.95487726, + "learning_rate": 0.0009950508208019285, + "loss": 0.96707737, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.35253906, + "routerloss_mlp": 0.0, + "step": 382, + "time_per_iteration": 3.023996591567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224507, + "balance_loss_mlp": 1.19086623, + "diversity_loss_mlp": 0.0, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.09021711867793632, + "language_loss": 1.0023253, + "learning_rate": 0.0009950069990591096, + "loss": 1.01457047, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.33666992, + "routerloss_mlp": 0.0, + "step": 383, + "time_per_iteration": 2.62634015083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02435347, + "balance_loss_mlp": 2.36668229, + "diversity_loss_mlp": 0.0, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.252441104666548, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78836709, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.6875, + "routerloss_mlp": 0.0, + "step": 384, + "time_per_iteration": 4.887000322341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205074, + "balance_loss_mlp": 1.17217231, + "diversity_loss_mlp": 0.0, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.13776686153508858, + "language_loss": 0.92669415, + "learning_rate": 0.0009949187790542777, + "loss": 0.93874478, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.32910156, + "routerloss_mlp": 0.0, + "step": 385, + "time_per_iteration": 2.7325563430786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158547, + "balance_loss_mlp": 1.12683773, + "diversity_loss_mlp": 0.0, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.09404920935129117, + "language_loss": 0.89306223, + "learning_rate": 0.0009948743808265148, + "loss": 0.90464771, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 386, + "time_per_iteration": 2.723581314086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152345, + "balance_loss_mlp": 1.12321043, + "diversity_loss_mlp": 0.0, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.11553674714385681, + "language_loss": 0.98625511, + "learning_rate": 0.0009948297904714782, + "loss": 0.99777853, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29125977, + "routerloss_mlp": 0.0, + "step": 387, + "time_per_iteration": 2.6925902366638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152671, + "balance_loss_mlp": 1.12460923, + "diversity_loss_mlp": 0.0, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.10281917509950625, + "language_loss": 0.91430104, + "learning_rate": 0.0009947850080064796, + "loss": 0.92582774, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.28076172, + "routerloss_mlp": 0.0, + "step": 388, + "time_per_iteration": 2.7813222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_mlp": 1.80238378, + "diversity_loss_mlp": 0.24433145, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.03140321958098528, + "language_loss": 0.96549261, + "learning_rate": 0.0009947400334489047, + "loss": 0.97600979, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0283502, + "step": 389, + "time_per_iteration": 3.055640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_mlp": 1.11867988, + "diversity_loss_mlp": 0.0, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.10120121915973303, + "language_loss": 0.87344396, + "learning_rate": 0.0009946948668162145, + "loss": 0.88490444, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.27392578, + "routerloss_mlp": 0.0, + "step": 390, + "time_per_iteration": 2.7240688800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159261, + "balance_loss_mlp": 1.13079381, + "diversity_loss_mlp": 0.0, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.0733706931740777, + "language_loss": 0.92598295, + "learning_rate": 0.0009946495081259441, + "loss": 0.93757558, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 391, + "time_per_iteration": 2.8451168537139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145886, + "balance_loss_mlp": 1.11753774, + "diversity_loss_mlp": 0.0, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.0986246500370879, + "language_loss": 0.95604634, + "learning_rate": 0.0009946039573957035, + "loss": 0.96750522, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.28344727, + "routerloss_mlp": 0.0, + "step": 392, + "time_per_iteration": 2.943962574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142945, + "balance_loss_mlp": 1.11550307, + "diversity_loss_mlp": 0.0, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.0698233472363084, + "language_loss": 0.92221498, + "learning_rate": 0.000994558214643177, + "loss": 0.93364441, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.27441406, + "routerloss_mlp": 0.0, + "step": 393, + "time_per_iteration": 2.7336390018463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137637, + "balance_loss_mlp": 1.10933709, + "diversity_loss_mlp": 0.0, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.0667709001177297, + "language_loss": 0.93581867, + "learning_rate": 0.000994512279886123, + "loss": 0.94719505, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 394, + "time_per_iteration": 3.0792524814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148773, + "balance_loss_mlp": 1.12104487, + "diversity_loss_mlp": 0.0, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.057306164352953166, + "language_loss": 0.94243777, + "learning_rate": 0.0009944661531423758, + "loss": 0.95392549, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.27758789, + "routerloss_mlp": 0.0, + "step": 395, + "time_per_iteration": 2.7003707885742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169709, + "balance_loss_mlp": 1.14162326, + "diversity_loss_mlp": 0.0, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.09187664036534561, + "language_loss": 0.92709243, + "learning_rate": 0.000994419834429843, + "loss": 0.93878949, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 396, + "time_per_iteration": 2.654961109161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184579, + "balance_loss_mlp": 1.15613592, + "diversity_loss_mlp": 0.0, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.10401840603132484, + "language_loss": 0.96742636, + "learning_rate": 0.0009943733237665069, + "loss": 0.97927213, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 397, + "time_per_iteration": 2.8282015323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204203, + "balance_loss_mlp": 1.17542565, + "diversity_loss_mlp": 0.0, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.06433229599495933, + "language_loss": 0.96130294, + "learning_rate": 0.0009943266211704248, + "loss": 0.97334492, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.28759766, + "routerloss_mlp": 0.0, + "step": 398, + "time_per_iteration": 2.970426321029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183998, + "balance_loss_mlp": 1.15534043, + "diversity_loss_mlp": 0.0, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.08157022591406732, + "language_loss": 0.98195136, + "learning_rate": 0.000994279726659728, + "loss": 0.99379134, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.28662109, + "routerloss_mlp": 0.0, + "step": 399, + "time_per_iteration": 2.5123794078826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177562, + "balance_loss_mlp": 1.14926195, + "diversity_loss_mlp": 0.0, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.07895179134063258, + "language_loss": 0.95376462, + "learning_rate": 0.0009942326402526231, + "loss": 0.96554029, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.28320312, + "routerloss_mlp": 0.0, + "step": 400, + "time_per_iteration": 2.52349591255188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146856, + "balance_loss_mlp": 1.11905658, + "diversity_loss_mlp": 0.0, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.0705701607591385, + "language_loss": 0.94442534, + "learning_rate": 0.0009941853619673902, + "loss": 0.95589387, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.27807617, + "routerloss_mlp": 0.0, + "step": 401, + "time_per_iteration": 2.643442153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.10811007, + "diversity_loss_mlp": 0.0, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.11619926948996102, + "language_loss": 0.97199881, + "learning_rate": 0.0009941378918223844, + "loss": 0.9833436, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.26416016, + "routerloss_mlp": 0.0, + "step": 402, + "time_per_iteration": 3.05241322517395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124539, + "balance_loss_mlp": 1.09765708, + "diversity_loss_mlp": 0.0, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.0628584922031364, + "language_loss": 0.90586787, + "learning_rate": 0.0009940902298360354, + "loss": 0.91711324, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.26916504, + "routerloss_mlp": 0.0, + "step": 403, + "time_per_iteration": 2.739593744277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123125, + "balance_loss_mlp": 1.09564674, + "diversity_loss_mlp": 0.0, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.07463467829204698, + "language_loss": 0.99357891, + "learning_rate": 0.0009940423760268473, + "loss": 1.00481009, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.27478027, + "routerloss_mlp": 0.0, + "step": 404, + "time_per_iteration": 2.863248825073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123907, + "balance_loss_mlp": 1.09644127, + "diversity_loss_mlp": 0.0, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.08544352707712408, + "language_loss": 0.93046296, + "learning_rate": 0.0009939943304133982, + "loss": 0.94170201, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.27514648, + "routerloss_mlp": 0.0, + "step": 405, + "time_per_iteration": 2.631242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929276, + "balance_loss_mlp": 1.55583501, + "diversity_loss_mlp": 0.25816602, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.039808149400508724, + "language_loss": 1.0085814, + "learning_rate": 0.0009939460930143416, + "loss": 1.017874, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02227605, + "step": 406, + "time_per_iteration": 2.655000925064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908113, + "balance_loss_mlp": 1.5136435, + "diversity_loss_mlp": 0.25845903, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.031543409668047605, + "language_loss": 0.94866949, + "learning_rate": 0.0009938976638484043, + "loss": 0.95775062, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02206134, + "step": 407, + "time_per_iteration": 2.932522773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125815, + "balance_loss_mlp": 1.09954083, + "diversity_loss_mlp": 0.0, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.0874520562524596, + "language_loss": 0.93291676, + "learning_rate": 0.0009938490429343887, + "loss": 0.94417489, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 408, + "time_per_iteration": 2.5488343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128514, + "balance_loss_mlp": 1.10140562, + "diversity_loss_mlp": 0.0, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.1051667442879041, + "language_loss": 0.94155729, + "learning_rate": 0.0009938002302911709, + "loss": 0.95284247, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 409, + "time_per_iteration": 2.7672979831695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.10946035, + "diversity_loss_mlp": 0.0, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.09613329153911296, + "language_loss": 0.9601537, + "learning_rate": 0.0009937512259377015, + "loss": 0.97151482, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 410, + "time_per_iteration": 2.674072504043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159019, + "balance_loss_mlp": 1.13217306, + "diversity_loss_mlp": 0.0, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.05951235305386178, + "language_loss": 0.95475662, + "learning_rate": 0.000993702029893006, + "loss": 0.96634674, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.26879883, + "routerloss_mlp": 0.0, + "step": 411, + "time_per_iteration": 2.7913753986358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185856, + "balance_loss_mlp": 1.15731764, + "diversity_loss_mlp": 0.0, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.10961223184545879, + "language_loss": 0.95336723, + "learning_rate": 0.0009936526421761838, + "loss": 0.96522582, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.28540039, + "routerloss_mlp": 0.0, + "step": 412, + "time_per_iteration": 3.036557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181446, + "balance_loss_mlp": 1.15414703, + "diversity_loss_mlp": 0.0, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.09075853005030154, + "language_loss": 0.97731507, + "learning_rate": 0.000993603062806409, + "loss": 0.98912954, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.27319336, + "routerloss_mlp": 0.0, + "step": 413, + "time_per_iteration": 2.690500259399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166438, + "balance_loss_mlp": 1.1394248, + "diversity_loss_mlp": 0.0, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.0841151797190701, + "language_loss": 1.00301099, + "learning_rate": 0.0009935532918029298, + "loss": 1.01467538, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.27050781, + "routerloss_mlp": 0.0, + "step": 414, + "time_per_iteration": 2.6386477947235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171646, + "balance_loss_mlp": 1.14432323, + "diversity_loss_mlp": 0.0, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.07267589634089947, + "language_loss": 0.94145483, + "learning_rate": 0.0009935033291850694, + "loss": 0.95317131, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.27307129, + "routerloss_mlp": 0.0, + "step": 415, + "time_per_iteration": 2.6771326065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138565, + "balance_loss_mlp": 1.11312544, + "diversity_loss_mlp": 0.0, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.09244391725109519, + "language_loss": 0.96404541, + "learning_rate": 0.0009934531749722247, + "loss": 0.97543103, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 416, + "time_per_iteration": 2.586975574493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132517, + "balance_loss_mlp": 1.10733998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.0915153559751851, + "language_loss": 0.94398224, + "learning_rate": 0.0009934028291838672, + "loss": 0.95530736, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.25183105, + "routerloss_mlp": 0.0, + "step": 417, + "time_per_iteration": 2.7062928676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150706, + "balance_loss_mlp": 1.1251713, + "diversity_loss_mlp": 0.0, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.10053131301435142, + "language_loss": 0.89968443, + "learning_rate": 0.0009933522918395433, + "loss": 0.91119152, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.25549316, + "routerloss_mlp": 0.0, + "step": 418, + "time_per_iteration": 2.65326189994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00760745, + "balance_loss_mlp": 1.16580379, + "diversity_loss_mlp": 0.256477, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.006992447528439397, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79011846, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.049605, + "step": 419, + "time_per_iteration": 4.8772523403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176473, + "balance_loss_mlp": 1.15143883, + "diversity_loss_mlp": 0.0, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08608768077535772, + "language_loss": 1.07860529, + "learning_rate": 0.000993250642561551, + "loss": 1.09036994, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.25061035, + "routerloss_mlp": 0.0, + "step": 420, + "time_per_iteration": 2.588672399520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.15165043, + "diversity_loss_mlp": 0.0, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.09804047271530963, + "language_loss": 0.93524832, + "learning_rate": 0.0009931995306673466, + "loss": 0.94701445, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 421, + "time_per_iteration": 2.734513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200943, + "balance_loss_mlp": 1.17474103, + "diversity_loss_mlp": 0.0, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.0768650968130289, + "language_loss": 0.98959565, + "learning_rate": 0.000993148227296103, + "loss": 1.00160503, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 422, + "time_per_iteration": 2.6389012336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185361, + "balance_loss_mlp": 1.1604228, + "diversity_loss_mlp": 0.0, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.08220754838372611, + "language_loss": 0.87845761, + "learning_rate": 0.000993096732467738, + "loss": 0.89031118, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.24938965, + "routerloss_mlp": 0.0, + "step": 423, + "time_per_iteration": 2.976412057876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00884908, + "balance_loss_mlp": 1.45653749, + "diversity_loss_mlp": 0.26738948, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.04326164577840749, + "language_loss": 0.94753903, + "learning_rate": 0.0009930450462022435, + "loss": 0.95638812, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02294483, + "step": 424, + "time_per_iteration": 2.9038002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02462639, + "balance_loss_mlp": 2.35582733, + "diversity_loss_mlp": 0.0, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.15208391867633483, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.81652445, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.0703125, + "routerloss_mlp": 0.0, + "step": 425, + "time_per_iteration": 4.893689155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182525, + "balance_loss_mlp": 1.15690684, + "diversity_loss_mlp": 0.0, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.10181541083425144, + "language_loss": 0.92197704, + "learning_rate": 0.0009929410994402065, + "loss": 0.93380231, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.25646973, + "routerloss_mlp": 0.0, + "step": 426, + "time_per_iteration": 3.793488025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863772, + "balance_loss_mlp": 1.42266524, + "diversity_loss_mlp": 0.26325443, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.038163151149059646, + "language_loss": 0.97185421, + "learning_rate": 0.0009928888389840196, + "loss": 0.98049194, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02081174, + "step": 427, + "time_per_iteration": 2.7310097217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196199, + "balance_loss_mlp": 1.1708436, + "diversity_loss_mlp": 0.0, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.1014811860289813, + "language_loss": 0.98936689, + "learning_rate": 0.0009928363871714147, + "loss": 1.00132895, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.25378418, + "routerloss_mlp": 0.0, + "step": 428, + "time_per_iteration": 2.650698184967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198239, + "balance_loss_mlp": 1.17194164, + "diversity_loss_mlp": 0.0, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.0884548399202502, + "language_loss": 0.93840969, + "learning_rate": 0.0009927837440227556, + "loss": 0.95039201, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 429, + "time_per_iteration": 2.8162689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199498, + "balance_loss_mlp": 1.17399931, + "diversity_loss_mlp": 0.0, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.0660726649824177, + "language_loss": 0.88846099, + "learning_rate": 0.0009927309095584798, + "loss": 0.90045595, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.25524902, + "routerloss_mlp": 0.0, + "step": 430, + "time_per_iteration": 2.975594997406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190829, + "balance_loss_mlp": 1.1661284, + "diversity_loss_mlp": 0.0, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.08430379744466543, + "language_loss": 0.98639262, + "learning_rate": 0.0009926778837991, + "loss": 0.99830091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.24682617, + "routerloss_mlp": 0.0, + "step": 431, + "time_per_iteration": 2.595855236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187757, + "balance_loss_mlp": 1.16231799, + "diversity_loss_mlp": 0.0, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.08045199303169787, + "language_loss": 0.97297168, + "learning_rate": 0.000992624666765202, + "loss": 0.98484921, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.2545166, + "routerloss_mlp": 0.0, + "step": 432, + "time_per_iteration": 2.828488826751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195331, + "balance_loss_mlp": 1.17080951, + "diversity_loss_mlp": 0.0, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.08518069864439091, + "language_loss": 0.9513936, + "learning_rate": 0.000992571258477447, + "loss": 0.96334684, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 433, + "time_per_iteration": 2.7914628982543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181479, + "balance_loss_mlp": 1.15727913, + "diversity_loss_mlp": 0.0, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.08514456826718247, + "language_loss": 0.89393032, + "learning_rate": 0.0009925176589565695, + "loss": 0.90574509, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.24182129, + "routerloss_mlp": 0.0, + "step": 434, + "time_per_iteration": 2.847381830215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154073, + "balance_loss_mlp": 1.13002813, + "diversity_loss_mlp": 0.0, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.09497783603336436, + "language_loss": 0.99263078, + "learning_rate": 0.0009924638682233791, + "loss": 1.00417161, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.24047852, + "routerloss_mlp": 0.0, + "step": 435, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505725, + "balance_loss_mlp": 2.43934894, + "diversity_loss_mlp": 0.0, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06827578128022488, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82070321, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.6640625, + "routerloss_mlp": 0.0, + "step": 436, + "time_per_iteration": 4.539026737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138708, + "balance_loss_mlp": 1.11440182, + "diversity_loss_mlp": 0.0, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.10357837156718612, + "language_loss": 0.8856501, + "learning_rate": 0.0009923557132036668, + "loss": 0.89703721, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 437, + "time_per_iteration": 3.0414698123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124752, + "balance_loss_mlp": 1.09998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.06660243724344939, + "language_loss": 0.94103611, + "learning_rate": 0.0009923013489591345, + "loss": 0.95228368, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.24768066, + "routerloss_mlp": 0.0, + "step": 438, + "time_per_iteration": 2.7426626682281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857144, + "balance_loss_mlp": 1.4199276, + "diversity_loss_mlp": 0.26049304, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04620678173721227, + "language_loss": 0.92873847, + "learning_rate": 0.0009922467935862681, + "loss": 0.93730992, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01693399, + "step": 439, + "time_per_iteration": 3.107149124145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113851, + "balance_loss_mlp": 1.11386943, + "diversity_loss_mlp": 0.0, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.07763968648184205, + "language_loss": 0.95120305, + "learning_rate": 0.0009921920471062478, + "loss": 0.96258819, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 440, + "time_per_iteration": 2.572195529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139984, + "balance_loss_mlp": 1.11489022, + "diversity_loss_mlp": 0.0, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.0880262953369173, + "language_loss": 0.92829931, + "learning_rate": 0.0009921371095403281, + "loss": 0.93969917, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.25109863, + "routerloss_mlp": 0.0, + "step": 441, + "time_per_iteration": 2.6386919021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156684, + "balance_loss_mlp": 1.13206697, + "diversity_loss_mlp": 0.0, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.09427081021892933, + "language_loss": 0.95792937, + "learning_rate": 0.0009920819809098379, + "loss": 0.96949625, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 442, + "time_per_iteration": 2.588674783706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169571, + "balance_loss_mlp": 1.1441319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.0873536117240321, + "language_loss": 0.91373646, + "learning_rate": 0.0009920266612361798, + "loss": 0.92543221, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 443, + "time_per_iteration": 2.755526065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167225, + "balance_loss_mlp": 1.14349055, + "diversity_loss_mlp": 0.0, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.07116177044877865, + "language_loss": 0.90907955, + "learning_rate": 0.0009919711505408308, + "loss": 0.92075175, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.23718262, + "routerloss_mlp": 0.0, + "step": 444, + "time_per_iteration": 2.7939865589141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116645, + "balance_loss_mlp": 1.14170241, + "diversity_loss_mlp": 0.0, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.09221719775958219, + "language_loss": 0.89192301, + "learning_rate": 0.000991915448845342, + "loss": 0.90358752, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.24731445, + "routerloss_mlp": 0.0, + "step": 445, + "time_per_iteration": 2.5457842350006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154656, + "balance_loss_mlp": 1.13168466, + "diversity_loss_mlp": 0.0, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.08780021998431992, + "language_loss": 0.98329008, + "learning_rate": 0.000991859556171339, + "loss": 0.99483669, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.22973633, + "routerloss_mlp": 0.0, + "step": 446, + "time_per_iteration": 2.6356756687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0083848, + "balance_loss_mlp": 1.38336182, + "diversity_loss_mlp": 0.25472927, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.049564893991705376, + "language_loss": 1.00050902, + "learning_rate": 0.000991803472540521, + "loss": 1.00889397, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01943407, + "step": 447, + "time_per_iteration": 2.631704807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130193, + "balance_loss_mlp": 1.1087712, + "diversity_loss_mlp": 0.0, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.11682082282160788, + "language_loss": 0.94917679, + "learning_rate": 0.0009917471979746615, + "loss": 0.96047872, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 448, + "time_per_iteration": 2.9820516109466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122722, + "balance_loss_mlp": 1.10119319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.07207820272739716, + "language_loss": 0.94521272, + "learning_rate": 0.0009916907324956086, + "loss": 0.95643997, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 449, + "time_per_iteration": 2.701571464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127326, + "balance_loss_mlp": 1.10453379, + "diversity_loss_mlp": 0.0, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.081693490118891, + "language_loss": 0.90889072, + "learning_rate": 0.0009916340761252837, + "loss": 0.92016399, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 450, + "time_per_iteration": 2.598238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124635, + "balance_loss_mlp": 1.10287929, + "diversity_loss_mlp": 0.0, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.08322873762038852, + "language_loss": 0.88526833, + "learning_rate": 0.0009915772288856832, + "loss": 0.89651471, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.21765137, + "routerloss_mlp": 0.0, + "step": 451, + "time_per_iteration": 3.0680441856384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121178, + "balance_loss_mlp": 1.09876692, + "diversity_loss_mlp": 0.0, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.07764148626601892, + "language_loss": 0.8994481, + "learning_rate": 0.000991520190798877, + "loss": 0.91065991, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22412109, + "routerloss_mlp": 0.0, + "step": 452, + "time_per_iteration": 2.7982983589172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136254, + "balance_loss_mlp": 1.11281788, + "diversity_loss_mlp": 0.0, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.11496723003988224, + "language_loss": 0.98584056, + "learning_rate": 0.0009914629618870089, + "loss": 0.99720311, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 453, + "time_per_iteration": 2.8737423419952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0218934, + "balance_loss_mlp": 2.1624465, + "diversity_loss_mlp": 0.0, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.09249743450545506, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.8086521, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.26953125, + "routerloss_mlp": 0.0, + "step": 454, + "time_per_iteration": 4.756322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065274, + "balance_loss_mlp": 2.03780842, + "diversity_loss_mlp": 0.0, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.0744981683452351, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83493233, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.27539062, + "routerloss_mlp": 0.0, + "step": 455, + "time_per_iteration": 2.173584461212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848454, + "balance_loss_mlp": 1.40727437, + "diversity_loss_mlp": 0.24745712, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.04702924064086775, + "language_loss": 0.92085564, + "learning_rate": 0.0009912901304235883, + "loss": 0.92934018, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0210887, + "step": 456, + "time_per_iteration": 2.868276596069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273346, + "balance_loss_mlp": 1.24886012, + "diversity_loss_mlp": 0.0, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.1518400720273604, + "language_loss": 0.87943619, + "learning_rate": 0.000991232138434397, + "loss": 0.89216965, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.24499512, + "routerloss_mlp": 0.0, + "step": 457, + "time_per_iteration": 2.8729381561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262571, + "balance_loss_mlp": 1.23763299, + "diversity_loss_mlp": 0.0, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.14470377187588201, + "language_loss": 0.94336045, + "learning_rate": 0.000991173955731976, + "loss": 0.9559862, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 458, + "time_per_iteration": 2.7100729942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218734, + "balance_loss_mlp": 1.19520259, + "diversity_loss_mlp": 0.0, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.09239254139658798, + "language_loss": 0.99845707, + "learning_rate": 0.0009911155823389137, + "loss": 1.01064444, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.23547363, + "routerloss_mlp": 0.0, + "step": 459, + "time_per_iteration": 2.9462080001831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178782, + "balance_loss_mlp": 1.1555717, + "diversity_loss_mlp": 0.0, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.0878830171329016, + "language_loss": 0.95269191, + "learning_rate": 0.000991057018277873, + "loss": 0.9644798, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.23205566, + "routerloss_mlp": 0.0, + "step": 460, + "time_per_iteration": 2.7473583221435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151252, + "balance_loss_mlp": 1.12904322, + "diversity_loss_mlp": 0.0, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.1205367347306004, + "language_loss": 0.9509443, + "learning_rate": 0.0009909982635715898, + "loss": 0.96245682, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22216797, + "routerloss_mlp": 0.0, + "step": 461, + "time_per_iteration": 2.6226725578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145607, + "balance_loss_mlp": 1.12300491, + "diversity_loss_mlp": 0.0, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.0884001914091671, + "language_loss": 0.94182885, + "learning_rate": 0.0009909393182428751, + "loss": 0.95328492, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22619629, + "routerloss_mlp": 0.0, + "step": 462, + "time_per_iteration": 2.632216453552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157329, + "balance_loss_mlp": 1.13402367, + "diversity_loss_mlp": 0.0, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.09814328047414513, + "language_loss": 0.89072084, + "learning_rate": 0.000990880182314614, + "loss": 0.90229416, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 463, + "time_per_iteration": 2.6763410568237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.008652, + "balance_loss_mlp": 1.44467092, + "diversity_loss_mlp": 0.24997658, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.034550824680377484, + "language_loss": 0.89998591, + "learning_rate": 0.0009908208558097643, + "loss": 0.90863788, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01787652, + "step": 464, + "time_per_iteration": 2.9323060512542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224446, + "balance_loss_mlp": 1.20036614, + "diversity_loss_mlp": 0.0, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.11121459240038054, + "language_loss": 0.9153899, + "learning_rate": 0.000990761338751359, + "loss": 0.92763436, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 465, + "time_per_iteration": 2.7976956367492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01887012, + "balance_loss_mlp": 1.84867477, + "diversity_loss_mlp": 0.0, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.10155840838291885, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75546634, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.3828125, + "routerloss_mlp": 0.0, + "step": 466, + "time_per_iteration": 4.965139150619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319273, + "balance_loss_mlp": 1.29344034, + "diversity_loss_mlp": 0.0, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.10901527230577203, + "language_loss": 0.93872285, + "learning_rate": 0.0009906417330663815, + "loss": 0.95191562, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 467, + "time_per_iteration": 2.628042459487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01352641, + "balance_loss_mlp": 1.3264153, + "diversity_loss_mlp": 0.0, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.10051526680757361, + "language_loss": 0.90321958, + "learning_rate": 0.0009905816444862442, + "loss": 0.91674596, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 468, + "time_per_iteration": 2.613952398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396274, + "balance_loss_mlp": 1.36905813, + "diversity_loss_mlp": 0.0, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.10220310656667285, + "language_loss": 0.88433367, + "learning_rate": 0.0009905213654454216, + "loss": 0.89829642, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.27209473, + "routerloss_mlp": 0.0, + "step": 469, + "time_per_iteration": 2.897365093231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01363851, + "balance_loss_mlp": 1.3367548, + "diversity_loss_mlp": 0.0, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.11223211494597432, + "language_loss": 0.94907629, + "learning_rate": 0.0009904608959673158, + "loss": 0.96271479, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.2713623, + "routerloss_mlp": 0.0, + "step": 470, + "time_per_iteration": 2.7828967571258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328731, + "balance_loss_mlp": 1.30289829, + "diversity_loss_mlp": 0.0, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.10534875872888719, + "language_loss": 0.94143116, + "learning_rate": 0.000990400236075403, + "loss": 0.95471847, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 471, + "time_per_iteration": 2.5291385650634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126, + "balance_loss_mlp": 1.23546696, + "diversity_loss_mlp": 0.0, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.08150240013734093, + "language_loss": 0.92401147, + "learning_rate": 0.0009903393857932338, + "loss": 0.93661153, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 472, + "time_per_iteration": 2.6317975521087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234666, + "balance_loss_mlp": 1.21105075, + "diversity_loss_mlp": 0.0, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.1079858906687858, + "language_loss": 0.89742762, + "learning_rate": 0.0009902783451444317, + "loss": 0.90977424, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.23583984, + "routerloss_mlp": 0.0, + "step": 473, + "time_per_iteration": 2.708159923553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204783, + "balance_loss_mlp": 1.18326581, + "diversity_loss_mlp": 0.0, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.08561107807714156, + "language_loss": 0.94620812, + "learning_rate": 0.0009902171141526956, + "loss": 0.95825595, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 474, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196875, + "balance_loss_mlp": 1.17460644, + "diversity_loss_mlp": 0.0, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.10745755704500252, + "language_loss": 0.82875264, + "learning_rate": 0.000990155692841797, + "loss": 0.84072143, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.22277832, + "routerloss_mlp": 0.0, + "step": 475, + "time_per_iteration": 2.985820770263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191147, + "balance_loss_mlp": 1.16911697, + "diversity_loss_mlp": 0.0, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.10692573165988825, + "language_loss": 0.93685389, + "learning_rate": 0.0009900940812355818, + "loss": 0.9487654, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.22033691, + "routerloss_mlp": 0.0, + "step": 476, + "time_per_iteration": 2.882946014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182015, + "balance_loss_mlp": 1.15972316, + "diversity_loss_mlp": 0.0, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.15748592495925862, + "language_loss": 0.89566875, + "learning_rate": 0.00099003227935797, + "loss": 0.90748894, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.22290039, + "routerloss_mlp": 0.0, + "step": 477, + "time_per_iteration": 2.729729413986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176422, + "balance_loss_mlp": 1.15324748, + "diversity_loss_mlp": 0.0, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.11223041806675033, + "language_loss": 0.92644513, + "learning_rate": 0.000989970287232955, + "loss": 0.93820935, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.23156738, + "routerloss_mlp": 0.0, + "step": 478, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168241, + "balance_loss_mlp": 1.14524555, + "diversity_loss_mlp": 0.0, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.08330283562574453, + "language_loss": 0.90444613, + "learning_rate": 0.0009899081048846043, + "loss": 0.91612852, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 479, + "time_per_iteration": 2.548454523086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230508, + "balance_loss_mlp": 1.20630884, + "diversity_loss_mlp": 0.0, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.17103007353978975, + "language_loss": 0.94793594, + "learning_rate": 0.0009898457323370593, + "loss": 0.96024096, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.24206543, + "routerloss_mlp": 0.0, + "step": 480, + "time_per_iteration": 2.582655668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249007, + "balance_loss_mlp": 1.22349596, + "diversity_loss_mlp": 0.0, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.11976742763400251, + "language_loss": 0.9370476, + "learning_rate": 0.000989783169614535, + "loss": 0.94953763, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25537109, + "routerloss_mlp": 0.0, + "step": 481, + "time_per_iteration": 2.6305787563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01772239, + "balance_loss_mlp": 1.74649, + "diversity_loss_mlp": 0.0, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.0876770513617693, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80524993, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 482, + "time_per_iteration": 4.8690409660339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276229, + "balance_loss_mlp": 1.25084925, + "diversity_loss_mlp": 0.0, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.10686208189243855, + "language_loss": 0.91100538, + "learning_rate": 0.000989657473741779, + "loss": 0.92376775, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25402832, + "routerloss_mlp": 0.0, + "step": 483, + "time_per_iteration": 2.8294553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275465, + "balance_loss_mlp": 1.25022864, + "diversity_loss_mlp": 0.0, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.09087050091564236, + "language_loss": 0.92375994, + "learning_rate": 0.0009895943406403465, + "loss": 0.93651462, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.25244141, + "routerloss_mlp": 0.0, + "step": 484, + "time_per_iteration": 2.728445053100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231643, + "balance_loss_mlp": 1.20584655, + "diversity_loss_mlp": 0.0, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.11173906110031175, + "language_loss": 0.85102737, + "learning_rate": 0.0009895310174615338, + "loss": 0.86334383, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.25805664, + "routerloss_mlp": 0.0, + "step": 485, + "time_per_iteration": 2.809858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01674879, + "balance_loss_mlp": 1.65122819, + "diversity_loss_mlp": 0.0, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.0891862493938321, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.77393395, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.23632812, + "routerloss_mlp": 0.0, + "step": 486, + "time_per_iteration": 4.675356388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149968, + "balance_loss_mlp": 1.1268059, + "diversity_loss_mlp": 0.0, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.12873710921953274, + "language_loss": 0.89867461, + "learning_rate": 0.0009894038009701782, + "loss": 0.91017425, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.23168945, + "routerloss_mlp": 0.0, + "step": 487, + "time_per_iteration": 2.646655797958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141338, + "balance_loss_mlp": 1.11786556, + "diversity_loss_mlp": 0.0, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.11717214663903742, + "language_loss": 0.89069557, + "learning_rate": 0.0009893399077070253, + "loss": 0.90210891, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.23474121, + "routerloss_mlp": 0.0, + "step": 488, + "time_per_iteration": 2.578733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936332, + "balance_loss_mlp": 1.59238243, + "diversity_loss_mlp": 0.24211329, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.03786592480343135, + "language_loss": 0.88446009, + "learning_rate": 0.0009892758244652718, + "loss": 0.89382339, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0190843, + "step": 489, + "time_per_iteration": 2.72853946685791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131365, + "balance_loss_mlp": 1.10876274, + "diversity_loss_mlp": 0.0, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.09957245788293691, + "language_loss": 0.92780352, + "learning_rate": 0.0009892115512697968, + "loss": 0.93911719, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 490, + "time_per_iteration": 2.6975181102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127367, + "balance_loss_mlp": 1.10648203, + "diversity_loss_mlp": 0.0, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.09077239739165983, + "language_loss": 0.95311546, + "learning_rate": 0.0009891470881455537, + "loss": 0.96438909, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 491, + "time_per_iteration": 2.674140214920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141034, + "balance_loss_mlp": 1.12092364, + "diversity_loss_mlp": 0.0, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.08843271909801863, + "language_loss": 0.91967297, + "learning_rate": 0.0009890824351175692, + "loss": 0.93108326, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.20092773, + "routerloss_mlp": 0.0, + "step": 492, + "time_per_iteration": 2.689789295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148763, + "balance_loss_mlp": 1.12847304, + "diversity_loss_mlp": 0.0, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.0818574716555875, + "language_loss": 0.96715915, + "learning_rate": 0.0009890175922109435, + "loss": 0.97864676, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.20288086, + "routerloss_mlp": 0.0, + "step": 493, + "time_per_iteration": 2.653787136077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161837, + "balance_loss_mlp": 1.14108253, + "diversity_loss_mlp": 0.0, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.10785532679009643, + "language_loss": 0.94627249, + "learning_rate": 0.0009889525594508513, + "loss": 0.95789087, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.20751953, + "routerloss_mlp": 0.0, + "step": 494, + "time_per_iteration": 3.013289213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168804, + "balance_loss_mlp": 1.14887238, + "diversity_loss_mlp": 0.0, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.09313196509024183, + "language_loss": 0.89226812, + "learning_rate": 0.0009888873368625404, + "loss": 0.90395617, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 495, + "time_per_iteration": 2.4990835189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215397, + "balance_loss_mlp": 1.19448745, + "diversity_loss_mlp": 0.0, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.11525575263217126, + "language_loss": 0.92808712, + "learning_rate": 0.0009888219244713326, + "loss": 0.94024116, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 496, + "time_per_iteration": 2.828477382659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235818, + "balance_loss_mlp": 1.2138716, + "diversity_loss_mlp": 0.0, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.13708349411569606, + "language_loss": 0.92383498, + "learning_rate": 0.0009887563223026229, + "loss": 0.93619317, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 497, + "time_per_iteration": 2.6688501834869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03358766, + "balance_loss_mlp": 3.33902526, + "diversity_loss_mlp": 0.0, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.4973253845941573, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.82426929, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19726562, + "routerloss_mlp": 0.0, + "step": 498, + "time_per_iteration": 4.9225428104400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125204, + "balance_loss_mlp": 1.22810328, + "diversity_loss_mlp": 0.0, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.09338533863845942, + "language_loss": 0.9145627, + "learning_rate": 0.0009886245487346482, + "loss": 0.92708313, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.23925781, + "routerloss_mlp": 0.0, + "step": 499, + "time_per_iteration": 3.0396392345428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273949, + "balance_loss_mlp": 1.24874783, + "diversity_loss_mlp": 0.0, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.12406156723875504, + "language_loss": 0.94657683, + "learning_rate": 0.0009885583773865422, + "loss": 0.95931631, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 500, + "time_per_iteration": 2.434283971786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_mlp": 1.29096031, + "diversity_loss_mlp": 0.0, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.11518840252548597, + "language_loss": 0.91528684, + "learning_rate": 0.0009884920163632524, + "loss": 0.92847896, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 501, + "time_per_iteration": 2.6888957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131255, + "balance_loss_mlp": 1.28246212, + "diversity_loss_mlp": 0.0, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.12991803618191863, + "language_loss": 0.93797207, + "learning_rate": 0.000988425465690543, + "loss": 0.95109755, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.30102539, + "routerloss_mlp": 0.0, + "step": 502, + "time_per_iteration": 2.5672004222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283439, + "balance_loss_mlp": 1.25225365, + "diversity_loss_mlp": 0.0, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.11000587000012971, + "language_loss": 0.91223967, + "learning_rate": 0.0009883587253942505, + "loss": 0.92507404, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 503, + "time_per_iteration": 2.7560157775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_mlp": 1.24281311, + "diversity_loss_mlp": 0.0, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.10509235815923167, + "language_loss": 0.97371984, + "learning_rate": 0.0009882917955002862, + "loss": 0.9864552, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 504, + "time_per_iteration": 2.5183091163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227481, + "balance_loss_mlp": 1.1978929, + "diversity_loss_mlp": 0.0, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.11004475447178139, + "language_loss": 0.90284961, + "learning_rate": 0.0009882246760346343, + "loss": 0.91512442, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 505, + "time_per_iteration": 2.6169376373291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215441, + "balance_loss_mlp": 1.18637753, + "diversity_loss_mlp": 0.0, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.13294554223904492, + "language_loss": 0.94025862, + "learning_rate": 0.0009881573670233533, + "loss": 0.95241302, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.29077148, + "routerloss_mlp": 0.0, + "step": 506, + "time_per_iteration": 2.5373079776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012064, + "balance_loss_mlp": 1.17976809, + "diversity_loss_mlp": 0.0, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.07932421313758002, + "language_loss": 0.89223576, + "learning_rate": 0.0009880898684925747, + "loss": 0.90429974, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 507, + "time_per_iteration": 2.661796808242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206827, + "balance_loss_mlp": 1.18070853, + "diversity_loss_mlp": 0.0, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.09132088261693337, + "language_loss": 0.87935519, + "learning_rate": 0.0009880221804685037, + "loss": 0.89142346, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.26159668, + "routerloss_mlp": 0.0, + "step": 508, + "time_per_iteration": 2.542513608932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02552291, + "balance_loss_mlp": 2.42869496, + "diversity_loss_mlp": 0.0, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.1282373293100265, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.8189671, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 509, + "time_per_iteration": 4.707206964492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280503, + "balance_loss_mlp": 1.25399113, + "diversity_loss_mlp": 0.0, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.09929466646798928, + "language_loss": 0.93586993, + "learning_rate": 0.0009878862360456733, + "loss": 0.94867498, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.265625, + "routerloss_mlp": 0.0, + "step": 510, + "time_per_iteration": 2.6981284618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284628, + "balance_loss_mlp": 1.25883126, + "diversity_loss_mlp": 0.0, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.10250849932844218, + "language_loss": 0.87516463, + "learning_rate": 0.0009878179796996922, + "loss": 0.88801086, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.25817871, + "routerloss_mlp": 0.0, + "step": 511, + "time_per_iteration": 2.7541561126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281708, + "balance_loss_mlp": 1.25468373, + "diversity_loss_mlp": 0.0, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.10234956077068923, + "language_loss": 0.90780497, + "learning_rate": 0.0009877495339659754, + "loss": 0.92062211, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.27038574, + "routerloss_mlp": 0.0, + "step": 512, + "time_per_iteration": 2.7744665145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278173, + "balance_loss_mlp": 1.25241184, + "diversity_loss_mlp": 0.0, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.11291475079800635, + "language_loss": 0.85683644, + "learning_rate": 0.000987680898871096, + "loss": 0.86961818, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 513, + "time_per_iteration": 2.8321592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289018, + "balance_loss_mlp": 1.26217198, + "diversity_loss_mlp": 0.0, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.10190264212433507, + "language_loss": 0.85800934, + "learning_rate": 0.0009876120744417, + "loss": 0.87089956, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.26867676, + "routerloss_mlp": 0.0, + "step": 514, + "time_per_iteration": 2.945312023162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245022, + "balance_loss_mlp": 1.2198211, + "diversity_loss_mlp": 0.0, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.09616865198011539, + "language_loss": 0.94088352, + "learning_rate": 0.0009875430607045078, + "loss": 0.9533338, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 515, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214395, + "balance_loss_mlp": 1.19058895, + "diversity_loss_mlp": 0.0, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.0895550710797692, + "language_loss": 0.91242373, + "learning_rate": 0.000987473857686313, + "loss": 0.9245677, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.23791504, + "routerloss_mlp": 0.0, + "step": 516, + "time_per_iteration": 2.7530250549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218622, + "balance_loss_mlp": 1.19458985, + "diversity_loss_mlp": 0.0, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.11626991588591096, + "language_loss": 0.92559797, + "learning_rate": 0.0009874044654139824, + "loss": 0.93778414, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.24023438, + "routerloss_mlp": 0.0, + "step": 517, + "time_per_iteration": 2.7673146724700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188049, + "balance_loss_mlp": 1.16410005, + "diversity_loss_mlp": 0.0, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.09260385447056875, + "language_loss": 0.91065013, + "learning_rate": 0.0009873348839144563, + "loss": 0.92253065, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.23950195, + "routerloss_mlp": 0.0, + "step": 518, + "time_per_iteration": 2.5385515689849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.13979197, + "diversity_loss_mlp": 0.0, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.07604390633760301, + "language_loss": 0.95252264, + "learning_rate": 0.000987265113214749, + "loss": 0.96414435, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.22375488, + "routerloss_mlp": 0.0, + "step": 519, + "time_per_iteration": 2.556882619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171262, + "balance_loss_mlp": 1.14849353, + "diversity_loss_mlp": 0.0, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.093032650642813, + "language_loss": 0.94720447, + "learning_rate": 0.0009871951533419476, + "loss": 0.95891708, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.22753906, + "routerloss_mlp": 0.0, + "step": 520, + "time_per_iteration": 2.724825143814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163285, + "balance_loss_mlp": 1.14063525, + "diversity_loss_mlp": 0.0, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.07732484115861517, + "language_loss": 0.87440532, + "learning_rate": 0.0009871250043232132, + "loss": 0.88603818, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.22631836, + "routerloss_mlp": 0.0, + "step": 521, + "time_per_iteration": 2.756647825241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171709, + "balance_loss_mlp": 1.14840364, + "diversity_loss_mlp": 0.0, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.08586449419627491, + "language_loss": 0.8592059, + "learning_rate": 0.0009870546661857797, + "loss": 0.87092298, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 522, + "time_per_iteration": 2.611241340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188262, + "balance_loss_mlp": 1.16447985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.11121774977632432, + "language_loss": 0.93899059, + "learning_rate": 0.0009869841389569553, + "loss": 0.9508732, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.2376709, + "routerloss_mlp": 0.0, + "step": 523, + "time_per_iteration": 2.986001491546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897074, + "balance_loss_mlp": 1.51972795, + "diversity_loss_mlp": 0.23477924, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.04055297882665198, + "language_loss": 0.88430732, + "learning_rate": 0.0009869134226641206, + "loss": 0.89327806, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01982057, + "step": 524, + "time_per_iteration": 2.5944766998291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213869, + "balance_loss_mlp": 1.19106424, + "diversity_loss_mlp": 0.0, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.1040439940574723, + "language_loss": 0.87633705, + "learning_rate": 0.0009868425173347303, + "loss": 0.88847572, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 525, + "time_per_iteration": 2.679245710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202393, + "balance_loss_mlp": 1.17973125, + "diversity_loss_mlp": 0.0, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.10306076043273057, + "language_loss": 0.95430547, + "learning_rate": 0.0009867714229963125, + "loss": 0.96632946, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 526, + "time_per_iteration": 2.6960504055023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194179, + "balance_loss_mlp": 1.17121899, + "diversity_loss_mlp": 0.0, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.13221329860014494, + "language_loss": 0.92439747, + "learning_rate": 0.000986700139676468, + "loss": 0.93633932, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 527, + "time_per_iteration": 2.5740442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226752, + "balance_loss_mlp": 1.20331526, + "diversity_loss_mlp": 0.0, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.07480383753700154, + "language_loss": 0.90178651, + "learning_rate": 0.0009866286674028717, + "loss": 0.91405398, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 528, + "time_per_iteration": 2.6214394569396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00901033, + "balance_loss_mlp": 1.53179681, + "diversity_loss_mlp": 0.23385583, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.042015219172821444, + "language_loss": 0.87127066, + "learning_rate": 0.0009865570062032717, + "loss": 0.88028097, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820667, + "step": 529, + "time_per_iteration": 2.947612762451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243163, + "balance_loss_mlp": 1.21885657, + "diversity_loss_mlp": 0.0, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.11620953964099495, + "language_loss": 0.91896212, + "learning_rate": 0.0009864851561054893, + "loss": 0.93139374, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 530, + "time_per_iteration": 2.8097901344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192516, + "balance_loss_mlp": 1.16937733, + "diversity_loss_mlp": 0.0, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.0991735208834069, + "language_loss": 0.90383148, + "learning_rate": 0.0009864131171374191, + "loss": 0.9157567, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.23132324, + "routerloss_mlp": 0.0, + "step": 531, + "time_per_iteration": 2.6775832176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169363, + "balance_loss_mlp": 1.14682031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.08125371515716559, + "language_loss": 0.90489674, + "learning_rate": 0.0009863408893270292, + "loss": 0.91659039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.2253418, + "routerloss_mlp": 0.0, + "step": 532, + "time_per_iteration": 2.7877254486083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.1120224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.11770570969906818, + "language_loss": 0.85183895, + "learning_rate": 0.0009862684727023605, + "loss": 0.8631804, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 533, + "time_per_iteration": 2.717573642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128208, + "balance_loss_mlp": 1.10571277, + "diversity_loss_mlp": 0.0, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.10673213842736717, + "language_loss": 0.88664484, + "learning_rate": 0.0009861958672915283, + "loss": 0.89792687, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.22497559, + "routerloss_mlp": 0.0, + "step": 534, + "time_per_iteration": 2.7880847454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111155, + "balance_loss_mlp": 1.08948302, + "diversity_loss_mlp": 0.0, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.11915216532291298, + "language_loss": 0.88834876, + "learning_rate": 0.0009861230731227201, + "loss": 0.89946032, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.21679688, + "routerloss_mlp": 0.0, + "step": 535, + "time_per_iteration": 2.844203233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121725, + "balance_loss_mlp": 1.10002935, + "diversity_loss_mlp": 0.0, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.11019657032079996, + "language_loss": 0.90318179, + "learning_rate": 0.0009860500902241973, + "loss": 0.91439903, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.21716309, + "routerloss_mlp": 0.0, + "step": 536, + "time_per_iteration": 2.5753133296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126411, + "balance_loss_mlp": 1.10444033, + "diversity_loss_mlp": 0.0, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.13353850851854182, + "language_loss": 0.95278764, + "learning_rate": 0.0009859769186242942, + "loss": 0.96405172, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.21984863, + "routerloss_mlp": 0.0, + "step": 537, + "time_per_iteration": 2.544611930847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894726, + "balance_loss_mlp": 1.52693653, + "diversity_loss_mlp": 0.22699235, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04205207536563703, + "language_loss": 0.88558614, + "learning_rate": 0.0009859035583514187, + "loss": 0.8945334, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01776124, + "step": 538, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257859, + "balance_loss_mlp": 1.23475599, + "diversity_loss_mlp": 0.0, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.11200334451020948, + "language_loss": 0.89448857, + "learning_rate": 0.0009858300094340517, + "loss": 0.90706718, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.23071289, + "routerloss_mlp": 0.0, + "step": 539, + "time_per_iteration": 2.7679364681243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291272, + "balance_loss_mlp": 1.26785898, + "diversity_loss_mlp": 0.0, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.17493624211104222, + "language_loss": 0.84562349, + "learning_rate": 0.0009857562719007473, + "loss": 0.85853624, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.23388672, + "routerloss_mlp": 0.0, + "step": 540, + "time_per_iteration": 2.6256375312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267144, + "balance_loss_mlp": 1.24492311, + "diversity_loss_mlp": 0.0, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.14114133743563548, + "language_loss": 0.86615884, + "learning_rate": 0.0009856823457801331, + "loss": 0.87883031, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 541, + "time_per_iteration": 2.8773691654205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254714, + "balance_loss_mlp": 1.23256469, + "diversity_loss_mlp": 0.0, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.08733197639022866, + "language_loss": 0.93604994, + "learning_rate": 0.00098560823110091, + "loss": 0.94859707, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.22167969, + "routerloss_mlp": 0.0, + "step": 542, + "time_per_iteration": 2.6173057556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206738, + "balance_loss_mlp": 1.18436217, + "diversity_loss_mlp": 0.0, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.14252191795618116, + "language_loss": 0.94814467, + "learning_rate": 0.000985533927891851, + "loss": 0.96021199, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.22387695, + "routerloss_mlp": 0.0, + "step": 543, + "time_per_iteration": 2.682035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00924177, + "balance_loss_mlp": 1.58877563, + "diversity_loss_mlp": 0.22542018, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.04171093567104517, + "language_loss": 0.92462713, + "learning_rate": 0.0009854594361818044, + "loss": 0.93386889, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01707876, + "step": 544, + "time_per_iteration": 2.771606922149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134006, + "balance_loss_mlp": 1.11126077, + "diversity_loss_mlp": 0.0, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.16622789723447462, + "language_loss": 0.91736549, + "learning_rate": 0.0009853847559996897, + "loss": 0.92870551, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.22729492, + "routerloss_mlp": 0.0, + "step": 545, + "time_per_iteration": 2.714980363845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131307, + "balance_loss_mlp": 1.10896707, + "diversity_loss_mlp": 0.0, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.13863422454282084, + "language_loss": 0.90834534, + "learning_rate": 0.0009853098873745, + "loss": 0.91965836, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.22351074, + "routerloss_mlp": 0.0, + "step": 546, + "time_per_iteration": 2.98349928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127677, + "balance_loss_mlp": 1.10500383, + "diversity_loss_mlp": 0.0, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.15888834478547278, + "language_loss": 0.90073705, + "learning_rate": 0.0009852348303353027, + "loss": 0.91201389, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.22668457, + "routerloss_mlp": 0.0, + "step": 547, + "time_per_iteration": 2.782012701034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148987, + "balance_loss_mlp": 1.12613487, + "diversity_loss_mlp": 0.0, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.10179846154330349, + "language_loss": 0.82990968, + "learning_rate": 0.000985159584911237, + "loss": 0.84139955, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 548, + "time_per_iteration": 3.102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216658, + "balance_loss_mlp": 1.19307828, + "diversity_loss_mlp": 0.0, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.12466178148261096, + "language_loss": 0.89916652, + "learning_rate": 0.0009850841511315162, + "loss": 0.91133308, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.2355957, + "routerloss_mlp": 0.0, + "step": 549, + "time_per_iteration": 2.61226749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241093, + "balance_loss_mlp": 1.21708441, + "diversity_loss_mlp": 0.0, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.11901003741868514, + "language_loss": 0.90615034, + "learning_rate": 0.0009850085290254256, + "loss": 0.91856128, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.23986816, + "routerloss_mlp": 0.0, + "step": 550, + "time_per_iteration": 2.7958199977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914838, + "balance_loss_mlp": 1.5724771, + "diversity_loss_mlp": 0.22113116, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.03122458898086593, + "language_loss": 0.87977409, + "learning_rate": 0.0009849327186223246, + "loss": 0.88892245, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0180343, + "step": 551, + "time_per_iteration": 2.799394130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242815, + "balance_loss_mlp": 1.21818638, + "diversity_loss_mlp": 0.0, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.10957849833176474, + "language_loss": 0.95181417, + "learning_rate": 0.000984856719951646, + "loss": 0.96424234, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.24609375, + "routerloss_mlp": 0.0, + "step": 552, + "time_per_iteration": 2.559286117553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121032, + "balance_loss_mlp": 1.18546462, + "diversity_loss_mlp": 0.0, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.09349197696587547, + "language_loss": 0.91760498, + "learning_rate": 0.0009847805330428943, + "loss": 0.92970818, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.24865723, + "routerloss_mlp": 0.0, + "step": 553, + "time_per_iteration": 2.906571388244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875983, + "balance_loss_mlp": 1.49139261, + "diversity_loss_mlp": 0.22127438, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05457604420902532, + "language_loss": 0.93558431, + "learning_rate": 0.0009847041579256481, + "loss": 0.94434416, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01964992, + "step": 554, + "time_per_iteration": 2.6159372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202664, + "balance_loss_mlp": 1.17859542, + "diversity_loss_mlp": 0.0, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.08395889420783041, + "language_loss": 0.94042808, + "learning_rate": 0.0009846275946295592, + "loss": 0.95245475, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 555, + "time_per_iteration": 2.592341184616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182493, + "balance_loss_mlp": 1.15904498, + "diversity_loss_mlp": 0.0, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.08262845202589308, + "language_loss": 0.8740595, + "learning_rate": 0.0009845508431843518, + "loss": 0.8858844, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 556, + "time_per_iteration": 3.0123813152313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177615, + "balance_loss_mlp": 1.15481031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.07593810566908125, + "language_loss": 0.88148719, + "learning_rate": 0.0009844739036198233, + "loss": 0.8932634, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 557, + "time_per_iteration": 2.6356143951416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184514, + "balance_loss_mlp": 1.16157842, + "diversity_loss_mlp": 0.0, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.09177793780956148, + "language_loss": 0.94916999, + "learning_rate": 0.0009843967759658448, + "loss": 0.96101511, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 558, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02293865, + "balance_loss_mlp": 2.17026901, + "diversity_loss_mlp": 0.0, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.09925677209713644, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.75061619, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 559, + "time_per_iteration": 4.829499244689941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207667, + "balance_loss_mlp": 1.18555331, + "diversity_loss_mlp": 0.0, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.1031420062274817, + "language_loss": 0.9552027, + "learning_rate": 0.000984241956509384, + "loss": 0.96727937, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 560, + "time_per_iteration": 2.65759539604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204016, + "balance_loss_mlp": 1.18220043, + "diversity_loss_mlp": 0.0, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.08944048757536185, + "language_loss": 0.90505213, + "learning_rate": 0.0009841642647670078, + "loss": 0.91709226, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 561, + "time_per_iteration": 2.591806173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194467, + "balance_loss_mlp": 1.17308092, + "diversity_loss_mlp": 0.0, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.08297191380839272, + "language_loss": 0.85483265, + "learning_rate": 0.0009840863850553944, + "loss": 0.8667773, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.21398926, + "routerloss_mlp": 0.0, + "step": 562, + "time_per_iteration": 2.963149309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179499, + "balance_loss_mlp": 1.15856552, + "diversity_loss_mlp": 0.0, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.18759249419324772, + "language_loss": 0.9088884, + "learning_rate": 0.0009840083174047782, + "loss": 0.92068338, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.20947266, + "routerloss_mlp": 0.0, + "step": 563, + "time_per_iteration": 2.71415114402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169496, + "balance_loss_mlp": 1.14940953, + "diversity_loss_mlp": 0.0, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.08351477183844232, + "language_loss": 0.86295354, + "learning_rate": 0.0009839300618454685, + "loss": 0.87464857, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.20080566, + "routerloss_mlp": 0.0, + "step": 564, + "time_per_iteration": 2.8288042545318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163807, + "balance_loss_mlp": 1.14280224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0761185875884483, + "language_loss": 0.9141686, + "learning_rate": 0.0009838516184078466, + "loss": 0.92580664, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.21020508, + "routerloss_mlp": 0.0, + "step": 565, + "time_per_iteration": 2.8194022178649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177734, + "balance_loss_mlp": 1.15682447, + "diversity_loss_mlp": 0.0, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.14122321260962364, + "language_loss": 0.88377023, + "learning_rate": 0.0009837729871223669, + "loss": 0.89554763, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 566, + "time_per_iteration": 2.6096079349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194985, + "balance_loss_mlp": 1.17372978, + "diversity_loss_mlp": 0.0, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.1066586812750682, + "language_loss": 0.88896918, + "learning_rate": 0.0009836941680195568, + "loss": 0.90091902, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.21264648, + "routerloss_mlp": 0.0, + "step": 567, + "time_per_iteration": 2.779846429824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210465, + "balance_loss_mlp": 1.18900692, + "diversity_loss_mlp": 0.0, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.09744135285550241, + "language_loss": 0.84777021, + "learning_rate": 0.0009836151611300166, + "loss": 0.85987484, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 568, + "time_per_iteration": 3.2130274772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210546, + "balance_loss_mlp": 1.18979168, + "diversity_loss_mlp": 0.0, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.0877787159655237, + "language_loss": 0.95202124, + "learning_rate": 0.0009835359664844194, + "loss": 0.96412671, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.2076416, + "routerloss_mlp": 0.0, + "step": 569, + "time_per_iteration": 2.614626407623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02163392, + "balance_loss_mlp": 2.12848806, + "diversity_loss_mlp": 0.0, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.098326155744124, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.83200204, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.34960938, + "routerloss_mlp": 0.0, + "step": 570, + "time_per_iteration": 4.910563230514526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188679, + "balance_loss_mlp": 1.16738796, + "diversity_loss_mlp": 0.0, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.10673198509513786, + "language_loss": 0.92503107, + "learning_rate": 0.0009833770140481118, + "loss": 0.93691778, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.21313477, + "routerloss_mlp": 0.0, + "step": 571, + "time_per_iteration": 2.6361794471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167522, + "balance_loss_mlp": 1.14587367, + "diversity_loss_mlp": 0.0, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.06757736028097705, + "language_loss": 0.82720339, + "learning_rate": 0.000983297256319112, + "loss": 0.83887863, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.21655273, + "routerloss_mlp": 0.0, + "step": 572, + "time_per_iteration": 3.2420709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148716, + "balance_loss_mlp": 1.12606621, + "diversity_loss_mlp": 0.0, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.09218112459591986, + "language_loss": 0.87054348, + "learning_rate": 0.000983217310957477, + "loss": 0.88203067, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 573, + "time_per_iteration": 2.7485547065734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139139, + "balance_loss_mlp": 1.11725259, + "diversity_loss_mlp": 0.0, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.08282639029669561, + "language_loss": 0.90421212, + "learning_rate": 0.000983137177994244, + "loss": 0.91560352, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.21899414, + "routerloss_mlp": 0.0, + "step": 574, + "time_per_iteration": 2.8651185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142425, + "balance_loss_mlp": 1.11990607, + "diversity_loss_mlp": 0.0, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.08655490231030577, + "language_loss": 0.8561765, + "learning_rate": 0.0009830568574605235, + "loss": 0.8676008, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.22521973, + "routerloss_mlp": 0.0, + "step": 575, + "time_per_iteration": 2.942331075668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162411, + "balance_loss_mlp": 1.13946342, + "diversity_loss_mlp": 0.0, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.08792859421485215, + "language_loss": 0.88113999, + "learning_rate": 0.0009829763493874992, + "loss": 0.89276409, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 576, + "time_per_iteration": 3.0282514095306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173849, + "balance_loss_mlp": 1.15098429, + "diversity_loss_mlp": 0.0, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.10676499351314739, + "language_loss": 0.9303807, + "learning_rate": 0.0009828956538064264, + "loss": 0.94211912, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.2286377, + "routerloss_mlp": 0.0, + "step": 577, + "time_per_iteration": 2.7946369647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173248, + "balance_loss_mlp": 1.1503005, + "diversity_loss_mlp": 0.0, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.11074471638842859, + "language_loss": 0.91223717, + "learning_rate": 0.0009828147707486344, + "loss": 0.92396963, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 578, + "time_per_iteration": 2.731588125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.13424993, + "diversity_loss_mlp": 0.0, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.09317476454713723, + "language_loss": 0.86116958, + "learning_rate": 0.0009827337002455245, + "loss": 0.87273794, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 579, + "time_per_iteration": 2.639047145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134437, + "balance_loss_mlp": 1.11184728, + "diversity_loss_mlp": 0.0, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.07918824025832125, + "language_loss": 0.88299757, + "learning_rate": 0.0009826524423285712, + "loss": 0.89434195, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.22595215, + "routerloss_mlp": 0.0, + "step": 580, + "time_per_iteration": 2.911012649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114105, + "balance_loss_mlp": 1.11881745, + "diversity_loss_mlp": 0.0, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.10469703454021252, + "language_loss": 0.89618349, + "learning_rate": 0.0009825709970293218, + "loss": 0.90759397, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 581, + "time_per_iteration": 2.8837828636169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135626, + "balance_loss_mlp": 1.11433506, + "diversity_loss_mlp": 0.0, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.1022616119694228, + "language_loss": 0.95317924, + "learning_rate": 0.0009824893643793956, + "loss": 0.96453559, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.21289062, + "routerloss_mlp": 0.0, + "step": 582, + "time_per_iteration": 3.0962114334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948798, + "balance_loss_mlp": 1.63779283, + "diversity_loss_mlp": 0.22248407, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.04350556393742171, + "language_loss": 0.88843536, + "learning_rate": 0.0009824075444104857, + "loss": 0.89792335, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01865991, + "step": 583, + "time_per_iteration": 2.719085454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157517, + "balance_loss_mlp": 1.13638163, + "diversity_loss_mlp": 0.0, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.10740950198198211, + "language_loss": 0.93831933, + "learning_rate": 0.000982325537154357, + "loss": 0.94989443, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.21154785, + "routerloss_mlp": 0.0, + "step": 584, + "time_per_iteration": 2.597120523452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117352, + "balance_loss_mlp": 1.15234792, + "diversity_loss_mlp": 0.0, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.12322952105084124, + "language_loss": 0.94442445, + "learning_rate": 0.0009822433426428484, + "loss": 0.95615965, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.21179199, + "routerloss_mlp": 0.0, + "step": 585, + "time_per_iteration": 2.571805238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238103, + "balance_loss_mlp": 1.2166214, + "diversity_loss_mlp": 0.0, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.08678287386034968, + "language_loss": 0.87089044, + "learning_rate": 0.0009821609609078697, + "loss": 0.88327146, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.21484375, + "routerloss_mlp": 0.0, + "step": 586, + "time_per_iteration": 2.586289405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320429, + "balance_loss_mlp": 1.29861343, + "diversity_loss_mlp": 0.0, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.09324667942342675, + "language_loss": 0.89581811, + "learning_rate": 0.0009820783919814045, + "loss": 0.90902239, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 587, + "time_per_iteration": 2.804417848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01397697, + "balance_loss_mlp": 1.37499988, + "diversity_loss_mlp": 0.0, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.11766834316785481, + "language_loss": 0.82825267, + "learning_rate": 0.0009819956358955095, + "loss": 0.8422296, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.22705078, + "routerloss_mlp": 0.0, + "step": 588, + "time_per_iteration": 2.5654590129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433511, + "balance_loss_mlp": 1.41009879, + "diversity_loss_mlp": 0.0, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.13254981657968556, + "language_loss": 0.84316242, + "learning_rate": 0.0009819126926823127, + "loss": 0.85749757, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.23413086, + "routerloss_mlp": 0.0, + "step": 589, + "time_per_iteration": 2.5090954303741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369151, + "balance_loss_mlp": 1.34720445, + "diversity_loss_mlp": 0.0, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.12923638752993147, + "language_loss": 0.87131608, + "learning_rate": 0.000981829562374016, + "loss": 0.88500756, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 590, + "time_per_iteration": 2.7904558181762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263432, + "balance_loss_mlp": 1.24309444, + "diversity_loss_mlp": 0.0, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.0979331207375339, + "language_loss": 0.97635686, + "learning_rate": 0.0009817462450028933, + "loss": 0.98899126, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 591, + "time_per_iteration": 2.6596498489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186211, + "balance_loss_mlp": 1.16698265, + "diversity_loss_mlp": 0.0, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.0791908179615389, + "language_loss": 0.85476398, + "learning_rate": 0.0009816627406012916, + "loss": 0.86662614, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 592, + "time_per_iteration": 2.795384168624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143018, + "balance_loss_mlp": 1.12423062, + "diversity_loss_mlp": 0.0, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.14133504737490046, + "language_loss": 0.85158926, + "learning_rate": 0.0009815790492016295, + "loss": 0.86301947, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.18774414, + "routerloss_mlp": 0.0, + "step": 593, + "time_per_iteration": 2.968202829360962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113857, + "balance_loss_mlp": 1.11954474, + "diversity_loss_mlp": 0.0, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.10990083394980393, + "language_loss": 0.87156999, + "learning_rate": 0.0009814951708363993, + "loss": 0.88295579, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.19006348, + "routerloss_mlp": 0.0, + "step": 594, + "time_per_iteration": 2.8341050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01993613, + "balance_loss_mlp": 1.96176016, + "diversity_loss_mlp": 0.0, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.10325359814292956, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79984605, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.31835938, + "routerloss_mlp": 0.0, + "step": 595, + "time_per_iteration": 4.746119976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11163688, + "diversity_loss_mlp": 0.0, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.1448933947746474, + "language_loss": 0.89056683, + "learning_rate": 0.0009813268533395648, + "loss": 0.90187395, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.19067383, + "routerloss_mlp": 0.0, + "step": 596, + "time_per_iteration": 2.592421054840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151969, + "balance_loss_mlp": 1.13301492, + "diversity_loss_mlp": 0.0, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.12455054099529249, + "language_loss": 0.8755219, + "learning_rate": 0.0009812424142733073, + "loss": 0.88704157, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.18933105, + "routerloss_mlp": 0.0, + "step": 597, + "time_per_iteration": 2.549654483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158973, + "balance_loss_mlp": 1.13961387, + "diversity_loss_mlp": 0.0, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.1533400924271749, + "language_loss": 0.86129421, + "learning_rate": 0.000981157788372175, + "loss": 0.87288398, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 598, + "time_per_iteration": 3.029372453689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181573, + "balance_loss_mlp": 1.16308403, + "diversity_loss_mlp": 0.0, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.08122879346901381, + "language_loss": 0.89185023, + "learning_rate": 0.0009810729756690223, + "loss": 0.90366596, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.18481445, + "routerloss_mlp": 0.0, + "step": 599, + "time_per_iteration": 2.72200608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225343, + "balance_loss_mlp": 1.20584035, + "diversity_loss_mlp": 0.0, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.09322481346022114, + "language_loss": 0.91937912, + "learning_rate": 0.0009809879761967766, + "loss": 0.93163252, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.19482422, + "routerloss_mlp": 0.0, + "step": 600, + "time_per_iteration": 2.9454104900360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240049, + "balance_loss_mlp": 1.22046316, + "diversity_loss_mlp": 0.0, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.11235514763344263, + "language_loss": 0.86727029, + "learning_rate": 0.0009809027899884378, + "loss": 0.87967086, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.19580078, + "routerloss_mlp": 0.0, + "step": 601, + "time_per_iteration": 2.888047218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288764, + "balance_loss_mlp": 1.26829576, + "diversity_loss_mlp": 0.0, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.07021797329248278, + "language_loss": 0.88593882, + "learning_rate": 0.0009808174170770779, + "loss": 0.89882648, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.20458984, + "routerloss_mlp": 0.0, + "step": 602, + "time_per_iteration": 2.8045670986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02144093, + "balance_loss_mlp": 2.11128712, + "diversity_loss_mlp": 0.0, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.1124732092134732, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.87042338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.328125, + "routerloss_mlp": 0.0, + "step": 603, + "time_per_iteration": 4.899731397628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341078, + "balance_loss_mlp": 1.32069361, + "diversity_loss_mlp": 0.0, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.10202627615666406, + "language_loss": 0.93765342, + "learning_rate": 0.0009806461112779462, + "loss": 0.95106417, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 604, + "time_per_iteration": 2.6618311405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291209, + "balance_loss_mlp": 1.27080083, + "diversity_loss_mlp": 0.0, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.13219567018011513, + "language_loss": 0.87928259, + "learning_rate": 0.0009805601784566814, + "loss": 0.89219463, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.20397949, + "routerloss_mlp": 0.0, + "step": 605, + "time_per_iteration": 2.4783012866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229751, + "balance_loss_mlp": 1.20996237, + "diversity_loss_mlp": 0.0, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.07794567116482086, + "language_loss": 0.95705628, + "learning_rate": 0.0009804740590654089, + "loss": 0.9693538, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.19787598, + "routerloss_mlp": 0.0, + "step": 606, + "time_per_iteration": 2.6886532306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155761, + "balance_loss_mlp": 1.13543582, + "diversity_loss_mlp": 0.0, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.09113538166915294, + "language_loss": 0.90117687, + "learning_rate": 0.0009803877531375635, + "loss": 0.91273439, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 607, + "time_per_iteration": 2.877068281173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127783, + "balance_loss_mlp": 1.1072073, + "diversity_loss_mlp": 0.0, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.0886917383310614, + "language_loss": 0.90959686, + "learning_rate": 0.0009803012607066523, + "loss": 0.92087471, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.20581055, + "routerloss_mlp": 0.0, + "step": 608, + "time_per_iteration": 2.7187952995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110833, + "balance_loss_mlp": 1.08786178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.061304878637031934, + "language_loss": 0.89645171, + "learning_rate": 0.0009802145818062543, + "loss": 0.90753502, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 609, + "time_per_iteration": 2.692622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920288, + "balance_loss_mlp": 1.57755673, + "diversity_loss_mlp": 0.22646153, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.03934500472587961, + "language_loss": 0.91726142, + "learning_rate": 0.0009801277164700212, + "loss": 0.92646432, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01827916, + "step": 610, + "time_per_iteration": 2.5983645915985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100014, + "balance_loss_mlp": 1.07810283, + "diversity_loss_mlp": 0.0, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.11493980483313035, + "language_loss": 0.90203917, + "learning_rate": 0.0009800406647316776, + "loss": 0.91303933, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.21911621, + "routerloss_mlp": 0.0, + "step": 611, + "time_per_iteration": 2.83890438079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02350268, + "balance_loss_mlp": 2.30563617, + "diversity_loss_mlp": 0.0, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.20114955038596882, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7926473, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.44726562, + "routerloss_mlp": 0.0, + "step": 612, + "time_per_iteration": 4.795763254165649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111114, + "balance_loss_mlp": 1.09067178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.10624240262278996, + "language_loss": 0.88978302, + "learning_rate": 0.000979866002183916, + "loss": 0.9008944, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 613, + "time_per_iteration": 2.660820484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.10140252, + "diversity_loss_mlp": 0.0, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.11793468153173196, + "language_loss": 0.90023279, + "learning_rate": 0.0009797783914423082, + "loss": 0.91144633, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.19946289, + "routerloss_mlp": 0.0, + "step": 614, + "time_per_iteration": 2.8052501678466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.13508475, + "diversity_loss_mlp": 0.0, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.09232041353489327, + "language_loss": 0.84365702, + "learning_rate": 0.0009796905944342094, + "loss": 0.8552016, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.19360352, + "routerloss_mlp": 0.0, + "step": 615, + "time_per_iteration": 2.829193115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164283, + "balance_loss_mlp": 1.14475632, + "diversity_loss_mlp": 0.0, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.08204462941928636, + "language_loss": 0.88193601, + "learning_rate": 0.0009796026111937057, + "loss": 0.89357883, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.19519043, + "routerloss_mlp": 0.0, + "step": 616, + "time_per_iteration": 2.5868873596191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165697, + "balance_loss_mlp": 1.14656377, + "diversity_loss_mlp": 0.0, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.08667467412120618, + "language_loss": 0.88612103, + "learning_rate": 0.0009795144417549552, + "loss": 0.89777797, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.19128418, + "routerloss_mlp": 0.0, + "step": 617, + "time_per_iteration": 2.689771890640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163262, + "balance_loss_mlp": 1.14452195, + "diversity_loss_mlp": 0.0, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.07824422885129345, + "language_loss": 0.8978498, + "learning_rate": 0.0009794260861521883, + "loss": 0.90948236, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.18737793, + "routerloss_mlp": 0.0, + "step": 618, + "time_per_iteration": 2.78352689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154292, + "balance_loss_mlp": 1.13528955, + "diversity_loss_mlp": 0.0, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.09960243519509318, + "language_loss": 0.86907887, + "learning_rate": 0.0009793375444197075, + "loss": 0.88062179, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 619, + "time_per_iteration": 2.618597984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159739, + "balance_loss_mlp": 1.14053416, + "diversity_loss_mlp": 0.0, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.09155899478389973, + "language_loss": 0.85016847, + "learning_rate": 0.000979248816591888, + "loss": 0.86176586, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 620, + "time_per_iteration": 2.7570278644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145713, + "balance_loss_mlp": 1.12721133, + "diversity_loss_mlp": 0.0, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.1108991519321712, + "language_loss": 0.86349535, + "learning_rate": 0.0009791599027031766, + "loss": 0.87495244, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 621, + "time_per_iteration": 3.2095139026641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137126, + "balance_loss_mlp": 1.11841059, + "diversity_loss_mlp": 0.0, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.09815511109151757, + "language_loss": 0.86187375, + "learning_rate": 0.0009790708027880932, + "loss": 0.873245, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 622, + "time_per_iteration": 2.878537654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01885107, + "balance_loss_mlp": 1.84448004, + "diversity_loss_mlp": 0.0, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.060338107853692736, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.79312396, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 623, + "time_per_iteration": 4.854407787322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.12785053, + "diversity_loss_mlp": 0.0, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.08227936779447462, + "language_loss": 0.9313252, + "learning_rate": 0.0009788920450172487, + "loss": 0.94280195, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.19812012, + "routerloss_mlp": 0.0, + "step": 624, + "time_per_iteration": 2.633763551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173257, + "balance_loss_mlp": 1.15283692, + "diversity_loss_mlp": 0.0, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.08898942147955141, + "language_loss": 0.90448737, + "learning_rate": 0.0009788023872308875, + "loss": 0.91621995, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.20410156, + "routerloss_mlp": 0.0, + "step": 625, + "time_per_iteration": 2.5277719497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01862648, + "balance_loss_mlp": 1.82163978, + "diversity_loss_mlp": 0.0, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.06145643913195344, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.77291644, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.41015625, + "routerloss_mlp": 0.0, + "step": 626, + "time_per_iteration": 4.746332883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165065, + "balance_loss_mlp": 1.1446321, + "diversity_loss_mlp": 0.0, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.07179626691480034, + "language_loss": 0.93775636, + "learning_rate": 0.0009786225140303285, + "loss": 0.94940698, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.2043457, + "routerloss_mlp": 0.0, + "step": 627, + "time_per_iteration": 2.650980234146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.13354802, + "diversity_loss_mlp": 0.0, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.1000912175423248, + "language_loss": 0.91955918, + "learning_rate": 0.0009785322986859634, + "loss": 0.93110657, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.21191406, + "routerloss_mlp": 0.0, + "step": 628, + "time_per_iteration": 2.699179172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0098085, + "balance_loss_mlp": 1.69793713, + "diversity_loss_mlp": 0.22907162, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.03434932946066091, + "language_loss": 0.92752671, + "learning_rate": 0.0009784418975588838, + "loss": 0.93733525, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01734566, + "step": 629, + "time_per_iteration": 2.7467246055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131113, + "balance_loss_mlp": 1.10905957, + "diversity_loss_mlp": 0.0, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.08662072407619689, + "language_loss": 0.93157279, + "learning_rate": 0.0009783513106841862, + "loss": 0.94288397, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.22070312, + "routerloss_mlp": 0.0, + "step": 630, + "time_per_iteration": 2.699862003326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01893774, + "balance_loss_mlp": 1.85181284, + "diversity_loss_mlp": 0.0, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.08318726834589595, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78626478, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.41992188, + "routerloss_mlp": 0.0, + "step": 631, + "time_per_iteration": 4.952157258987427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129662, + "balance_loss_mlp": 1.10740614, + "diversity_loss_mlp": 0.0, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.08011431594745816, + "language_loss": 0.87836802, + "learning_rate": 0.0009781695798326854, + "loss": 0.88966465, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.22265625, + "routerloss_mlp": 0.0, + "step": 632, + "time_per_iteration": 2.5692520141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.10132909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.08866631591317527, + "language_loss": 0.87804729, + "learning_rate": 0.0009780784359264365, + "loss": 0.88928837, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 633, + "time_per_iteration": 2.6267781257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00719882, + "balance_loss_mlp": 1.16367078, + "diversity_loss_mlp": 0.22089316, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.0030158712959469035, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.74908578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02760048, + "step": 634, + "time_per_iteration": 4.819004535675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00956665, + "balance_loss_mlp": 1.64561963, + "diversity_loss_mlp": 0.23289478, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.029780004210258365, + "language_loss": 0.87410563, + "learning_rate": 0.000977895591329867, + "loss": 0.88367236, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017408, + "step": 635, + "time_per_iteration": 2.8417630195617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111829, + "balance_loss_mlp": 1.09035909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.07301537581986137, + "language_loss": 0.86799347, + "learning_rate": 0.000977803890710533, + "loss": 0.87911177, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 636, + "time_per_iteration": 2.721245765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_mlp": 1.08507979, + "diversity_loss_mlp": 0.0, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.0646034576227674, + "language_loss": 0.93395561, + "learning_rate": 0.0009777120045912774, + "loss": 0.94501537, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.20898438, + "routerloss_mlp": 0.0, + "step": 637, + "time_per_iteration": 2.5976381301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114015, + "balance_loss_mlp": 1.09267688, + "diversity_loss_mlp": 0.0, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.07520229878174765, + "language_loss": 0.89586985, + "learning_rate": 0.0009776199330077736, + "loss": 0.90700996, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.21362305, + "routerloss_mlp": 0.0, + "step": 638, + "time_per_iteration": 2.7055575847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127487, + "balance_loss_mlp": 1.10741186, + "diversity_loss_mlp": 0.0, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.08952902399696973, + "language_loss": 0.91934389, + "learning_rate": 0.0009775276759957667, + "loss": 0.93061876, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.20068359, + "routerloss_mlp": 0.0, + "step": 639, + "time_per_iteration": 2.703442096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113385, + "balance_loss_mlp": 1.11285698, + "diversity_loss_mlp": 0.0, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.08734236555353025, + "language_loss": 0.8993817, + "learning_rate": 0.0009774352335910745, + "loss": 0.91072023, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.21008301, + "routerloss_mlp": 0.0, + "step": 640, + "time_per_iteration": 2.798133373260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133691, + "balance_loss_mlp": 1.11327052, + "diversity_loss_mlp": 0.0, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.08010684820371014, + "language_loss": 0.94195282, + "learning_rate": 0.000977342605829586, + "loss": 0.95328975, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.20422363, + "routerloss_mlp": 0.0, + "step": 641, + "time_per_iteration": 2.72929310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167832, + "balance_loss_mlp": 1.14699411, + "diversity_loss_mlp": 0.0, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.08202605728626432, + "language_loss": 0.85741401, + "learning_rate": 0.0009772497927472623, + "loss": 0.86909235, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.20837402, + "routerloss_mlp": 0.0, + "step": 642, + "time_per_iteration": 3.071017265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166824, + "balance_loss_mlp": 1.14637995, + "diversity_loss_mlp": 0.0, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.0829252807022359, + "language_loss": 0.84863311, + "learning_rate": 0.0009771567943801368, + "loss": 0.86030138, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.20446777, + "routerloss_mlp": 0.0, + "step": 643, + "time_per_iteration": 2.667830228805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.16058123, + "diversity_loss_mlp": 0.0, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.07304892670416417, + "language_loss": 0.89067769, + "learning_rate": 0.0009770636107643152, + "loss": 0.90248668, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.203125, + "routerloss_mlp": 0.0, + "step": 644, + "time_per_iteration": 2.715703010559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187033, + "balance_loss_mlp": 1.16633821, + "diversity_loss_mlp": 0.0, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.07624328698635177, + "language_loss": 0.87043303, + "learning_rate": 0.0009769702419359738, + "loss": 0.88230342, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.20703125, + "routerloss_mlp": 0.0, + "step": 645, + "time_per_iteration": 2.645270586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199535, + "balance_loss_mlp": 1.17913866, + "diversity_loss_mlp": 0.0, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.10325279424343262, + "language_loss": 0.88927197, + "learning_rate": 0.000976876687931362, + "loss": 0.90126729, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 646, + "time_per_iteration": 2.9558987617492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154414, + "balance_loss_mlp": 1.13427997, + "diversity_loss_mlp": 0.0, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.10259074887379964, + "language_loss": 0.84658372, + "learning_rate": 0.0009767829487868005, + "loss": 0.85812783, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.20129395, + "routerloss_mlp": 0.0, + "step": 647, + "time_per_iteration": 2.593254566192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165839, + "balance_loss_mlp": 1.14557362, + "diversity_loss_mlp": 0.0, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.08660672395493044, + "language_loss": 0.88729513, + "learning_rate": 0.000976689024538682, + "loss": 0.8989535, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.20263672, + "routerloss_mlp": 0.0, + "step": 648, + "time_per_iteration": 2.6087043285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.12564492, + "diversity_loss_mlp": 0.0, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.09471610460140056, + "language_loss": 0.86980593, + "learning_rate": 0.0009765949152234716, + "loss": 0.88127637, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.21411133, + "routerloss_mlp": 0.0, + "step": 649, + "time_per_iteration": 2.8878984451293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130912, + "balance_loss_mlp": 2.08723378, + "diversity_loss_mlp": 0.0, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.17488169385486374, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.80816996, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.4375, + "routerloss_mlp": 0.0, + "step": 650, + "time_per_iteration": 4.7227959632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125186, + "balance_loss_mlp": 1.10393071, + "diversity_loss_mlp": 0.0, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.09783498118048492, + "language_loss": 0.81436628, + "learning_rate": 0.0009764061415379919, + "loss": 0.82561815, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.21276855, + "routerloss_mlp": 0.0, + "step": 651, + "time_per_iteration": 3.2849485874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135606, + "balance_loss_mlp": 1.11419618, + "diversity_loss_mlp": 0.0, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08568090703098526, + "language_loss": 0.88376707, + "learning_rate": 0.0009763114772410109, + "loss": 0.89512312, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 652, + "time_per_iteration": 2.640482187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147244, + "balance_loss_mlp": 1.12633479, + "diversity_loss_mlp": 0.0, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.0799999486499222, + "language_loss": 0.86490756, + "learning_rate": 0.0009762166280235146, + "loss": 0.87638003, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.20910645, + "routerloss_mlp": 0.0, + "step": 653, + "time_per_iteration": 2.9535903930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188128, + "balance_loss_mlp": 1.16659844, + "diversity_loss_mlp": 0.0, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.09522027236447655, + "language_loss": 0.86765033, + "learning_rate": 0.0009761215939223267, + "loss": 0.87953162, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 654, + "time_per_iteration": 2.7124929428100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186032, + "balance_loss_mlp": 1.16533732, + "diversity_loss_mlp": 0.0, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.11212167432887624, + "language_loss": 0.85993934, + "learning_rate": 0.0009760263749743428, + "loss": 0.87179965, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.20690918, + "routerloss_mlp": 0.0, + "step": 655, + "time_per_iteration": 2.5919461250305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171572, + "balance_loss_mlp": 1.1518662, + "diversity_loss_mlp": 0.0, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.09226162692886594, + "language_loss": 0.89700639, + "learning_rate": 0.0009759309712165299, + "loss": 0.9087221, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.19702148, + "routerloss_mlp": 0.0, + "step": 656, + "time_per_iteration": 2.746537685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161192, + "balance_loss_mlp": 1.14149833, + "diversity_loss_mlp": 0.0, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.08627335840647962, + "language_loss": 0.92326117, + "learning_rate": 0.0009758353826859272, + "loss": 0.9348731, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 657, + "time_per_iteration": 2.5861480236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128682, + "balance_loss_mlp": 1.10790431, + "diversity_loss_mlp": 0.0, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.1059978443595565, + "language_loss": 0.88603538, + "learning_rate": 0.0009757396094196456, + "loss": 0.89732224, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.20788574, + "routerloss_mlp": 0.0, + "step": 658, + "time_per_iteration": 2.8773136138916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130444, + "balance_loss_mlp": 1.11040533, + "diversity_loss_mlp": 0.0, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.12293029558515219, + "language_loss": 0.83426332, + "learning_rate": 0.0009756436514548673, + "loss": 0.8455677, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.20031738, + "routerloss_mlp": 0.0, + "step": 659, + "time_per_iteration": 2.810722589492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.11438441, + "diversity_loss_mlp": 0.0, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.06793027871708798, + "language_loss": 0.87658846, + "learning_rate": 0.0009755475088288466, + "loss": 0.88793576, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.20349121, + "routerloss_mlp": 0.0, + "step": 660, + "time_per_iteration": 2.7121376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.12785089, + "diversity_loss_mlp": 0.0, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08710392398912287, + "language_loss": 0.89421189, + "learning_rate": 0.0009754511815789095, + "loss": 0.90569162, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.20117188, + "routerloss_mlp": 0.0, + "step": 661, + "time_per_iteration": 2.777318239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162586, + "balance_loss_mlp": 1.14171267, + "diversity_loss_mlp": 0.0, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08537034247511402, + "language_loss": 0.84716892, + "learning_rate": 0.0009753546697424533, + "loss": 0.85879481, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 662, + "time_per_iteration": 2.6664726734161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169368, + "balance_loss_mlp": 1.14935231, + "diversity_loss_mlp": 0.0, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.08593929583832248, + "language_loss": 0.89815515, + "learning_rate": 0.0009752579733569475, + "loss": 0.90984881, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.20019531, + "routerloss_mlp": 0.0, + "step": 663, + "time_per_iteration": 2.695844888687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02192512, + "balance_loss_mlp": 2.16352034, + "diversity_loss_mlp": 0.0, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.2093028146020386, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.77073896, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.2890625, + "routerloss_mlp": 0.0, + "step": 664, + "time_per_iteration": 4.96467137336731 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00927072, + "balance_loss_mlp": 1.59828615, + "diversity_loss_mlp": 0.21952696, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.040572636524321984, + "language_loss": 0.8949101, + "learning_rate": 0.0009750640270890217, + "loss": 0.90418077, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816532, + "step": 665, + "time_per_iteration": 2.7632246017456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241186, + "balance_loss_mlp": 1.22053885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.08846289988129392, + "language_loss": 0.95572138, + "learning_rate": 0.0009749667772818983, + "loss": 0.96813321, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.20654297, + "routerloss_mlp": 0.0, + "step": 666, + "time_per_iteration": 3.037458896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183198, + "balance_loss_mlp": 1.80241597, + "diversity_loss_mlp": 0.0, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.11554481164154014, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.7876792, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.29492188, + "routerloss_mlp": 0.0, + "step": 667, + "time_per_iteration": 4.810182332992554 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244511, + "balance_loss_mlp": 1.22299325, + "diversity_loss_mlp": 0.0, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.09137997717488894, + "language_loss": 0.94816601, + "learning_rate": 0.0009747717245101093, + "loss": 0.9606111, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.21520996, + "routerloss_mlp": 0.0, + "step": 668, + "time_per_iteration": 2.552507162094116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917856, + "balance_loss_mlp": 1.58052325, + "diversity_loss_mlp": 0.21830653, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.03508480239171642, + "language_loss": 0.8457346, + "learning_rate": 0.00097467392162117, + "loss": 0.85491318, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01844162, + "step": 669, + "time_per_iteration": 2.6064391136169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242109, + "balance_loss_mlp": 1.21882796, + "diversity_loss_mlp": 0.0, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.1666980552990896, + "language_loss": 0.90609741, + "learning_rate": 0.0009745759344474708, + "loss": 0.91851848, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.23266602, + "routerloss_mlp": 0.0, + "step": 670, + "time_per_iteration": 2.826202392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229033, + "balance_loss_mlp": 1.2077179, + "diversity_loss_mlp": 0.0, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.09671049007121679, + "language_loss": 0.88974905, + "learning_rate": 0.0009744777630270536, + "loss": 0.90203935, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.21337891, + "routerloss_mlp": 0.0, + "step": 671, + "time_per_iteration": 2.578334331512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233527, + "balance_loss_mlp": 1.21067417, + "diversity_loss_mlp": 0.0, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.08999527722625096, + "language_loss": 0.92790663, + "learning_rate": 0.000974379407398032, + "loss": 0.94024187, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 672, + "time_per_iteration": 2.8661158084869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237675, + "balance_loss_mlp": 1.21589506, + "diversity_loss_mlp": 0.0, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.09653126460783178, + "language_loss": 0.81875724, + "learning_rate": 0.0009742808675985913, + "loss": 0.83113402, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.21801758, + "routerloss_mlp": 0.0, + "step": 673, + "time_per_iteration": 3.0861356258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260533, + "balance_loss_mlp": 1.23754919, + "diversity_loss_mlp": 0.0, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.08653130412501808, + "language_loss": 0.90219223, + "learning_rate": 0.0009741821436669876, + "loss": 0.91479754, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 674, + "time_per_iteration": 2.5609960556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267597, + "balance_loss_mlp": 1.24489975, + "diversity_loss_mlp": 0.0, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.09623752325881015, + "language_loss": 0.91791725, + "learning_rate": 0.0009740832356415492, + "loss": 0.93059325, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.22680664, + "routerloss_mlp": 0.0, + "step": 675, + "time_per_iteration": 2.544027805328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295128, + "balance_loss_mlp": 1.27278781, + "diversity_loss_mlp": 0.0, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.08903369590662558, + "language_loss": 0.87403589, + "learning_rate": 0.0009739841435606756, + "loss": 0.88698715, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.22338867, + "routerloss_mlp": 0.0, + "step": 676, + "time_per_iteration": 2.9931325912475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261461, + "balance_loss_mlp": 1.23933589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.0602287995404217, + "language_loss": 0.89557111, + "learning_rate": 0.0009738848674628377, + "loss": 0.90818572, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 677, + "time_per_iteration": 2.7290966510772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264602, + "balance_loss_mlp": 1.24307275, + "diversity_loss_mlp": 0.0, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.10468610894957399, + "language_loss": 0.88751101, + "learning_rate": 0.000973785407386578, + "loss": 0.90015703, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 678, + "time_per_iteration": 2.7950329780578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00969584, + "balance_loss_mlp": 1.6979661, + "diversity_loss_mlp": 0.20886885, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.03344489204860934, + "language_loss": 0.86933386, + "learning_rate": 0.0009736857633705103, + "loss": 0.87902969, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01616703, + "step": 679, + "time_per_iteration": 2.8691866397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193718, + "balance_loss_mlp": 1.17283261, + "diversity_loss_mlp": 0.0, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.08130386374469858, + "language_loss": 0.92363989, + "learning_rate": 0.0009735859354533196, + "loss": 0.93557703, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 680, + "time_per_iteration": 2.6832337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155917, + "balance_loss_mlp": 1.13447094, + "diversity_loss_mlp": 0.0, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.0924188238597787, + "language_loss": 0.91083395, + "learning_rate": 0.0009734859236737628, + "loss": 0.92239314, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.21459961, + "routerloss_mlp": 0.0, + "step": 681, + "time_per_iteration": 2.6023473739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125397, + "balance_loss_mlp": 1.10410571, + "diversity_loss_mlp": 0.0, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.08442474228180671, + "language_loss": 0.93186569, + "learning_rate": 0.0009733857280706678, + "loss": 0.9431197, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.2130127, + "routerloss_mlp": 0.0, + "step": 682, + "time_per_iteration": 2.5775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968386, + "balance_loss_mlp": 1.69064701, + "diversity_loss_mlp": 0.21057674, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.03992508312329801, + "language_loss": 0.84369749, + "learning_rate": 0.000973285348682934, + "loss": 0.85338134, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777408, + "step": 683, + "time_per_iteration": 2.768641233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01618305, + "balance_loss_mlp": 1.58530831, + "diversity_loss_mlp": 0.0, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.09794042911652269, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79516685, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.33007812, + "routerloss_mlp": 0.0, + "step": 684, + "time_per_iteration": 4.802167177200317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094162, + "balance_loss_mlp": 1.07383704, + "diversity_loss_mlp": 0.0, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.12652995306024198, + "language_loss": 0.84832728, + "learning_rate": 0.0009730840387095046, + "loss": 0.8592689, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.20324707, + "routerloss_mlp": 0.0, + "step": 685, + "time_per_iteration": 3.2910287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112414, + "balance_loss_mlp": 1.09188628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.13012317463795417, + "language_loss": 0.90537834, + "learning_rate": 0.0009729831082019642, + "loss": 0.91650254, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.20532227, + "routerloss_mlp": 0.0, + "step": 686, + "time_per_iteration": 2.7909138202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121716, + "balance_loss_mlp": 1.101331, + "diversity_loss_mlp": 0.0, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08096428549902779, + "language_loss": 0.88353586, + "learning_rate": 0.0009728819940660958, + "loss": 0.89475298, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 687, + "time_per_iteration": 2.7699429988861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131653, + "balance_loss_mlp": 1.11135173, + "diversity_loss_mlp": 0.0, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.07933225152322496, + "language_loss": 0.85085285, + "learning_rate": 0.0009727806963411557, + "loss": 0.86216938, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 688, + "time_per_iteration": 2.581984519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144043, + "balance_loss_mlp": 1.12350333, + "diversity_loss_mlp": 0.0, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.09807362554425139, + "language_loss": 0.87180853, + "learning_rate": 0.000972679215066471, + "loss": 0.88324893, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.20544434, + "routerloss_mlp": 0.0, + "step": 689, + "time_per_iteration": 2.6538989543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148083, + "balance_loss_mlp": 1.12809181, + "diversity_loss_mlp": 0.0, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.09247782934143206, + "language_loss": 0.98983967, + "learning_rate": 0.0009725775502814401, + "loss": 1.00132048, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.19995117, + "routerloss_mlp": 0.0, + "step": 690, + "time_per_iteration": 2.610485315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167941, + "balance_loss_mlp": 1.14827132, + "diversity_loss_mlp": 0.0, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.08082631328369684, + "language_loss": 0.84880829, + "learning_rate": 0.0009724757020255327, + "loss": 0.8604877, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.1965332, + "routerloss_mlp": 0.0, + "step": 691, + "time_per_iteration": 2.8424370288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152001, + "balance_loss_mlp": 1.13209307, + "diversity_loss_mlp": 0.0, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09067820147092803, + "language_loss": 0.87807095, + "learning_rate": 0.0009723736703382902, + "loss": 0.88959098, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.19897461, + "routerloss_mlp": 0.0, + "step": 692, + "time_per_iteration": 2.5578606128692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149266, + "balance_loss_mlp": 1.13037133, + "diversity_loss_mlp": 0.0, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07979062216362842, + "language_loss": 0.82877922, + "learning_rate": 0.0009722714552593244, + "loss": 0.84027195, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 693, + "time_per_iteration": 2.6148533821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153464, + "balance_loss_mlp": 1.13444984, + "diversity_loss_mlp": 0.0, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08708336283232748, + "language_loss": 0.94164526, + "learning_rate": 0.000972169056828319, + "loss": 0.9531799, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 694, + "time_per_iteration": 2.517944097518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154915, + "balance_loss_mlp": 1.1360321, + "diversity_loss_mlp": 0.0, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.0753733884935208, + "language_loss": 0.86921358, + "learning_rate": 0.0009720664750850283, + "loss": 0.8807627, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 695, + "time_per_iteration": 2.8149421215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148667, + "balance_loss_mlp": 1.1299628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.09445278911045346, + "language_loss": 0.92951906, + "learning_rate": 0.0009719637100692784, + "loss": 0.94100577, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 696, + "time_per_iteration": 2.719451904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149322, + "balance_loss_mlp": 1.13098741, + "diversity_loss_mlp": 0.0, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.10008701466446891, + "language_loss": 0.82604736, + "learning_rate": 0.0009718607618209661, + "loss": 0.83754057, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 697, + "time_per_iteration": 2.8692104816436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148038, + "balance_loss_mlp": 1.12914348, + "diversity_loss_mlp": 0.0, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.07908911060166324, + "language_loss": 0.87701273, + "learning_rate": 0.0009717576303800595, + "loss": 0.88849318, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 698, + "time_per_iteration": 3.0484437942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139219, + "balance_loss_mlp": 1.11988366, + "diversity_loss_mlp": 0.0, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.12480577454910273, + "language_loss": 0.85819161, + "learning_rate": 0.0009716543157865975, + "loss": 0.86958385, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.1932373, + "routerloss_mlp": 0.0, + "step": 699, + "time_per_iteration": 2.706787347793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144768, + "balance_loss_mlp": 1.12586117, + "diversity_loss_mlp": 0.0, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.16362357873421526, + "language_loss": 0.83352965, + "learning_rate": 0.0009715508180806907, + "loss": 0.84497738, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.18896484, + "routerloss_mlp": 0.0, + "step": 700, + "time_per_iteration": 3.1985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162079, + "balance_loss_mlp": 1.14230227, + "diversity_loss_mlp": 0.0, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.08746408781150025, + "language_loss": 0.90170425, + "learning_rate": 0.0009714471373025202, + "loss": 0.91332507, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.19763184, + "routerloss_mlp": 0.0, + "step": 701, + "time_per_iteration": 3.487022638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.13656974, + "diversity_loss_mlp": 0.0, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.10787745491017559, + "language_loss": 0.88186693, + "learning_rate": 0.0009713432734923386, + "loss": 0.89343208, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 702, + "time_per_iteration": 2.6239736080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.14830136, + "diversity_loss_mlp": 0.0, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.09670789671988574, + "language_loss": 0.86879516, + "learning_rate": 0.0009712392266904696, + "loss": 0.88047349, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.19506836, + "routerloss_mlp": 0.0, + "step": 703, + "time_per_iteration": 2.7542335987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181198, + "balance_loss_mlp": 1.16149247, + "diversity_loss_mlp": 0.0, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.10598212751912446, + "language_loss": 0.85246772, + "learning_rate": 0.0009711349969373076, + "loss": 0.86427975, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 704, + "time_per_iteration": 3.162461042404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175522, + "balance_loss_mlp": 1.15518451, + "diversity_loss_mlp": 0.0, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.0954290464489283, + "language_loss": 0.80285007, + "learning_rate": 0.0009710305842733178, + "loss": 0.81460524, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 705, + "time_per_iteration": 2.7630715370178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155052, + "balance_loss_mlp": 1.13601446, + "diversity_loss_mlp": 0.0, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.09437017973872532, + "language_loss": 0.89630616, + "learning_rate": 0.0009709259887390373, + "loss": 0.9078567, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.19030762, + "routerloss_mlp": 0.0, + "step": 706, + "time_per_iteration": 2.6160268783569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895019, + "balance_loss_mlp": 1.55161047, + "diversity_loss_mlp": 0.20666173, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.04273378361131697, + "language_loss": 0.90874577, + "learning_rate": 0.0009708212103750737, + "loss": 0.91769588, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588319, + "step": 707, + "time_per_iteration": 2.594606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180444, + "balance_loss_mlp": 1.16110778, + "diversity_loss_mlp": 0.0, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.08814378894040824, + "language_loss": 0.87522972, + "learning_rate": 0.0009707162492221051, + "loss": 0.88703418, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.19335938, + "routerloss_mlp": 0.0, + "step": 708, + "time_per_iteration": 2.8884427547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197388, + "balance_loss_mlp": 1.17801642, + "diversity_loss_mlp": 0.0, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.07892254834086627, + "language_loss": 0.87611169, + "learning_rate": 0.0009706111053208815, + "loss": 0.8880856, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 709, + "time_per_iteration": 2.7824413776397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213311, + "balance_loss_mlp": 1.19383228, + "diversity_loss_mlp": 0.0, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.10389736734512126, + "language_loss": 0.85504246, + "learning_rate": 0.0009705057787122232, + "loss": 0.86717558, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.19458008, + "routerloss_mlp": 0.0, + "step": 710, + "time_per_iteration": 2.529498815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178108, + "balance_loss_mlp": 1.15870059, + "diversity_loss_mlp": 0.0, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.07975606670492637, + "language_loss": 0.91293353, + "learning_rate": 0.0009704002694370216, + "loss": 0.92471457, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.19384766, + "routerloss_mlp": 0.0, + "step": 711, + "time_per_iteration": 2.5365610122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152018, + "balance_loss_mlp": 1.13282573, + "diversity_loss_mlp": 0.0, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.08453852441771745, + "language_loss": 0.86583841, + "learning_rate": 0.0009702945775362388, + "loss": 0.87735862, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.19177246, + "routerloss_mlp": 0.0, + "step": 712, + "time_per_iteration": 2.595674514770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111883, + "balance_loss_mlp": 1.10022175, + "diversity_loss_mlp": 0.0, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.08096963371537849, + "language_loss": 0.87088716, + "learning_rate": 0.0009701887030509086, + "loss": 0.88207549, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.18615723, + "routerloss_mlp": 0.0, + "step": 713, + "time_per_iteration": 2.6124320030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112657, + "balance_loss_mlp": 1.09444165, + "diversity_loss_mlp": 0.0, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.12434454369652892, + "language_loss": 0.91262931, + "learning_rate": 0.0009700826460221346, + "loss": 0.92375588, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.18225098, + "routerloss_mlp": 0.0, + "step": 714, + "time_per_iteration": 2.674612283706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115832, + "balance_loss_mlp": 1.09812903, + "diversity_loss_mlp": 0.0, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.11407804289300516, + "language_loss": 0.92571628, + "learning_rate": 0.0009699764064910921, + "loss": 0.93687463, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.17712402, + "routerloss_mlp": 0.0, + "step": 715, + "time_per_iteration": 2.8810853958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121116, + "balance_loss_mlp": 1.10322237, + "diversity_loss_mlp": 0.0, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08940816195623212, + "language_loss": 0.86826718, + "learning_rate": 0.0009698699844990268, + "loss": 0.87947834, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.17907715, + "routerloss_mlp": 0.0, + "step": 716, + "time_per_iteration": 2.697970151901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153213, + "balance_loss_mlp": 1.13561809, + "diversity_loss_mlp": 0.0, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.07906779204708066, + "language_loss": 0.88138282, + "learning_rate": 0.0009697633800872555, + "loss": 0.89291501, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 717, + "time_per_iteration": 2.8897392749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197417, + "balance_loss_mlp": 1.1801312, + "diversity_loss_mlp": 0.0, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.10867682790127652, + "language_loss": 0.9066782, + "learning_rate": 0.0009696565932971655, + "loss": 0.91865242, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 718, + "time_per_iteration": 2.8944718837738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209582, + "balance_loss_mlp": 1.19165277, + "diversity_loss_mlp": 0.0, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.0949883595308799, + "language_loss": 0.89814746, + "learning_rate": 0.0009695496241702153, + "loss": 0.91024327, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 719, + "time_per_iteration": 2.7888894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188478, + "balance_loss_mlp": 1.17082274, + "diversity_loss_mlp": 0.0, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.11627833553714081, + "language_loss": 0.86245799, + "learning_rate": 0.0009694424727479339, + "loss": 0.87434286, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.17687988, + "routerloss_mlp": 0.0, + "step": 720, + "time_per_iteration": 2.901224374771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157865, + "balance_loss_mlp": 1.14056826, + "diversity_loss_mlp": 0.0, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.09369792564045784, + "language_loss": 0.88928097, + "learning_rate": 0.0009693351390719213, + "loss": 0.90085959, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 721, + "time_per_iteration": 2.6945152282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126431, + "balance_loss_mlp": 1.10868096, + "diversity_loss_mlp": 0.0, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.07998653864580182, + "language_loss": 0.90800881, + "learning_rate": 0.000969227623183848, + "loss": 0.91927308, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 722, + "time_per_iteration": 2.789515733718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110503, + "balance_loss_mlp": 1.0873754, + "diversity_loss_mlp": 0.0, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.07914116119322331, + "language_loss": 0.90912664, + "learning_rate": 0.0009691199251254554, + "loss": 0.92017698, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.17663574, + "routerloss_mlp": 0.0, + "step": 723, + "time_per_iteration": 2.8231685161590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0093359, + "balance_loss_mlp": 1.62175167, + "diversity_loss_mlp": 0.20987722, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.03669424434563534, + "language_loss": 0.86868215, + "learning_rate": 0.0009690120449385555, + "loss": 0.87801802, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777578, + "step": 724, + "time_per_iteration": 2.8498518466949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093753, + "balance_loss_mlp": 1.07543111, + "diversity_loss_mlp": 0.0, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.10366482624390064, + "language_loss": 0.92449063, + "learning_rate": 0.0009689039826650312, + "loss": 0.93542814, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.18322754, + "routerloss_mlp": 0.0, + "step": 725, + "time_per_iteration": 2.7611966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0154366, + "balance_loss_mlp": 1.50932813, + "diversity_loss_mlp": 0.0, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.08078369374569346, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.78066719, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.34375, + "routerloss_mlp": 0.0, + "step": 726, + "time_per_iteration": 4.927435398101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933775, + "balance_loss_mlp": 1.62253523, + "diversity_loss_mlp": 0.20735951, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.04309218151041253, + "language_loss": 0.87429261, + "learning_rate": 0.0009686873120259941, + "loss": 0.88363039, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01882811, + "step": 727, + "time_per_iteration": 2.602264165878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113926, + "balance_loss_mlp": 1.12035322, + "diversity_loss_mlp": 0.0, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.14876828859354083, + "language_loss": 0.8713131, + "learning_rate": 0.0009685787037446004, + "loss": 0.88270569, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.18884277, + "routerloss_mlp": 0.0, + "step": 728, + "time_per_iteration": 2.806549072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118218, + "balance_loss_mlp": 1.09903765, + "diversity_loss_mlp": 0.0, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.1987640778264907, + "language_loss": 0.87505388, + "learning_rate": 0.0009684699135448201, + "loss": 0.88623607, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 729, + "time_per_iteration": 2.7200138568878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112792, + "balance_loss_mlp": 1.09435034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.0640895655048784, + "language_loss": 0.92135447, + "learning_rate": 0.0009683609414688895, + "loss": 0.93248242, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.18432617, + "routerloss_mlp": 0.0, + "step": 730, + "time_per_iteration": 2.7423696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911127, + "balance_loss_mlp": 1.58117688, + "diversity_loss_mlp": 0.20959289, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.03249579551243702, + "language_loss": 0.86587501, + "learning_rate": 0.0009682517875591154, + "loss": 0.87498629, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01574249, + "step": 731, + "time_per_iteration": 2.809400796890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199938, + "balance_loss_mlp": 1.18138909, + "diversity_loss_mlp": 0.0, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.07609394509363156, + "language_loss": 0.86229968, + "learning_rate": 0.0009681424518578749, + "loss": 0.87429905, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.18530273, + "routerloss_mlp": 0.0, + "step": 732, + "time_per_iteration": 2.725839614868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283686, + "balance_loss_mlp": 1.26505399, + "diversity_loss_mlp": 0.0, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.1414658743658329, + "language_loss": 0.87506676, + "learning_rate": 0.000968032934407616, + "loss": 0.88790363, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.1862793, + "routerloss_mlp": 0.0, + "step": 733, + "time_per_iteration": 2.583768844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01310281, + "balance_loss_mlp": 1.29136264, + "diversity_loss_mlp": 0.0, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.10963887531318486, + "language_loss": 0.81871867, + "learning_rate": 0.0009679232352508571, + "loss": 0.8318215, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.18908691, + "routerloss_mlp": 0.0, + "step": 734, + "time_per_iteration": 2.785585880279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286635, + "balance_loss_mlp": 1.26744211, + "diversity_loss_mlp": 0.0, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.10469043869015734, + "language_loss": 0.80695581, + "learning_rate": 0.0009678133544301871, + "loss": 0.81982213, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 735, + "time_per_iteration": 2.6638481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224375, + "balance_loss_mlp": 1.20588589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.06500438819618859, + "language_loss": 0.91870093, + "learning_rate": 0.0009677032919882658, + "loss": 0.93094468, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 736, + "time_per_iteration": 2.6578378677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197974, + "balance_loss_mlp": 1.18056929, + "diversity_loss_mlp": 0.0, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.09940630997209131, + "language_loss": 0.91374373, + "learning_rate": 0.000967593047967823, + "loss": 0.92572349, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.17419434, + "routerloss_mlp": 0.0, + "step": 737, + "time_per_iteration": 2.5236403942108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117212, + "balance_loss_mlp": 1.15476346, + "diversity_loss_mlp": 0.0, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.10840920786543624, + "language_loss": 0.86479127, + "learning_rate": 0.0009674826224116593, + "loss": 0.87651253, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 738, + "time_per_iteration": 2.803260326385498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134605, + "balance_loss_mlp": 1.11759412, + "diversity_loss_mlp": 0.0, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.09051392518082112, + "language_loss": 0.86862409, + "learning_rate": 0.0009673720153626455, + "loss": 0.87997013, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 739, + "time_per_iteration": 2.6086573600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.10798764, + "diversity_loss_mlp": 0.0, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.11444093339414264, + "language_loss": 0.8689152, + "learning_rate": 0.0009672612268637235, + "loss": 0.88016504, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.17016602, + "routerloss_mlp": 0.0, + "step": 740, + "time_per_iteration": 2.582648277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116151, + "balance_loss_mlp": 1.09880614, + "diversity_loss_mlp": 0.0, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.10874190594389947, + "language_loss": 0.84213787, + "learning_rate": 0.0009671502569579048, + "loss": 0.85329938, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 741, + "time_per_iteration": 2.7945284843444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132432, + "balance_loss_mlp": 1.11539662, + "diversity_loss_mlp": 0.0, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.07140691777849974, + "language_loss": 0.89503837, + "learning_rate": 0.0009670391056882719, + "loss": 0.90636265, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.17053223, + "routerloss_mlp": 0.0, + "step": 742, + "time_per_iteration": 2.71687912940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149228, + "balance_loss_mlp": 1.13240731, + "diversity_loss_mlp": 0.0, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.08672376963732596, + "language_loss": 0.88698781, + "learning_rate": 0.0009669277730979776, + "loss": 0.89848006, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 743, + "time_per_iteration": 3.2029030323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147207, + "balance_loss_mlp": 1.13025546, + "diversity_loss_mlp": 0.0, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.09113342882689801, + "language_loss": 0.85227454, + "learning_rate": 0.0009668162592302449, + "loss": 0.86374664, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 744, + "time_per_iteration": 2.899656057357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165032, + "balance_loss_mlp": 1.14748406, + "diversity_loss_mlp": 0.0, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.07780467137911447, + "language_loss": 0.86560214, + "learning_rate": 0.0009667045641283676, + "loss": 0.87725246, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.17553711, + "routerloss_mlp": 0.0, + "step": 745, + "time_per_iteration": 2.6474997997283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.14148676, + "diversity_loss_mlp": 0.0, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.09864944110558675, + "language_loss": 0.95312673, + "learning_rate": 0.0009665926878357092, + "loss": 0.96471858, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.17700195, + "routerloss_mlp": 0.0, + "step": 746, + "time_per_iteration": 2.946307420730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00851982, + "balance_loss_mlp": 1.46230698, + "diversity_loss_mlp": 0.20995456, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.034792990408202794, + "language_loss": 0.91192698, + "learning_rate": 0.0009664806303957043, + "loss": 0.92044681, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01585159, + "step": 747, + "time_per_iteration": 2.706286668777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160661, + "balance_loss_mlp": 1.14221931, + "diversity_loss_mlp": 0.0, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.08367194984434445, + "language_loss": 0.87066692, + "learning_rate": 0.0009663683918518571, + "loss": 0.88227355, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 748, + "time_per_iteration": 2.892982244491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136526, + "balance_loss_mlp": 1.11831081, + "diversity_loss_mlp": 0.0, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.07455761265115375, + "language_loss": 0.85490787, + "learning_rate": 0.0009662559722477428, + "loss": 0.86627316, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.18237305, + "routerloss_mlp": 0.0, + "step": 749, + "time_per_iteration": 2.6979615688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01292346, + "balance_loss_mlp": 1.2582047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.08640394257539531, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77455318, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.34179688, + "routerloss_mlp": 0.0, + "step": 750, + "time_per_iteration": 4.991304397583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.11068118, + "diversity_loss_mlp": 0.0, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.07866539193327844, + "language_loss": 0.89197791, + "learning_rate": 0.0009660305900333632, + "loss": 0.90326303, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.17834473, + "routerloss_mlp": 0.0, + "step": 751, + "time_per_iteration": 2.6706793308258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121937, + "balance_loss_mlp": 1.1038413, + "diversity_loss_mlp": 0.0, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.10038132697844201, + "language_loss": 0.82478833, + "learning_rate": 0.0009659176275105992, + "loss": 0.83600777, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.1809082, + "routerloss_mlp": 0.0, + "step": 752, + "time_per_iteration": 2.697909355163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126212, + "balance_loss_mlp": 1.10777032, + "diversity_loss_mlp": 0.0, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.10638604925915984, + "language_loss": 0.85756153, + "learning_rate": 0.0009658044841025701, + "loss": 0.86882365, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 753, + "time_per_iteration": 2.7749171257019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128331, + "balance_loss_mlp": 1.1107595, + "diversity_loss_mlp": 0.0, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.09130861127340602, + "language_loss": 0.81584072, + "learning_rate": 0.0009656911598532021, + "loss": 0.827124, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.17590332, + "routerloss_mlp": 0.0, + "step": 754, + "time_per_iteration": 2.635702610015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136592, + "balance_loss_mlp": 1.11914003, + "diversity_loss_mlp": 0.0, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.06835454276473461, + "language_loss": 0.90494555, + "learning_rate": 0.0009655776548064917, + "loss": 0.9163115, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.17456055, + "routerloss_mlp": 0.0, + "step": 755, + "time_per_iteration": 2.6545748710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135969, + "balance_loss_mlp": 1.11902952, + "diversity_loss_mlp": 0.0, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.07886906074703284, + "language_loss": 0.88367254, + "learning_rate": 0.0009654639690065054, + "loss": 0.89503217, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 756, + "time_per_iteration": 2.8773815631866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150961, + "balance_loss_mlp": 1.13343716, + "diversity_loss_mlp": 0.0, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.07604063018618923, + "language_loss": 0.8823185, + "learning_rate": 0.00096535010249738, + "loss": 0.89382815, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.17529297, + "routerloss_mlp": 0.0, + "step": 757, + "time_per_iteration": 2.7175021171569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846707, + "balance_loss_mlp": 1.45519352, + "diversity_loss_mlp": 0.20419648, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.03954501513556402, + "language_loss": 0.82782531, + "learning_rate": 0.0009652360553233224, + "loss": 0.83629239, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017012, + "step": 758, + "time_per_iteration": 2.7434637546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115333, + "balance_loss_mlp": 1.12624609, + "diversity_loss_mlp": 0.0, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.03342191973393777, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.7492708, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 759, + "time_per_iteration": 4.910880088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188786, + "balance_loss_mlp": 1.17063034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.0638252555407819, + "language_loss": 0.81659228, + "learning_rate": 0.0009650074191575883, + "loss": 0.82848012, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.18151855, + "routerloss_mlp": 0.0, + "step": 760, + "time_per_iteration": 3.2028603553771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213565, + "balance_loss_mlp": 1.19484925, + "diversity_loss_mlp": 0.0, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.07046318146001718, + "language_loss": 0.86031073, + "learning_rate": 0.0009648928302546766, + "loss": 0.87244636, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 761, + "time_per_iteration": 2.6812515258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243947, + "balance_loss_mlp": 1.22551703, + "diversity_loss_mlp": 0.0, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.0884537515073792, + "language_loss": 0.85470825, + "learning_rate": 0.0009647780608643613, + "loss": 0.86714768, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.1842041, + "routerloss_mlp": 0.0, + "step": 762, + "time_per_iteration": 3.3486785888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012302, + "balance_loss_mlp": 1.21243811, + "diversity_loss_mlp": 0.0, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.12042495658723557, + "language_loss": 0.874053, + "learning_rate": 0.0009646631110312001, + "loss": 0.88635492, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 763, + "time_per_iteration": 2.6648313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172377, + "balance_loss_mlp": 1.1544956, + "diversity_loss_mlp": 0.0, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.05916332097574664, + "language_loss": 0.8841719, + "learning_rate": 0.0009645479807998203, + "loss": 0.89589572, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 764, + "time_per_iteration": 2.7347912788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147505, + "balance_loss_mlp": 1.12983775, + "diversity_loss_mlp": 0.0, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06985321722585584, + "language_loss": 0.92467874, + "learning_rate": 0.0009644326702149196, + "loss": 0.93615377, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.17675781, + "routerloss_mlp": 0.0, + "step": 765, + "time_per_iteration": 2.7316319942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135962, + "balance_loss_mlp": 1.11803293, + "diversity_loss_mlp": 0.0, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.09157028460957184, + "language_loss": 0.84919345, + "learning_rate": 0.0009643171793212653, + "loss": 0.86055309, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 766, + "time_per_iteration": 3.116917610168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105537, + "balance_loss_mlp": 1.08738184, + "diversity_loss_mlp": 0.0, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.08034801396880724, + "language_loss": 0.89233959, + "learning_rate": 0.0009642015081636952, + "loss": 0.90339494, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.18164062, + "routerloss_mlp": 0.0, + "step": 767, + "time_per_iteration": 2.705993175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103513, + "balance_loss_mlp": 1.08563185, + "diversity_loss_mlp": 0.0, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.09221888586765616, + "language_loss": 0.88360566, + "learning_rate": 0.0009640856567871166, + "loss": 0.8946408, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.17895508, + "routerloss_mlp": 0.0, + "step": 768, + "time_per_iteration": 2.5172243118286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108598, + "balance_loss_mlp": 1.08981061, + "diversity_loss_mlp": 0.0, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.0844592716079577, + "language_loss": 0.89047211, + "learning_rate": 0.0009639696252365072, + "loss": 0.9015581, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.18786621, + "routerloss_mlp": 0.0, + "step": 769, + "time_per_iteration": 3.034848690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105095, + "balance_loss_mlp": 1.08673656, + "diversity_loss_mlp": 0.0, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.07095543604969227, + "language_loss": 0.81996548, + "learning_rate": 0.0009638534135569144, + "loss": 0.83101642, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.18371582, + "routerloss_mlp": 0.0, + "step": 770, + "time_per_iteration": 2.947564125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106726, + "balance_loss_mlp": 1.08859468, + "diversity_loss_mlp": 0.0, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.08627707323979403, + "language_loss": 0.9012745, + "learning_rate": 0.0009637370217934554, + "loss": 0.91234171, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.18139648, + "routerloss_mlp": 0.0, + "step": 771, + "time_per_iteration": 2.6592423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.09355128, + "diversity_loss_mlp": 0.0, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.06345294765682771, + "language_loss": 0.82981932, + "learning_rate": 0.0009636204499913175, + "loss": 0.84093815, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 772, + "time_per_iteration": 2.8836610317230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115362, + "balance_loss_mlp": 1.09749293, + "diversity_loss_mlp": 0.0, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06338786563117527, + "language_loss": 0.87914705, + "learning_rate": 0.0009635036981957581, + "loss": 0.89030063, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 773, + "time_per_iteration": 2.885239601135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132405, + "balance_loss_mlp": 1.11417794, + "diversity_loss_mlp": 0.0, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.08623405645423676, + "language_loss": 0.90735364, + "learning_rate": 0.0009633867664521043, + "loss": 0.91867769, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.18212891, + "routerloss_mlp": 0.0, + "step": 774, + "time_per_iteration": 2.802264451980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159356, + "balance_loss_mlp": 1.14176083, + "diversity_loss_mlp": 0.0, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.09977443827883303, + "language_loss": 0.86760318, + "learning_rate": 0.0009632696548057527, + "loss": 0.8791967, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 775, + "time_per_iteration": 2.5641794204711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187156, + "balance_loss_mlp": 1.16960835, + "diversity_loss_mlp": 0.0, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.08744626586779954, + "language_loss": 0.85013115, + "learning_rate": 0.0009631523633021704, + "loss": 0.86200273, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 776, + "time_per_iteration": 2.7851786613464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881631, + "balance_loss_mlp": 1.52411294, + "diversity_loss_mlp": 0.20632464, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.038364140445948956, + "language_loss": 0.88378215, + "learning_rate": 0.0009630348919868936, + "loss": 0.89259851, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164127, + "step": 777, + "time_per_iteration": 2.7285845279693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191902, + "balance_loss_mlp": 1.17415154, + "diversity_loss_mlp": 0.0, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.14061909589017782, + "language_loss": 0.81450796, + "learning_rate": 0.0009629172409055293, + "loss": 0.82642698, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 778, + "time_per_iteration": 2.5018203258514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154162, + "balance_loss_mlp": 1.13728166, + "diversity_loss_mlp": 0.0, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.06968828956123203, + "language_loss": 0.87518388, + "learning_rate": 0.0009627994101037531, + "loss": 0.88672549, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 779, + "time_per_iteration": 2.763136863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139257, + "balance_loss_mlp": 1.12231779, + "diversity_loss_mlp": 0.0, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.07833298109740298, + "language_loss": 0.88761836, + "learning_rate": 0.0009626813996273114, + "loss": 0.8990109, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 780, + "time_per_iteration": 2.8791675567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117275, + "balance_loss_mlp": 1.09990597, + "diversity_loss_mlp": 0.0, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.09603506751758703, + "language_loss": 0.89051467, + "learning_rate": 0.0009625632095220198, + "loss": 0.90168738, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 781, + "time_per_iteration": 2.8194801807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119786, + "balance_loss_mlp": 1.10251248, + "diversity_loss_mlp": 0.0, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.1003760880169841, + "language_loss": 0.86904705, + "learning_rate": 0.0009624448398337637, + "loss": 0.88024497, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 782, + "time_per_iteration": 2.511925458908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117445, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.08409428795596587, + "language_loss": 0.8913728, + "learning_rate": 0.0009623262906084984, + "loss": 0.90254724, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.17236328, + "routerloss_mlp": 0.0, + "step": 783, + "time_per_iteration": 2.9890754222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125496, + "balance_loss_mlp": 1.10804367, + "diversity_loss_mlp": 0.0, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.07818041002140835, + "language_loss": 0.90351313, + "learning_rate": 0.0009622075618922486, + "loss": 0.9147681, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 784, + "time_per_iteration": 2.6550891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119712, + "balance_loss_mlp": 1.10261774, + "diversity_loss_mlp": 0.0, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.07239943737193227, + "language_loss": 0.87125635, + "learning_rate": 0.0009620886537311091, + "loss": 0.88245344, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.17114258, + "routerloss_mlp": 0.0, + "step": 785, + "time_per_iteration": 2.646864652633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125571, + "balance_loss_mlp": 1.10794032, + "diversity_loss_mlp": 0.0, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.08980079735835493, + "language_loss": 0.85309643, + "learning_rate": 0.000961969566171244, + "loss": 0.86435217, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.1763916, + "routerloss_mlp": 0.0, + "step": 786, + "time_per_iteration": 2.5803041458129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136873, + "balance_loss_mlp": 1.11938524, + "diversity_loss_mlp": 0.0, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.08282756535064502, + "language_loss": 0.8993417, + "learning_rate": 0.0009618502992588873, + "loss": 0.91071045, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.17504883, + "routerloss_mlp": 0.0, + "step": 787, + "time_per_iteration": 2.6479151248931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124837, + "balance_loss_mlp": 1.10727715, + "diversity_loss_mlp": 0.0, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07571751270322945, + "language_loss": 0.8792628, + "learning_rate": 0.0009617308530403424, + "loss": 0.89051116, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 788, + "time_per_iteration": 3.002804756164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125535, + "balance_loss_mlp": 1.10758173, + "diversity_loss_mlp": 0.0, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0842913885359751, + "language_loss": 0.88032806, + "learning_rate": 0.0009616112275619825, + "loss": 0.89158338, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 789, + "time_per_iteration": 2.6842775344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110837, + "balance_loss_mlp": 1.09398067, + "diversity_loss_mlp": 0.0, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.07451962795351484, + "language_loss": 0.83893597, + "learning_rate": 0.0009614914228702503, + "loss": 0.85004437, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 790, + "time_per_iteration": 2.714026689529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095726, + "balance_loss_mlp": 1.07848811, + "diversity_loss_mlp": 0.0, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.07099161447381937, + "language_loss": 0.89133644, + "learning_rate": 0.0009613714390116581, + "loss": 0.90229368, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.17260742, + "routerloss_mlp": 0.0, + "step": 791, + "time_per_iteration": 2.947917938232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089669, + "balance_loss_mlp": 1.0730865, + "diversity_loss_mlp": 0.0, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.07518738092336623, + "language_loss": 0.86102855, + "learning_rate": 0.0009612512760327879, + "loss": 0.87192523, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 792, + "time_per_iteration": 2.887404203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092437, + "balance_loss_mlp": 1.07553315, + "diversity_loss_mlp": 0.0, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.09992337759040973, + "language_loss": 0.85428631, + "learning_rate": 0.0009611309339802909, + "loss": 0.86521071, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 793, + "time_per_iteration": 2.463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101517, + "balance_loss_mlp": 1.08537626, + "diversity_loss_mlp": 0.0, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.07717151134226699, + "language_loss": 0.84535038, + "learning_rate": 0.0009610104129008881, + "loss": 0.85636556, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 794, + "time_per_iteration": 3.1276698112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108014, + "balance_loss_mlp": 1.09176612, + "diversity_loss_mlp": 0.0, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.07067272187318202, + "language_loss": 0.88475168, + "learning_rate": 0.0009608897128413701, + "loss": 0.89583182, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 795, + "time_per_iteration": 2.7658157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110863, + "balance_loss_mlp": 1.09251332, + "diversity_loss_mlp": 0.0, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.05987412473430484, + "language_loss": 0.85522842, + "learning_rate": 0.0009607688338485965, + "loss": 0.86631477, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 796, + "time_per_iteration": 2.849942207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112598, + "balance_loss_mlp": 1.10935068, + "diversity_loss_mlp": 0.0, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.07148533051381147, + "language_loss": 0.90245026, + "learning_rate": 0.0009606477759694969, + "loss": 0.91371006, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 797, + "time_per_iteration": 3.0240113735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144466, + "balance_loss_mlp": 1.12839675, + "diversity_loss_mlp": 0.0, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07535837127697287, + "language_loss": 0.87540114, + "learning_rate": 0.0009605265392510703, + "loss": 0.88684577, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 798, + "time_per_iteration": 2.6324868202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147656, + "balance_loss_mlp": 1.13140786, + "diversity_loss_mlp": 0.0, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.070317951825601, + "language_loss": 0.91919398, + "learning_rate": 0.0009604051237403846, + "loss": 0.93067056, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 799, + "time_per_iteration": 2.6472957134246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159957, + "balance_loss_mlp": 1.14441192, + "diversity_loss_mlp": 0.0, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.08825283549053219, + "language_loss": 0.8626982, + "learning_rate": 0.0009602835294845776, + "loss": 0.8742978, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 800, + "time_per_iteration": 2.4501516819000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141823, + "balance_loss_mlp": 1.12552738, + "diversity_loss_mlp": 0.0, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.07489761537063061, + "language_loss": 0.89964634, + "learning_rate": 0.0009601617565308565, + "loss": 0.91106457, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 801, + "time_per_iteration": 2.6480391025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00945745, + "balance_loss_mlp": 1.65525413, + "diversity_loss_mlp": 0.20237769, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.03656221347615257, + "language_loss": 0.8655234, + "learning_rate": 0.0009600398049264977, + "loss": 0.87498081, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01692954, + "step": 802, + "time_per_iteration": 3.0029048919677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00923116, + "balance_loss_mlp": 1.61011553, + "diversity_loss_mlp": 0.20312682, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.045238735441598905, + "language_loss": 0.92041564, + "learning_rate": 0.0009599176747188469, + "loss": 0.92964679, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164945, + "step": 803, + "time_per_iteration": 2.860461473464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113914, + "balance_loss_mlp": 1.12246239, + "diversity_loss_mlp": 0.0, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.08350523706559901, + "language_loss": 0.83155477, + "learning_rate": 0.0009597953659553196, + "loss": 0.84294617, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.16687012, + "routerloss_mlp": 0.0, + "step": 804, + "time_per_iteration": 2.733302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139673, + "balance_loss_mlp": 1.12363935, + "diversity_loss_mlp": 0.0, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.08094420015679657, + "language_loss": 0.89484847, + "learning_rate": 0.0009596728786833997, + "loss": 0.90624517, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.16027832, + "routerloss_mlp": 0.0, + "step": 805, + "time_per_iteration": 2.602963447570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112483, + "balance_loss_mlp": 1.10851073, + "diversity_loss_mlp": 0.0, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.09295267358895155, + "language_loss": 0.8926357, + "learning_rate": 0.0009595502129506415, + "loss": 0.90388405, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 806, + "time_per_iteration": 3.358494997024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112525, + "balance_loss_mlp": 1.10893035, + "diversity_loss_mlp": 0.0, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.09807919542340894, + "language_loss": 0.82600027, + "learning_rate": 0.0009594273688046678, + "loss": 0.83725274, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 807, + "time_per_iteration": 2.7516088485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121041, + "balance_loss_mlp": 1.10408974, + "diversity_loss_mlp": 0.0, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.13657059547118527, + "language_loss": 0.85685933, + "learning_rate": 0.000959304346293171, + "loss": 0.86806977, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 808, + "time_per_iteration": 2.676118850708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133717, + "balance_loss_mlp": 1.11686087, + "diversity_loss_mlp": 0.0, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.08670416080232539, + "language_loss": 0.88104093, + "learning_rate": 0.0009591811454639125, + "loss": 0.89237815, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.16870117, + "routerloss_mlp": 0.0, + "step": 809, + "time_per_iteration": 2.806877613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143795, + "balance_loss_mlp": 1.12712979, + "diversity_loss_mlp": 0.0, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.07575766208840308, + "language_loss": 0.88623202, + "learning_rate": 0.0009590577663647234, + "loss": 0.89766991, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 810, + "time_per_iteration": 2.705397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167139, + "balance_loss_mlp": 1.15012765, + "diversity_loss_mlp": 0.0, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.07966338850805216, + "language_loss": 0.86178398, + "learning_rate": 0.0009589342090435036, + "loss": 0.87345541, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 811, + "time_per_iteration": 2.767648935317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164589, + "balance_loss_mlp": 1.14749408, + "diversity_loss_mlp": 0.0, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07988119295983553, + "language_loss": 0.87430739, + "learning_rate": 0.0009588104735482223, + "loss": 0.88595331, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 812, + "time_per_iteration": 2.6543996334075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.14989901, + "diversity_loss_mlp": 0.0, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.09429144108453459, + "language_loss": 0.83906114, + "learning_rate": 0.0009586865599269177, + "loss": 0.85073483, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 813, + "time_per_iteration": 2.632206439971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180179, + "balance_loss_mlp": 1.1632992, + "diversity_loss_mlp": 0.0, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.08748302318090055, + "language_loss": 0.88416874, + "learning_rate": 0.0009585624682276977, + "loss": 0.89597052, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 814, + "time_per_iteration": 2.7365036010742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187488, + "balance_loss_mlp": 1.17066741, + "diversity_loss_mlp": 0.0, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.08109713122840453, + "language_loss": 0.87263978, + "learning_rate": 0.0009584381984987386, + "loss": 0.88451469, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 815, + "time_per_iteration": 2.5354831218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011941, + "balance_loss_mlp": 1.1770407, + "diversity_loss_mlp": 0.0, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.07928759805262754, + "language_loss": 0.89978456, + "learning_rate": 0.0009583137507882864, + "loss": 0.91172552, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.17077637, + "routerloss_mlp": 0.0, + "step": 816, + "time_per_iteration": 2.679156541824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895961, + "balance_loss_mlp": 1.55854249, + "diversity_loss_mlp": 0.20119007, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.035733799703693336, + "language_loss": 0.81236839, + "learning_rate": 0.000958189125144656, + "loss": 0.82132804, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160944, + "step": 817, + "time_per_iteration": 2.6629080772399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211679, + "balance_loss_mlp": 1.1954186, + "diversity_loss_mlp": 0.0, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08655764528844483, + "language_loss": 0.88309336, + "learning_rate": 0.0009580643216162313, + "loss": 0.89521015, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 818, + "time_per_iteration": 2.6631743907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174608, + "balance_loss_mlp": 1.15813375, + "diversity_loss_mlp": 0.0, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.07543766685957613, + "language_loss": 0.79610753, + "learning_rate": 0.0009579393402514652, + "loss": 0.80785358, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 819, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116637, + "balance_loss_mlp": 1.15002656, + "diversity_loss_mlp": 0.0, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.08555828674018097, + "language_loss": 0.90543056, + "learning_rate": 0.0009578141810988801, + "loss": 0.91709423, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 820, + "time_per_iteration": 2.6443581581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154879, + "balance_loss_mlp": 1.13852358, + "diversity_loss_mlp": 0.0, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.08457683432578478, + "language_loss": 0.90617025, + "learning_rate": 0.0009576888442070668, + "loss": 0.91771901, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.16357422, + "routerloss_mlp": 0.0, + "step": 821, + "time_per_iteration": 2.588172197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131243, + "balance_loss_mlp": 1.11597228, + "diversity_loss_mlp": 0.0, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08246293521158644, + "language_loss": 0.92183721, + "learning_rate": 0.0009575633296246854, + "loss": 0.93314958, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 822, + "time_per_iteration": 2.5674116611480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894902, + "balance_loss_mlp": 1.55344844, + "diversity_loss_mlp": 0.20225295, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.035537794180972825, + "language_loss": 0.83368647, + "learning_rate": 0.0009574376374004652, + "loss": 0.84263551, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01705186, + "step": 823, + "time_per_iteration": 2.6215808391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124038, + "balance_loss_mlp": 1.10815978, + "diversity_loss_mlp": 0.0, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.07732147283422666, + "language_loss": 0.801727, + "learning_rate": 0.000957311767583204, + "loss": 0.81296742, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 824, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114811, + "balance_loss_mlp": 1.12617576, + "diversity_loss_mlp": 0.0, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.06675818035974217, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83219701, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.21972656, + "routerloss_mlp": 0.0, + "step": 825, + "time_per_iteration": 4.730658531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00883043, + "balance_loss_mlp": 1.5295732, + "diversity_loss_mlp": 0.20110103, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.0472865977200058, + "language_loss": 0.91635585, + "learning_rate": 0.0009570594953650961, + "loss": 0.92518628, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01770616, + "step": 826, + "time_per_iteration": 2.528219699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119191, + "balance_loss_mlp": 1.10247803, + "diversity_loss_mlp": 0.0, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.1137923923451387, + "language_loss": 0.80430406, + "learning_rate": 0.00095693309306219, + "loss": 0.81549597, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 827, + "time_per_iteration": 3.0950989723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111184, + "balance_loss_mlp": 1.09513879, + "diversity_loss_mlp": 0.0, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.08215179220405018, + "language_loss": 0.87886679, + "learning_rate": 0.0009568065133621244, + "loss": 0.8899852, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.16699219, + "routerloss_mlp": 0.0, + "step": 828, + "time_per_iteration": 3.367777109146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106235, + "balance_loss_mlp": 1.08993912, + "diversity_loss_mlp": 0.0, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.0806870261134831, + "language_loss": 0.85100621, + "learning_rate": 0.0009566797563140422, + "loss": 0.86206853, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 829, + "time_per_iteration": 2.8803212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122437, + "balance_loss_mlp": 1.10618925, + "diversity_loss_mlp": 0.0, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.0881590388408274, + "language_loss": 0.88045579, + "learning_rate": 0.0009565528219671547, + "loss": 0.89168018, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 830, + "time_per_iteration": 2.8965914249420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130205, + "balance_loss_mlp": 1.11437368, + "diversity_loss_mlp": 0.0, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.08433678519740714, + "language_loss": 0.84820044, + "learning_rate": 0.0009564257103707418, + "loss": 0.85950249, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 831, + "time_per_iteration": 2.6071205139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138047, + "balance_loss_mlp": 1.12237096, + "diversity_loss_mlp": 0.0, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08192391736137887, + "language_loss": 0.90990019, + "learning_rate": 0.0009562984215741533, + "loss": 0.92128068, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.15661621, + "routerloss_mlp": 0.0, + "step": 832, + "time_per_iteration": 2.647022008895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126204, + "balance_loss_mlp": 1.11050415, + "diversity_loss_mlp": 0.0, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.08304692865674389, + "language_loss": 0.8233614, + "learning_rate": 0.0009561709556268065, + "loss": 0.83462346, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 833, + "time_per_iteration": 2.7033326625823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113334, + "balance_loss_mlp": 1.09758639, + "diversity_loss_mlp": 0.0, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.1118379895427605, + "language_loss": 0.94022137, + "learning_rate": 0.0009560433125781884, + "loss": 0.95135468, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 834, + "time_per_iteration": 2.7286314964294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137088, + "balance_loss_mlp": 1.12088716, + "diversity_loss_mlp": 0.0, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.07457680689162895, + "language_loss": 0.92389894, + "learning_rate": 0.0009559154924778544, + "loss": 0.93526971, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 835, + "time_per_iteration": 2.7348785400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143876, + "balance_loss_mlp": 1.12812805, + "diversity_loss_mlp": 0.0, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.10043267780752475, + "language_loss": 0.85037422, + "learning_rate": 0.0009557874953754284, + "loss": 0.86181295, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 836, + "time_per_iteration": 3.069246768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156501, + "balance_loss_mlp": 1.14049125, + "diversity_loss_mlp": 0.0, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08327927090533828, + "language_loss": 0.83506572, + "learning_rate": 0.0009556593213206038, + "loss": 0.84663069, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 837, + "time_per_iteration": 2.7368414402008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190738, + "balance_loss_mlp": 1.17505026, + "diversity_loss_mlp": 0.0, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.08045457133261572, + "language_loss": 0.87076676, + "learning_rate": 0.0009555309703631414, + "loss": 0.88267422, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 838, + "time_per_iteration": 2.72027850151062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180132, + "balance_loss_mlp": 1.16382456, + "diversity_loss_mlp": 0.0, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.09367634959673259, + "language_loss": 0.87476748, + "learning_rate": 0.0009554024425528722, + "loss": 0.88656878, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 839, + "time_per_iteration": 2.7314722537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173375, + "balance_loss_mlp": 1.15756762, + "diversity_loss_mlp": 0.0, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0683151622017414, + "language_loss": 0.88983327, + "learning_rate": 0.0009552737379396948, + "loss": 0.90156698, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.15795898, + "routerloss_mlp": 0.0, + "step": 840, + "time_per_iteration": 2.6384117603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165757, + "balance_loss_mlp": 1.14950919, + "diversity_loss_mlp": 0.0, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.08203724053437887, + "language_loss": 0.87545735, + "learning_rate": 0.0009551448565735767, + "loss": 0.88711488, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 841, + "time_per_iteration": 2.7497382164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158402, + "balance_loss_mlp": 1.14156926, + "diversity_loss_mlp": 0.0, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.08523302245909381, + "language_loss": 0.84374112, + "learning_rate": 0.0009550157985045543, + "loss": 0.8553251, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 842, + "time_per_iteration": 3.080169916152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114708, + "balance_loss_mlp": 1.13046193, + "diversity_loss_mlp": 0.0, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.10255895710786052, + "language_loss": 0.89356017, + "learning_rate": 0.0009548865637827321, + "loss": 0.90503097, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 843, + "time_per_iteration": 2.684195041656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158581, + "balance_loss_mlp": 1.14129627, + "diversity_loss_mlp": 0.0, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.08376364289368579, + "language_loss": 0.89409387, + "learning_rate": 0.0009547571524582838, + "loss": 0.90567964, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 844, + "time_per_iteration": 2.5846645832061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157702, + "balance_loss_mlp": 1.14051175, + "diversity_loss_mlp": 0.0, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.09201378669766774, + "language_loss": 0.92096436, + "learning_rate": 0.0009546275645814512, + "loss": 0.93254137, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.17211914, + "routerloss_mlp": 0.0, + "step": 845, + "time_per_iteration": 2.603830575942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165367, + "balance_loss_mlp": 1.1485343, + "diversity_loss_mlp": 0.0, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.11870998115484692, + "language_loss": 0.8935858, + "learning_rate": 0.0009544978002025446, + "loss": 0.90523952, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 846, + "time_per_iteration": 2.57155179977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167547, + "balance_loss_mlp": 1.15075064, + "diversity_loss_mlp": 0.0, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.08095587687984966, + "language_loss": 0.86639023, + "learning_rate": 0.0009543678593719434, + "loss": 0.87806571, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.16809082, + "routerloss_mlp": 0.0, + "step": 847, + "time_per_iteration": 2.7022597789764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189002, + "balance_loss_mlp": 1.17215741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.06757237913003537, + "language_loss": 0.87374425, + "learning_rate": 0.0009542377421400945, + "loss": 0.8856343, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 848, + "time_per_iteration": 2.7858939170837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209239, + "balance_loss_mlp": 1.1922878, + "diversity_loss_mlp": 0.0, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.0709695929057924, + "language_loss": 0.83489215, + "learning_rate": 0.0009541074485575145, + "loss": 0.84698457, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 849, + "time_per_iteration": 2.7202138900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206318, + "balance_loss_mlp": 1.18949735, + "diversity_loss_mlp": 0.0, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.09796618546415216, + "language_loss": 0.91934282, + "learning_rate": 0.0009539769786747874, + "loss": 0.93140602, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 850, + "time_per_iteration": 2.6165611743927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183142, + "balance_loss_mlp": 1.16619003, + "diversity_loss_mlp": 0.0, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.08882238893928415, + "language_loss": 0.81184316, + "learning_rate": 0.0009538463325425665, + "loss": 0.82367456, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 851, + "time_per_iteration": 2.686708927154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.13394117, + "diversity_loss_mlp": 0.0, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.07439357185799754, + "language_loss": 0.85950458, + "learning_rate": 0.0009537155102115728, + "loss": 0.87101221, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 852, + "time_per_iteration": 2.5918595790863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875998, + "balance_loss_mlp": 1.52336514, + "diversity_loss_mlp": 0.19506347, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.033648266618603755, + "language_loss": 0.83653182, + "learning_rate": 0.0009535845117325961, + "loss": 0.84529185, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0167836, + "step": 853, + "time_per_iteration": 2.724388599395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106481, + "balance_loss_mlp": 1.08957744, + "diversity_loss_mlp": 0.0, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.08216353114673619, + "language_loss": 0.93429655, + "learning_rate": 0.0009534533371564946, + "loss": 0.94536138, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 854, + "time_per_iteration": 2.7487661838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011031, + "balance_loss_mlp": 1.08627963, + "diversity_loss_mlp": 0.0, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.1393079137823864, + "language_loss": 0.88947123, + "learning_rate": 0.0009533219865341949, + "loss": 0.9005022, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 855, + "time_per_iteration": 2.5900051593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095935, + "balance_loss_mlp": 1.0794363, + "diversity_loss_mlp": 0.0, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.09213408499242232, + "language_loss": 0.86629748, + "learning_rate": 0.0009531904599166916, + "loss": 0.87725687, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 856, + "time_per_iteration": 2.6516594886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093162, + "balance_loss_mlp": 1.07659197, + "diversity_loss_mlp": 0.0, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.11803940214792888, + "language_loss": 0.85319799, + "learning_rate": 0.0009530587573550478, + "loss": 0.86412966, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 857, + "time_per_iteration": 2.6046345233917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.06968486, + "diversity_loss_mlp": 0.0, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.035898632567184195, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75406808, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 858, + "time_per_iteration": 5.039424180984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113669, + "balance_loss_mlp": 1.12172914, + "diversity_loss_mlp": 0.0, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.11200047020164162, + "language_loss": 0.90257657, + "learning_rate": 0.0009527948246039337, + "loss": 0.91394353, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.14929199, + "routerloss_mlp": 0.0, + "step": 859, + "time_per_iteration": 2.550898551940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912162, + "balance_loss_mlp": 1.5939728, + "diversity_loss_mlp": 0.19291875, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.041813305841329106, + "language_loss": 0.87981749, + "learning_rate": 0.000952662594516931, + "loss": 0.88893914, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871633, + "step": 860, + "time_per_iteration": 3.135986089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159964, + "balance_loss_mlp": 1.14404976, + "diversity_loss_mlp": 0.0, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.09693666764449156, + "language_loss": 0.86321676, + "learning_rate": 0.0009525301886907234, + "loss": 0.87481636, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 861, + "time_per_iteration": 2.8601465225219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117936, + "balance_loss_mlp": 1.16340995, + "diversity_loss_mlp": 0.0, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.08775979857040934, + "language_loss": 0.87897611, + "learning_rate": 0.0009523976071767155, + "loss": 0.89076972, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 862, + "time_per_iteration": 2.676481246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186964, + "balance_loss_mlp": 1.17058492, + "diversity_loss_mlp": 0.0, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.08829714099376759, + "language_loss": 0.87565947, + "learning_rate": 0.00095226485002638, + "loss": 0.88752913, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 863, + "time_per_iteration": 2.7554168701171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188442, + "balance_loss_mlp": 1.17221785, + "diversity_loss_mlp": 0.0, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.07683945950910559, + "language_loss": 0.89008975, + "learning_rate": 0.0009521319172912576, + "loss": 0.90197414, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.16223145, + "routerloss_mlp": 0.0, + "step": 864, + "time_per_iteration": 2.7515084743499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180456, + "balance_loss_mlp": 1.16381395, + "diversity_loss_mlp": 0.0, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.07957847945510911, + "language_loss": 0.95031559, + "learning_rate": 0.0009519988090229579, + "loss": 0.96212018, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 865, + "time_per_iteration": 2.671473741531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177408, + "balance_loss_mlp": 1.16058719, + "diversity_loss_mlp": 0.0, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.08787110668844439, + "language_loss": 0.87748879, + "learning_rate": 0.0009518655252731576, + "loss": 0.8892628, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 866, + "time_per_iteration": 2.7561991214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152051, + "balance_loss_mlp": 1.13470602, + "diversity_loss_mlp": 0.0, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.07641565274747647, + "language_loss": 0.90193641, + "learning_rate": 0.0009517320660936022, + "loss": 0.91345698, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.17358398, + "routerloss_mlp": 0.0, + "step": 867, + "time_per_iteration": 2.7005693912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177189, + "balance_loss_mlp": 1.16064239, + "diversity_loss_mlp": 0.0, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.08424262891613502, + "language_loss": 0.83321446, + "learning_rate": 0.0009515984315361051, + "loss": 0.84498632, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.16552734, + "routerloss_mlp": 0.0, + "step": 868, + "time_per_iteration": 2.7969586849212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167914, + "balance_loss_mlp": 1.15145087, + "diversity_loss_mlp": 0.0, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08829416831991993, + "language_loss": 0.87132847, + "learning_rate": 0.000951464621652548, + "loss": 0.88300765, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 869, + "time_per_iteration": 2.6121644973754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152825, + "balance_loss_mlp": 1.13639808, + "diversity_loss_mlp": 0.0, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.07099792340868973, + "language_loss": 0.79077303, + "learning_rate": 0.0009513306364948804, + "loss": 0.80230129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 870, + "time_per_iteration": 2.7814862728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140954, + "balance_loss_mlp": 1.12481356, + "diversity_loss_mlp": 0.0, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09401721418936884, + "language_loss": 0.89126736, + "learning_rate": 0.0009511964761151197, + "loss": 0.90267694, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 871, + "time_per_iteration": 2.601903200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152354, + "balance_loss_mlp": 1.13628435, + "diversity_loss_mlp": 0.0, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.07594901152089473, + "language_loss": 0.90430808, + "learning_rate": 0.0009510621405653521, + "loss": 0.91583163, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 872, + "time_per_iteration": 2.6015260219573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140995, + "balance_loss_mlp": 1.12449682, + "diversity_loss_mlp": 0.0, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.08553354640914074, + "language_loss": 0.84159112, + "learning_rate": 0.0009509276298977309, + "loss": 0.85300112, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 873, + "time_per_iteration": 2.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156157, + "balance_loss_mlp": 1.13969469, + "diversity_loss_mlp": 0.0, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.09960357111836311, + "language_loss": 0.81973028, + "learning_rate": 0.0009507929441644778, + "loss": 0.83129185, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 874, + "time_per_iteration": 3.518749237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141075, + "balance_loss_mlp": 1.12455297, + "diversity_loss_mlp": 0.0, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.09789550875526438, + "language_loss": 0.86003464, + "learning_rate": 0.0009506580834178826, + "loss": 0.87144536, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.1652832, + "routerloss_mlp": 0.0, + "step": 875, + "time_per_iteration": 2.7423431873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152406, + "balance_loss_mlp": 1.13565707, + "diversity_loss_mlp": 0.0, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.08790070613593892, + "language_loss": 0.91631377, + "learning_rate": 0.0009505230477103028, + "loss": 0.92783785, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 876, + "time_per_iteration": 2.698725938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133355, + "balance_loss_mlp": 1.11677289, + "diversity_loss_mlp": 0.0, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.09908277874944699, + "language_loss": 0.81365788, + "learning_rate": 0.0009503878370941641, + "loss": 0.82499135, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 877, + "time_per_iteration": 2.791314125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891363, + "balance_loss_mlp": 1.54620337, + "diversity_loss_mlp": 0.20141272, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.04203797903351432, + "language_loss": 0.89092785, + "learning_rate": 0.0009502524516219595, + "loss": 0.89984149, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01755447, + "step": 878, + "time_per_iteration": 2.776076078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_mlp": 1.12719083, + "diversity_loss_mlp": 0.0, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.08982042340710936, + "language_loss": 0.90123284, + "learning_rate": 0.0009501168913462506, + "loss": 0.91266429, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 879, + "time_per_iteration": 2.6948277950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112281, + "balance_loss_mlp": 1.09587741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.05096984028598956, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80234206, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 880, + "time_per_iteration": 4.850466728210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143836, + "balance_loss_mlp": 1.12831497, + "diversity_loss_mlp": 0.0, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.08080936273118028, + "language_loss": 0.85235959, + "learning_rate": 0.0009498452465949042, + "loss": 0.8637979, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.1550293, + "routerloss_mlp": 0.0, + "step": 881, + "time_per_iteration": 3.2163655757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147334, + "balance_loss_mlp": 1.13156271, + "diversity_loss_mlp": 0.0, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.06875421208466073, + "language_loss": 0.91363323, + "learning_rate": 0.0009497091622247285, + "loss": 0.92510653, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 882, + "time_per_iteration": 2.686939239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152935, + "balance_loss_mlp": 1.13735437, + "diversity_loss_mlp": 0.0, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.08376903723107024, + "language_loss": 0.93688583, + "learning_rate": 0.0009495729032619723, + "loss": 0.94841516, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.15563965, + "routerloss_mlp": 0.0, + "step": 883, + "time_per_iteration": 2.709554433822632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164282, + "balance_loss_mlp": 1.14845097, + "diversity_loss_mlp": 0.0, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07836441801613908, + "language_loss": 0.83897853, + "learning_rate": 0.0009494364697595354, + "loss": 0.85062128, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 884, + "time_per_iteration": 2.905869722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192457, + "balance_loss_mlp": 1.17685246, + "diversity_loss_mlp": 0.0, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.08347533231949411, + "language_loss": 0.89193916, + "learning_rate": 0.0009492998617703867, + "loss": 0.90386373, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 885, + "time_per_iteration": 2.655181884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.18021917, + "diversity_loss_mlp": 0.0, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.09597329726050118, + "language_loss": 0.87667245, + "learning_rate": 0.0009491630793475619, + "loss": 0.88863432, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 886, + "time_per_iteration": 2.6077725887298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195953, + "balance_loss_mlp": 1.17983615, + "diversity_loss_mlp": 0.0, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.09161300078510141, + "language_loss": 0.8529889, + "learning_rate": 0.0009490261225441643, + "loss": 0.86494851, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 887, + "time_per_iteration": 2.8882617950439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169082, + "balance_loss_mlp": 1.15244031, + "diversity_loss_mlp": 0.0, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07944379291645969, + "language_loss": 0.90366387, + "learning_rate": 0.0009488889914133656, + "loss": 0.91535467, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 888, + "time_per_iteration": 2.969808578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192276, + "balance_loss_mlp": 1.17532432, + "diversity_loss_mlp": 0.0, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.0816216626447537, + "language_loss": 0.89335579, + "learning_rate": 0.0009487516860084047, + "loss": 0.90527856, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 889, + "time_per_iteration": 2.6975717544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164555, + "balance_loss_mlp": 1.14738929, + "diversity_loss_mlp": 0.0, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.08956429914743876, + "language_loss": 0.88835347, + "learning_rate": 0.0009486142063825884, + "loss": 0.89999902, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 890, + "time_per_iteration": 2.5376908779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087842, + "balance_loss_mlp": 1.07248783, + "diversity_loss_mlp": 0.0, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.041165905845677725, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73514056, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 891, + "time_per_iteration": 4.961901664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168071, + "balance_loss_mlp": 1.15150142, + "diversity_loss_mlp": 0.0, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.09530662242326329, + "language_loss": 0.89790797, + "learning_rate": 0.0009483387246819542, + "loss": 0.90958869, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 892, + "time_per_iteration": 2.7075483798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.0489924, + "diversity_loss_mlp": 0.0, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.03173229244132217, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83349359, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 893, + "time_per_iteration": 4.639479398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175334, + "balance_loss_mlp": 1.15915704, + "diversity_loss_mlp": 0.0, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.09568003043121609, + "language_loss": 0.88799989, + "learning_rate": 0.0009480625467392688, + "loss": 0.89975327, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 894, + "time_per_iteration": 2.6601061820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.04933381, + "diversity_loss_mlp": 0.0, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.02668432598653126, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79057646, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 895, + "time_per_iteration": 4.739619970321655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154117, + "balance_loss_mlp": 1.13857174, + "diversity_loss_mlp": 0.0, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0641043143423189, + "language_loss": 0.87743723, + "learning_rate": 0.0009477856729834196, + "loss": 0.88897842, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 896, + "time_per_iteration": 2.7397632598876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143695, + "balance_loss_mlp": 1.12863934, + "diversity_loss_mlp": 0.0, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.08265751895316475, + "language_loss": 0.89999056, + "learning_rate": 0.0009476469753098809, + "loss": 0.9114275, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 897, + "time_per_iteration": 2.7494678497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.13624024, + "diversity_loss_mlp": 0.0, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08701823937514089, + "language_loss": 0.86839932, + "learning_rate": 0.0009475081038443738, + "loss": 0.87991428, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.15234375, + "routerloss_mlp": 0.0, + "step": 898, + "time_per_iteration": 2.6241486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147135, + "balance_loss_mlp": 1.13179302, + "diversity_loss_mlp": 0.0, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.10104724937619765, + "language_loss": 0.85756111, + "learning_rate": 0.0009473690586408124, + "loss": 0.86903244, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 899, + "time_per_iteration": 2.8371973037719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141451, + "balance_loss_mlp": 1.1257633, + "diversity_loss_mlp": 0.0, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.08019640817702944, + "language_loss": 0.86364079, + "learning_rate": 0.0009472298397531792, + "loss": 0.87505525, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 900, + "time_per_iteration": 2.742392063140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158392, + "balance_loss_mlp": 1.14285886, + "diversity_loss_mlp": 0.0, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.08623310667606855, + "language_loss": 0.86846912, + "learning_rate": 0.0009470904472355235, + "loss": 0.88005304, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 901, + "time_per_iteration": 2.6695165634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168499, + "balance_loss_mlp": 1.15235806, + "diversity_loss_mlp": 0.0, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.08505658620970231, + "language_loss": 0.7976377, + "learning_rate": 0.0009469508811419626, + "loss": 0.80932266, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 902, + "time_per_iteration": 2.706495761871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295395, + "balance_loss_mlp": 1.28533375, + "diversity_loss_mlp": 0.0, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.12561294289393785, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72909224, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 903, + "time_per_iteration": 4.816544532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201232, + "balance_loss_mlp": 1.18432808, + "diversity_loss_mlp": 0.0, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.08260915403461032, + "language_loss": 0.83578205, + "learning_rate": 0.0009466712284439292, + "loss": 0.84779429, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 904, + "time_per_iteration": 2.7518186569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225673, + "balance_loss_mlp": 1.20837545, + "diversity_loss_mlp": 0.0, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.10172065741669829, + "language_loss": 0.88445127, + "learning_rate": 0.0009465311419480276, + "loss": 0.89670801, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 905, + "time_per_iteration": 2.6713294982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222896, + "balance_loss_mlp": 1.20540833, + "diversity_loss_mlp": 0.0, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.08928567213571854, + "language_loss": 0.88188136, + "learning_rate": 0.0009463908820933622, + "loss": 0.89411032, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 906, + "time_per_iteration": 2.838935375213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211371, + "balance_loss_mlp": 1.19455028, + "diversity_loss_mlp": 0.0, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.07641026648080583, + "language_loss": 0.82561022, + "learning_rate": 0.0009462504489343868, + "loss": 0.83772391, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 907, + "time_per_iteration": 2.814695119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176767, + "balance_loss_mlp": 1.15961313, + "diversity_loss_mlp": 0.0, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.1031074016814366, + "language_loss": 0.88790941, + "learning_rate": 0.0009461098425256222, + "loss": 0.89967716, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 908, + "time_per_iteration": 2.6116297245025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159634, + "balance_loss_mlp": 1.14329028, + "diversity_loss_mlp": 0.0, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.08015161116044169, + "language_loss": 0.86030436, + "learning_rate": 0.0009459690629216567, + "loss": 0.87190068, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 909, + "time_per_iteration": 2.6483752727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130085, + "balance_loss_mlp": 1.11407518, + "diversity_loss_mlp": 0.0, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.1301831169035446, + "language_loss": 0.87761313, + "learning_rate": 0.0009458281101771457, + "loss": 0.88891399, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 910, + "time_per_iteration": 2.6089227199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00992009, + "balance_loss_mlp": 1.75545192, + "diversity_loss_mlp": 0.19214596, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.033219305186726854, + "language_loss": 0.82887536, + "learning_rate": 0.0009456869843468122, + "loss": 0.83879542, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820984, + "step": 911, + "time_per_iteration": 2.895577907562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110935, + "balance_loss_mlp": 1.09519958, + "diversity_loss_mlp": 0.0, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.09801228329993106, + "language_loss": 0.78689641, + "learning_rate": 0.0009455456854854459, + "loss": 0.79800576, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 912, + "time_per_iteration": 2.61677885055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112332, + "balance_loss_mlp": 1.09684718, + "diversity_loss_mlp": 0.0, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.10345929433375275, + "language_loss": 0.84027654, + "learning_rate": 0.0009454042136479039, + "loss": 0.8513999, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 913, + "time_per_iteration": 2.63289737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970368, + "balance_loss_mlp": 1.71473479, + "diversity_loss_mlp": 0.18966624, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.036406885856323776, + "language_loss": 0.82874572, + "learning_rate": 0.0009452625688891103, + "loss": 0.83844936, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816791, + "step": 914, + "time_per_iteration": 2.5505056381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00652668, + "balance_loss_mlp": 1.1176697, + "diversity_loss_mlp": 0.15453993, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.002103211778310914, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79387403, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01656273, + "step": 915, + "time_per_iteration": 4.6835761070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138887, + "balance_loss_mlp": 1.12381876, + "diversity_loss_mlp": 0.0, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.10180381633640839, + "language_loss": 0.92940623, + "learning_rate": 0.0009449787608278015, + "loss": 0.94079512, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 916, + "time_per_iteration": 2.7294180393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155245, + "balance_loss_mlp": 1.13949776, + "diversity_loss_mlp": 0.0, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08481056496958321, + "language_loss": 0.92318904, + "learning_rate": 0.0009448365976354704, + "loss": 0.9347415, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 917, + "time_per_iteration": 2.4908158779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174187, + "balance_loss_mlp": 1.15821338, + "diversity_loss_mlp": 0.0, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.1031397623895646, + "language_loss": 0.89928877, + "learning_rate": 0.0009446942617422558, + "loss": 0.91103065, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 918, + "time_per_iteration": 2.5721499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191219, + "balance_loss_mlp": 1.1748755, + "diversity_loss_mlp": 0.0, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.17804953788653613, + "language_loss": 0.85687363, + "learning_rate": 0.0009445517532034176, + "loss": 0.86878586, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 919, + "time_per_iteration": 2.6613845825195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195517, + "balance_loss_mlp": 1.18031824, + "diversity_loss_mlp": 0.0, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.09678678856513988, + "language_loss": 0.89147103, + "learning_rate": 0.0009444090720742824, + "loss": 0.90342629, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 920, + "time_per_iteration": 2.587042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186456, + "balance_loss_mlp": 1.17107785, + "diversity_loss_mlp": 0.0, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.10185153476697495, + "language_loss": 0.87654328, + "learning_rate": 0.0009442662184102439, + "loss": 0.88840789, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 921, + "time_per_iteration": 2.8263702392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153869, + "balance_loss_mlp": 1.13851511, + "diversity_loss_mlp": 0.0, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.07023953845341, + "language_loss": 0.87764925, + "learning_rate": 0.000944123192266763, + "loss": 0.88918793, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 922, + "time_per_iteration": 2.789288282394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914197, + "balance_loss_mlp": 1.60349846, + "diversity_loss_mlp": 0.18745996, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.03372690713262746, + "language_loss": 0.83555657, + "learning_rate": 0.0009439799936993671, + "loss": 0.84469855, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871805, + "step": 923, + "time_per_iteration": 2.7374520301818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137351, + "balance_loss_mlp": 1.12125802, + "diversity_loss_mlp": 0.0, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.08202300708599226, + "language_loss": 0.87886107, + "learning_rate": 0.0009438366227636511, + "loss": 0.89023459, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.16088867, + "routerloss_mlp": 0.0, + "step": 924, + "time_per_iteration": 2.7159595489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148154, + "balance_loss_mlp": 1.13190556, + "diversity_loss_mlp": 0.0, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.08035818105278464, + "language_loss": 0.86048192, + "learning_rate": 0.0009436930795152763, + "loss": 0.8719635, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 925, + "time_per_iteration": 2.8248116970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143318, + "balance_loss_mlp": 1.12739205, + "diversity_loss_mlp": 0.0, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07405817727017547, + "language_loss": 0.86317486, + "learning_rate": 0.0009435493640099713, + "loss": 0.87460804, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 926, + "time_per_iteration": 2.8155741691589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161834, + "balance_loss_mlp": 1.1451211, + "diversity_loss_mlp": 0.0, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.09122083849675254, + "language_loss": 0.84453332, + "learning_rate": 0.0009434054763035314, + "loss": 0.8561517, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 927, + "time_per_iteration": 2.636686325073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158411, + "balance_loss_mlp": 1.1422224, + "diversity_loss_mlp": 0.0, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.0663266274239875, + "language_loss": 0.85362542, + "learning_rate": 0.0009432614164518185, + "loss": 0.86520946, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 928, + "time_per_iteration": 2.9446685314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171163, + "balance_loss_mlp": 1.15443754, + "diversity_loss_mlp": 0.0, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07726522608444414, + "language_loss": 0.84178561, + "learning_rate": 0.000943117184510762, + "loss": 0.85349721, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 929, + "time_per_iteration": 3.0194530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175374, + "balance_loss_mlp": 1.16435885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.030831515732685378, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79965341, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 930, + "time_per_iteration": 5.04656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172004, + "balance_loss_mlp": 1.15555263, + "diversity_loss_mlp": 0.0, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.08209248711818126, + "language_loss": 0.88495553, + "learning_rate": 0.0009428282045846674, + "loss": 0.89667559, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.16455078, + "routerloss_mlp": 0.0, + "step": 931, + "time_per_iteration": 2.6833221912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905029, + "balance_loss_mlp": 1.58147573, + "diversity_loss_mlp": 0.18920106, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.030391877730158674, + "language_loss": 0.89804769, + "learning_rate": 0.0009426834567118214, + "loss": 0.90709794, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01969042, + "step": 932, + "time_per_iteration": 3.0804004669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174106, + "balance_loss_mlp": 1.15761924, + "diversity_loss_mlp": 0.0, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.06967623980831897, + "language_loss": 0.80600739, + "learning_rate": 0.0009425385369740155, + "loss": 0.81774843, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.16491699, + "routerloss_mlp": 0.0, + "step": 933, + "time_per_iteration": 3.039576530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172613, + "balance_loss_mlp": 1.15553069, + "diversity_loss_mlp": 0.0, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.09198882046168515, + "language_loss": 0.87049097, + "learning_rate": 0.0009423934454275125, + "loss": 0.88221705, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 934, + "time_per_iteration": 2.8528192043304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147429, + "balance_loss_mlp": 1.13053656, + "diversity_loss_mlp": 0.0, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.09002999058802562, + "language_loss": 0.92077851, + "learning_rate": 0.0009422481821286418, + "loss": 0.93225282, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.16906738, + "routerloss_mlp": 0.0, + "step": 935, + "time_per_iteration": 2.720700740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140916, + "balance_loss_mlp": 1.12434602, + "diversity_loss_mlp": 0.0, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.11818586168906865, + "language_loss": 0.88474637, + "learning_rate": 0.0009421027471337998, + "loss": 0.89615548, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 936, + "time_per_iteration": 2.61820125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114364, + "balance_loss_mlp": 1.12680769, + "diversity_loss_mlp": 0.0, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.13119105141522364, + "language_loss": 0.82430404, + "learning_rate": 0.0009419571404994493, + "loss": 0.83574045, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 937, + "time_per_iteration": 2.6458749771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.11016333, + "diversity_loss_mlp": 0.0, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.10011425098636609, + "language_loss": 0.90748799, + "learning_rate": 0.00094181136228212, + "loss": 0.91875559, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 938, + "time_per_iteration": 2.659946918487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132333, + "balance_loss_mlp": 1.11602521, + "diversity_loss_mlp": 0.0, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06984091109722412, + "language_loss": 0.86027002, + "learning_rate": 0.0009416654125384077, + "loss": 0.8715933, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 939, + "time_per_iteration": 2.723839044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182476, + "balance_loss_mlp": 1.17174697, + "diversity_loss_mlp": 0.0, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.0414358910702132, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.8095485, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 940, + "time_per_iteration": 4.920511722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141007, + "balance_loss_mlp": 1.12453222, + "diversity_loss_mlp": 0.0, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.0813056862192268, + "language_loss": 0.83903325, + "learning_rate": 0.000941372998698552, + "loss": 0.85044336, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 941, + "time_per_iteration": 2.937645673751831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00896978, + "balance_loss_mlp": 1.56833267, + "diversity_loss_mlp": 0.1911485, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.04191931915848681, + "language_loss": 0.82149267, + "learning_rate": 0.0009412265347159336, + "loss": 0.83046246, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0172378, + "step": 942, + "time_per_iteration": 2.7250781059265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116112, + "balance_loss_mlp": 1.14446664, + "diversity_loss_mlp": 0.0, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.08706600394859935, + "language_loss": 0.84761524, + "learning_rate": 0.0009410798994339829, + "loss": 0.85922647, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 943, + "time_per_iteration": 2.5916900634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115721, + "balance_loss_mlp": 1.14027047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.07414862428622851, + "language_loss": 0.87698966, + "learning_rate": 0.000940933092909628, + "loss": 0.88856173, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 944, + "time_per_iteration": 2.6747801303863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166789, + "balance_loss_mlp": 1.15049326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.07390491400887403, + "language_loss": 0.83424389, + "learning_rate": 0.0009407861151998649, + "loss": 0.84591174, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 945, + "time_per_iteration": 2.602691411972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163795, + "balance_loss_mlp": 1.14708209, + "diversity_loss_mlp": 0.0, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.07435679337016335, + "language_loss": 0.86087269, + "learning_rate": 0.0009406389663617552, + "loss": 0.87251067, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 946, + "time_per_iteration": 2.6775379180908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139209, + "balance_loss_mlp": 1.12300825, + "diversity_loss_mlp": 0.0, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.08423780444915897, + "language_loss": 0.86031067, + "learning_rate": 0.000940491646452427, + "loss": 0.87170279, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 947, + "time_per_iteration": 2.717313051223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134537, + "balance_loss_mlp": 1.11805058, + "diversity_loss_mlp": 0.0, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.0716601161320721, + "language_loss": 0.90799212, + "learning_rate": 0.000940344155529075, + "loss": 0.91933751, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 948, + "time_per_iteration": 2.645601749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905236, + "balance_loss_mlp": 1.57791471, + "diversity_loss_mlp": 0.19691566, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.03478780514937427, + "language_loss": 0.87420666, + "learning_rate": 0.0009401964936489605, + "loss": 0.883259, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01782099, + "step": 949, + "time_per_iteration": 2.546546459197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132433, + "balance_loss_mlp": 1.11666203, + "diversity_loss_mlp": 0.0, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.11218622077210595, + "language_loss": 0.85308415, + "learning_rate": 0.0009400486608694108, + "loss": 0.86440849, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 950, + "time_per_iteration": 2.71462345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135805, + "balance_loss_mlp": 1.1190201, + "diversity_loss_mlp": 0.0, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.07143871570155125, + "language_loss": 0.87176299, + "learning_rate": 0.0009399006572478195, + "loss": 0.88312101, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 951, + "time_per_iteration": 3.0933260917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137853, + "balance_loss_mlp": 1.12129509, + "diversity_loss_mlp": 0.0, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.08672794105569953, + "language_loss": 0.90997601, + "learning_rate": 0.0009397524828416468, + "loss": 0.92135453, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 952, + "time_per_iteration": 2.6721160411834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906668, + "balance_loss_mlp": 1.58174932, + "diversity_loss_mlp": 0.19792399, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.0341945315399877, + "language_loss": 0.96079636, + "learning_rate": 0.0009396041377084192, + "loss": 0.96986312, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01683164, + "step": 953, + "time_per_iteration": 2.6563429832458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147916, + "balance_loss_mlp": 1.1312983, + "diversity_loss_mlp": 0.0, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.07156922543086394, + "language_loss": 0.87274891, + "learning_rate": 0.0009394556219057295, + "loss": 0.88422805, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 954, + "time_per_iteration": 2.710129499435425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.1480366, + "diversity_loss_mlp": 0.0, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08933499459227748, + "language_loss": 0.83389091, + "learning_rate": 0.0009393069354912362, + "loss": 0.84553862, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 955, + "time_per_iteration": 2.736077070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162546, + "balance_loss_mlp": 1.1459167, + "diversity_loss_mlp": 0.0, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.10088049230192819, + "language_loss": 0.81851852, + "learning_rate": 0.0009391580785226649, + "loss": 0.83014399, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 956, + "time_per_iteration": 2.8675243854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139492, + "balance_loss_mlp": 1.12933517, + "diversity_loss_mlp": 0.0, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.028623000900350283, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80479944, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 957, + "time_per_iteration": 4.758531332015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128949, + "balance_loss_mlp": 1.11177051, + "diversity_loss_mlp": 0.0, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.0742792603097427, + "language_loss": 0.8674221, + "learning_rate": 0.0009388598531545196, + "loss": 0.87871158, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 958, + "time_per_iteration": 2.8665144443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110151, + "balance_loss_mlp": 1.09304404, + "diversity_loss_mlp": 0.0, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.08387101873752756, + "language_loss": 0.85292655, + "learning_rate": 0.000938710484870727, + "loss": 0.86402804, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.17126465, + "routerloss_mlp": 0.0, + "step": 959, + "time_per_iteration": 2.5621094703674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113798, + "balance_loss_mlp": 1.09718001, + "diversity_loss_mlp": 0.0, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.08027143748444723, + "language_loss": 0.85896957, + "learning_rate": 0.0009385609462644189, + "loss": 0.87010753, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 960, + "time_per_iteration": 2.6949400901794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122642, + "balance_loss_mlp": 1.10596502, + "diversity_loss_mlp": 0.0, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07967759372686231, + "language_loss": 0.8535409, + "learning_rate": 0.0009384112373936514, + "loss": 0.86476731, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 961, + "time_per_iteration": 2.644244432449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.11566615, + "diversity_loss_mlp": 0.0, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.09330138113238175, + "language_loss": 0.91539109, + "learning_rate": 0.0009382613583165467, + "loss": 0.92671585, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 962, + "time_per_iteration": 2.8191375732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128481, + "balance_loss_mlp": 1.11161256, + "diversity_loss_mlp": 0.0, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.08799115365988901, + "language_loss": 0.89600122, + "learning_rate": 0.0009381113090912928, + "loss": 0.90728599, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 963, + "time_per_iteration": 2.77341890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137775, + "balance_loss_mlp": 1.12159812, + "diversity_loss_mlp": 0.0, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.08224545608030313, + "language_loss": 0.89354098, + "learning_rate": 0.000937961089776144, + "loss": 0.90491867, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 964, + "time_per_iteration": 2.6057045459747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140677, + "balance_loss_mlp": 1.12448788, + "diversity_loss_mlp": 0.0, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.08763662153745684, + "language_loss": 0.82399738, + "learning_rate": 0.0009378107004294208, + "loss": 0.83540416, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 965, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132665, + "balance_loss_mlp": 1.11624968, + "diversity_loss_mlp": 0.0, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.0696996408734829, + "language_loss": 0.91584361, + "learning_rate": 0.0009376601411095096, + "loss": 0.92717028, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.16418457, + "routerloss_mlp": 0.0, + "step": 966, + "time_per_iteration": 2.6557700634002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108368, + "balance_loss_mlp": 1.09209585, + "diversity_loss_mlp": 0.0, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.0928645758984953, + "language_loss": 0.86438054, + "learning_rate": 0.0009375094118748622, + "loss": 0.8754642, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.16271973, + "routerloss_mlp": 0.0, + "step": 967, + "time_per_iteration": 2.5574727058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121341, + "balance_loss_mlp": 1.10546279, + "diversity_loss_mlp": 0.0, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.08866997131388626, + "language_loss": 0.90710455, + "learning_rate": 0.0009373585127839976, + "loss": 0.91831791, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 968, + "time_per_iteration": 2.9949731826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.1066587, + "diversity_loss_mlp": 0.0, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08663719992470821, + "language_loss": 0.90892541, + "learning_rate": 0.0009372074438954994, + "loss": 0.92014849, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.15637207, + "routerloss_mlp": 0.0, + "step": 969, + "time_per_iteration": 2.583392381668091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115205, + "balance_loss_mlp": 1.09983897, + "diversity_loss_mlp": 0.0, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.1288159292638968, + "language_loss": 0.91714692, + "learning_rate": 0.0009370562052680181, + "loss": 0.92829901, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.15356445, + "routerloss_mlp": 0.0, + "step": 970, + "time_per_iteration": 2.476053476333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131477, + "balance_loss_mlp": 1.1160872, + "diversity_loss_mlp": 0.0, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.05501755081279848, + "language_loss": 0.89296091, + "learning_rate": 0.0009369047969602695, + "loss": 0.90427566, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 971, + "time_per_iteration": 2.705310344696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161734, + "balance_loss_mlp": 1.14604628, + "diversity_loss_mlp": 0.0, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.09590230746039986, + "language_loss": 0.86690193, + "learning_rate": 0.0009367532190310357, + "loss": 0.8785193, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 972, + "time_per_iteration": 2.551683187484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151378, + "balance_loss_mlp": 1.13526106, + "diversity_loss_mlp": 0.0, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.13723256450586457, + "language_loss": 0.88859725, + "learning_rate": 0.0009366014715391644, + "loss": 0.90011096, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 973, + "time_per_iteration": 2.6311707496643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140536, + "balance_loss_mlp": 1.12521768, + "diversity_loss_mlp": 0.0, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.0667022200872989, + "language_loss": 0.83902818, + "learning_rate": 0.0009364495545435693, + "loss": 0.85043353, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 974, + "time_per_iteration": 2.756056308746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121055, + "balance_loss_mlp": 1.10528326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.06720472395514528, + "language_loss": 0.88235438, + "learning_rate": 0.0009362974681032297, + "loss": 0.89356488, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 975, + "time_per_iteration": 2.601027488708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117499, + "balance_loss_mlp": 1.10179889, + "diversity_loss_mlp": 0.0, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.09372829562862567, + "language_loss": 0.88529336, + "learning_rate": 0.0009361452122771907, + "loss": 0.8964684, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 976, + "time_per_iteration": 2.8729074001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124468, + "balance_loss_mlp": 1.107934, + "diversity_loss_mlp": 0.0, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.10248565336705484, + "language_loss": 0.83506191, + "learning_rate": 0.0009359927871245635, + "loss": 0.84630656, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 977, + "time_per_iteration": 2.4633541107177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114403, + "balance_loss_mlp": 1.12861657, + "diversity_loss_mlp": 0.0, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09207140211488826, + "language_loss": 0.85937703, + "learning_rate": 0.0009358401927045246, + "loss": 0.87081736, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 978, + "time_per_iteration": 2.8528451919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165656, + "balance_loss_mlp": 1.15002799, + "diversity_loss_mlp": 0.0, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.09819064259764942, + "language_loss": 0.88151729, + "learning_rate": 0.0009356874290763166, + "loss": 0.89317381, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 979, + "time_per_iteration": 3.4732589721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165217, + "balance_loss_mlp": 1.14985144, + "diversity_loss_mlp": 0.0, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.07125364842819645, + "language_loss": 0.88739443, + "learning_rate": 0.0009355344962992474, + "loss": 0.8990466, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 980, + "time_per_iteration": 2.618013381958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092711, + "balance_loss_mlp": 1.61735535, + "diversity_loss_mlp": 0.20325859, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.031158428526317693, + "language_loss": 0.8787328, + "learning_rate": 0.0009353813944326908, + "loss": 0.88800395, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0168031, + "step": 981, + "time_per_iteration": 2.926612377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00925726, + "balance_loss_mlp": 1.616956, + "diversity_loss_mlp": 0.20126666, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0354798675553145, + "language_loss": 0.82752389, + "learning_rate": 0.0009352281235360863, + "loss": 0.83678114, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01661466, + "step": 982, + "time_per_iteration": 2.7461719512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156754, + "balance_loss_mlp": 1.14193642, + "diversity_loss_mlp": 0.0, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.08008026175511872, + "language_loss": 0.84875655, + "learning_rate": 0.0009350746836689389, + "loss": 0.86032403, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 983, + "time_per_iteration": 2.5128703117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232965, + "balance_loss_mlp": 1.22199774, + "diversity_loss_mlp": 0.0, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.06420942239022731, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82672185, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 984, + "time_per_iteration": 4.987680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144146, + "balance_loss_mlp": 1.12880325, + "diversity_loss_mlp": 0.0, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08702988523082197, + "language_loss": 0.82654107, + "learning_rate": 0.0009347672972613634, + "loss": 0.83798254, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 985, + "time_per_iteration": 2.586580514907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891878, + "balance_loss_mlp": 1.54986262, + "diversity_loss_mlp": 0.20135348, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.032521151954013804, + "language_loss": 0.85226321, + "learning_rate": 0.0009346133508402735, + "loss": 0.86118197, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01626948, + "step": 986, + "time_per_iteration": 2.7389352321624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151414, + "balance_loss_mlp": 1.13596404, + "diversity_loss_mlp": 0.0, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.0982536864932062, + "language_loss": 0.84267235, + "learning_rate": 0.0009344592356873166, + "loss": 0.85418648, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 987, + "time_per_iteration": 2.6327145099639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157169, + "balance_loss_mlp": 1.14155281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.07528447862042392, + "language_loss": 0.78532755, + "learning_rate": 0.0009343049518623255, + "loss": 0.79689926, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 988, + "time_per_iteration": 2.7461259365081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161817, + "balance_loss_mlp": 1.14693928, + "diversity_loss_mlp": 0.0, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.07061488940634471, + "language_loss": 0.83142781, + "learning_rate": 0.0009341504994251985, + "loss": 0.84304595, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 989, + "time_per_iteration": 2.9033045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128003, + "balance_loss_mlp": 1.11765516, + "diversity_loss_mlp": 0.0, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.02664126889468688, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74648499, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 990, + "time_per_iteration": 5.065544605255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116372, + "balance_loss_mlp": 1.14821064, + "diversity_loss_mlp": 0.0, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.062492069067547173, + "language_loss": 0.81668103, + "learning_rate": 0.0009338410889544574, + "loss": 0.82831824, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 991, + "time_per_iteration": 3.0360453128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160077, + "balance_loss_mlp": 1.14444828, + "diversity_loss_mlp": 0.0, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.07188646642614673, + "language_loss": 0.87598348, + "learning_rate": 0.000933686131040967, + "loss": 0.88758421, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 992, + "time_per_iteration": 4.194309234619141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132508, + "balance_loss_mlp": 1.11693931, + "diversity_loss_mlp": 0.0, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.07096950165415856, + "language_loss": 0.90250611, + "learning_rate": 0.0009335310047555883, + "loss": 0.91383117, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 993, + "time_per_iteration": 2.7198565006256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128004, + "balance_loss_mlp": 1.11225605, + "diversity_loss_mlp": 0.0, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.07682750770192658, + "language_loss": 0.8836562, + "learning_rate": 0.0009333757101585467, + "loss": 0.89493626, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 994, + "time_per_iteration": 2.6651480197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.10621142, + "diversity_loss_mlp": 0.0, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.10461680978710068, + "language_loss": 0.9317944, + "learning_rate": 0.0009332202473101329, + "loss": 0.94301325, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 995, + "time_per_iteration": 2.667943239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00890685, + "balance_loss_mlp": 1.54595685, + "diversity_loss_mlp": 0.2013846, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.03439253799161941, + "language_loss": 0.8270663, + "learning_rate": 0.0009330646162707028, + "loss": 0.83597314, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170145, + "step": 996, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130524, + "balance_loss_mlp": 1.11483645, + "diversity_loss_mlp": 0.0, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.07379991060729872, + "language_loss": 0.84002179, + "learning_rate": 0.0009329088171006779, + "loss": 0.85132706, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 997, + "time_per_iteration": 3.133023738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136353, + "balance_loss_mlp": 1.12061739, + "diversity_loss_mlp": 0.0, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.09187105070084006, + "language_loss": 0.85599297, + "learning_rate": 0.0009327528498605446, + "loss": 0.86735654, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 998, + "time_per_iteration": 2.5390877723693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888942, + "balance_loss_mlp": 1.54108667, + "diversity_loss_mlp": 0.20404731, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.03685920036749298, + "language_loss": 0.89166534, + "learning_rate": 0.0009325967146108548, + "loss": 0.90055484, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01637482, + "step": 999, + "time_per_iteration": 2.7167420387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159789, + "balance_loss_mlp": 1.14361215, + "diversity_loss_mlp": 0.0, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.08415694153473897, + "language_loss": 0.87386107, + "learning_rate": 0.0009324404114122258, + "loss": 0.88545901, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 1000, + "time_per_iteration": 2.6833291053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164843, + "balance_loss_mlp": 1.1492269, + "diversity_loss_mlp": 0.0, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.07516183221332183, + "language_loss": 0.86446774, + "learning_rate": 0.0009322839403253397, + "loss": 0.87611622, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 1001, + "time_per_iteration": 4.16480565071106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173642, + "balance_loss_mlp": 1.15789402, + "diversity_loss_mlp": 0.0, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07739515949456567, + "language_loss": 0.84035075, + "learning_rate": 0.0009321273014109439, + "loss": 0.8520872, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1002, + "time_per_iteration": 2.9390604496002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183539, + "balance_loss_mlp": 1.16795826, + "diversity_loss_mlp": 0.0, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.08102605487142737, + "language_loss": 0.84643984, + "learning_rate": 0.0009319704947298513, + "loss": 0.85827518, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1003, + "time_per_iteration": 2.923952579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116012, + "balance_loss_mlp": 1.14496815, + "diversity_loss_mlp": 0.0, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.060771133612280225, + "language_loss": 0.88448775, + "learning_rate": 0.0009318135203429393, + "loss": 0.89608896, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.15124512, + "routerloss_mlp": 0.0, + "step": 1004, + "time_per_iteration": 2.7170984745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135222, + "balance_loss_mlp": 1.11972475, + "diversity_loss_mlp": 0.0, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.07023398647530335, + "language_loss": 0.87528408, + "learning_rate": 0.0009316563783111511, + "loss": 0.88663626, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1005, + "time_per_iteration": 2.7271320819854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011162, + "balance_loss_mlp": 1.10061884, + "diversity_loss_mlp": 0.0, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.07388032809600253, + "language_loss": 0.82009041, + "learning_rate": 0.0009314990686954943, + "loss": 0.83125246, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1006, + "time_per_iteration": 2.9210305213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108745, + "balance_loss_mlp": 1.09337938, + "diversity_loss_mlp": 0.0, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.06330578200459082, + "language_loss": 0.80805916, + "learning_rate": 0.000931341591557042, + "loss": 0.81914663, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 1007, + "time_per_iteration": 3.695157051086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095993, + "balance_loss_mlp": 1.08054364, + "diversity_loss_mlp": 0.0, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.07858263731415134, + "language_loss": 0.87216473, + "learning_rate": 0.0009311839469569325, + "loss": 0.88312465, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1008, + "time_per_iteration": 2.633854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108854, + "balance_loss_mlp": 1.07287586, + "diversity_loss_mlp": 0.0, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.14235975733457876, + "language_loss": 0.87399781, + "learning_rate": 0.0009310261349563687, + "loss": 0.88488322, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1009, + "time_per_iteration": 2.702073574066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898627, + "balance_loss_mlp": 1.56164169, + "diversity_loss_mlp": 0.20371187, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.03011805945399338, + "language_loss": 0.85438645, + "learning_rate": 0.0009308681556166186, + "loss": 0.86337274, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01594995, + "step": 1010, + "time_per_iteration": 2.8698601722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111744, + "balance_loss_mlp": 1.0962348, + "diversity_loss_mlp": 0.0, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.08879322612819535, + "language_loss": 0.87462533, + "learning_rate": 0.0009307100089990152, + "loss": 0.88574278, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1011, + "time_per_iteration": 2.7149901390075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140864, + "balance_loss_mlp": 1.12543821, + "diversity_loss_mlp": 0.0, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.07383907155719892, + "language_loss": 0.83837229, + "learning_rate": 0.0009305516951649568, + "loss": 0.84978092, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.15405273, + "routerloss_mlp": 0.0, + "step": 1012, + "time_per_iteration": 2.702683448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161407, + "balance_loss_mlp": 1.14599323, + "diversity_loss_mlp": 0.0, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07624018834593461, + "language_loss": 0.86570859, + "learning_rate": 0.0009303932141759057, + "loss": 0.87732267, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 1013, + "time_per_iteration": 2.7500197887420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168747, + "balance_loss_mlp": 1.15382242, + "diversity_loss_mlp": 0.0, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.08469076174706892, + "language_loss": 0.83575755, + "learning_rate": 0.0009302345660933902, + "loss": 0.84744501, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1014, + "time_per_iteration": 2.8010780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171185, + "balance_loss_mlp": 1.15642715, + "diversity_loss_mlp": 0.0, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.08619273283705803, + "language_loss": 0.85146868, + "learning_rate": 0.0009300757509790026, + "loss": 0.86318052, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1015, + "time_per_iteration": 2.840315103530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150596, + "balance_loss_mlp": 1.13570654, + "diversity_loss_mlp": 0.0, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.10655365126946059, + "language_loss": 0.90244913, + "learning_rate": 0.0009299167688944005, + "loss": 0.91395509, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1016, + "time_per_iteration": 2.502391815185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130549, + "balance_loss_mlp": 1.11540985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07757202619564983, + "language_loss": 0.85754222, + "learning_rate": 0.0009297576199013063, + "loss": 0.86884773, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1017, + "time_per_iteration": 2.7255496978759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00657481, + "balance_loss_mlp": 1.1064117, + "diversity_loss_mlp": 0.17609364, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.0027779106975556575, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.73659611, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01622855, + "step": 1018, + "time_per_iteration": 4.943171739578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01384914, + "balance_loss_mlp": 1.37351775, + "diversity_loss_mlp": 0.0, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.09054623740471555, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80811214, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 1019, + "time_per_iteration": 5.518418788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125322, + "balance_loss_mlp": 1.11074281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.08202201534603108, + "language_loss": 0.8648417, + "learning_rate": 0.0009292791720892659, + "loss": 0.87609494, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1020, + "time_per_iteration": 2.889078140258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131715, + "balance_loss_mlp": 1.11721921, + "diversity_loss_mlp": 0.0, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07932574612707302, + "language_loss": 0.88913518, + "learning_rate": 0.0009291193560807218, + "loss": 0.90045238, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1021, + "time_per_iteration": 2.5933609008789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136679, + "balance_loss_mlp": 1.122159, + "diversity_loss_mlp": 0.0, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.08278255048112054, + "language_loss": 0.87034905, + "learning_rate": 0.0009289593734732688, + "loss": 0.88171583, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1022, + "time_per_iteration": 2.600834369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132774, + "balance_loss_mlp": 1.11842132, + "diversity_loss_mlp": 0.0, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.08270608551386573, + "language_loss": 0.93774927, + "learning_rate": 0.0009287992243290175, + "loss": 0.94907701, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1023, + "time_per_iteration": 2.474914312362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111783, + "balance_loss_mlp": 1.10275006, + "diversity_loss_mlp": 0.0, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.06901830196983176, + "language_loss": 0.90473127, + "learning_rate": 0.0009286389087101435, + "loss": 0.91590953, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1024, + "time_per_iteration": 2.7718465328216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120328, + "balance_loss_mlp": 1.1055932, + "diversity_loss_mlp": 0.0, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07476522676232629, + "language_loss": 0.8853035, + "learning_rate": 0.0009284784266788864, + "loss": 0.89650679, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1025, + "time_per_iteration": 2.7143290042877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122071, + "balance_loss_mlp": 1.10795665, + "diversity_loss_mlp": 0.0, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.08990804702262417, + "language_loss": 0.91984832, + "learning_rate": 0.0009283177782975512, + "loss": 0.93106908, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1026, + "time_per_iteration": 2.948909282684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115739, + "balance_loss_mlp": 1.10118401, + "diversity_loss_mlp": 0.0, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08229992096701991, + "language_loss": 0.88074464, + "learning_rate": 0.000928156963628507, + "loss": 0.89190209, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1027, + "time_per_iteration": 2.5764074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109846, + "balance_loss_mlp": 1.09483802, + "diversity_loss_mlp": 0.0, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.08379460495492784, + "language_loss": 0.87978798, + "learning_rate": 0.0009279959827341877, + "loss": 0.89088643, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1028, + "time_per_iteration": 2.752347946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08043635, + "diversity_loss_mlp": 0.0, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.08467225305095022, + "language_loss": 0.87624389, + "learning_rate": 0.0009278348356770915, + "loss": 0.88720024, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1029, + "time_per_iteration": 2.555527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.08132768, + "diversity_loss_mlp": 0.0, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.0755245964113765, + "language_loss": 0.85285002, + "learning_rate": 0.0009276735225197814, + "loss": 0.86381966, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1030, + "time_per_iteration": 2.5947089195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104122, + "balance_loss_mlp": 1.08832633, + "diversity_loss_mlp": 0.0, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.08972056860523267, + "language_loss": 0.85732102, + "learning_rate": 0.0009275120433248847, + "loss": 0.86836231, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.15783691, + "routerloss_mlp": 0.0, + "step": 1031, + "time_per_iteration": 2.676872730255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109193, + "balance_loss_mlp": 1.09355247, + "diversity_loss_mlp": 0.0, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.07488561277584621, + "language_loss": 0.85529125, + "learning_rate": 0.0009273503981550931, + "loss": 0.86638314, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1032, + "time_per_iteration": 3.09958815574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099668, + "balance_loss_mlp": 1.08494592, + "diversity_loss_mlp": 0.0, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.1040963884260124, + "language_loss": 0.86882496, + "learning_rate": 0.0009271885870731626, + "loss": 0.87982166, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1033, + "time_per_iteration": 2.509047269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098009, + "balance_loss_mlp": 1.08258307, + "diversity_loss_mlp": 0.0, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.09324111295027285, + "language_loss": 0.88376671, + "learning_rate": 0.0009270266101419143, + "loss": 0.89474678, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1034, + "time_per_iteration": 2.6504034996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094236, + "balance_loss_mlp": 1.07954955, + "diversity_loss_mlp": 0.0, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.12545708784893086, + "language_loss": 0.85201651, + "learning_rate": 0.0009268644674242328, + "loss": 0.86295891, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1035, + "time_per_iteration": 2.6919047832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105423, + "balance_loss_mlp": 1.08997381, + "diversity_loss_mlp": 0.0, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.09055239952020887, + "language_loss": 0.80814689, + "learning_rate": 0.0009267021589830678, + "loss": 0.81920111, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1036, + "time_per_iteration": 2.582871198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278291, + "balance_loss_mlp": 1.26927888, + "diversity_loss_mlp": 0.0, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.10087907784966592, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78905374, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 1037, + "time_per_iteration": 4.955699920654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112922, + "balance_loss_mlp": 1.11371088, + "diversity_loss_mlp": 0.0, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.08737337363848705, + "language_loss": 0.9264009, + "learning_rate": 0.000926377045182406, + "loss": 0.93769312, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1038, + "time_per_iteration": 2.8884389400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140143, + "balance_loss_mlp": 1.12453878, + "diversity_loss_mlp": 0.0, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.10415849564176528, + "language_loss": 0.87916917, + "learning_rate": 0.0009262142399491296, + "loss": 0.89057058, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1039, + "time_per_iteration": 3.045872211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143419, + "balance_loss_mlp": 1.12763548, + "diversity_loss_mlp": 0.0, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.09906225236156592, + "language_loss": 0.87455821, + "learning_rate": 0.0009260512692448105, + "loss": 0.88599241, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.15771484, + "routerloss_mlp": 0.0, + "step": 1040, + "time_per_iteration": 2.699052572250366 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 86214480, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2343431372144640.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/training_args.bin b/sft_pretrain/Full_competesmoev30/checkpoint-1040/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b6a9277adbc97dc93da839d7637a55f6cb09192 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe769c1cc19035ec98b831c3889d46da4eb91c0444d770f41a815de3d19398a +size 7992 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-1040/zero_to_fp32.py b/sft_pretrain/Full_competesmoev30/checkpoint-1040/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-1040/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/added_tokens.json b/sft_pretrain/Full_competesmoev30/checkpoint-2080/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/config.json b/sft_pretrain/Full_competesmoev30/checkpoint-2080/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28a5bb1c149304f33214eee3c6e2764711ffb065 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.005, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.005, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": true, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 9, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev30", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/generation_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-2080/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e72130c613f7fb3fd057920e05b079504db106a9 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1adb17c65648ba15576cc02ec5aa98399861b23912419d23d615fb5d754f6790 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86abd020c9a8c0f68ec5001d907598b74461b5fd --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c551e6156041124627bb32d36fdefc5fe77958252bf59f3bdafe6f283e24cd +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dee4173183d6d4734d3a0afc51094cbd04d1f9ac --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e00c0dd6cdf805e70afa34ed1b95c88aedfa15a2243ab5a98c76b4263d476615 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1962641ef2e4d7214dd624bacb9717989199f81f --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:614dcf6a9a29c87fc6f09a3428fc6c8ce23f01266a9cb801ef163008524770d6 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1123bef8ea3bdc61a4e834e8fed3516f1838aae4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147aba3cc5c11fbcb61b7ef325287f19e3cd6ec48656f77853b2313fafda1a08 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86ae5a76db4f35b71724f369a5c3809e5ccdd375 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9cca58a42fdbd875d5a95c998090b4e37f22c6bd407642681366c3825e52f77 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8188ad2ebea22c9476492a60fb83396bd69f5bcc --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32bbe3f13281487d9a3d64f8b4335c2b3916a05fa8e348f135e82b28fbe0e6de +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cdb10b4bf97fc610e9622a08a56c36812450ae8 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d57622ef765ff525a6369a945ff4c23c08b1490fc0d8b7273b979ab919b0f29 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/latest b/sft_pretrain/Full_competesmoev30/checkpoint-2080/latest new file mode 100644 index 0000000000000000000000000000000000000000..306b989cc55bbad3d1661dff0bcd6923a752cb0a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/latest @@ -0,0 +1 @@ +global_step2080 \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/model-00001-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-2080/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/model-00002-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-2080/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1f59c84221f2ec3b9746db6c6a33c6c809d9a5c5 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc661e658543918d0fa6715f947dff0e9b2f4452dfcc976110b7177c818c04fa +size 3759030203 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/model.safetensors.index.json b/sft_pretrain/Full_competesmoev30/checkpoint-2080/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..1c36aea017a82c896c2bf8d32802184967811e4c --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/model.safetensors.index.json @@ -0,0 +1,673 @@ +{ + "metadata": { + "total_size": 8731429675 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_0.pth b/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..74aaffdc337c5a168a279aed341c53617abfb292 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7428511a0f39116505eb0e78fefd1d50fe2ddacee4482cdd5d925938d450347 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_1.pth b/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_2.pth b/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_3.pth b/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/special_tokens_map.json b/sft_pretrain/Full_competesmoev30/checkpoint-2080/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/tokenizer.model b/sft_pretrain/Full_competesmoev30/checkpoint-2080/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/tokenizer_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-2080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/trainer_state.json b/sft_pretrain/Full_competesmoev30/checkpoint-2080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..da6a642ee0244209f5c2d9595a3d571facc85f64 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/trainer_state.json @@ -0,0 +1,34793 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4001539053482108, + "eval_steps": 500, + "global_step": 2080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03936368, + "balance_loss_mlp": 2.84994221, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 15.847607787273237, + "language_loss": 2.91765308, + "learning_rate": 0.0, + "loss": 1.97528625, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.278199672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02015882, + "balance_loss_mlp": 1.26743817, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 26.39987998366427, + "language_loss": 2.42349291, + "learning_rate": 0.00013726078121135892, + "loss": 2.44365168, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 7.4765625, + "step": 2, + "time_per_iteration": 2.74550199508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02034476, + "balance_loss_mlp": 1.28603244, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 23.46624299076427, + "language_loss": 2.13354897, + "learning_rate": 0.00021755319103969496, + "loss": 2.15389395, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 7.4765625, + "step": 3, + "time_per_iteration": 2.820986270904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02058399, + "balance_loss_mlp": 1.29927421, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 3.493910581799846, + "language_loss": 1.37129521, + "learning_rate": 0.00027452156242271784, + "loss": 1.3918792, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 7.5859375, + "step": 4, + "time_per_iteration": 2.677243947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02066247, + "balance_loss_mlp": 1.30979228, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 0.8674817587168525, + "language_loss": 1.33187473, + "learning_rate": 0.0003187096642208417, + "loss": 1.35253716, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 7.55859375, + "step": 5, + "time_per_iteration": 2.6032657623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02071583, + "balance_loss_mlp": 1.31322157, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 2.033424387355904, + "language_loss": 1.30649018, + "learning_rate": 0.0003548139722510539, + "loss": 1.32720602, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 7.578125, + "step": 6, + "time_per_iteration": 2.6967170238494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101369, + "balance_loss_mlp": 1.33652186, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7061194413900653, + "language_loss": 1.22160292, + "learning_rate": 0.00038533972973918044, + "loss": 1.24261677, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 7.64453125, + "step": 7, + "time_per_iteration": 2.7199785709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02146806, + "balance_loss_mlp": 1.36975181, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.35850971046258795, + "language_loss": 1.17196155, + "learning_rate": 0.0004117823436340768, + "loss": 1.19342971, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 7.76171875, + "step": 8, + "time_per_iteration": 2.6428823471069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02153063, + "balance_loss_mlp": 1.36837983, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.22105321402960548, + "language_loss": 1.2430563, + "learning_rate": 0.00043510638207938993, + "loss": 1.26458693, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 7.8359375, + "step": 9, + "time_per_iteration": 2.7773404121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194678, + "balance_loss_mlp": 1.4077065, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.2650641779955913, + "language_loss": 1.13927829, + "learning_rate": 0.00045597044543220066, + "loss": 1.16122508, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 7.87109375, + "step": 10, + "time_per_iteration": 2.6966803073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215625, + "balance_loss_mlp": 1.42216802, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.17099192662038445, + "language_loss": 1.11761594, + "learning_rate": 0.00047484428652143135, + "loss": 1.13977218, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 7.921875, + "step": 11, + "time_per_iteration": 2.846426010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218955, + "balance_loss_mlp": 1.42854977, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.11899482154082718, + "language_loss": 1.17641664, + "learning_rate": 0.0004920747534624128, + "loss": 1.19860613, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 7.890625, + "step": 12, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02207543, + "balance_loss_mlp": 1.41751897, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.14172497717456267, + "language_loss": 1.20158505, + "learning_rate": 0.0005079252465375872, + "loss": 1.22366059, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 7.8984375, + "step": 13, + "time_per_iteration": 2.7560088634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02203989, + "balance_loss_mlp": 1.41625452, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.1448362910448976, + "language_loss": 1.09927368, + "learning_rate": 0.0005226005109505393, + "loss": 1.12131357, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 7.859375, + "step": 14, + "time_per_iteration": 2.623379707336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02125464, + "balance_loss_mlp": 1.36481309, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.13392565488521943, + "language_loss": 1.15514731, + "learning_rate": 0.0005362628552605367, + "loss": 1.17640197, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 7.59765625, + "step": 15, + "time_per_iteration": 2.596914768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02122013, + "balance_loss_mlp": 1.3682282, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.12347082932885804, + "language_loss": 1.19854355, + "learning_rate": 0.0005490431248454357, + "loss": 1.21976352, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 7.53125, + "step": 16, + "time_per_iteration": 2.685072898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02419001, + "balance_loss_mlp": 1.67742407, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2736231848322761, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78124118, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.40625, + "step": 17, + "time_per_iteration": 5.928683757781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02002798, + "balance_loss_mlp": 1.29097593, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.09154168539226555, + "language_loss": 1.06151795, + "learning_rate": 0.0005723671632907488, + "loss": 1.08154595, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.12109375, + "step": 18, + "time_per_iteration": 2.6618175506591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945774, + "balance_loss_mlp": 1.26141703, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11342789334024792, + "language_loss": 1.1168499, + "learning_rate": 0.0005830738490244919, + "loss": 1.13630772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.8515625, + "step": 19, + "time_per_iteration": 2.5248160362243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01908107, + "balance_loss_mlp": 1.24625731, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10096694408553891, + "language_loss": 1.13845825, + "learning_rate": 0.0005932312266435596, + "loss": 1.15753937, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.62109375, + "step": 20, + "time_per_iteration": 2.800579309463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01843731, + "balance_loss_mlp": 1.21316147, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.1378013237236713, + "language_loss": 1.09039617, + "learning_rate": 0.0006028929207788754, + "loss": 1.10883355, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 6.30078125, + "step": 21, + "time_per_iteration": 2.693075656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01796963, + "balance_loss_mlp": 1.19309616, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.10529209836160877, + "language_loss": 1.11936951, + "learning_rate": 0.0006121050677327902, + "loss": 1.13733912, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 6.03125, + "step": 22, + "time_per_iteration": 2.8881568908691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746784, + "balance_loss_mlp": 1.17724967, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.085047282331545, + "language_loss": 1.02962387, + "learning_rate": 0.0006209076479463684, + "loss": 1.04709172, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 5.70703125, + "step": 23, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01714578, + "balance_loss_mlp": 1.16831291, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.1446104563316411, + "language_loss": 1.12823486, + "learning_rate": 0.0006293355346737718, + "loss": 1.1453805, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 5.46875, + "step": 24, + "time_per_iteration": 2.662325382232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664908, + "balance_loss_mlp": 1.14725351, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.08929005506461926, + "language_loss": 1.08926165, + "learning_rate": 0.0006374193284416834, + "loss": 1.10591078, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 5.17578125, + "step": 25, + "time_per_iteration": 2.7794790267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01647718, + "balance_loss_mlp": 1.15752983, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.382953647696995, + "language_loss": 1.07588863, + "learning_rate": 0.0006451860277489461, + "loss": 1.09236586, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.89453125, + "step": 26, + "time_per_iteration": 2.6574552059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01623745, + "balance_loss_mlp": 1.1686517, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.13377036730821817, + "language_loss": 1.14740276, + "learning_rate": 0.0006526595731190848, + "loss": 1.16364002, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 4.55078125, + "step": 27, + "time_per_iteration": 2.5226099491119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558493, + "balance_loss_mlp": 1.14078379, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.07887885702942038, + "language_loss": 1.08901012, + "learning_rate": 0.0006598612921618983, + "loss": 1.10459495, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 4.18359375, + "step": 28, + "time_per_iteration": 2.839459180831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01503024, + "balance_loss_mlp": 1.11487842, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.08107526710192482, + "language_loss": 1.0255661, + "learning_rate": 0.0006668102665011454, + "loss": 1.04059625, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 3.87695312, + "step": 29, + "time_per_iteration": 3.257913589477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474291, + "balance_loss_mlp": 1.11227608, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.13697687064909753, + "language_loss": 1.11483085, + "learning_rate": 0.0006735236364718957, + "loss": 1.1295737, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 3.6171875, + "step": 30, + "time_per_iteration": 2.7084178924560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0142553, + "balance_loss_mlp": 1.09460521, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.11726589989245696, + "language_loss": 1.10265064, + "learning_rate": 0.0006800168558381346, + "loss": 1.11690593, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 3.31054688, + "step": 31, + "time_per_iteration": 2.588890552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01390474, + "balance_loss_mlp": 1.08758759, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.10666498872881085, + "language_loss": 1.13109517, + "learning_rate": 0.0006863039060567947, + "loss": 1.14499998, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 3.0234375, + "step": 32, + "time_per_iteration": 2.671940326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372012, + "balance_loss_mlp": 1.09372997, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.09439068448398888, + "language_loss": 1.06106949, + "learning_rate": 0.0006923974775611263, + "loss": 1.07478976, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 2.78710938, + "step": 33, + "time_per_iteration": 2.854475498199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370442, + "balance_loss_mlp": 1.11390388, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.06215931521992215, + "language_loss": 1.03014469, + "learning_rate": 0.0006983091239737814, + "loss": 1.04384923, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 2.56445312, + "step": 34, + "time_per_iteration": 3.0690298080444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361344, + "balance_loss_mlp": 1.12464166, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.09515467516314563, + "language_loss": 1.01683736, + "learning_rate": 0.0007040493939600222, + "loss": 1.03045082, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 2.36523438, + "step": 35, + "time_per_iteration": 2.8111989498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344012, + "balance_loss_mlp": 1.12600231, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.06987238068095514, + "language_loss": 1.02534437, + "learning_rate": 0.0007096279445021078, + "loss": 1.0387845, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 2.18554688, + "step": 36, + "time_per_iteration": 2.704871654510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340389, + "balance_loss_mlp": 1.14107156, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.1404335763188921, + "language_loss": 1.09097314, + "learning_rate": 0.0007150536386503726, + "loss": 1.10437703, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 1.9921875, + "step": 37, + "time_per_iteration": 2.872793436050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315876, + "balance_loss_mlp": 1.13486814, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.16061978088166937, + "language_loss": 1.01896858, + "learning_rate": 0.0007203346302358509, + "loss": 1.0321275, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 1.81054688, + "step": 38, + "time_per_iteration": 2.9352476596832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304512, + "balance_loss_mlp": 1.13332772, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.19798610454398824, + "language_loss": 1.06942129, + "learning_rate": 0.000725478437577282, + "loss": 1.08246636, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 1.71386719, + "step": 39, + "time_per_iteration": 2.766380786895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266397, + "balance_loss_mlp": 1.10894561, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.0682924496804484, + "language_loss": 1.01676083, + "learning_rate": 0.0007304920078549186, + "loss": 1.02942467, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 1.57324219, + "step": 40, + "time_per_iteration": 2.7017316818237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260084, + "balance_loss_mlp": 1.10988009, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.18661861035366387, + "language_loss": 1.03648829, + "learning_rate": 0.0007353817735343603, + "loss": 1.04908907, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 1.50097656, + "step": 41, + "time_per_iteration": 2.7103593349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243555, + "balance_loss_mlp": 1.10651195, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.09436856387031409, + "language_loss": 0.996611, + "learning_rate": 0.0007401537019902344, + "loss": 1.00904644, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 1.37109375, + "step": 42, + "time_per_iteration": 2.6113343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223311, + "balance_loss_mlp": 1.09961998, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.12261468754490484, + "language_loss": 1.02989793, + "learning_rate": 0.0007448133392900729, + "loss": 1.04213095, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.23535156, + "step": 43, + "time_per_iteration": 2.6736834049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123183, + "balance_loss_mlp": 1.11490965, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.06742287935331995, + "language_loss": 0.98469728, + "learning_rate": 0.0007493658489441491, + "loss": 0.9970156, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.16699219, + "step": 44, + "time_per_iteration": 2.8660154342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221739, + "balance_loss_mlp": 1.11549973, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.13165016268944502, + "language_loss": 1.02125764, + "learning_rate": 0.0007538160463002316, + "loss": 1.03347504, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.06445312, + "step": 45, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219104, + "balance_loss_mlp": 1.12082767, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.09154051415002856, + "language_loss": 1.05303812, + "learning_rate": 0.0007581684291577274, + "loss": 1.06522906, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.98193359, + "step": 46, + "time_per_iteration": 2.5779762268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211973, + "balance_loss_mlp": 1.12180293, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.10098348979088022, + "language_loss": 1.08761919, + "learning_rate": 0.0007624272050891776, + "loss": 1.09973884, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.90185547, + "step": 47, + "time_per_iteration": 2.8511393070220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.09893048, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.06288361982709323, + "language_loss": 0.98731792, + "learning_rate": 0.0007665963158851307, + "loss": 0.9991011, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.79345703, + "step": 48, + "time_per_iteration": 2.7975704669952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117803, + "balance_loss_mlp": 1.10588408, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.07935638516568921, + "language_loss": 1.07018328, + "learning_rate": 0.0007706794594783609, + "loss": 1.08196378, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.72167969, + "step": 49, + "time_per_iteration": 2.762869358062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170672, + "balance_loss_mlp": 1.10281849, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.06589219417940043, + "language_loss": 1.06122911, + "learning_rate": 0.0007746801096530423, + "loss": 1.07293582, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.67919922, + "step": 50, + "time_per_iteration": 2.755232334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116692, + "balance_loss_mlp": 1.10545588, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09337036144210262, + "language_loss": 1.10751569, + "learning_rate": 0.0007786015338021173, + "loss": 1.11918497, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.61376953, + "step": 51, + "time_per_iteration": 2.6145899295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.10279799, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.0700474736529942, + "language_loss": 1.03127432, + "learning_rate": 0.0007824468089603051, + "loss": 1.04286635, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.56396484, + "step": 52, + "time_per_iteration": 2.653333902359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.1128397, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.0678828268350522, + "language_loss": 1.02721131, + "learning_rate": 0.0007862188363098669, + "loss": 1.0388329, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.4934082, + "step": 53, + "time_per_iteration": 3.16854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150565, + "balance_loss_mlp": 1.10464573, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.07226768628462193, + "language_loss": 1.03151178, + "learning_rate": 0.0007899203543304438, + "loss": 1.04301751, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.45947266, + "step": 54, + "time_per_iteration": 2.684342384338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153237, + "balance_loss_mlp": 1.10901022, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.2877805661885644, + "language_loss": 1.16480064, + "learning_rate": 0.0007935539507422731, + "loss": 1.17633295, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.44213867, + "step": 55, + "time_per_iteration": 2.550560235977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135293, + "balance_loss_mlp": 1.09545326, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.09011321470942846, + "language_loss": 1.08752644, + "learning_rate": 0.0007971220733732573, + "loss": 1.09887934, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.39819336, + "step": 56, + "time_per_iteration": 2.6777026653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138051, + "balance_loss_mlp": 1.10307515, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.08011479339587849, + "language_loss": 1.04026377, + "learning_rate": 0.0008006270400641869, + "loss": 1.05164433, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.34985352, + "step": 57, + "time_per_iteration": 2.6899423599243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140153, + "balance_loss_mlp": 1.10787153, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.11169369867739573, + "language_loss": 1.05261517, + "learning_rate": 0.0008040710477125043, + "loss": 1.06401682, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.32275391, + "step": 58, + "time_per_iteration": 2.723038911819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144338, + "balance_loss_mlp": 1.11403465, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.15034464280850074, + "language_loss": 1.06417704, + "learning_rate": 0.0008074561805429771, + "loss": 1.07562041, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.30297852, + "step": 59, + "time_per_iteration": 2.6378283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136058, + "balance_loss_mlp": 1.10842514, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.12260992246729245, + "language_loss": 1.03937411, + "learning_rate": 0.0008107844176832545, + "loss": 1.05073476, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.27612305, + "step": 60, + "time_per_iteration": 2.700141668319702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143848, + "balance_loss_mlp": 1.11745548, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.07189127634205647, + "language_loss": 1.05365705, + "learning_rate": 0.0008140576401132568, + "loss": 1.06509542, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.2644043, + "step": 61, + "time_per_iteration": 2.6508264541625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141309, + "balance_loss_mlp": 1.11781311, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.05216073972873087, + "language_loss": 1.06422329, + "learning_rate": 0.0008172776370494935, + "loss": 1.07563639, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.23461914, + "step": 62, + "time_per_iteration": 2.725492238998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136117, + "balance_loss_mlp": 1.11272764, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.101779425959611, + "language_loss": 1.13612652, + "learning_rate": 0.0008204461118185703, + "loss": 1.14748764, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.23376465, + "step": 63, + "time_per_iteration": 2.5753746032714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148279, + "balance_loss_mlp": 1.12627339, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.07447427381713748, + "language_loss": 1.0324012, + "learning_rate": 0.0008235646872681536, + "loss": 1.04388404, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.22009277, + "step": 64, + "time_per_iteration": 2.5766890048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134709, + "balance_loss_mlp": 1.11331069, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.38827595406324295, + "language_loss": 1.02755439, + "learning_rate": 0.0008266349107584288, + "loss": 1.03890157, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.2142334, + "step": 65, + "time_per_iteration": 2.6795432567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150765, + "balance_loss_mlp": 1.12982011, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.12495940986475743, + "language_loss": 1.06208372, + "learning_rate": 0.0008296582587724851, + "loss": 1.07359147, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.20947266, + "step": 66, + "time_per_iteration": 2.7176458835601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140545, + "balance_loss_mlp": 1.11969519, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.1040817091496257, + "language_loss": 1.04495656, + "learning_rate": 0.0008326361411800136, + "loss": 1.05636215, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.20861816, + "step": 67, + "time_per_iteration": 2.944484233856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11664486, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.1236975736999165, + "language_loss": 1.04613113, + "learning_rate": 0.0008355699051851403, + "loss": 1.05749726, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1998291, + "step": 68, + "time_per_iteration": 2.7155401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163949, + "balance_loss_mlp": 1.14371967, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.08669769947970225, + "language_loss": 1.11325383, + "learning_rate": 0.0008384608389860635, + "loss": 1.12489343, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.20214844, + "step": 69, + "time_per_iteration": 2.6746206283569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.15127182, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.13494585106435908, + "language_loss": 1.01927853, + "learning_rate": 0.000841310175171381, + "loss": 1.03098571, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.19433594, + "step": 70, + "time_per_iteration": 2.6096978187561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116458, + "balance_loss_mlp": 1.14537501, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.08071853308807045, + "language_loss": 0.99831259, + "learning_rate": 0.000844119093875517, + "loss": 1.00995839, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.19189453, + "step": 71, + "time_per_iteration": 2.7110228538513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172694, + "balance_loss_mlp": 1.1531322, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.1298896621631551, + "language_loss": 1.05077183, + "learning_rate": 0.0008468887257134666, + "loss": 1.06249881, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.19543457, + "step": 72, + "time_per_iteration": 2.6877832412719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117331, + "balance_loss_mlp": 1.15338969, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.15655470084299106, + "language_loss": 1.07319438, + "learning_rate": 0.0008496201545131264, + "loss": 1.08492744, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.19909668, + "step": 73, + "time_per_iteration": 2.712404251098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155518, + "balance_loss_mlp": 1.13590837, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.16190508579873739, + "language_loss": 1.04767108, + "learning_rate": 0.0008523144198617317, + "loss": 1.05922627, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.19604492, + "step": 74, + "time_per_iteration": 3.1923534870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136399, + "balance_loss_mlp": 1.11624122, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.09478832041488004, + "language_loss": 1.04861999, + "learning_rate": 0.0008549725194813783, + "loss": 1.05998397, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.20153809, + "step": 75, + "time_per_iteration": 2.6708076000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116508, + "balance_loss_mlp": 1.09800684, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.08770819878028477, + "language_loss": 1.03907192, + "learning_rate": 0.0008575954114472099, + "loss": 1.05023694, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.18481445, + "step": 76, + "time_per_iteration": 3.13152813911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115762, + "balance_loss_mlp": 1.09717751, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.13848190952411177, + "language_loss": 1.01474786, + "learning_rate": 0.0008601840162606118, + "loss": 1.02590549, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.18591309, + "step": 77, + "time_per_iteration": 3.0026464462280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126663, + "balance_loss_mlp": 1.10745883, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04300320251384177, + "language_loss": 1.07548404, + "learning_rate": 0.000862739218788641, + "loss": 1.08675063, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.19189453, + "step": 78, + "time_per_iteration": 2.780151128768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136666, + "balance_loss_mlp": 1.11736631, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.05300805683051922, + "language_loss": 1.05217659, + "learning_rate": 0.0008652618700799138, + "loss": 1.0635432, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.19287109, + "step": 79, + "time_per_iteration": 2.644989252090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115046, + "balance_loss_mlp": 1.13105261, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.13679514692214284, + "language_loss": 1.04483461, + "learning_rate": 0.0008677527890662774, + "loss": 1.05633926, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.19384766, + "step": 80, + "time_per_iteration": 2.4652533531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151484, + "balance_loss_mlp": 1.13120639, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.06949005945359786, + "language_loss": 1.05593443, + "learning_rate": 0.0008702127641587799, + "loss": 1.06744933, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.20263672, + "step": 81, + "time_per_iteration": 2.6423192024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155894, + "balance_loss_mlp": 1.13492513, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.09507058081046676, + "language_loss": 1.01514888, + "learning_rate": 0.0008726425547457192, + "loss": 1.02670789, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.20959473, + "step": 82, + "time_per_iteration": 2.7670798301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.11376882, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.0793725108169458, + "language_loss": 1.00304663, + "learning_rate": 0.0008750428925998964, + "loss": 1.01438546, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.20117188, + "step": 83, + "time_per_iteration": 2.7451062202453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145272, + "balance_loss_mlp": 1.12516141, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.14534943996774727, + "language_loss": 1.06251049, + "learning_rate": 0.0008774144832015932, + "loss": 1.07396317, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.2010498, + "step": 84, + "time_per_iteration": 2.7039954662323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01784137, + "balance_loss_mlp": 1.77116704, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.33978769388161495, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76558447, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.12988281, + "step": 85, + "time_per_iteration": 4.672428846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133263, + "balance_loss_mlp": 1.11339045, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.0814354491433929, + "language_loss": 1.01647198, + "learning_rate": 0.0008820741205014318, + "loss": 1.02780461, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.19873047, + "step": 86, + "time_per_iteration": 2.9217472076416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135249, + "balance_loss_mlp": 1.11522174, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.09136661427056217, + "language_loss": 1.02933669, + "learning_rate": 0.0008843634575408404, + "loss": 1.04068923, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20031738, + "step": 87, + "time_per_iteration": 2.7795376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126576, + "balance_loss_mlp": 1.10805094, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.08653972064742017, + "language_loss": 1.04609084, + "learning_rate": 0.0008866266301555082, + "loss": 1.0573566, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.18518066, + "step": 88, + "time_per_iteration": 2.7490010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144591, + "balance_loss_mlp": 1.12630451, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.0643644920813647, + "language_loss": 1.05052233, + "learning_rate": 0.0008888642296509615, + "loss": 1.06196821, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.18273926, + "step": 89, + "time_per_iteration": 2.594862222671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167485, + "balance_loss_mlp": 1.14840007, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.0960094219381758, + "language_loss": 1.09507632, + "learning_rate": 0.0008910768275115906, + "loss": 1.10675108, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.1907959, + "step": 90, + "time_per_iteration": 2.732243299484253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168948, + "balance_loss_mlp": 1.14970791, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.08670111946866453, + "language_loss": 1.05579484, + "learning_rate": 0.0008932649762767675, + "loss": 1.06748414, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.19238281, + "step": 91, + "time_per_iteration": 2.58011531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156938, + "balance_loss_mlp": 1.13799536, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.1377326340865385, + "language_loss": 1.07988524, + "learning_rate": 0.0008954292103690864, + "loss": 1.09145451, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.18933105, + "step": 92, + "time_per_iteration": 2.88777494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144865, + "balance_loss_mlp": 1.12581539, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.08013614344713903, + "language_loss": 1.10040021, + "learning_rate": 0.0008975700468778296, + "loss": 1.11184883, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.19042969, + "step": 93, + "time_per_iteration": 2.5774590969085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153192, + "balance_loss_mlp": 1.13429725, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.08120240816831911, + "language_loss": 1.03244281, + "learning_rate": 0.0008996879863005366, + "loss": 1.04397476, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.18896484, + "step": 94, + "time_per_iteration": 2.6684646606445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166139, + "balance_loss_mlp": 1.14685082, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.10696755240582503, + "language_loss": 1.0365541, + "learning_rate": 0.0009017835132453337, + "loss": 1.04821539, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.19262695, + "step": 95, + "time_per_iteration": 2.5731871128082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160643, + "balance_loss_mlp": 1.14130712, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.09689172385373614, + "language_loss": 1.03809953, + "learning_rate": 0.0009038570970964896, + "loss": 1.04970598, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.1932373, + "step": 96, + "time_per_iteration": 2.7642133235931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142174, + "balance_loss_mlp": 1.1226114, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0731237284630876, + "language_loss": 1.01012015, + "learning_rate": 0.0009059091926454854, + "loss": 1.02154183, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.19543457, + "step": 97, + "time_per_iteration": 2.5798768997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134737, + "balance_loss_mlp": 1.11522222, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.09616120207899966, + "language_loss": 1.00179553, + "learning_rate": 0.0009079402406897198, + "loss": 1.01314282, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.19494629, + "step": 98, + "time_per_iteration": 3.2566075325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.12357211, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.06455780129345397, + "language_loss": 1.01265812, + "learning_rate": 0.0009099506686008212, + "loss": 1.02409148, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.19763184, + "step": 99, + "time_per_iteration": 2.799565553665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129571, + "balance_loss_mlp": 1.11054564, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.10657448879387016, + "language_loss": 1.0467732, + "learning_rate": 0.0009119408908644013, + "loss": 1.05806899, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.19030762, + "step": 100, + "time_per_iteration": 2.684875249862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122425, + "balance_loss_mlp": 1.10363734, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.06970738765852934, + "language_loss": 1.09725833, + "learning_rate": 0.0009139113095929519, + "loss": 1.1084826, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.18762207, + "step": 101, + "time_per_iteration": 2.8530783653259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130466, + "balance_loss_mlp": 1.11095107, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.04951217111237057, + "language_loss": 1.03750157, + "learning_rate": 0.0009158623150134762, + "loss": 1.04880619, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.19506836, + "step": 102, + "time_per_iteration": 2.5738718509674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124372, + "balance_loss_mlp": 1.10552466, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.07829016079597523, + "language_loss": 1.03829539, + "learning_rate": 0.000917794285931332, + "loss": 1.04953909, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.18859863, + "step": 103, + "time_per_iteration": 2.6672050952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116034, + "balance_loss_mlp": 1.09756863, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.06055754000551873, + "language_loss": 0.96430528, + "learning_rate": 0.0009197075901716639, + "loss": 0.97546566, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.18444824, + "step": 104, + "time_per_iteration": 2.7030909061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143399, + "balance_loss_mlp": 1.12458754, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.08851166873462187, + "language_loss": 1.06492853, + "learning_rate": 0.0009216025849997171, + "loss": 1.07636249, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.18798828, + "step": 105, + "time_per_iteration": 2.770717144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.11799645, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.1087806769155691, + "language_loss": 1.01426148, + "learning_rate": 0.0009234796175212258, + "loss": 1.02562797, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.18640137, + "step": 106, + "time_per_iteration": 2.9345030784606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145469, + "balance_loss_mlp": 1.12691963, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.08314221817588373, + "language_loss": 1.04264343, + "learning_rate": 0.000925339025064007, + "loss": 1.05409813, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.18530273, + "step": 107, + "time_per_iteration": 2.9724230766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136133, + "balance_loss_mlp": 1.11766744, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.06103111074840472, + "language_loss": 0.9746207, + "learning_rate": 0.0009271811355418027, + "loss": 0.98598194, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.18457031, + "step": 108, + "time_per_iteration": 2.8312766551971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114122, + "balance_loss_mlp": 1.12251627, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09366723049874563, + "language_loss": 1.0430491, + "learning_rate": 0.0009290062678013548, + "loss": 1.05446124, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.18713379, + "step": 109, + "time_per_iteration": 2.8890299797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119997, + "balance_loss_mlp": 1.10091138, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.07845117671788823, + "language_loss": 1.02498507, + "learning_rate": 0.0009308147319536321, + "loss": 1.03618503, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.19067383, + "step": 110, + "time_per_iteration": 2.6301145553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124178, + "balance_loss_mlp": 1.10517561, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.06169483511964636, + "language_loss": 1.08628201, + "learning_rate": 0.0009326068296900676, + "loss": 1.09752393, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.18981934, + "step": 111, + "time_per_iteration": 2.8480148315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124091, + "balance_loss_mlp": 1.1046958, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.07277353768082521, + "language_loss": 1.00328588, + "learning_rate": 0.0009343828545846161, + "loss": 1.01452684, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.19384766, + "step": 112, + "time_per_iteration": 2.785245656967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_mlp": 1.12596965, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.0989159829516975, + "language_loss": 1.03963184, + "learning_rate": 0.0009361430923823841, + "loss": 1.05108869, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.19702148, + "step": 113, + "time_per_iteration": 2.6218817234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139838, + "balance_loss_mlp": 1.11994159, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.08134488401387123, + "language_loss": 1.07289195, + "learning_rate": 0.0009378878212755459, + "loss": 1.08429039, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.19885254, + "step": 114, + "time_per_iteration": 2.489394426345825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135841, + "balance_loss_mlp": 1.11546779, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.08931795851274972, + "language_loss": 0.98084462, + "learning_rate": 0.0009396173121672103, + "loss": 0.992203, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.20373535, + "step": 115, + "time_per_iteration": 2.6338186264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.11229324, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.07784948028132394, + "language_loss": 1.03230667, + "learning_rate": 0.0009413318289238633, + "loss": 1.04362714, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.1973877, + "step": 116, + "time_per_iteration": 2.7797064781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119746, + "balance_loss_mlp": 1.10049319, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.10235619274826367, + "language_loss": 0.95674431, + "learning_rate": 0.0009430316286169771, + "loss": 0.96794176, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.19226074, + "step": 117, + "time_per_iteration": 3.0148251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123727, + "balance_loss_mlp": 1.10400951, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.08556933686221588, + "language_loss": 1.00759292, + "learning_rate": 0.0009447169617543361, + "loss": 1.0188303, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.19714355, + "step": 118, + "time_per_iteration": 2.570577383041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147362, + "balance_loss_mlp": 1.12738276, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.14195532580527156, + "language_loss": 1.07468402, + "learning_rate": 0.0009463880725016029, + "loss": 1.08615768, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.19970703, + "step": 119, + "time_per_iteration": 2.687791585922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119491, + "balance_loss_mlp": 1.1002152, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.12580227983012474, + "language_loss": 1.02723956, + "learning_rate": 0.0009480451988946134, + "loss": 1.03843451, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19274902, + "step": 120, + "time_per_iteration": 2.86080002784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.09974504, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.09779732210141849, + "language_loss": 1.04102588, + "learning_rate": 0.0009496885730428627, + "loss": 1.05221319, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1895752, + "step": 121, + "time_per_iteration": 3.058720350265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129608, + "balance_loss_mlp": 1.11076128, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.21300696817673925, + "language_loss": 1.02294064, + "learning_rate": 0.0009513184213246156, + "loss": 1.03423667, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.18859863, + "step": 122, + "time_per_iteration": 2.634585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112851, + "balance_loss_mlp": 1.10879278, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.08876505507315528, + "language_loss": 1.05331969, + "learning_rate": 0.0009529349645740552, + "loss": 1.06460488, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.19702148, + "step": 123, + "time_per_iteration": 2.68062686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139736, + "balance_loss_mlp": 1.11948287, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.07165211399576038, + "language_loss": 1.04294729, + "learning_rate": 0.0009545384182608524, + "loss": 1.05434453, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.20239258, + "step": 124, + "time_per_iteration": 2.541867971420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147945, + "balance_loss_mlp": 1.12758446, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.1170262954091428, + "language_loss": 1.01733518, + "learning_rate": 0.0009561289926625252, + "loss": 1.02881455, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.20361328, + "step": 125, + "time_per_iteration": 2.6904866695404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144311, + "balance_loss_mlp": 1.12337756, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.0767802787123007, + "language_loss": 1.06512678, + "learning_rate": 0.0009577068930299292, + "loss": 1.07656991, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.20935059, + "step": 126, + "time_per_iteration": 2.5956666469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112026, + "balance_loss_mlp": 1.10011339, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.05578094289714296, + "language_loss": 1.01563096, + "learning_rate": 0.0009592723197462087, + "loss": 1.02683353, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.20141602, + "step": 127, + "time_per_iteration": 2.652282953262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135813, + "balance_loss_mlp": 1.11633444, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.08941911012616197, + "language_loss": 0.98464531, + "learning_rate": 0.0009608254684795125, + "loss": 0.99600339, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.19470215, + "step": 128, + "time_per_iteration": 2.9219348430633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113214, + "balance_loss_mlp": 1.11204123, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.07851670709976168, + "language_loss": 1.01339173, + "learning_rate": 0.0009623665303297678, + "loss": 1.02471328, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.20092773, + "step": 129, + "time_per_iteration": 2.72129225730896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138949, + "balance_loss_mlp": 1.11936343, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.10234054898828188, + "language_loss": 1.05215728, + "learning_rate": 0.0009638956919697878, + "loss": 1.0635469, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19580078, + "step": 130, + "time_per_iteration": 2.8943347930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120076, + "balance_loss_mlp": 1.10040641, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.07955649128739337, + "language_loss": 0.97532988, + "learning_rate": 0.0009654131357809714, + "loss": 0.98653066, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.19665527, + "step": 131, + "time_per_iteration": 2.5710790157318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131377, + "balance_loss_mlp": 1.11108756, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.09603534709419483, + "language_loss": 1.06830871, + "learning_rate": 0.0009669190399838441, + "loss": 1.07962251, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.20275879, + "step": 132, + "time_per_iteration": 3.12355899810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104613, + "balance_loss_mlp": 1.08422863, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.07678679730921736, + "language_loss": 0.99635059, + "learning_rate": 0.0009684135787636724, + "loss": 1.0073967, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.20373535, + "step": 133, + "time_per_iteration": 2.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011225, + "balance_loss_mlp": 1.10198379, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.06194161941979751, + "language_loss": 1.03999257, + "learning_rate": 0.0009698969223913726, + "loss": 1.05121756, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.2052002, + "step": 134, + "time_per_iteration": 3.0173001289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111055, + "balance_loss_mlp": 1.09066617, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.06876216863310104, + "language_loss": 1.06792855, + "learning_rate": 0.0009713692373399265, + "loss": 1.07903397, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.19873047, + "step": 135, + "time_per_iteration": 2.670929431915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134721, + "balance_loss_mlp": 1.33280921, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.15411027982306336, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.80803436, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.14355469, + "step": 136, + "time_per_iteration": 5.4502341747283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142923, + "balance_loss_mlp": 1.13023889, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.0420308652143082, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.78953964, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.911421298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.1204778, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.15008184892874737, + "language_loss": 0.99414909, + "learning_rate": 0.0009757216201974225, + "loss": 1.00555539, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.20141602, + "step": 138, + "time_per_iteration": 2.805294990539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163113, + "balance_loss_mlp": 1.1417979, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.10042691837700132, + "language_loss": 1.04683781, + "learning_rate": 0.0009771514130396581, + "loss": 1.05846894, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.21325684, + "step": 139, + "time_per_iteration": 2.6785237789154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.15150893, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.13712828131438198, + "language_loss": 1.04777944, + "learning_rate": 0.00097857095638274, + "loss": 1.05949712, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.20275879, + "step": 140, + "time_per_iteration": 2.5689632892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161751, + "balance_loss_mlp": 1.140818, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.04776427930188189, + "language_loss": 0.96152979, + "learning_rate": 0.0009799803961288726, + "loss": 0.97314727, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.20922852, + "step": 141, + "time_per_iteration": 3.005524158477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114311, + "balance_loss_mlp": 1.12280869, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.08242063446041879, + "language_loss": 1.02058709, + "learning_rate": 0.000981379875086876, + "loss": 1.03201818, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.20300293, + "step": 142, + "time_per_iteration": 3.0404272079467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149559, + "balance_loss_mlp": 1.12884021, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.08811908081945614, + "language_loss": 0.97007114, + "learning_rate": 0.0009827695330590185, + "loss": 0.98156673, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.20727539, + "step": 143, + "time_per_iteration": 2.677872896194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139202, + "balance_loss_mlp": 1.11838782, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.09095558281985278, + "language_loss": 0.9660008, + "learning_rate": 0.0009841495069248256, + "loss": 0.97739279, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.20788574, + "step": 144, + "time_per_iteration": 3.0181970596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124304, + "balance_loss_mlp": 1.10402668, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.06968867614461936, + "language_loss": 0.96011639, + "learning_rate": 0.0009855199307219871, + "loss": 0.97135949, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.20275879, + "step": 145, + "time_per_iteration": 2.6638803482055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129462, + "balance_loss_mlp": 1.10819507, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.10380696742567494, + "language_loss": 0.97768301, + "learning_rate": 0.0009868809357244854, + "loss": 0.98897767, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.21264648, + "step": 146, + "time_per_iteration": 2.6609416007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_mlp": 1.08754969, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.04767435219925792, + "language_loss": 1.01976728, + "learning_rate": 0.0009882326505180556, + "loss": 1.03085351, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.21081543, + "step": 147, + "time_per_iteration": 2.7018306255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116059, + "balance_loss_mlp": 1.09487534, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.081387986355653, + "language_loss": 1.0020777, + "learning_rate": 0.0009895752010730906, + "loss": 1.01323831, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.21191406, + "step": 148, + "time_per_iteration": 2.9776458740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114545, + "balance_loss_mlp": 1.09280121, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07164111136345892, + "language_loss": 1.06547272, + "learning_rate": 0.0009909087108150867, + "loss": 1.07661819, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.21740723, + "step": 149, + "time_per_iteration": 2.7685787677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120998, + "balance_loss_mlp": 1.09932601, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.09002123643314056, + "language_loss": 1.07463562, + "learning_rate": 0.0009922333006927371, + "loss": 1.08584571, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.2166748, + "step": 150, + "time_per_iteration": 2.5377442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134752, + "balance_loss_mlp": 1.11268604, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.07882603128859848, + "language_loss": 1.00827551, + "learning_rate": 0.0009935490892437632, + "loss": 1.01962304, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.22070312, + "step": 151, + "time_per_iteration": 2.5629055500030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126049, + "balance_loss_mlp": 1.10497248, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.07540534084758796, + "language_loss": 0.99210167, + "learning_rate": 0.0009948561926585687, + "loss": 1.00336218, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.21069336, + "step": 152, + "time_per_iteration": 2.755824565887451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133957, + "balance_loss_mlp": 1.1110214, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09890448438657973, + "language_loss": 1.02627087, + "learning_rate": 0.0009961547248418122, + "loss": 1.03761053, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.22937012, + "step": 153, + "time_per_iteration": 2.6255645751953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115876, + "balance_loss_mlp": 1.09208155, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.0750271830701194, + "language_loss": 0.99508584, + "learning_rate": 0.0009974447974719707, + "loss": 1.00624466, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.23791504, + "step": 154, + "time_per_iteration": 2.685029983520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126502, + "balance_loss_mlp": 1.10213518, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.12681443605953674, + "language_loss": 1.01620197, + "learning_rate": 0.0009987265200589763, + "loss": 1.02746701, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.24365234, + "step": 155, + "time_per_iteration": 2.7264955043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119337, + "balance_loss_mlp": 1.09590077, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.07965097154096117, + "language_loss": 1.01522899, + "learning_rate": 0.001, + "loss": 1.02642226, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.23400879, + "step": 156, + "time_per_iteration": 2.864698886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111456, + "balance_loss_mlp": 1.09257805, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.061020534493473076, + "language_loss": 0.9859184, + "learning_rate": 0.0009999999029413921, + "loss": 0.99706399, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.2199707, + "step": 157, + "time_per_iteration": 2.8241283893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125679, + "balance_loss_mlp": 1.1049242, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.05862251807890935, + "language_loss": 1.00346851, + "learning_rate": 0.0009999996117656068, + "loss": 1.01472545, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.2076416, + "step": 158, + "time_per_iteration": 2.7097458839416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113624, + "balance_loss_mlp": 1.09279847, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.09545570145123992, + "language_loss": 0.93653512, + "learning_rate": 0.0009999991264727564, + "loss": 0.94767129, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.20837402, + "step": 159, + "time_per_iteration": 2.756363868713379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110635, + "balance_loss_mlp": 1.08577418, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.09475469160316574, + "language_loss": 1.04571712, + "learning_rate": 0.0009999984470630296, + "loss": 1.05678058, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.20581055, + "step": 160, + "time_per_iteration": 2.5990707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112086, + "balance_loss_mlp": 1.09061611, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.07420241291943742, + "language_loss": 0.9342289, + "learning_rate": 0.0009999975735366902, + "loss": 0.94534969, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.21472168, + "step": 161, + "time_per_iteration": 3.06878662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114184, + "balance_loss_mlp": 1.09270215, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0799449593456649, + "language_loss": 0.95189524, + "learning_rate": 0.0009999965058940775, + "loss": 0.96303707, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.21484375, + "step": 162, + "time_per_iteration": 3.4937808513641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112457, + "balance_loss_mlp": 1.10226631, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08293329451395655, + "language_loss": 1.01278222, + "learning_rate": 0.0009999952441356057, + "loss": 1.02402782, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.22314453, + "step": 163, + "time_per_iteration": 2.535121202468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109663, + "balance_loss_mlp": 1.08820534, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.06727245316799851, + "language_loss": 1.0154388, + "learning_rate": 0.000999993788261765, + "loss": 1.02653539, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.21472168, + "step": 164, + "time_per_iteration": 3.5832889080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110987, + "balance_loss_mlp": 1.08942175, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.07205404441274409, + "language_loss": 1.03110182, + "learning_rate": 0.00099999213827312, + "loss": 1.04221165, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.21569824, + "step": 165, + "time_per_iteration": 2.8096628189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118839, + "balance_loss_mlp": 1.09684491, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.050309165813849886, + "language_loss": 0.98088074, + "learning_rate": 0.000999990294170312, + "loss": 0.99206913, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.22009277, + "step": 166, + "time_per_iteration": 2.663135051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116813, + "balance_loss_mlp": 1.09486628, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.06058681172545402, + "language_loss": 1.02190185, + "learning_rate": 0.0009999882559540566, + "loss": 1.03306985, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.21948242, + "step": 167, + "time_per_iteration": 2.649784564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118672, + "balance_loss_mlp": 1.09543872, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.10019647540930027, + "language_loss": 0.98887956, + "learning_rate": 0.000999986023625145, + "loss": 1.00006628, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.23217773, + "step": 168, + "time_per_iteration": 2.6998720169067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01817799, + "balance_loss_mlp": 1.79767668, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.21411409700219255, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80742216, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.20117188, + "step": 169, + "time_per_iteration": 5.029488563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112576, + "balance_loss_mlp": 1.10157228, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.09130724925200479, + "language_loss": 0.99515283, + "learning_rate": 0.0009999809766328958, + "loss": 1.00641036, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.24206543, + "step": 170, + "time_per_iteration": 2.6508679389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153612, + "balance_loss_mlp": 1.12968671, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.0981725040523357, + "language_loss": 1.01766157, + "learning_rate": 0.0009999781619715177, + "loss": 1.02919769, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.23925781, + "step": 171, + "time_per_iteration": 2.5449466705322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151071, + "balance_loss_mlp": 1.12767053, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.10018141203760955, + "language_loss": 1.0104121, + "learning_rate": 0.000999975153201402, + "loss": 1.02192283, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.23388672, + "step": 172, + "time_per_iteration": 2.8463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114097, + "balance_loss_mlp": 1.11745048, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.05920698759335099, + "language_loss": 0.98661143, + "learning_rate": 0.0009999719503237174, + "loss": 0.99802113, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.23498535, + "step": 173, + "time_per_iteration": 2.733147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157549, + "balance_loss_mlp": 1.1333611, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.12686135486457134, + "language_loss": 1.07479167, + "learning_rate": 0.0009999685533397073, + "loss": 1.08636713, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.24194336, + "step": 174, + "time_per_iteration": 2.5705809593200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110896, + "balance_loss_mlp": 1.08707762, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.07652801902249555, + "language_loss": 0.99758261, + "learning_rate": 0.00099996496225069, + "loss": 1.00869155, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.23815918, + "step": 175, + "time_per_iteration": 2.6572659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118018, + "balance_loss_mlp": 1.09399772, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.05463854096335067, + "language_loss": 1.01895058, + "learning_rate": 0.0009999611770580604, + "loss": 1.03013086, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.24023438, + "step": 176, + "time_per_iteration": 2.8216159343719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.09596181, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.08810438351502946, + "language_loss": 1.01167393, + "learning_rate": 0.0009999571977632876, + "loss": 1.02288568, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.25231934, + "step": 177, + "time_per_iteration": 2.581037998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115073, + "balance_loss_mlp": 1.09040904, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.08419866181616258, + "language_loss": 1.03353202, + "learning_rate": 0.0009999530243679166, + "loss": 1.04468274, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.24682617, + "step": 178, + "time_per_iteration": 2.5844500064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137225, + "balance_loss_mlp": 1.11332321, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.13671082465577608, + "language_loss": 0.99045932, + "learning_rate": 0.0009999486568735675, + "loss": 1.00183165, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.23913574, + "step": 179, + "time_per_iteration": 3.044409990310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125047, + "balance_loss_mlp": 1.1010983, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.0738854697341979, + "language_loss": 0.99422705, + "learning_rate": 0.0009999440952819362, + "loss": 1.00547755, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.23950195, + "step": 180, + "time_per_iteration": 3.644280433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112251, + "balance_loss_mlp": 1.08836114, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.04789131390967285, + "language_loss": 0.98983485, + "learning_rate": 0.0009999393395947935, + "loss": 1.00095737, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2388916, + "step": 181, + "time_per_iteration": 2.8229053020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114504, + "balance_loss_mlp": 1.08992302, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.08040661288612141, + "language_loss": 1.02358437, + "learning_rate": 0.0009999343898139858, + "loss": 1.03472936, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.24584961, + "step": 182, + "time_per_iteration": 2.6112709045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123737, + "balance_loss_mlp": 1.09824967, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.0879280890069936, + "language_loss": 1.01010704, + "learning_rate": 0.0009999292459414348, + "loss": 1.02134442, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.25476074, + "step": 183, + "time_per_iteration": 2.574800491333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111495, + "balance_loss_mlp": 1.08559036, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.08068750200828848, + "language_loss": 1.05455053, + "learning_rate": 0.0009999239079791374, + "loss": 1.06566548, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.25915527, + "step": 184, + "time_per_iteration": 2.5650548934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110884, + "balance_loss_mlp": 1.08343673, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.07300059562366337, + "language_loss": 0.98493111, + "learning_rate": 0.0009999183759291659, + "loss": 0.99601954, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.25427246, + "step": 185, + "time_per_iteration": 2.7383785247802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110571, + "balance_loss_mlp": 1.08168936, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.09426698036311254, + "language_loss": 1.00536895, + "learning_rate": 0.0009999126497936682, + "loss": 1.01642609, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.24023438, + "step": 186, + "time_per_iteration": 2.5103538036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110913, + "balance_loss_mlp": 1.08740544, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.07507023604654985, + "language_loss": 1.03590488, + "learning_rate": 0.0009999067295748676, + "loss": 1.047014, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.23510742, + "step": 187, + "time_per_iteration": 2.806403160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112247, + "balance_loss_mlp": 1.09995186, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.10679989437153373, + "language_loss": 1.00781608, + "learning_rate": 0.000999900615275062, + "loss": 1.01904082, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.22509766, + "step": 188, + "time_per_iteration": 2.6750597953796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105556, + "balance_loss_mlp": 1.0823226, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.06425431277780277, + "language_loss": 1.06987619, + "learning_rate": 0.0009998943068966256, + "loss": 1.0809319, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.23242188, + "step": 189, + "time_per_iteration": 2.4297006130218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.0826813, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.07322572175010231, + "language_loss": 1.01591444, + "learning_rate": 0.0009998878044420072, + "loss": 1.02697778, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23669434, + "step": 190, + "time_per_iteration": 2.6686899662017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108272, + "balance_loss_mlp": 1.08489525, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.07088525550270033, + "language_loss": 0.97819, + "learning_rate": 0.0009998811079137318, + "loss": 0.98927271, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.23400879, + "step": 191, + "time_per_iteration": 2.5795974731445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118931, + "balance_loss_mlp": 1.09439743, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.07437245365565072, + "language_loss": 0.9895249, + "learning_rate": 0.0009998742173143987, + "loss": 1.0007143, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.24536133, + "step": 192, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133748, + "balance_loss_mlp": 1.10824919, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.06698686336952825, + "language_loss": 0.98415262, + "learning_rate": 0.0009998671326466833, + "loss": 0.99549013, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.25524902, + "step": 193, + "time_per_iteration": 2.955780506134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136952, + "balance_loss_mlp": 1.10922432, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.07154145387165563, + "language_loss": 0.99267447, + "learning_rate": 0.0009998598539133362, + "loss": 1.00404394, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.27734375, + "step": 194, + "time_per_iteration": 3.0137686729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163557, + "balance_loss_mlp": 1.13373041, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.09795763902625766, + "language_loss": 1.00780571, + "learning_rate": 0.0009998523811171828, + "loss": 1.01944125, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2980957, + "step": 195, + "time_per_iteration": 2.5090267658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164155, + "balance_loss_mlp": 1.13323212, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0756543485462421, + "language_loss": 1.0036695, + "learning_rate": 0.0009998447142611248, + "loss": 1.015311, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.30883789, + "step": 196, + "time_per_iteration": 2.653759241104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12615836, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.10738469994654526, + "language_loss": 0.9438082, + "learning_rate": 0.0009998368533481387, + "loss": 0.95537138, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.30126953, + "step": 197, + "time_per_iteration": 3.03090763092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123277, + "balance_loss_mlp": 1.09433353, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08947148055588174, + "language_loss": 0.97516447, + "learning_rate": 0.0009998287983812762, + "loss": 0.98639727, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.28930664, + "step": 198, + "time_per_iteration": 2.842519760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.10672641, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.08719552456544254, + "language_loss": 1.03183711, + "learning_rate": 0.0009998205493636646, + "loss": 1.04316807, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.26416016, + "step": 199, + "time_per_iteration": 2.657094955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099837, + "balance_loss_mlp": 1.07485092, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.11937452390124363, + "language_loss": 0.95869702, + "learning_rate": 0.0009998121062985063, + "loss": 0.96969533, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.24987793, + "step": 200, + "time_per_iteration": 2.6954355239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108694, + "balance_loss_mlp": 1.08444691, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.09459530753006626, + "language_loss": 0.98493665, + "learning_rate": 0.0009998034691890794, + "loss": 0.9960236, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.24243164, + "step": 201, + "time_per_iteration": 2.7717928886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.08075976, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.07675440437740683, + "language_loss": 1.0290482, + "learning_rate": 0.0009997946380387369, + "loss": 1.04009235, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.23632812, + "step": 202, + "time_per_iteration": 2.63975191116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111336, + "balance_loss_mlp": 1.08706474, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09220046036918417, + "language_loss": 1.04956245, + "learning_rate": 0.0009997856128509076, + "loss": 1.06067586, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.24279785, + "step": 203, + "time_per_iteration": 2.856816053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124883, + "balance_loss_mlp": 1.10112453, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.08622839045605694, + "language_loss": 0.99688643, + "learning_rate": 0.0009997763936290952, + "loss": 1.00813532, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.23754883, + "step": 204, + "time_per_iteration": 2.5392112731933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113243, + "balance_loss_mlp": 1.10773039, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.09842935942049862, + "language_loss": 1.0453217, + "learning_rate": 0.0009997669803768789, + "loss": 1.05664587, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24694824, + "step": 205, + "time_per_iteration": 2.7708992958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108465, + "balance_loss_mlp": 1.08426595, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.10843184908981528, + "language_loss": 0.9984858, + "learning_rate": 0.0009997573730979134, + "loss": 1.00957048, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.24194336, + "step": 206, + "time_per_iteration": 2.7474939823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01685643, + "balance_loss_mlp": 1.6616106, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.13014896830523812, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80878842, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 0.24023438, + "step": 207, + "time_per_iteration": 4.682751655578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109669, + "balance_loss_mlp": 1.08474243, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.07677308889428856, + "language_loss": 0.98866731, + "learning_rate": 0.0009997375764747294, + "loss": 0.99976397, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.24926758, + "step": 208, + "time_per_iteration": 2.9866418838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110763, + "balance_loss_mlp": 1.08659935, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.07362493409063897, + "language_loss": 0.96845645, + "learning_rate": 0.0009997273871381967, + "loss": 0.97956407, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.24169922, + "step": 209, + "time_per_iteration": 2.7354848384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125333, + "balance_loss_mlp": 1.09998906, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.07873798613461079, + "language_loss": 1.01664305, + "learning_rate": 0.0009997170037902862, + "loss": 1.0278964, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.25366211, + "step": 210, + "time_per_iteration": 2.704061269760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120611, + "balance_loss_mlp": 1.09462297, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.06515356853390573, + "language_loss": 1.04550838, + "learning_rate": 0.0009997064264350292, + "loss": 1.05671442, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.26013184, + "step": 211, + "time_per_iteration": 2.8975577354431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113542, + "balance_loss_mlp": 1.08662462, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.07652094351016743, + "language_loss": 0.98263478, + "learning_rate": 0.0009996956550765317, + "loss": 0.99377024, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.26928711, + "step": 212, + "time_per_iteration": 2.6716954708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125588, + "balance_loss_mlp": 1.09752572, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07289633346919515, + "language_loss": 0.93075061, + "learning_rate": 0.0009996846897189762, + "loss": 0.94200653, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.28051758, + "step": 213, + "time_per_iteration": 2.621661901473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110085, + "balance_loss_mlp": 1.08412087, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.055838089119108855, + "language_loss": 0.99370623, + "learning_rate": 0.0009996735303666193, + "loss": 1.004807, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.2598877, + "step": 214, + "time_per_iteration": 2.6928601264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095032, + "balance_loss_mlp": 1.06966448, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.04962656356162825, + "language_loss": 1.01034558, + "learning_rate": 0.0009996621770237937, + "loss": 1.02129602, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.25390625, + "step": 215, + "time_per_iteration": 2.760256290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098352, + "balance_loss_mlp": 1.07167339, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.06820201547086252, + "language_loss": 0.97216904, + "learning_rate": 0.0009996506296949073, + "loss": 0.98315251, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.26708984, + "step": 216, + "time_per_iteration": 2.921712636947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106582, + "balance_loss_mlp": 1.0792954, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.05678696526689756, + "language_loss": 0.96681535, + "learning_rate": 0.0009996388883844428, + "loss": 0.97788119, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27282715, + "step": 217, + "time_per_iteration": 2.6392288208007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092837, + "balance_loss_mlp": 1.06704009, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06325985488704432, + "language_loss": 1.01514912, + "learning_rate": 0.0009996269530969588, + "loss": 1.02607751, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25830078, + "step": 218, + "time_per_iteration": 2.6588566303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105308, + "balance_loss_mlp": 1.08038127, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.07879458740668356, + "language_loss": 0.99769139, + "learning_rate": 0.0009996148238370888, + "loss": 1.00874448, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24938965, + "step": 219, + "time_per_iteration": 2.7322278022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103161, + "balance_loss_mlp": 1.07711363, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.0629407592127239, + "language_loss": 0.95434463, + "learning_rate": 0.0009996025006095421, + "loss": 0.96537632, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.26049805, + "step": 220, + "time_per_iteration": 3.336355209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02460831, + "balance_loss_mlp": 2.43965983, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.4526401201513886, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.80243975, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 0.21191406, + "step": 221, + "time_per_iteration": 5.584397315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138695, + "balance_loss_mlp": 1.11146736, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.08000509590360377, + "language_loss": 0.96767551, + "learning_rate": 0.0009995772722706307, + "loss": 0.9790625, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.27246094, + "step": 222, + "time_per_iteration": 2.932035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.14898777, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.06295735346771135, + "language_loss": 1.10290885, + "learning_rate": 0.0009995643671690604, + "loss": 1.1146853, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.28686523, + "step": 223, + "time_per_iteration": 2.489574909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118606, + "balance_loss_mlp": 1.15768862, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.06397701682602697, + "language_loss": 0.97599596, + "learning_rate": 0.0009995512681194023, + "loss": 0.98785651, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.28369141, + "step": 224, + "time_per_iteration": 2.8617055416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204697, + "balance_loss_mlp": 1.17644429, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.0569906191636753, + "language_loss": 0.95713508, + "learning_rate": 0.0009995379751267417, + "loss": 0.96918201, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28295898, + "step": 225, + "time_per_iteration": 3.272956371307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211045, + "balance_loss_mlp": 1.17959809, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.06210348551978246, + "language_loss": 0.970909, + "learning_rate": 0.0009995244881962398, + "loss": 0.98301941, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.31420898, + "step": 226, + "time_per_iteration": 2.629014253616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207143, + "balance_loss_mlp": 1.17750776, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.06412842399528458, + "language_loss": 0.97423029, + "learning_rate": 0.0009995108073331323, + "loss": 0.98630178, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.29614258, + "step": 227, + "time_per_iteration": 2.598266124725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.1790204, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.05900157234221112, + "language_loss": 1.00919747, + "learning_rate": 0.0009994969325427309, + "loss": 1.02128983, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.30200195, + "step": 228, + "time_per_iteration": 2.681445598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208149, + "balance_loss_mlp": 1.17727375, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.08372721248844238, + "language_loss": 0.96768719, + "learning_rate": 0.0009994828638304218, + "loss": 0.97976863, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.30883789, + "step": 229, + "time_per_iteration": 2.6330137252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213023, + "balance_loss_mlp": 1.18202829, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.09332052147555223, + "language_loss": 1.02555704, + "learning_rate": 0.0009994686012016675, + "loss": 1.0376873, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.30981445, + "step": 230, + "time_per_iteration": 2.519575595855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205079, + "balance_loss_mlp": 1.17470419, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.07303811655625075, + "language_loss": 1.02279592, + "learning_rate": 0.000999454144662005, + "loss": 1.03484678, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.3034668, + "step": 231, + "time_per_iteration": 2.8772194385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200788, + "balance_loss_mlp": 1.16729009, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.05982585511102693, + "language_loss": 0.9550131, + "learning_rate": 0.0009994394942170468, + "loss": 0.96702093, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.33520508, + "step": 232, + "time_per_iteration": 2.705536127090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200355, + "balance_loss_mlp": 1.16673827, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06482734437318205, + "language_loss": 0.93872058, + "learning_rate": 0.0009994246498724808, + "loss": 0.95072412, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.33642578, + "step": 233, + "time_per_iteration": 2.729526996612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204357, + "balance_loss_mlp": 1.17043054, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.06840473363398163, + "language_loss": 0.96267349, + "learning_rate": 0.00099940961163407, + "loss": 0.97471702, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.33935547, + "step": 234, + "time_per_iteration": 2.8506321907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210646, + "balance_loss_mlp": 1.1758604, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.061734633326469966, + "language_loss": 0.99016106, + "learning_rate": 0.0009993943795076528, + "loss": 1.0022676, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.34814453, + "step": 235, + "time_per_iteration": 2.6817193031311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012082, + "balance_loss_mlp": 1.17379582, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.07722659013027651, + "language_loss": 1.01211047, + "learning_rate": 0.0009993789534991427, + "loss": 1.02419257, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34423828, + "step": 236, + "time_per_iteration": 2.4797797203063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216471, + "balance_loss_mlp": 1.18354487, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.057771959372629855, + "language_loss": 0.96296465, + "learning_rate": 0.0009993633336145287, + "loss": 0.97512937, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.3293457, + "step": 237, + "time_per_iteration": 2.629390001296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225643, + "balance_loss_mlp": 1.19369495, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.07668042159358972, + "language_loss": 1.00654197, + "learning_rate": 0.0009993475198598752, + "loss": 1.01879823, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.31958008, + "step": 238, + "time_per_iteration": 3.01481032371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220207, + "balance_loss_mlp": 1.1866858, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08994725037560618, + "language_loss": 0.96828419, + "learning_rate": 0.0009993315122413212, + "loss": 0.98048627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.33544922, + "step": 239, + "time_per_iteration": 2.6483867168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215592, + "balance_loss_mlp": 1.18042517, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.08238446857980607, + "language_loss": 0.9678297, + "learning_rate": 0.0009993153107650818, + "loss": 0.97998565, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.35180664, + "step": 240, + "time_per_iteration": 2.594534158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199347, + "balance_loss_mlp": 1.16303563, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.09316981102360596, + "language_loss": 0.96465278, + "learning_rate": 0.0009992989154374468, + "loss": 0.9766463, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.36328125, + "step": 241, + "time_per_iteration": 2.5503900051116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190623, + "balance_loss_mlp": 1.15631413, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06540072726643342, + "language_loss": 1.03219867, + "learning_rate": 0.0009992823262647817, + "loss": 1.04410505, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.34301758, + "step": 242, + "time_per_iteration": 2.7218894958496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156597, + "balance_loss_mlp": 1.1235044, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.09177405734811558, + "language_loss": 0.97326249, + "learning_rate": 0.0009992655432535264, + "loss": 0.98482847, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.33105469, + "step": 243, + "time_per_iteration": 2.800133466720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136682, + "balance_loss_mlp": 1.10614085, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.0753000751829641, + "language_loss": 0.98140877, + "learning_rate": 0.0009992485664101973, + "loss": 0.99277562, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.30517578, + "step": 244, + "time_per_iteration": 2.6863763332366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.08648348, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.06369495608278983, + "language_loss": 1.00049853, + "learning_rate": 0.000999231395741385, + "loss": 1.01165819, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.29467773, + "step": 245, + "time_per_iteration": 3.145612955093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104415, + "balance_loss_mlp": 1.0764488, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.058358007346171054, + "language_loss": 0.97651666, + "learning_rate": 0.0009992140312537557, + "loss": 0.98756075, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.2800293, + "step": 246, + "time_per_iteration": 2.612847328186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092763, + "balance_loss_mlp": 1.06641817, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.0813165094086701, + "language_loss": 0.93562448, + "learning_rate": 0.000999196472954051, + "loss": 0.94655204, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.26379395, + "step": 247, + "time_per_iteration": 2.9633545875549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02706023, + "balance_loss_mlp": 2.55038333, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.26644214904670055, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.82130873, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.5546875, + "step": 248, + "time_per_iteration": 5.665804624557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.12381256, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.07780849766073628, + "language_loss": 1.00670481, + "learning_rate": 0.0009991607749457578, + "loss": 1.01821971, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.27709961, + "step": 249, + "time_per_iteration": 2.511357069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173992, + "balance_loss_mlp": 1.14483345, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.08242230719461915, + "language_loss": 0.98555326, + "learning_rate": 0.0009991426352510286, + "loss": 0.99729323, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.29174805, + "step": 250, + "time_per_iteration": 2.9747626781463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213643, + "balance_loss_mlp": 1.18186164, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.08110439009499554, + "language_loss": 0.99640858, + "learning_rate": 0.0009991243017719422, + "loss": 1.00854492, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.31787109, + "step": 251, + "time_per_iteration": 2.6450002193450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247147, + "balance_loss_mlp": 1.21276748, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.09531666026222298, + "language_loss": 0.94547766, + "learning_rate": 0.0009991057745156165, + "loss": 0.95794916, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.34375, + "step": 252, + "time_per_iteration": 2.608226776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0212821, + "balance_loss_mlp": 2.05687547, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.23568337742673945, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84039193, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.71484375, + "step": 253, + "time_per_iteration": 5.009166955947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253718, + "balance_loss_mlp": 1.22112656, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.11732554794190522, + "language_loss": 1.02719152, + "learning_rate": 0.0009990681387000943, + "loss": 1.03972876, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.32568359, + "step": 254, + "time_per_iteration": 2.733544111251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259536, + "balance_loss_mlp": 1.22959042, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.10757948615664437, + "language_loss": 0.99075437, + "learning_rate": 0.0009990490301555093, + "loss": 1.00334978, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.29907227, + "step": 255, + "time_per_iteration": 2.952223777770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01833791, + "balance_loss_mlp": 1.79201972, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.13001926806611183, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81048942, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.41796875, + "step": 256, + "time_per_iteration": 4.834028244018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01839647, + "balance_loss_mlp": 1.7994014, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.11989001468728706, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81082386, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.40234375, + "step": 257, + "time_per_iteration": 4.963416814804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764173, + "balance_loss_mlp": 1.72659838, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.09913369297847359, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71740055, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.375, + "step": 258, + "time_per_iteration": 4.860485076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242536, + "balance_loss_mlp": 1.21342516, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.09740558448014502, + "language_loss": 0.93272007, + "learning_rate": 0.0009989706585723202, + "loss": 0.94514549, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29101562, + "step": 259, + "time_per_iteration": 2.763617753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252163, + "balance_loss_mlp": 1.22202659, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.1249592106702951, + "language_loss": 0.99313855, + "learning_rate": 0.0009989505813633442, + "loss": 1.0056603, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.30102539, + "step": 260, + "time_per_iteration": 2.687018394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240716, + "balance_loss_mlp": 1.2099601, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12109163963871895, + "language_loss": 0.99271172, + "learning_rate": 0.000998930310444573, + "loss": 1.00511885, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.30712891, + "step": 261, + "time_per_iteration": 2.7355992794036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194626, + "balance_loss_mlp": 1.16220057, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.10196827835843725, + "language_loss": 0.96712077, + "learning_rate": 0.0009989098458238765, + "loss": 0.97906703, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.32421875, + "step": 262, + "time_per_iteration": 2.8160154819488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120265, + "balance_loss_mlp": 1.16850853, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.08050125519090791, + "language_loss": 0.96376812, + "learning_rate": 0.0009988891875091998, + "loss": 0.97579467, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.34179688, + "step": 263, + "time_per_iteration": 2.7738425731658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221172, + "balance_loss_mlp": 1.18657792, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09840792148235085, + "language_loss": 0.91716301, + "learning_rate": 0.0009988683355085636, + "loss": 0.92937469, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.34619141, + "step": 264, + "time_per_iteration": 2.7763147354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240941, + "balance_loss_mlp": 1.20393836, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.10851467261948886, + "language_loss": 0.99809039, + "learning_rate": 0.000998847289830063, + "loss": 1.01049972, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37011719, + "step": 265, + "time_per_iteration": 2.824655532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228337, + "balance_loss_mlp": 1.1930747, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.10300549526892724, + "language_loss": 0.92410266, + "learning_rate": 0.0009988260504818682, + "loss": 0.93638599, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.35253906, + "step": 266, + "time_per_iteration": 2.5484864711761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187227, + "balance_loss_mlp": 1.15127397, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.08304900792028935, + "language_loss": 0.99349552, + "learning_rate": 0.000998804617472226, + "loss": 1.00536776, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.35986328, + "step": 267, + "time_per_iteration": 2.67124342918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115754, + "balance_loss_mlp": 1.1241138, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.09977621520267708, + "language_loss": 0.94207335, + "learning_rate": 0.0009987829908094568, + "loss": 0.95364869, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.33447266, + "step": 268, + "time_per_iteration": 2.813934087753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134088, + "balance_loss_mlp": 1.09908843, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.11738978381138881, + "language_loss": 1.00792646, + "learning_rate": 0.0009987611705019569, + "loss": 1.01926744, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.3503418, + "step": 269, + "time_per_iteration": 4.138862133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117111, + "balance_loss_mlp": 1.08282614, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.05348082980263852, + "language_loss": 0.99369657, + "learning_rate": 0.0009987391565581978, + "loss": 1.00486767, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34277344, + "step": 270, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126117, + "balance_loss_mlp": 1.09176075, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.07524916084480812, + "language_loss": 0.92056942, + "learning_rate": 0.000998716948986726, + "loss": 0.93183053, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34350586, + "step": 271, + "time_per_iteration": 2.7993569374084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142479, + "balance_loss_mlp": 1.10948217, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.0817059207133684, + "language_loss": 0.94050443, + "learning_rate": 0.0009986945477961633, + "loss": 0.95192927, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.33032227, + "step": 272, + "time_per_iteration": 2.692488193511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162369, + "balance_loss_mlp": 1.13108802, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07154102990319093, + "language_loss": 0.9958387, + "learning_rate": 0.0009986719529952066, + "loss": 1.00746238, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.3125, + "step": 273, + "time_per_iteration": 2.834634780883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151893, + "balance_loss_mlp": 1.12099373, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.11641144040169231, + "language_loss": 0.98596179, + "learning_rate": 0.000998649164592628, + "loss": 0.99748075, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.30859375, + "step": 274, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128229, + "balance_loss_mlp": 1.0986656, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.08444223005841496, + "language_loss": 0.96863008, + "learning_rate": 0.0009986261825972748, + "loss": 0.97991234, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29541016, + "step": 275, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116765, + "balance_loss_mlp": 1.08734369, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.09541227165854013, + "language_loss": 0.9859423, + "learning_rate": 0.000998603007018069, + "loss": 0.99711001, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29394531, + "step": 276, + "time_per_iteration": 2.7675342559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108591, + "balance_loss_mlp": 1.07731009, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.06559506468622318, + "language_loss": 0.95903766, + "learning_rate": 0.0009985796378640089, + "loss": 0.97012359, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.3125, + "step": 277, + "time_per_iteration": 2.7019519805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111687, + "balance_loss_mlp": 1.08012068, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.07318038514420845, + "language_loss": 0.95983016, + "learning_rate": 0.0009985560751441665, + "loss": 0.97094703, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.31542969, + "step": 278, + "time_per_iteration": 2.8234922885894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111914, + "balance_loss_mlp": 1.0874306, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.07220087085065136, + "language_loss": 0.98319995, + "learning_rate": 0.00099853231886769, + "loss": 0.99439132, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.31713867, + "step": 279, + "time_per_iteration": 2.7748613357543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133292, + "balance_loss_mlp": 1.10162961, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06439402113592181, + "language_loss": 0.98657203, + "learning_rate": 0.0009985083690438024, + "loss": 0.99790496, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.31640625, + "step": 280, + "time_per_iteration": 2.700810670852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132869, + "balance_loss_mlp": 1.10204113, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.04843472954862069, + "language_loss": 0.89283121, + "learning_rate": 0.0009984842256818016, + "loss": 0.9041599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.30786133, + "step": 281, + "time_per_iteration": 3.115292549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113546, + "balance_loss_mlp": 1.10580087, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.06657413960403659, + "language_loss": 0.99515754, + "learning_rate": 0.0009984598887910613, + "loss": 1.00651217, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.29614258, + "step": 282, + "time_per_iteration": 2.735640048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140553, + "balance_loss_mlp": 1.10893846, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.07881571737542031, + "language_loss": 0.95306879, + "learning_rate": 0.0009984353583810297, + "loss": 0.96447432, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.31616211, + "step": 283, + "time_per_iteration": 2.8240931034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128602, + "balance_loss_mlp": 1.09834647, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0943213260733239, + "language_loss": 0.97471213, + "learning_rate": 0.0009984106344612302, + "loss": 0.98599815, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.30224609, + "step": 284, + "time_per_iteration": 2.802689790725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119254, + "balance_loss_mlp": 1.08964229, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.0726777825280204, + "language_loss": 0.92919928, + "learning_rate": 0.0009983857170412615, + "loss": 0.94039178, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.29589844, + "step": 285, + "time_per_iteration": 3.0111782550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.10165143, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.06957121076923053, + "language_loss": 0.92976809, + "learning_rate": 0.000998360606130798, + "loss": 0.94110835, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.32324219, + "step": 286, + "time_per_iteration": 2.8221306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01949249, + "balance_loss_mlp": 1.90461755, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.20138197735421756, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71022367, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.44726562, + "step": 287, + "time_per_iteration": 4.872509956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160615, + "balance_loss_mlp": 1.12447047, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.09083797153449202, + "language_loss": 0.98382282, + "learning_rate": 0.0009983098038774552, + "loss": 0.99542892, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.36132812, + "step": 288, + "time_per_iteration": 2.7861900329589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156365, + "balance_loss_mlp": 1.54524422, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.05039988105800305, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79733872, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.18359375, + "step": 289, + "time_per_iteration": 4.809176683425903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183294, + "balance_loss_mlp": 1.14958155, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.11767359006900376, + "language_loss": 0.95852768, + "learning_rate": 0.0009982582277800948, + "loss": 0.9703607, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.33666992, + "step": 290, + "time_per_iteration": 2.5785539150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11738336, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.09005932528563108, + "language_loss": 1.03039932, + "learning_rate": 0.0009982321495648908, + "loss": 1.04188573, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.3125, + "step": 291, + "time_per_iteration": 2.798412561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133747, + "balance_loss_mlp": 1.10218096, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.07041326246084649, + "language_loss": 0.9488259, + "learning_rate": 0.0009982058779188115, + "loss": 0.96016335, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.31542969, + "step": 292, + "time_per_iteration": 2.7117443084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113109, + "balance_loss_mlp": 1.08354521, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.0659469171672323, + "language_loss": 1.02221513, + "learning_rate": 0.0009981794128520567, + "loss": 1.0333463, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.29589844, + "step": 293, + "time_per_iteration": 2.83561372756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113063, + "balance_loss_mlp": 1.10104227, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.07618014203826041, + "language_loss": 0.98908657, + "learning_rate": 0.000998152754374901, + "loss": 1.00039291, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.2956543, + "step": 294, + "time_per_iteration": 2.879502773284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133819, + "balance_loss_mlp": 1.1052562, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.09109925372268521, + "language_loss": 0.94850433, + "learning_rate": 0.0009981259024976943, + "loss": 0.95984244, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.28564453, + "step": 295, + "time_per_iteration": 2.708038568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129466, + "balance_loss_mlp": 1.10023606, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.08548016831625774, + "language_loss": 0.92669952, + "learning_rate": 0.0009980988572308612, + "loss": 0.93799424, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.29248047, + "step": 296, + "time_per_iteration": 2.99466609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126952, + "balance_loss_mlp": 1.09779358, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.05751010220277151, + "language_loss": 0.96034563, + "learning_rate": 0.0009980716185849015, + "loss": 0.9716152, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.29174805, + "step": 297, + "time_per_iteration": 3.0216734409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135652, + "balance_loss_mlp": 1.10651755, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06310788330802251, + "language_loss": 0.92855394, + "learning_rate": 0.0009980441865703904, + "loss": 0.93991041, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29150391, + "step": 298, + "time_per_iteration": 2.6354267597198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124221, + "balance_loss_mlp": 1.09456158, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.07879622532675779, + "language_loss": 1.0091691, + "learning_rate": 0.000998016561197978, + "loss": 1.02041125, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29638672, + "step": 299, + "time_per_iteration": 2.726853370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104202, + "balance_loss_mlp": 1.0768075, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.07606317837722033, + "language_loss": 0.9243238, + "learning_rate": 0.0009979887424783895, + "loss": 0.9353658, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.27441406, + "step": 300, + "time_per_iteration": 2.866880416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03286275, + "balance_loss_mlp": 5.97428513, + "diversity_loss_mlp": 0.40086228, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.08630620995418306, + "language_loss": 1.00780904, + "learning_rate": 0.0009979607304224248, + "loss": 1.04067183, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.09870158, + "step": 301, + "time_per_iteration": 2.8737847805023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101287, + "balance_loss_mlp": 1.07100797, + "diversity_loss_mlp": 0.0, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.07465341521099292, + "language_loss": 0.98771101, + "learning_rate": 0.000997932525040959, + "loss": 0.99872386, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.30273438, + "routerloss_mlp": 0.0, + "step": 302, + "time_per_iteration": 2.646038055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097684, + "balance_loss_mlp": 1.06912112, + "diversity_loss_mlp": 0.0, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.0784548088046029, + "language_loss": 1.01345074, + "learning_rate": 0.000997904126344943, + "loss": 1.02442753, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.28527832, + "routerloss_mlp": 0.0, + "step": 303, + "time_per_iteration": 2.607773542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117476, + "balance_loss_mlp": 1.08612442, + "diversity_loss_mlp": 0.0, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.08413175271133923, + "language_loss": 0.96722186, + "learning_rate": 0.0009978755343454018, + "loss": 0.97839665, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.31323242, + "routerloss_mlp": 0.0, + "step": 304, + "time_per_iteration": 2.7423698902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.11099684, + "diversity_loss_mlp": 0.0, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.08591892096672729, + "language_loss": 0.97475642, + "learning_rate": 0.0009978467490534355, + "loss": 0.98621881, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.35229492, + "routerloss_mlp": 0.0, + "step": 305, + "time_per_iteration": 2.5751075744628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144387, + "balance_loss_mlp": 1.10974526, + "diversity_loss_mlp": 0.0, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.06674928608125212, + "language_loss": 0.95161211, + "learning_rate": 0.00099781777048022, + "loss": 0.96305597, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.34667969, + "routerloss_mlp": 0.0, + "step": 306, + "time_per_iteration": 2.697453260421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142445, + "balance_loss_mlp": 1.10766006, + "diversity_loss_mlp": 0.0, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.08714127978238019, + "language_loss": 0.96547389, + "learning_rate": 0.0009977885986370057, + "loss": 0.97689843, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.34790039, + "routerloss_mlp": 0.0, + "step": 307, + "time_per_iteration": 2.555311679840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.11098385, + "diversity_loss_mlp": 0.0, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.07630797692789458, + "language_loss": 0.93133295, + "learning_rate": 0.000997759233535118, + "loss": 0.94276774, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.32495117, + "routerloss_mlp": 0.0, + "step": 308, + "time_per_iteration": 2.7760326862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.10530353, + "diversity_loss_mlp": 0.0, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.1535726459245726, + "language_loss": 0.98530197, + "learning_rate": 0.0009977296751859576, + "loss": 0.99668187, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.32666016, + "routerloss_mlp": 0.0, + "step": 309, + "time_per_iteration": 2.7718236446380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119868, + "balance_loss_mlp": 1.09030402, + "diversity_loss_mlp": 0.0, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.09363029892750833, + "language_loss": 1.00139546, + "learning_rate": 0.0009976999236009998, + "loss": 1.01259422, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 310, + "time_per_iteration": 2.7480924129486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128418, + "balance_loss_mlp": 1.1004039, + "diversity_loss_mlp": 0.0, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.11799476734746514, + "language_loss": 1.01830125, + "learning_rate": 0.0009976699787917955, + "loss": 1.02958548, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.28051758, + "routerloss_mlp": 0.0, + "step": 311, + "time_per_iteration": 2.6702628135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02237821, + "balance_loss_mlp": 2.22513723, + "diversity_loss_mlp": 0.0, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.1521885653041848, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75680816, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 312, + "time_per_iteration": 4.968472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01934551, + "balance_loss_mlp": 3.38140035, + "diversity_loss_mlp": 0.39575127, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.05936914788699087, + "language_loss": 0.983639, + "learning_rate": 0.0009976095095472243, + "loss": 1.00298452, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.04597524, + "step": 313, + "time_per_iteration": 2.6077775955200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140935, + "balance_loss_mlp": 1.11120427, + "diversity_loss_mlp": 0.0, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.09323488343042824, + "language_loss": 0.95392269, + "learning_rate": 0.0009975789851353334, + "loss": 0.96533203, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29736328, + "routerloss_mlp": 0.0, + "step": 314, + "time_per_iteration": 2.810530424118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152798, + "balance_loss_mlp": 1.12359178, + "diversity_loss_mlp": 0.0, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.09115128879339694, + "language_loss": 0.97407585, + "learning_rate": 0.0009975482675461487, + "loss": 0.98560387, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.29223633, + "routerloss_mlp": 0.0, + "step": 315, + "time_per_iteration": 2.658961772918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165231, + "balance_loss_mlp": 1.13464189, + "diversity_loss_mlp": 0.0, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08232329918432242, + "language_loss": 0.95008749, + "learning_rate": 0.0009975173567915952, + "loss": 0.96173978, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.3059082, + "routerloss_mlp": 0.0, + "step": 316, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208938, + "balance_loss_mlp": 1.17508304, + "diversity_loss_mlp": 0.0, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.11734128354988786, + "language_loss": 0.89037865, + "learning_rate": 0.000997486252883674, + "loss": 0.90246803, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.33886719, + "routerloss_mlp": 0.0, + "step": 317, + "time_per_iteration": 2.82440447807312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246386, + "balance_loss_mlp": 1.21069503, + "diversity_loss_mlp": 0.0, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.09191065951965113, + "language_loss": 0.94435382, + "learning_rate": 0.0009974549558344602, + "loss": 0.95681769, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.35693359, + "routerloss_mlp": 0.0, + "step": 318, + "time_per_iteration": 3.6594014167785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256455, + "balance_loss_mlp": 1.22028661, + "diversity_loss_mlp": 0.0, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.10186826507715854, + "language_loss": 1.03254342, + "learning_rate": 0.000997423465656105, + "loss": 1.04510808, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.36181641, + "routerloss_mlp": 0.0, + "step": 319, + "time_per_iteration": 2.7277376651763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228783, + "balance_loss_mlp": 1.19342566, + "diversity_loss_mlp": 0.0, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.07892523617459922, + "language_loss": 1.00628281, + "learning_rate": 0.0009973917823608335, + "loss": 1.01857066, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.35375977, + "routerloss_mlp": 0.0, + "step": 320, + "time_per_iteration": 2.608973503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216411, + "balance_loss_mlp": 1.18279386, + "diversity_loss_mlp": 0.0, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.08046246772740448, + "language_loss": 0.96186835, + "learning_rate": 0.0009973599059609462, + "loss": 0.9740324, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 321, + "time_per_iteration": 2.736543655395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188588, + "balance_loss_mlp": 1.15735531, + "diversity_loss_mlp": 0.0, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.06958940991484033, + "language_loss": 0.93877137, + "learning_rate": 0.000997327836468819, + "loss": 0.95065725, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.31225586, + "routerloss_mlp": 0.0, + "step": 322, + "time_per_iteration": 2.6034624576568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_mlp": 1.14392066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.10097410409674823, + "language_loss": 0.96476239, + "learning_rate": 0.000997295573896902, + "loss": 0.97648811, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28649902, + "routerloss_mlp": 0.0, + "step": 323, + "time_per_iteration": 2.8207039833068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02388506, + "balance_loss_mlp": 2.37343788, + "diversity_loss_mlp": 0.0, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.2858946964689234, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83584547, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 324, + "time_per_iteration": 4.691263437271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01793915, + "balance_loss_mlp": 1.78142214, + "diversity_loss_mlp": 0.0, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.11944332826526777, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80365855, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 325, + "time_per_iteration": 4.837715148925781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214832, + "balance_loss_mlp": 1.18657923, + "diversity_loss_mlp": 0.0, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.0814388529334085, + "language_loss": 0.91516924, + "learning_rate": 0.000997197627828043, + "loss": 0.92731762, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 326, + "time_per_iteration": 2.5261096954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228602, + "balance_loss_mlp": 1.20018268, + "diversity_loss_mlp": 0.0, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.08774897428196327, + "language_loss": 0.86495018, + "learning_rate": 0.0009971645930629716, + "loss": 0.87723619, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.28442383, + "routerloss_mlp": 0.0, + "step": 327, + "time_per_iteration": 2.73193621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236303, + "balance_loss_mlp": 1.20914674, + "diversity_loss_mlp": 0.0, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.0823367638378532, + "language_loss": 0.99889791, + "learning_rate": 0.0009971313652814872, + "loss": 1.01126099, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.2722168, + "routerloss_mlp": 0.0, + "step": 328, + "time_per_iteration": 2.79278826713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224995, + "balance_loss_mlp": 1.1973865, + "diversity_loss_mlp": 0.0, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.1407341288256049, + "language_loss": 0.97435188, + "learning_rate": 0.0009970979444964903, + "loss": 0.98660183, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27636719, + "routerloss_mlp": 0.0, + "step": 329, + "time_per_iteration": 2.9955334663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213807, + "balance_loss_mlp": 1.18553066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.10291010686297611, + "language_loss": 0.9869082, + "learning_rate": 0.0009970643307209556, + "loss": 0.99904621, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 330, + "time_per_iteration": 2.79775071144104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202809, + "balance_loss_mlp": 1.17248201, + "diversity_loss_mlp": 0.0, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.08231148280507655, + "language_loss": 0.94842714, + "learning_rate": 0.0009970305239679334, + "loss": 0.96045524, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.30322266, + "routerloss_mlp": 0.0, + "step": 331, + "time_per_iteration": 2.802400827407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203401, + "balance_loss_mlp": 1.17300248, + "diversity_loss_mlp": 0.0, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.08804880344809486, + "language_loss": 0.99692816, + "learning_rate": 0.0009969965242505483, + "loss": 1.00896215, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.30371094, + "routerloss_mlp": 0.0, + "step": 332, + "time_per_iteration": 2.634702682495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224958, + "balance_loss_mlp": 1.19243741, + "diversity_loss_mlp": 0.0, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06414677867033303, + "language_loss": 0.95931363, + "learning_rate": 0.0009969623315820007, + "loss": 0.97156322, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 333, + "time_per_iteration": 2.6661436557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245141, + "balance_loss_mlp": 1.21149969, + "diversity_loss_mlp": 0.0, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06624608002660057, + "language_loss": 0.9590115, + "learning_rate": 0.000996927945975565, + "loss": 0.97146285, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 334, + "time_per_iteration": 2.576922655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252992, + "balance_loss_mlp": 1.21672821, + "diversity_loss_mlp": 0.0, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.07108304231036514, + "language_loss": 0.93002915, + "learning_rate": 0.0009968933674445906, + "loss": 0.94255906, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.36230469, + "routerloss_mlp": 0.0, + "step": 335, + "time_per_iteration": 2.706836462020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267675, + "balance_loss_mlp": 1.23026776, + "diversity_loss_mlp": 0.0, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.0701420022906001, + "language_loss": 0.95153642, + "learning_rate": 0.0009968585960025028, + "loss": 0.96421325, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.37402344, + "routerloss_mlp": 0.0, + "step": 336, + "time_per_iteration": 2.9356396198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01838771, + "balance_loss_mlp": 1.81416643, + "diversity_loss_mlp": 0.0, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.09587986506557475, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79491967, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.24511719, + "routerloss_mlp": 0.0, + "step": 337, + "time_per_iteration": 4.784119606018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242978, + "balance_loss_mlp": 1.20874155, + "diversity_loss_mlp": 0.0, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.1007121907193806, + "language_loss": 0.9314844, + "learning_rate": 0.0009967884744390583, + "loss": 0.94391423, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.3425293, + "routerloss_mlp": 0.0, + "step": 338, + "time_per_iteration": 3.5315823554992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209945, + "balance_loss_mlp": 1.1758039, + "diversity_loss_mlp": 0.0, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.10820011352875603, + "language_loss": 0.93812096, + "learning_rate": 0.0009967531243449256, + "loss": 0.95022047, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.34130859, + "routerloss_mlp": 0.0, + "step": 339, + "time_per_iteration": 2.6663827896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172072, + "balance_loss_mlp": 1.13959908, + "diversity_loss_mlp": 0.0, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.07246387309668721, + "language_loss": 1.014539, + "learning_rate": 0.000996717581394126, + "loss": 1.02625966, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 340, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142038, + "balance_loss_mlp": 1.11142516, + "diversity_loss_mlp": 0.0, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.07622939946709405, + "language_loss": 1.01788783, + "learning_rate": 0.000996681845600459, + "loss": 1.0293082, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.30615234, + "routerloss_mlp": 0.0, + "step": 341, + "time_per_iteration": 2.6651370525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138836, + "balance_loss_mlp": 1.10901034, + "diversity_loss_mlp": 0.0, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.06359259902727714, + "language_loss": 0.94080132, + "learning_rate": 0.0009966459169777982, + "loss": 0.95218974, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.29785156, + "routerloss_mlp": 0.0, + "step": 342, + "time_per_iteration": 2.524775981903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136152, + "balance_loss_mlp": 1.10670757, + "diversity_loss_mlp": 0.0, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.07912610309003802, + "language_loss": 1.03090763, + "learning_rate": 0.0009966097955400924, + "loss": 1.04226899, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.29418945, + "routerloss_mlp": 0.0, + "step": 343, + "time_per_iteration": 2.662269115447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074802, + "balance_loss_mlp": 1.74366593, + "diversity_loss_mlp": 0.35364389, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.10968898462568231, + "language_loss": 0.99445379, + "learning_rate": 0.0009965734813013652, + "loss": 1.00520182, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02614743, + "step": 344, + "time_per_iteration": 2.82026743888855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138748, + "balance_loss_mlp": 1.10989952, + "diversity_loss_mlp": 0.0, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.13046244738635646, + "language_loss": 0.99630761, + "learning_rate": 0.0009965369742757151, + "loss": 1.00769508, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.28833008, + "routerloss_mlp": 0.0, + "step": 345, + "time_per_iteration": 2.565809965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112942, + "balance_loss_mlp": 1.10131097, + "diversity_loss_mlp": 0.0, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.1120170016707216, + "language_loss": 0.96858162, + "learning_rate": 0.0009965002744773152, + "loss": 0.9798758, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 346, + "time_per_iteration": 3.52542781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144914, + "balance_loss_mlp": 1.1170671, + "diversity_loss_mlp": 0.0, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.08447825810050776, + "language_loss": 0.93369007, + "learning_rate": 0.0009964633819204139, + "loss": 0.94513917, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.27832031, + "routerloss_mlp": 0.0, + "step": 347, + "time_per_iteration": 2.6504640579223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02729187, + "balance_loss_mlp": 2.68856025, + "diversity_loss_mlp": 0.0, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.36365581545094156, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.84530306, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 348, + "time_per_iteration": 4.9217259883880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01886969, + "balance_loss_mlp": 1.8606472, + "diversity_loss_mlp": 0.0, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.11180228987157655, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.77040851, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.26367188, + "routerloss_mlp": 0.0, + "step": 349, + "time_per_iteration": 4.915479898452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148873, + "balance_loss_mlp": 1.11942816, + "diversity_loss_mlp": 0.0, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.08620115988858058, + "language_loss": 0.93105251, + "learning_rate": 0.000996351547842304, + "loss": 0.94254124, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29443359, + "routerloss_mlp": 0.0, + "step": 350, + "time_per_iteration": 3.2273383140563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183797, + "balance_loss_mlp": 1.152946, + "diversity_loss_mlp": 0.0, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.10656846418921655, + "language_loss": 0.91589314, + "learning_rate": 0.0009963138843953744, + "loss": 0.92773116, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.30810547, + "routerloss_mlp": 0.0, + "step": 351, + "time_per_iteration": 2.6443302631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122224, + "balance_loss_mlp": 1.19079256, + "diversity_loss_mlp": 0.0, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.12218392571909323, + "language_loss": 0.95582229, + "learning_rate": 0.000996276028262306, + "loss": 0.9680447, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.31420898, + "routerloss_mlp": 0.0, + "step": 352, + "time_per_iteration": 2.819287061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121763, + "balance_loss_mlp": 1.18711233, + "diversity_loss_mlp": 0.0, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.14903684788896404, + "language_loss": 1.01496267, + "learning_rate": 0.0009962379794577964, + "loss": 1.02713895, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.30493164, + "routerloss_mlp": 0.0, + "step": 353, + "time_per_iteration": 2.591759204864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123139, + "balance_loss_mlp": 1.2003479, + "diversity_loss_mlp": 0.0, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.0632056956592815, + "language_loss": 0.9195236, + "learning_rate": 0.000996199737996617, + "loss": 0.9318375, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "routerloss_mlp": 0.0, + "step": 354, + "time_per_iteration": 2.889040231704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209696, + "balance_loss_mlp": 1.17963195, + "diversity_loss_mlp": 0.0, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.07119928644727336, + "language_loss": 1.00405252, + "learning_rate": 0.0009961613038936149, + "loss": 1.0161494, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.30029297, + "routerloss_mlp": 0.0, + "step": 355, + "time_per_iteration": 2.5856525897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187257, + "balance_loss_mlp": 1.15755057, + "diversity_loss_mlp": 0.0, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.07116362106359332, + "language_loss": 0.93361115, + "learning_rate": 0.000996122677163711, + "loss": 0.9454838, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.296875, + "routerloss_mlp": 0.0, + "step": 356, + "time_per_iteration": 2.8134818077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213028, + "balance_loss_mlp": 1.18367887, + "diversity_loss_mlp": 0.0, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.08014414191517881, + "language_loss": 0.98940754, + "learning_rate": 0.000996083857821902, + "loss": 1.0015378, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.29345703, + "routerloss_mlp": 0.0, + "step": 357, + "time_per_iteration": 3.0531890392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237281, + "balance_loss_mlp": 1.20714498, + "diversity_loss_mlp": 0.0, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.06260381392843543, + "language_loss": 0.96791607, + "learning_rate": 0.0009960448458832588, + "loss": 0.98028892, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30126953, + "routerloss_mlp": 0.0, + "step": 358, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.20750594, + "diversity_loss_mlp": 0.0, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.07177130169486132, + "language_loss": 0.96227086, + "learning_rate": 0.000996005641362927, + "loss": 0.97463197, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28637695, + "routerloss_mlp": 0.0, + "step": 359, + "time_per_iteration": 2.58060884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229528, + "balance_loss_mlp": 1.19984436, + "diversity_loss_mlp": 0.0, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.09877521418753983, + "language_loss": 0.99257219, + "learning_rate": 0.0009959662442761274, + "loss": 1.00486755, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.29663086, + "routerloss_mlp": 0.0, + "step": 360, + "time_per_iteration": 2.8970725536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241998, + "balance_loss_mlp": 1.21033561, + "diversity_loss_mlp": 0.0, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.07509157549903762, + "language_loss": 0.93086261, + "learning_rate": 0.000995926654638155, + "loss": 0.9432826, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.31640625, + "routerloss_mlp": 0.0, + "step": 361, + "time_per_iteration": 2.787796974182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225169, + "balance_loss_mlp": 1.19405532, + "diversity_loss_mlp": 0.0, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.08313329413520473, + "language_loss": 0.94580126, + "learning_rate": 0.00099588687246438, + "loss": 0.95805293, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.31103516, + "routerloss_mlp": 0.0, + "step": 362, + "time_per_iteration": 2.826186418533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188587, + "balance_loss_mlp": 1.15785527, + "diversity_loss_mlp": 0.0, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.12654684897021498, + "language_loss": 1.02203465, + "learning_rate": 0.0009958468977702471, + "loss": 1.03392053, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 363, + "time_per_iteration": 2.5915637016296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02117372, + "balance_loss_mlp": 1.97470212, + "diversity_loss_mlp": 0.0, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.12517092959889778, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81852078, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.4296875, + "routerloss_mlp": 0.0, + "step": 364, + "time_per_iteration": 4.79950737953186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195198, + "balance_loss_mlp": 1.16406059, + "diversity_loss_mlp": 0.0, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.08484436116426784, + "language_loss": 0.90580225, + "learning_rate": 0.0009957663708830612, + "loss": 0.91775423, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 365, + "time_per_iteration": 3.2616662979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119947, + "balance_loss_mlp": 1.16575801, + "diversity_loss_mlp": 0.0, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.10575932689534903, + "language_loss": 0.93159938, + "learning_rate": 0.0009957258187212714, + "loss": 0.9435941, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.33740234, + "routerloss_mlp": 0.0, + "step": 366, + "time_per_iteration": 3.0113134384155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02012454, + "balance_loss_mlp": 1.90030205, + "diversity_loss_mlp": 0.0, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.0781885975604906, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.81207317, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.125, + "routerloss_mlp": 0.0, + "step": 367, + "time_per_iteration": 4.857182502746582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238272, + "balance_loss_mlp": 1.20377314, + "diversity_loss_mlp": 0.0, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.10459556468103207, + "language_loss": 0.9040041, + "learning_rate": 0.0009956441370400167, + "loss": 0.91638684, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.34472656, + "routerloss_mlp": 0.0, + "step": 368, + "time_per_iteration": 2.6384623050689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212552, + "balance_loss_mlp": 1.17986465, + "diversity_loss_mlp": 0.0, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.11871319311308551, + "language_loss": 0.96155751, + "learning_rate": 0.0009956030075522636, + "loss": 0.973683, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.3269043, + "routerloss_mlp": 0.0, + "step": 369, + "time_per_iteration": 2.7690951824188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.85686088, + "diversity_loss_mlp": 0.26596725, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0445321938876095, + "language_loss": 0.99161661, + "learning_rate": 0.0009955616856543587, + "loss": 1.00259984, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.03691306, + "step": 370, + "time_per_iteration": 2.6551451683044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136923, + "balance_loss_mlp": 1.10690594, + "diversity_loss_mlp": 0.0, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.06345816714032589, + "language_loss": 0.89315635, + "learning_rate": 0.0009955201713623448, + "loss": 0.90452558, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.29980469, + "routerloss_mlp": 0.0, + "step": 371, + "time_per_iteration": 2.7738049030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01981215, + "balance_loss_mlp": 1.93124223, + "diversity_loss_mlp": 0.0, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.16358882606758401, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78653932, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.5, + "routerloss_mlp": 0.0, + "step": 372, + "time_per_iteration": 4.94252347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117773, + "balance_loss_mlp": 1.08999681, + "diversity_loss_mlp": 0.0, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.14652608757044766, + "language_loss": 1.03006279, + "learning_rate": 0.0009954365656605333, + "loss": 1.04124057, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.27783203, + "routerloss_mlp": 0.0, + "step": 373, + "time_per_iteration": 2.551156759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138367, + "balance_loss_mlp": 1.10901785, + "diversity_loss_mlp": 0.0, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.09116429227244367, + "language_loss": 0.95790577, + "learning_rate": 0.0009953944742831947, + "loss": 0.96928942, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.29296875, + "routerloss_mlp": 0.0, + "step": 374, + "time_per_iteration": 2.995286226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159694, + "balance_loss_mlp": 1.13084567, + "diversity_loss_mlp": 0.0, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.10582188185488459, + "language_loss": 0.99257255, + "learning_rate": 0.0009953521905766642, + "loss": 1.00416946, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.28808594, + "routerloss_mlp": 0.0, + "step": 375, + "time_per_iteration": 2.946237325668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186879, + "balance_loss_mlp": 1.15664721, + "diversity_loss_mlp": 0.0, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.09648654328935216, + "language_loss": 0.97696835, + "learning_rate": 0.0009953097145573577, + "loss": 0.98883718, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.30200195, + "routerloss_mlp": 0.0, + "step": 376, + "time_per_iteration": 2.64080548286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119333, + "balance_loss_mlp": 1.16164398, + "diversity_loss_mlp": 0.0, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.11805021949506506, + "language_loss": 0.95023847, + "learning_rate": 0.000995267046241766, + "loss": 0.96217185, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 377, + "time_per_iteration": 3.2120020389556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188603, + "balance_loss_mlp": 1.15617776, + "diversity_loss_mlp": 0.0, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.10215127385841216, + "language_loss": 0.94931126, + "learning_rate": 0.0009952241856464547, + "loss": 0.96119732, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.32421875, + "routerloss_mlp": 0.0, + "step": 378, + "time_per_iteration": 2.595047950744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183617, + "balance_loss_mlp": 1.14971423, + "diversity_loss_mlp": 0.0, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.08294465031859817, + "language_loss": 1.01604176, + "learning_rate": 0.0009951811327880632, + "loss": 1.02787805, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.33911133, + "routerloss_mlp": 0.0, + "step": 379, + "time_per_iteration": 2.7318813800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173744, + "balance_loss_mlp": 1.13891101, + "diversity_loss_mlp": 0.0, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.06744176383892367, + "language_loss": 0.94898254, + "learning_rate": 0.0009951378876833063, + "loss": 0.96071994, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.34838867, + "routerloss_mlp": 0.0, + "step": 380, + "time_per_iteration": 2.565268039703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198329, + "balance_loss_mlp": 1.16392517, + "diversity_loss_mlp": 0.0, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.08808941505023588, + "language_loss": 1.01867247, + "learning_rate": 0.0009950944503489736, + "loss": 1.03065586, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.34399414, + "routerloss_mlp": 0.0, + "step": 381, + "time_per_iteration": 2.7605583667755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220014, + "balance_loss_mlp": 1.18479919, + "diversity_loss_mlp": 0.0, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.09503573620830386, + "language_loss": 0.95487726, + "learning_rate": 0.0009950508208019285, + "loss": 0.96707737, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.35253906, + "routerloss_mlp": 0.0, + "step": 382, + "time_per_iteration": 3.023996591567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224507, + "balance_loss_mlp": 1.19086623, + "diversity_loss_mlp": 0.0, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.09021711867793632, + "language_loss": 1.0023253, + "learning_rate": 0.0009950069990591096, + "loss": 1.01457047, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.33666992, + "routerloss_mlp": 0.0, + "step": 383, + "time_per_iteration": 2.62634015083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02435347, + "balance_loss_mlp": 2.36668229, + "diversity_loss_mlp": 0.0, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.252441104666548, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78836709, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.6875, + "routerloss_mlp": 0.0, + "step": 384, + "time_per_iteration": 4.887000322341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205074, + "balance_loss_mlp": 1.17217231, + "diversity_loss_mlp": 0.0, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.13776686153508858, + "language_loss": 0.92669415, + "learning_rate": 0.0009949187790542777, + "loss": 0.93874478, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.32910156, + "routerloss_mlp": 0.0, + "step": 385, + "time_per_iteration": 2.7325563430786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158547, + "balance_loss_mlp": 1.12683773, + "diversity_loss_mlp": 0.0, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.09404920935129117, + "language_loss": 0.89306223, + "learning_rate": 0.0009948743808265148, + "loss": 0.90464771, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 386, + "time_per_iteration": 2.723581314086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152345, + "balance_loss_mlp": 1.12321043, + "diversity_loss_mlp": 0.0, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.11553674714385681, + "language_loss": 0.98625511, + "learning_rate": 0.0009948297904714782, + "loss": 0.99777853, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29125977, + "routerloss_mlp": 0.0, + "step": 387, + "time_per_iteration": 2.6925902366638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152671, + "balance_loss_mlp": 1.12460923, + "diversity_loss_mlp": 0.0, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.10281917509950625, + "language_loss": 0.91430104, + "learning_rate": 0.0009947850080064796, + "loss": 0.92582774, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.28076172, + "routerloss_mlp": 0.0, + "step": 388, + "time_per_iteration": 2.7813222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_mlp": 1.80238378, + "diversity_loss_mlp": 0.24433145, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.03140321958098528, + "language_loss": 0.96549261, + "learning_rate": 0.0009947400334489047, + "loss": 0.97600979, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0283502, + "step": 389, + "time_per_iteration": 3.055640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_mlp": 1.11867988, + "diversity_loss_mlp": 0.0, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.10120121915973303, + "language_loss": 0.87344396, + "learning_rate": 0.0009946948668162145, + "loss": 0.88490444, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.27392578, + "routerloss_mlp": 0.0, + "step": 390, + "time_per_iteration": 2.7240688800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159261, + "balance_loss_mlp": 1.13079381, + "diversity_loss_mlp": 0.0, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.0733706931740777, + "language_loss": 0.92598295, + "learning_rate": 0.0009946495081259441, + "loss": 0.93757558, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 391, + "time_per_iteration": 2.8451168537139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145886, + "balance_loss_mlp": 1.11753774, + "diversity_loss_mlp": 0.0, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.0986246500370879, + "language_loss": 0.95604634, + "learning_rate": 0.0009946039573957035, + "loss": 0.96750522, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.28344727, + "routerloss_mlp": 0.0, + "step": 392, + "time_per_iteration": 2.943962574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142945, + "balance_loss_mlp": 1.11550307, + "diversity_loss_mlp": 0.0, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.0698233472363084, + "language_loss": 0.92221498, + "learning_rate": 0.000994558214643177, + "loss": 0.93364441, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.27441406, + "routerloss_mlp": 0.0, + "step": 393, + "time_per_iteration": 2.7336390018463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137637, + "balance_loss_mlp": 1.10933709, + "diversity_loss_mlp": 0.0, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.0667709001177297, + "language_loss": 0.93581867, + "learning_rate": 0.000994512279886123, + "loss": 0.94719505, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 394, + "time_per_iteration": 3.0792524814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148773, + "balance_loss_mlp": 1.12104487, + "diversity_loss_mlp": 0.0, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.057306164352953166, + "language_loss": 0.94243777, + "learning_rate": 0.0009944661531423758, + "loss": 0.95392549, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.27758789, + "routerloss_mlp": 0.0, + "step": 395, + "time_per_iteration": 2.7003707885742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169709, + "balance_loss_mlp": 1.14162326, + "diversity_loss_mlp": 0.0, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.09187664036534561, + "language_loss": 0.92709243, + "learning_rate": 0.000994419834429843, + "loss": 0.93878949, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 396, + "time_per_iteration": 2.654961109161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184579, + "balance_loss_mlp": 1.15613592, + "diversity_loss_mlp": 0.0, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.10401840603132484, + "language_loss": 0.96742636, + "learning_rate": 0.0009943733237665069, + "loss": 0.97927213, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 397, + "time_per_iteration": 2.8282015323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204203, + "balance_loss_mlp": 1.17542565, + "diversity_loss_mlp": 0.0, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.06433229599495933, + "language_loss": 0.96130294, + "learning_rate": 0.0009943266211704248, + "loss": 0.97334492, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.28759766, + "routerloss_mlp": 0.0, + "step": 398, + "time_per_iteration": 2.970426321029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183998, + "balance_loss_mlp": 1.15534043, + "diversity_loss_mlp": 0.0, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.08157022591406732, + "language_loss": 0.98195136, + "learning_rate": 0.000994279726659728, + "loss": 0.99379134, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.28662109, + "routerloss_mlp": 0.0, + "step": 399, + "time_per_iteration": 2.5123794078826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177562, + "balance_loss_mlp": 1.14926195, + "diversity_loss_mlp": 0.0, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.07895179134063258, + "language_loss": 0.95376462, + "learning_rate": 0.0009942326402526231, + "loss": 0.96554029, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.28320312, + "routerloss_mlp": 0.0, + "step": 400, + "time_per_iteration": 2.52349591255188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146856, + "balance_loss_mlp": 1.11905658, + "diversity_loss_mlp": 0.0, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.0705701607591385, + "language_loss": 0.94442534, + "learning_rate": 0.0009941853619673902, + "loss": 0.95589387, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.27807617, + "routerloss_mlp": 0.0, + "step": 401, + "time_per_iteration": 2.643442153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.10811007, + "diversity_loss_mlp": 0.0, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.11619926948996102, + "language_loss": 0.97199881, + "learning_rate": 0.0009941378918223844, + "loss": 0.9833436, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.26416016, + "routerloss_mlp": 0.0, + "step": 402, + "time_per_iteration": 3.05241322517395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124539, + "balance_loss_mlp": 1.09765708, + "diversity_loss_mlp": 0.0, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.0628584922031364, + "language_loss": 0.90586787, + "learning_rate": 0.0009940902298360354, + "loss": 0.91711324, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.26916504, + "routerloss_mlp": 0.0, + "step": 403, + "time_per_iteration": 2.739593744277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123125, + "balance_loss_mlp": 1.09564674, + "diversity_loss_mlp": 0.0, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.07463467829204698, + "language_loss": 0.99357891, + "learning_rate": 0.0009940423760268473, + "loss": 1.00481009, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.27478027, + "routerloss_mlp": 0.0, + "step": 404, + "time_per_iteration": 2.863248825073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123907, + "balance_loss_mlp": 1.09644127, + "diversity_loss_mlp": 0.0, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.08544352707712408, + "language_loss": 0.93046296, + "learning_rate": 0.0009939943304133982, + "loss": 0.94170201, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.27514648, + "routerloss_mlp": 0.0, + "step": 405, + "time_per_iteration": 2.631242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929276, + "balance_loss_mlp": 1.55583501, + "diversity_loss_mlp": 0.25816602, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.039808149400508724, + "language_loss": 1.0085814, + "learning_rate": 0.0009939460930143416, + "loss": 1.017874, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02227605, + "step": 406, + "time_per_iteration": 2.655000925064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908113, + "balance_loss_mlp": 1.5136435, + "diversity_loss_mlp": 0.25845903, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.031543409668047605, + "language_loss": 0.94866949, + "learning_rate": 0.0009938976638484043, + "loss": 0.95775062, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02206134, + "step": 407, + "time_per_iteration": 2.932522773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125815, + "balance_loss_mlp": 1.09954083, + "diversity_loss_mlp": 0.0, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.0874520562524596, + "language_loss": 0.93291676, + "learning_rate": 0.0009938490429343887, + "loss": 0.94417489, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 408, + "time_per_iteration": 2.5488343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128514, + "balance_loss_mlp": 1.10140562, + "diversity_loss_mlp": 0.0, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.1051667442879041, + "language_loss": 0.94155729, + "learning_rate": 0.0009938002302911709, + "loss": 0.95284247, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 409, + "time_per_iteration": 2.7672979831695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.10946035, + "diversity_loss_mlp": 0.0, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.09613329153911296, + "language_loss": 0.9601537, + "learning_rate": 0.0009937512259377015, + "loss": 0.97151482, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 410, + "time_per_iteration": 2.674072504043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159019, + "balance_loss_mlp": 1.13217306, + "diversity_loss_mlp": 0.0, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.05951235305386178, + "language_loss": 0.95475662, + "learning_rate": 0.000993702029893006, + "loss": 0.96634674, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.26879883, + "routerloss_mlp": 0.0, + "step": 411, + "time_per_iteration": 2.7913753986358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185856, + "balance_loss_mlp": 1.15731764, + "diversity_loss_mlp": 0.0, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.10961223184545879, + "language_loss": 0.95336723, + "learning_rate": 0.0009936526421761838, + "loss": 0.96522582, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.28540039, + "routerloss_mlp": 0.0, + "step": 412, + "time_per_iteration": 3.036557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181446, + "balance_loss_mlp": 1.15414703, + "diversity_loss_mlp": 0.0, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.09075853005030154, + "language_loss": 0.97731507, + "learning_rate": 0.000993603062806409, + "loss": 0.98912954, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.27319336, + "routerloss_mlp": 0.0, + "step": 413, + "time_per_iteration": 2.690500259399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166438, + "balance_loss_mlp": 1.1394248, + "diversity_loss_mlp": 0.0, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.0841151797190701, + "language_loss": 1.00301099, + "learning_rate": 0.0009935532918029298, + "loss": 1.01467538, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.27050781, + "routerloss_mlp": 0.0, + "step": 414, + "time_per_iteration": 2.6386477947235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171646, + "balance_loss_mlp": 1.14432323, + "diversity_loss_mlp": 0.0, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.07267589634089947, + "language_loss": 0.94145483, + "learning_rate": 0.0009935033291850694, + "loss": 0.95317131, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.27307129, + "routerloss_mlp": 0.0, + "step": 415, + "time_per_iteration": 2.6771326065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138565, + "balance_loss_mlp": 1.11312544, + "diversity_loss_mlp": 0.0, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.09244391725109519, + "language_loss": 0.96404541, + "learning_rate": 0.0009934531749722247, + "loss": 0.97543103, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 416, + "time_per_iteration": 2.586975574493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132517, + "balance_loss_mlp": 1.10733998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.0915153559751851, + "language_loss": 0.94398224, + "learning_rate": 0.0009934028291838672, + "loss": 0.95530736, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.25183105, + "routerloss_mlp": 0.0, + "step": 417, + "time_per_iteration": 2.7062928676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150706, + "balance_loss_mlp": 1.1251713, + "diversity_loss_mlp": 0.0, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.10053131301435142, + "language_loss": 0.89968443, + "learning_rate": 0.0009933522918395433, + "loss": 0.91119152, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.25549316, + "routerloss_mlp": 0.0, + "step": 418, + "time_per_iteration": 2.65326189994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00760745, + "balance_loss_mlp": 1.16580379, + "diversity_loss_mlp": 0.256477, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.006992447528439397, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79011846, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.049605, + "step": 419, + "time_per_iteration": 4.8772523403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176473, + "balance_loss_mlp": 1.15143883, + "diversity_loss_mlp": 0.0, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08608768077535772, + "language_loss": 1.07860529, + "learning_rate": 0.000993250642561551, + "loss": 1.09036994, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.25061035, + "routerloss_mlp": 0.0, + "step": 420, + "time_per_iteration": 2.588672399520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.15165043, + "diversity_loss_mlp": 0.0, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.09804047271530963, + "language_loss": 0.93524832, + "learning_rate": 0.0009931995306673466, + "loss": 0.94701445, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 421, + "time_per_iteration": 2.734513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200943, + "balance_loss_mlp": 1.17474103, + "diversity_loss_mlp": 0.0, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.0768650968130289, + "language_loss": 0.98959565, + "learning_rate": 0.000993148227296103, + "loss": 1.00160503, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 422, + "time_per_iteration": 2.6389012336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185361, + "balance_loss_mlp": 1.1604228, + "diversity_loss_mlp": 0.0, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.08220754838372611, + "language_loss": 0.87845761, + "learning_rate": 0.000993096732467738, + "loss": 0.89031118, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.24938965, + "routerloss_mlp": 0.0, + "step": 423, + "time_per_iteration": 2.976412057876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00884908, + "balance_loss_mlp": 1.45653749, + "diversity_loss_mlp": 0.26738948, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.04326164577840749, + "language_loss": 0.94753903, + "learning_rate": 0.0009930450462022435, + "loss": 0.95638812, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02294483, + "step": 424, + "time_per_iteration": 2.9038002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02462639, + "balance_loss_mlp": 2.35582733, + "diversity_loss_mlp": 0.0, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.15208391867633483, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.81652445, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.0703125, + "routerloss_mlp": 0.0, + "step": 425, + "time_per_iteration": 4.893689155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182525, + "balance_loss_mlp": 1.15690684, + "diversity_loss_mlp": 0.0, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.10181541083425144, + "language_loss": 0.92197704, + "learning_rate": 0.0009929410994402065, + "loss": 0.93380231, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.25646973, + "routerloss_mlp": 0.0, + "step": 426, + "time_per_iteration": 3.793488025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863772, + "balance_loss_mlp": 1.42266524, + "diversity_loss_mlp": 0.26325443, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.038163151149059646, + "language_loss": 0.97185421, + "learning_rate": 0.0009928888389840196, + "loss": 0.98049194, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02081174, + "step": 427, + "time_per_iteration": 2.7310097217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196199, + "balance_loss_mlp": 1.1708436, + "diversity_loss_mlp": 0.0, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.1014811860289813, + "language_loss": 0.98936689, + "learning_rate": 0.0009928363871714147, + "loss": 1.00132895, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.25378418, + "routerloss_mlp": 0.0, + "step": 428, + "time_per_iteration": 2.650698184967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198239, + "balance_loss_mlp": 1.17194164, + "diversity_loss_mlp": 0.0, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.0884548399202502, + "language_loss": 0.93840969, + "learning_rate": 0.0009927837440227556, + "loss": 0.95039201, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 429, + "time_per_iteration": 2.8162689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199498, + "balance_loss_mlp": 1.17399931, + "diversity_loss_mlp": 0.0, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.0660726649824177, + "language_loss": 0.88846099, + "learning_rate": 0.0009927309095584798, + "loss": 0.90045595, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.25524902, + "routerloss_mlp": 0.0, + "step": 430, + "time_per_iteration": 2.975594997406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190829, + "balance_loss_mlp": 1.1661284, + "diversity_loss_mlp": 0.0, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.08430379744466543, + "language_loss": 0.98639262, + "learning_rate": 0.0009926778837991, + "loss": 0.99830091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.24682617, + "routerloss_mlp": 0.0, + "step": 431, + "time_per_iteration": 2.595855236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187757, + "balance_loss_mlp": 1.16231799, + "diversity_loss_mlp": 0.0, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.08045199303169787, + "language_loss": 0.97297168, + "learning_rate": 0.000992624666765202, + "loss": 0.98484921, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.2545166, + "routerloss_mlp": 0.0, + "step": 432, + "time_per_iteration": 2.828488826751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195331, + "balance_loss_mlp": 1.17080951, + "diversity_loss_mlp": 0.0, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.08518069864439091, + "language_loss": 0.9513936, + "learning_rate": 0.000992571258477447, + "loss": 0.96334684, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 433, + "time_per_iteration": 2.7914628982543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181479, + "balance_loss_mlp": 1.15727913, + "diversity_loss_mlp": 0.0, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.08514456826718247, + "language_loss": 0.89393032, + "learning_rate": 0.0009925176589565695, + "loss": 0.90574509, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.24182129, + "routerloss_mlp": 0.0, + "step": 434, + "time_per_iteration": 2.847381830215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154073, + "balance_loss_mlp": 1.13002813, + "diversity_loss_mlp": 0.0, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.09497783603336436, + "language_loss": 0.99263078, + "learning_rate": 0.0009924638682233791, + "loss": 1.00417161, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.24047852, + "routerloss_mlp": 0.0, + "step": 435, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505725, + "balance_loss_mlp": 2.43934894, + "diversity_loss_mlp": 0.0, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06827578128022488, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82070321, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.6640625, + "routerloss_mlp": 0.0, + "step": 436, + "time_per_iteration": 4.539026737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138708, + "balance_loss_mlp": 1.11440182, + "diversity_loss_mlp": 0.0, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.10357837156718612, + "language_loss": 0.8856501, + "learning_rate": 0.0009923557132036668, + "loss": 0.89703721, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 437, + "time_per_iteration": 3.0414698123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124752, + "balance_loss_mlp": 1.09998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.06660243724344939, + "language_loss": 0.94103611, + "learning_rate": 0.0009923013489591345, + "loss": 0.95228368, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.24768066, + "routerloss_mlp": 0.0, + "step": 438, + "time_per_iteration": 2.7426626682281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857144, + "balance_loss_mlp": 1.4199276, + "diversity_loss_mlp": 0.26049304, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04620678173721227, + "language_loss": 0.92873847, + "learning_rate": 0.0009922467935862681, + "loss": 0.93730992, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01693399, + "step": 439, + "time_per_iteration": 3.107149124145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113851, + "balance_loss_mlp": 1.11386943, + "diversity_loss_mlp": 0.0, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.07763968648184205, + "language_loss": 0.95120305, + "learning_rate": 0.0009921920471062478, + "loss": 0.96258819, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 440, + "time_per_iteration": 2.572195529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139984, + "balance_loss_mlp": 1.11489022, + "diversity_loss_mlp": 0.0, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.0880262953369173, + "language_loss": 0.92829931, + "learning_rate": 0.0009921371095403281, + "loss": 0.93969917, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.25109863, + "routerloss_mlp": 0.0, + "step": 441, + "time_per_iteration": 2.6386919021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156684, + "balance_loss_mlp": 1.13206697, + "diversity_loss_mlp": 0.0, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.09427081021892933, + "language_loss": 0.95792937, + "learning_rate": 0.0009920819809098379, + "loss": 0.96949625, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 442, + "time_per_iteration": 2.588674783706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169571, + "balance_loss_mlp": 1.1441319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.0873536117240321, + "language_loss": 0.91373646, + "learning_rate": 0.0009920266612361798, + "loss": 0.92543221, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 443, + "time_per_iteration": 2.755526065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167225, + "balance_loss_mlp": 1.14349055, + "diversity_loss_mlp": 0.0, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.07116177044877865, + "language_loss": 0.90907955, + "learning_rate": 0.0009919711505408308, + "loss": 0.92075175, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.23718262, + "routerloss_mlp": 0.0, + "step": 444, + "time_per_iteration": 2.7939865589141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116645, + "balance_loss_mlp": 1.14170241, + "diversity_loss_mlp": 0.0, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.09221719775958219, + "language_loss": 0.89192301, + "learning_rate": 0.000991915448845342, + "loss": 0.90358752, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.24731445, + "routerloss_mlp": 0.0, + "step": 445, + "time_per_iteration": 2.5457842350006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154656, + "balance_loss_mlp": 1.13168466, + "diversity_loss_mlp": 0.0, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.08780021998431992, + "language_loss": 0.98329008, + "learning_rate": 0.000991859556171339, + "loss": 0.99483669, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.22973633, + "routerloss_mlp": 0.0, + "step": 446, + "time_per_iteration": 2.6356756687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0083848, + "balance_loss_mlp": 1.38336182, + "diversity_loss_mlp": 0.25472927, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.049564893991705376, + "language_loss": 1.00050902, + "learning_rate": 0.000991803472540521, + "loss": 1.00889397, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01943407, + "step": 447, + "time_per_iteration": 2.631704807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130193, + "balance_loss_mlp": 1.1087712, + "diversity_loss_mlp": 0.0, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.11682082282160788, + "language_loss": 0.94917679, + "learning_rate": 0.0009917471979746615, + "loss": 0.96047872, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 448, + "time_per_iteration": 2.9820516109466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122722, + "balance_loss_mlp": 1.10119319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.07207820272739716, + "language_loss": 0.94521272, + "learning_rate": 0.0009916907324956086, + "loss": 0.95643997, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 449, + "time_per_iteration": 2.701571464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127326, + "balance_loss_mlp": 1.10453379, + "diversity_loss_mlp": 0.0, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.081693490118891, + "language_loss": 0.90889072, + "learning_rate": 0.0009916340761252837, + "loss": 0.92016399, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 450, + "time_per_iteration": 2.598238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124635, + "balance_loss_mlp": 1.10287929, + "diversity_loss_mlp": 0.0, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.08322873762038852, + "language_loss": 0.88526833, + "learning_rate": 0.0009915772288856832, + "loss": 0.89651471, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.21765137, + "routerloss_mlp": 0.0, + "step": 451, + "time_per_iteration": 3.0680441856384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121178, + "balance_loss_mlp": 1.09876692, + "diversity_loss_mlp": 0.0, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.07764148626601892, + "language_loss": 0.8994481, + "learning_rate": 0.000991520190798877, + "loss": 0.91065991, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22412109, + "routerloss_mlp": 0.0, + "step": 452, + "time_per_iteration": 2.7982983589172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136254, + "balance_loss_mlp": 1.11281788, + "diversity_loss_mlp": 0.0, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.11496723003988224, + "language_loss": 0.98584056, + "learning_rate": 0.0009914629618870089, + "loss": 0.99720311, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 453, + "time_per_iteration": 2.8737423419952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0218934, + "balance_loss_mlp": 2.1624465, + "diversity_loss_mlp": 0.0, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.09249743450545506, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.8086521, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.26953125, + "routerloss_mlp": 0.0, + "step": 454, + "time_per_iteration": 4.756322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065274, + "balance_loss_mlp": 2.03780842, + "diversity_loss_mlp": 0.0, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.0744981683452351, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83493233, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.27539062, + "routerloss_mlp": 0.0, + "step": 455, + "time_per_iteration": 2.173584461212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848454, + "balance_loss_mlp": 1.40727437, + "diversity_loss_mlp": 0.24745712, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.04702924064086775, + "language_loss": 0.92085564, + "learning_rate": 0.0009912901304235883, + "loss": 0.92934018, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0210887, + "step": 456, + "time_per_iteration": 2.868276596069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273346, + "balance_loss_mlp": 1.24886012, + "diversity_loss_mlp": 0.0, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.1518400720273604, + "language_loss": 0.87943619, + "learning_rate": 0.000991232138434397, + "loss": 0.89216965, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.24499512, + "routerloss_mlp": 0.0, + "step": 457, + "time_per_iteration": 2.8729381561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262571, + "balance_loss_mlp": 1.23763299, + "diversity_loss_mlp": 0.0, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.14470377187588201, + "language_loss": 0.94336045, + "learning_rate": 0.000991173955731976, + "loss": 0.9559862, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 458, + "time_per_iteration": 2.7100729942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218734, + "balance_loss_mlp": 1.19520259, + "diversity_loss_mlp": 0.0, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.09239254139658798, + "language_loss": 0.99845707, + "learning_rate": 0.0009911155823389137, + "loss": 1.01064444, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.23547363, + "routerloss_mlp": 0.0, + "step": 459, + "time_per_iteration": 2.9462080001831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178782, + "balance_loss_mlp": 1.1555717, + "diversity_loss_mlp": 0.0, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.0878830171329016, + "language_loss": 0.95269191, + "learning_rate": 0.000991057018277873, + "loss": 0.9644798, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.23205566, + "routerloss_mlp": 0.0, + "step": 460, + "time_per_iteration": 2.7473583221435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151252, + "balance_loss_mlp": 1.12904322, + "diversity_loss_mlp": 0.0, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.1205367347306004, + "language_loss": 0.9509443, + "learning_rate": 0.0009909982635715898, + "loss": 0.96245682, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22216797, + "routerloss_mlp": 0.0, + "step": 461, + "time_per_iteration": 2.6226725578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145607, + "balance_loss_mlp": 1.12300491, + "diversity_loss_mlp": 0.0, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.0884001914091671, + "language_loss": 0.94182885, + "learning_rate": 0.0009909393182428751, + "loss": 0.95328492, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22619629, + "routerloss_mlp": 0.0, + "step": 462, + "time_per_iteration": 2.632216453552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157329, + "balance_loss_mlp": 1.13402367, + "diversity_loss_mlp": 0.0, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.09814328047414513, + "language_loss": 0.89072084, + "learning_rate": 0.000990880182314614, + "loss": 0.90229416, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 463, + "time_per_iteration": 2.6763410568237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.008652, + "balance_loss_mlp": 1.44467092, + "diversity_loss_mlp": 0.24997658, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.034550824680377484, + "language_loss": 0.89998591, + "learning_rate": 0.0009908208558097643, + "loss": 0.90863788, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01787652, + "step": 464, + "time_per_iteration": 2.9323060512542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224446, + "balance_loss_mlp": 1.20036614, + "diversity_loss_mlp": 0.0, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.11121459240038054, + "language_loss": 0.9153899, + "learning_rate": 0.000990761338751359, + "loss": 0.92763436, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 465, + "time_per_iteration": 2.7976956367492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01887012, + "balance_loss_mlp": 1.84867477, + "diversity_loss_mlp": 0.0, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.10155840838291885, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75546634, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.3828125, + "routerloss_mlp": 0.0, + "step": 466, + "time_per_iteration": 4.965139150619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319273, + "balance_loss_mlp": 1.29344034, + "diversity_loss_mlp": 0.0, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.10901527230577203, + "language_loss": 0.93872285, + "learning_rate": 0.0009906417330663815, + "loss": 0.95191562, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 467, + "time_per_iteration": 2.628042459487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01352641, + "balance_loss_mlp": 1.3264153, + "diversity_loss_mlp": 0.0, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.10051526680757361, + "language_loss": 0.90321958, + "learning_rate": 0.0009905816444862442, + "loss": 0.91674596, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 468, + "time_per_iteration": 2.613952398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396274, + "balance_loss_mlp": 1.36905813, + "diversity_loss_mlp": 0.0, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.10220310656667285, + "language_loss": 0.88433367, + "learning_rate": 0.0009905213654454216, + "loss": 0.89829642, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.27209473, + "routerloss_mlp": 0.0, + "step": 469, + "time_per_iteration": 2.897365093231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01363851, + "balance_loss_mlp": 1.3367548, + "diversity_loss_mlp": 0.0, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.11223211494597432, + "language_loss": 0.94907629, + "learning_rate": 0.0009904608959673158, + "loss": 0.96271479, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.2713623, + "routerloss_mlp": 0.0, + "step": 470, + "time_per_iteration": 2.7828967571258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328731, + "balance_loss_mlp": 1.30289829, + "diversity_loss_mlp": 0.0, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.10534875872888719, + "language_loss": 0.94143116, + "learning_rate": 0.000990400236075403, + "loss": 0.95471847, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 471, + "time_per_iteration": 2.5291385650634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126, + "balance_loss_mlp": 1.23546696, + "diversity_loss_mlp": 0.0, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.08150240013734093, + "language_loss": 0.92401147, + "learning_rate": 0.0009903393857932338, + "loss": 0.93661153, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 472, + "time_per_iteration": 2.6317975521087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234666, + "balance_loss_mlp": 1.21105075, + "diversity_loss_mlp": 0.0, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.1079858906687858, + "language_loss": 0.89742762, + "learning_rate": 0.0009902783451444317, + "loss": 0.90977424, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.23583984, + "routerloss_mlp": 0.0, + "step": 473, + "time_per_iteration": 2.708159923553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204783, + "balance_loss_mlp": 1.18326581, + "diversity_loss_mlp": 0.0, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.08561107807714156, + "language_loss": 0.94620812, + "learning_rate": 0.0009902171141526956, + "loss": 0.95825595, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 474, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196875, + "balance_loss_mlp": 1.17460644, + "diversity_loss_mlp": 0.0, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.10745755704500252, + "language_loss": 0.82875264, + "learning_rate": 0.000990155692841797, + "loss": 0.84072143, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.22277832, + "routerloss_mlp": 0.0, + "step": 475, + "time_per_iteration": 2.985820770263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191147, + "balance_loss_mlp": 1.16911697, + "diversity_loss_mlp": 0.0, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.10692573165988825, + "language_loss": 0.93685389, + "learning_rate": 0.0009900940812355818, + "loss": 0.9487654, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.22033691, + "routerloss_mlp": 0.0, + "step": 476, + "time_per_iteration": 2.882946014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182015, + "balance_loss_mlp": 1.15972316, + "diversity_loss_mlp": 0.0, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.15748592495925862, + "language_loss": 0.89566875, + "learning_rate": 0.00099003227935797, + "loss": 0.90748894, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.22290039, + "routerloss_mlp": 0.0, + "step": 477, + "time_per_iteration": 2.729729413986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176422, + "balance_loss_mlp": 1.15324748, + "diversity_loss_mlp": 0.0, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.11223041806675033, + "language_loss": 0.92644513, + "learning_rate": 0.000989970287232955, + "loss": 0.93820935, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.23156738, + "routerloss_mlp": 0.0, + "step": 478, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168241, + "balance_loss_mlp": 1.14524555, + "diversity_loss_mlp": 0.0, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.08330283562574453, + "language_loss": 0.90444613, + "learning_rate": 0.0009899081048846043, + "loss": 0.91612852, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 479, + "time_per_iteration": 2.548454523086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230508, + "balance_loss_mlp": 1.20630884, + "diversity_loss_mlp": 0.0, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.17103007353978975, + "language_loss": 0.94793594, + "learning_rate": 0.0009898457323370593, + "loss": 0.96024096, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.24206543, + "routerloss_mlp": 0.0, + "step": 480, + "time_per_iteration": 2.582655668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249007, + "balance_loss_mlp": 1.22349596, + "diversity_loss_mlp": 0.0, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.11976742763400251, + "language_loss": 0.9370476, + "learning_rate": 0.000989783169614535, + "loss": 0.94953763, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25537109, + "routerloss_mlp": 0.0, + "step": 481, + "time_per_iteration": 2.6305787563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01772239, + "balance_loss_mlp": 1.74649, + "diversity_loss_mlp": 0.0, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.0876770513617693, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80524993, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 482, + "time_per_iteration": 4.8690409660339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276229, + "balance_loss_mlp": 1.25084925, + "diversity_loss_mlp": 0.0, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.10686208189243855, + "language_loss": 0.91100538, + "learning_rate": 0.000989657473741779, + "loss": 0.92376775, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25402832, + "routerloss_mlp": 0.0, + "step": 483, + "time_per_iteration": 2.8294553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275465, + "balance_loss_mlp": 1.25022864, + "diversity_loss_mlp": 0.0, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.09087050091564236, + "language_loss": 0.92375994, + "learning_rate": 0.0009895943406403465, + "loss": 0.93651462, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.25244141, + "routerloss_mlp": 0.0, + "step": 484, + "time_per_iteration": 2.728445053100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231643, + "balance_loss_mlp": 1.20584655, + "diversity_loss_mlp": 0.0, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.11173906110031175, + "language_loss": 0.85102737, + "learning_rate": 0.0009895310174615338, + "loss": 0.86334383, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.25805664, + "routerloss_mlp": 0.0, + "step": 485, + "time_per_iteration": 2.809858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01674879, + "balance_loss_mlp": 1.65122819, + "diversity_loss_mlp": 0.0, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.0891862493938321, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.77393395, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.23632812, + "routerloss_mlp": 0.0, + "step": 486, + "time_per_iteration": 4.675356388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149968, + "balance_loss_mlp": 1.1268059, + "diversity_loss_mlp": 0.0, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.12873710921953274, + "language_loss": 0.89867461, + "learning_rate": 0.0009894038009701782, + "loss": 0.91017425, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.23168945, + "routerloss_mlp": 0.0, + "step": 487, + "time_per_iteration": 2.646655797958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141338, + "balance_loss_mlp": 1.11786556, + "diversity_loss_mlp": 0.0, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.11717214663903742, + "language_loss": 0.89069557, + "learning_rate": 0.0009893399077070253, + "loss": 0.90210891, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.23474121, + "routerloss_mlp": 0.0, + "step": 488, + "time_per_iteration": 2.578733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936332, + "balance_loss_mlp": 1.59238243, + "diversity_loss_mlp": 0.24211329, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.03786592480343135, + "language_loss": 0.88446009, + "learning_rate": 0.0009892758244652718, + "loss": 0.89382339, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0190843, + "step": 489, + "time_per_iteration": 2.72853946685791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131365, + "balance_loss_mlp": 1.10876274, + "diversity_loss_mlp": 0.0, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.09957245788293691, + "language_loss": 0.92780352, + "learning_rate": 0.0009892115512697968, + "loss": 0.93911719, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 490, + "time_per_iteration": 2.6975181102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127367, + "balance_loss_mlp": 1.10648203, + "diversity_loss_mlp": 0.0, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.09077239739165983, + "language_loss": 0.95311546, + "learning_rate": 0.0009891470881455537, + "loss": 0.96438909, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 491, + "time_per_iteration": 2.674140214920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141034, + "balance_loss_mlp": 1.12092364, + "diversity_loss_mlp": 0.0, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.08843271909801863, + "language_loss": 0.91967297, + "learning_rate": 0.0009890824351175692, + "loss": 0.93108326, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.20092773, + "routerloss_mlp": 0.0, + "step": 492, + "time_per_iteration": 2.689789295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148763, + "balance_loss_mlp": 1.12847304, + "diversity_loss_mlp": 0.0, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.0818574716555875, + "language_loss": 0.96715915, + "learning_rate": 0.0009890175922109435, + "loss": 0.97864676, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.20288086, + "routerloss_mlp": 0.0, + "step": 493, + "time_per_iteration": 2.653787136077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161837, + "balance_loss_mlp": 1.14108253, + "diversity_loss_mlp": 0.0, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.10785532679009643, + "language_loss": 0.94627249, + "learning_rate": 0.0009889525594508513, + "loss": 0.95789087, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.20751953, + "routerloss_mlp": 0.0, + "step": 494, + "time_per_iteration": 3.013289213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168804, + "balance_loss_mlp": 1.14887238, + "diversity_loss_mlp": 0.0, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.09313196509024183, + "language_loss": 0.89226812, + "learning_rate": 0.0009888873368625404, + "loss": 0.90395617, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 495, + "time_per_iteration": 2.4990835189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215397, + "balance_loss_mlp": 1.19448745, + "diversity_loss_mlp": 0.0, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.11525575263217126, + "language_loss": 0.92808712, + "learning_rate": 0.0009888219244713326, + "loss": 0.94024116, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 496, + "time_per_iteration": 2.828477382659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235818, + "balance_loss_mlp": 1.2138716, + "diversity_loss_mlp": 0.0, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.13708349411569606, + "language_loss": 0.92383498, + "learning_rate": 0.0009887563223026229, + "loss": 0.93619317, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 497, + "time_per_iteration": 2.6688501834869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03358766, + "balance_loss_mlp": 3.33902526, + "diversity_loss_mlp": 0.0, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.4973253845941573, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.82426929, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19726562, + "routerloss_mlp": 0.0, + "step": 498, + "time_per_iteration": 4.9225428104400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125204, + "balance_loss_mlp": 1.22810328, + "diversity_loss_mlp": 0.0, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.09338533863845942, + "language_loss": 0.9145627, + "learning_rate": 0.0009886245487346482, + "loss": 0.92708313, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.23925781, + "routerloss_mlp": 0.0, + "step": 499, + "time_per_iteration": 3.0396392345428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273949, + "balance_loss_mlp": 1.24874783, + "diversity_loss_mlp": 0.0, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.12406156723875504, + "language_loss": 0.94657683, + "learning_rate": 0.0009885583773865422, + "loss": 0.95931631, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 500, + "time_per_iteration": 2.434283971786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_mlp": 1.29096031, + "diversity_loss_mlp": 0.0, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.11518840252548597, + "language_loss": 0.91528684, + "learning_rate": 0.0009884920163632524, + "loss": 0.92847896, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 501, + "time_per_iteration": 2.6888957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131255, + "balance_loss_mlp": 1.28246212, + "diversity_loss_mlp": 0.0, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.12991803618191863, + "language_loss": 0.93797207, + "learning_rate": 0.000988425465690543, + "loss": 0.95109755, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.30102539, + "routerloss_mlp": 0.0, + "step": 502, + "time_per_iteration": 2.5672004222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283439, + "balance_loss_mlp": 1.25225365, + "diversity_loss_mlp": 0.0, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.11000587000012971, + "language_loss": 0.91223967, + "learning_rate": 0.0009883587253942505, + "loss": 0.92507404, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 503, + "time_per_iteration": 2.7560157775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_mlp": 1.24281311, + "diversity_loss_mlp": 0.0, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.10509235815923167, + "language_loss": 0.97371984, + "learning_rate": 0.0009882917955002862, + "loss": 0.9864552, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 504, + "time_per_iteration": 2.5183091163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227481, + "balance_loss_mlp": 1.1978929, + "diversity_loss_mlp": 0.0, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.11004475447178139, + "language_loss": 0.90284961, + "learning_rate": 0.0009882246760346343, + "loss": 0.91512442, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 505, + "time_per_iteration": 2.6169376373291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215441, + "balance_loss_mlp": 1.18637753, + "diversity_loss_mlp": 0.0, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.13294554223904492, + "language_loss": 0.94025862, + "learning_rate": 0.0009881573670233533, + "loss": 0.95241302, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.29077148, + "routerloss_mlp": 0.0, + "step": 506, + "time_per_iteration": 2.5373079776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012064, + "balance_loss_mlp": 1.17976809, + "diversity_loss_mlp": 0.0, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.07932421313758002, + "language_loss": 0.89223576, + "learning_rate": 0.0009880898684925747, + "loss": 0.90429974, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 507, + "time_per_iteration": 2.661796808242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206827, + "balance_loss_mlp": 1.18070853, + "diversity_loss_mlp": 0.0, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.09132088261693337, + "language_loss": 0.87935519, + "learning_rate": 0.0009880221804685037, + "loss": 0.89142346, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.26159668, + "routerloss_mlp": 0.0, + "step": 508, + "time_per_iteration": 2.542513608932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02552291, + "balance_loss_mlp": 2.42869496, + "diversity_loss_mlp": 0.0, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.1282373293100265, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.8189671, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 509, + "time_per_iteration": 4.707206964492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280503, + "balance_loss_mlp": 1.25399113, + "diversity_loss_mlp": 0.0, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.09929466646798928, + "language_loss": 0.93586993, + "learning_rate": 0.0009878862360456733, + "loss": 0.94867498, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.265625, + "routerloss_mlp": 0.0, + "step": 510, + "time_per_iteration": 2.6981284618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284628, + "balance_loss_mlp": 1.25883126, + "diversity_loss_mlp": 0.0, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.10250849932844218, + "language_loss": 0.87516463, + "learning_rate": 0.0009878179796996922, + "loss": 0.88801086, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.25817871, + "routerloss_mlp": 0.0, + "step": 511, + "time_per_iteration": 2.7541561126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281708, + "balance_loss_mlp": 1.25468373, + "diversity_loss_mlp": 0.0, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.10234956077068923, + "language_loss": 0.90780497, + "learning_rate": 0.0009877495339659754, + "loss": 0.92062211, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.27038574, + "routerloss_mlp": 0.0, + "step": 512, + "time_per_iteration": 2.7744665145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278173, + "balance_loss_mlp": 1.25241184, + "diversity_loss_mlp": 0.0, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.11291475079800635, + "language_loss": 0.85683644, + "learning_rate": 0.000987680898871096, + "loss": 0.86961818, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 513, + "time_per_iteration": 2.8321592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289018, + "balance_loss_mlp": 1.26217198, + "diversity_loss_mlp": 0.0, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.10190264212433507, + "language_loss": 0.85800934, + "learning_rate": 0.0009876120744417, + "loss": 0.87089956, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.26867676, + "routerloss_mlp": 0.0, + "step": 514, + "time_per_iteration": 2.945312023162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245022, + "balance_loss_mlp": 1.2198211, + "diversity_loss_mlp": 0.0, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.09616865198011539, + "language_loss": 0.94088352, + "learning_rate": 0.0009875430607045078, + "loss": 0.9533338, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 515, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214395, + "balance_loss_mlp": 1.19058895, + "diversity_loss_mlp": 0.0, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.0895550710797692, + "language_loss": 0.91242373, + "learning_rate": 0.000987473857686313, + "loss": 0.9245677, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.23791504, + "routerloss_mlp": 0.0, + "step": 516, + "time_per_iteration": 2.7530250549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218622, + "balance_loss_mlp": 1.19458985, + "diversity_loss_mlp": 0.0, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.11626991588591096, + "language_loss": 0.92559797, + "learning_rate": 0.0009874044654139824, + "loss": 0.93778414, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.24023438, + "routerloss_mlp": 0.0, + "step": 517, + "time_per_iteration": 2.7673146724700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188049, + "balance_loss_mlp": 1.16410005, + "diversity_loss_mlp": 0.0, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.09260385447056875, + "language_loss": 0.91065013, + "learning_rate": 0.0009873348839144563, + "loss": 0.92253065, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.23950195, + "routerloss_mlp": 0.0, + "step": 518, + "time_per_iteration": 2.5385515689849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.13979197, + "diversity_loss_mlp": 0.0, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.07604390633760301, + "language_loss": 0.95252264, + "learning_rate": 0.000987265113214749, + "loss": 0.96414435, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.22375488, + "routerloss_mlp": 0.0, + "step": 519, + "time_per_iteration": 2.556882619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171262, + "balance_loss_mlp": 1.14849353, + "diversity_loss_mlp": 0.0, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.093032650642813, + "language_loss": 0.94720447, + "learning_rate": 0.0009871951533419476, + "loss": 0.95891708, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.22753906, + "routerloss_mlp": 0.0, + "step": 520, + "time_per_iteration": 2.724825143814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163285, + "balance_loss_mlp": 1.14063525, + "diversity_loss_mlp": 0.0, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.07732484115861517, + "language_loss": 0.87440532, + "learning_rate": 0.0009871250043232132, + "loss": 0.88603818, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.22631836, + "routerloss_mlp": 0.0, + "step": 521, + "time_per_iteration": 2.756647825241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171709, + "balance_loss_mlp": 1.14840364, + "diversity_loss_mlp": 0.0, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.08586449419627491, + "language_loss": 0.8592059, + "learning_rate": 0.0009870546661857797, + "loss": 0.87092298, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 522, + "time_per_iteration": 2.611241340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188262, + "balance_loss_mlp": 1.16447985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.11121774977632432, + "language_loss": 0.93899059, + "learning_rate": 0.0009869841389569553, + "loss": 0.9508732, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.2376709, + "routerloss_mlp": 0.0, + "step": 523, + "time_per_iteration": 2.986001491546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897074, + "balance_loss_mlp": 1.51972795, + "diversity_loss_mlp": 0.23477924, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.04055297882665198, + "language_loss": 0.88430732, + "learning_rate": 0.0009869134226641206, + "loss": 0.89327806, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01982057, + "step": 524, + "time_per_iteration": 2.5944766998291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213869, + "balance_loss_mlp": 1.19106424, + "diversity_loss_mlp": 0.0, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.1040439940574723, + "language_loss": 0.87633705, + "learning_rate": 0.0009868425173347303, + "loss": 0.88847572, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 525, + "time_per_iteration": 2.679245710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202393, + "balance_loss_mlp": 1.17973125, + "diversity_loss_mlp": 0.0, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.10306076043273057, + "language_loss": 0.95430547, + "learning_rate": 0.0009867714229963125, + "loss": 0.96632946, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 526, + "time_per_iteration": 2.6960504055023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194179, + "balance_loss_mlp": 1.17121899, + "diversity_loss_mlp": 0.0, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.13221329860014494, + "language_loss": 0.92439747, + "learning_rate": 0.000986700139676468, + "loss": 0.93633932, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 527, + "time_per_iteration": 2.5740442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226752, + "balance_loss_mlp": 1.20331526, + "diversity_loss_mlp": 0.0, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.07480383753700154, + "language_loss": 0.90178651, + "learning_rate": 0.0009866286674028717, + "loss": 0.91405398, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 528, + "time_per_iteration": 2.6214394569396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00901033, + "balance_loss_mlp": 1.53179681, + "diversity_loss_mlp": 0.23385583, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.042015219172821444, + "language_loss": 0.87127066, + "learning_rate": 0.0009865570062032717, + "loss": 0.88028097, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820667, + "step": 529, + "time_per_iteration": 2.947612762451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243163, + "balance_loss_mlp": 1.21885657, + "diversity_loss_mlp": 0.0, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.11620953964099495, + "language_loss": 0.91896212, + "learning_rate": 0.0009864851561054893, + "loss": 0.93139374, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 530, + "time_per_iteration": 2.8097901344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192516, + "balance_loss_mlp": 1.16937733, + "diversity_loss_mlp": 0.0, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.0991735208834069, + "language_loss": 0.90383148, + "learning_rate": 0.0009864131171374191, + "loss": 0.9157567, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.23132324, + "routerloss_mlp": 0.0, + "step": 531, + "time_per_iteration": 2.6775832176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169363, + "balance_loss_mlp": 1.14682031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.08125371515716559, + "language_loss": 0.90489674, + "learning_rate": 0.0009863408893270292, + "loss": 0.91659039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.2253418, + "routerloss_mlp": 0.0, + "step": 532, + "time_per_iteration": 2.7877254486083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.1120224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.11770570969906818, + "language_loss": 0.85183895, + "learning_rate": 0.0009862684727023605, + "loss": 0.8631804, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 533, + "time_per_iteration": 2.717573642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128208, + "balance_loss_mlp": 1.10571277, + "diversity_loss_mlp": 0.0, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.10673213842736717, + "language_loss": 0.88664484, + "learning_rate": 0.0009861958672915283, + "loss": 0.89792687, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.22497559, + "routerloss_mlp": 0.0, + "step": 534, + "time_per_iteration": 2.7880847454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111155, + "balance_loss_mlp": 1.08948302, + "diversity_loss_mlp": 0.0, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.11915216532291298, + "language_loss": 0.88834876, + "learning_rate": 0.0009861230731227201, + "loss": 0.89946032, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.21679688, + "routerloss_mlp": 0.0, + "step": 535, + "time_per_iteration": 2.844203233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121725, + "balance_loss_mlp": 1.10002935, + "diversity_loss_mlp": 0.0, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.11019657032079996, + "language_loss": 0.90318179, + "learning_rate": 0.0009860500902241973, + "loss": 0.91439903, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.21716309, + "routerloss_mlp": 0.0, + "step": 536, + "time_per_iteration": 2.5753133296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126411, + "balance_loss_mlp": 1.10444033, + "diversity_loss_mlp": 0.0, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.13353850851854182, + "language_loss": 0.95278764, + "learning_rate": 0.0009859769186242942, + "loss": 0.96405172, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.21984863, + "routerloss_mlp": 0.0, + "step": 537, + "time_per_iteration": 2.544611930847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894726, + "balance_loss_mlp": 1.52693653, + "diversity_loss_mlp": 0.22699235, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04205207536563703, + "language_loss": 0.88558614, + "learning_rate": 0.0009859035583514187, + "loss": 0.8945334, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01776124, + "step": 538, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257859, + "balance_loss_mlp": 1.23475599, + "diversity_loss_mlp": 0.0, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.11200334451020948, + "language_loss": 0.89448857, + "learning_rate": 0.0009858300094340517, + "loss": 0.90706718, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.23071289, + "routerloss_mlp": 0.0, + "step": 539, + "time_per_iteration": 2.7679364681243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291272, + "balance_loss_mlp": 1.26785898, + "diversity_loss_mlp": 0.0, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.17493624211104222, + "language_loss": 0.84562349, + "learning_rate": 0.0009857562719007473, + "loss": 0.85853624, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.23388672, + "routerloss_mlp": 0.0, + "step": 540, + "time_per_iteration": 2.6256375312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267144, + "balance_loss_mlp": 1.24492311, + "diversity_loss_mlp": 0.0, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.14114133743563548, + "language_loss": 0.86615884, + "learning_rate": 0.0009856823457801331, + "loss": 0.87883031, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 541, + "time_per_iteration": 2.8773691654205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254714, + "balance_loss_mlp": 1.23256469, + "diversity_loss_mlp": 0.0, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.08733197639022866, + "language_loss": 0.93604994, + "learning_rate": 0.00098560823110091, + "loss": 0.94859707, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.22167969, + "routerloss_mlp": 0.0, + "step": 542, + "time_per_iteration": 2.6173057556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206738, + "balance_loss_mlp": 1.18436217, + "diversity_loss_mlp": 0.0, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.14252191795618116, + "language_loss": 0.94814467, + "learning_rate": 0.000985533927891851, + "loss": 0.96021199, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.22387695, + "routerloss_mlp": 0.0, + "step": 543, + "time_per_iteration": 2.682035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00924177, + "balance_loss_mlp": 1.58877563, + "diversity_loss_mlp": 0.22542018, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.04171093567104517, + "language_loss": 0.92462713, + "learning_rate": 0.0009854594361818044, + "loss": 0.93386889, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01707876, + "step": 544, + "time_per_iteration": 2.771606922149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134006, + "balance_loss_mlp": 1.11126077, + "diversity_loss_mlp": 0.0, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.16622789723447462, + "language_loss": 0.91736549, + "learning_rate": 0.0009853847559996897, + "loss": 0.92870551, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.22729492, + "routerloss_mlp": 0.0, + "step": 545, + "time_per_iteration": 2.714980363845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131307, + "balance_loss_mlp": 1.10896707, + "diversity_loss_mlp": 0.0, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.13863422454282084, + "language_loss": 0.90834534, + "learning_rate": 0.0009853098873745, + "loss": 0.91965836, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.22351074, + "routerloss_mlp": 0.0, + "step": 546, + "time_per_iteration": 2.98349928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127677, + "balance_loss_mlp": 1.10500383, + "diversity_loss_mlp": 0.0, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.15888834478547278, + "language_loss": 0.90073705, + "learning_rate": 0.0009852348303353027, + "loss": 0.91201389, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.22668457, + "routerloss_mlp": 0.0, + "step": 547, + "time_per_iteration": 2.782012701034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148987, + "balance_loss_mlp": 1.12613487, + "diversity_loss_mlp": 0.0, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.10179846154330349, + "language_loss": 0.82990968, + "learning_rate": 0.000985159584911237, + "loss": 0.84139955, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 548, + "time_per_iteration": 3.102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216658, + "balance_loss_mlp": 1.19307828, + "diversity_loss_mlp": 0.0, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.12466178148261096, + "language_loss": 0.89916652, + "learning_rate": 0.0009850841511315162, + "loss": 0.91133308, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.2355957, + "routerloss_mlp": 0.0, + "step": 549, + "time_per_iteration": 2.61226749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241093, + "balance_loss_mlp": 1.21708441, + "diversity_loss_mlp": 0.0, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.11901003741868514, + "language_loss": 0.90615034, + "learning_rate": 0.0009850085290254256, + "loss": 0.91856128, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.23986816, + "routerloss_mlp": 0.0, + "step": 550, + "time_per_iteration": 2.7958199977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914838, + "balance_loss_mlp": 1.5724771, + "diversity_loss_mlp": 0.22113116, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.03122458898086593, + "language_loss": 0.87977409, + "learning_rate": 0.0009849327186223246, + "loss": 0.88892245, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0180343, + "step": 551, + "time_per_iteration": 2.799394130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242815, + "balance_loss_mlp": 1.21818638, + "diversity_loss_mlp": 0.0, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.10957849833176474, + "language_loss": 0.95181417, + "learning_rate": 0.000984856719951646, + "loss": 0.96424234, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.24609375, + "routerloss_mlp": 0.0, + "step": 552, + "time_per_iteration": 2.559286117553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121032, + "balance_loss_mlp": 1.18546462, + "diversity_loss_mlp": 0.0, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.09349197696587547, + "language_loss": 0.91760498, + "learning_rate": 0.0009847805330428943, + "loss": 0.92970818, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.24865723, + "routerloss_mlp": 0.0, + "step": 553, + "time_per_iteration": 2.906571388244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875983, + "balance_loss_mlp": 1.49139261, + "diversity_loss_mlp": 0.22127438, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05457604420902532, + "language_loss": 0.93558431, + "learning_rate": 0.0009847041579256481, + "loss": 0.94434416, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01964992, + "step": 554, + "time_per_iteration": 2.6159372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202664, + "balance_loss_mlp": 1.17859542, + "diversity_loss_mlp": 0.0, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.08395889420783041, + "language_loss": 0.94042808, + "learning_rate": 0.0009846275946295592, + "loss": 0.95245475, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 555, + "time_per_iteration": 2.592341184616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182493, + "balance_loss_mlp": 1.15904498, + "diversity_loss_mlp": 0.0, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.08262845202589308, + "language_loss": 0.8740595, + "learning_rate": 0.0009845508431843518, + "loss": 0.8858844, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 556, + "time_per_iteration": 3.0123813152313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177615, + "balance_loss_mlp": 1.15481031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.07593810566908125, + "language_loss": 0.88148719, + "learning_rate": 0.0009844739036198233, + "loss": 0.8932634, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 557, + "time_per_iteration": 2.6356143951416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184514, + "balance_loss_mlp": 1.16157842, + "diversity_loss_mlp": 0.0, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.09177793780956148, + "language_loss": 0.94916999, + "learning_rate": 0.0009843967759658448, + "loss": 0.96101511, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 558, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02293865, + "balance_loss_mlp": 2.17026901, + "diversity_loss_mlp": 0.0, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.09925677209713644, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.75061619, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 559, + "time_per_iteration": 4.829499244689941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207667, + "balance_loss_mlp": 1.18555331, + "diversity_loss_mlp": 0.0, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.1031420062274817, + "language_loss": 0.9552027, + "learning_rate": 0.000984241956509384, + "loss": 0.96727937, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 560, + "time_per_iteration": 2.65759539604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204016, + "balance_loss_mlp": 1.18220043, + "diversity_loss_mlp": 0.0, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.08944048757536185, + "language_loss": 0.90505213, + "learning_rate": 0.0009841642647670078, + "loss": 0.91709226, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 561, + "time_per_iteration": 2.591806173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194467, + "balance_loss_mlp": 1.17308092, + "diversity_loss_mlp": 0.0, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.08297191380839272, + "language_loss": 0.85483265, + "learning_rate": 0.0009840863850553944, + "loss": 0.8667773, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.21398926, + "routerloss_mlp": 0.0, + "step": 562, + "time_per_iteration": 2.963149309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179499, + "balance_loss_mlp": 1.15856552, + "diversity_loss_mlp": 0.0, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.18759249419324772, + "language_loss": 0.9088884, + "learning_rate": 0.0009840083174047782, + "loss": 0.92068338, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.20947266, + "routerloss_mlp": 0.0, + "step": 563, + "time_per_iteration": 2.71415114402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169496, + "balance_loss_mlp": 1.14940953, + "diversity_loss_mlp": 0.0, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.08351477183844232, + "language_loss": 0.86295354, + "learning_rate": 0.0009839300618454685, + "loss": 0.87464857, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.20080566, + "routerloss_mlp": 0.0, + "step": 564, + "time_per_iteration": 2.8288042545318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163807, + "balance_loss_mlp": 1.14280224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0761185875884483, + "language_loss": 0.9141686, + "learning_rate": 0.0009838516184078466, + "loss": 0.92580664, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.21020508, + "routerloss_mlp": 0.0, + "step": 565, + "time_per_iteration": 2.8194022178649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177734, + "balance_loss_mlp": 1.15682447, + "diversity_loss_mlp": 0.0, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.14122321260962364, + "language_loss": 0.88377023, + "learning_rate": 0.0009837729871223669, + "loss": 0.89554763, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 566, + "time_per_iteration": 2.6096079349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194985, + "balance_loss_mlp": 1.17372978, + "diversity_loss_mlp": 0.0, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.1066586812750682, + "language_loss": 0.88896918, + "learning_rate": 0.0009836941680195568, + "loss": 0.90091902, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.21264648, + "routerloss_mlp": 0.0, + "step": 567, + "time_per_iteration": 2.779846429824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210465, + "balance_loss_mlp": 1.18900692, + "diversity_loss_mlp": 0.0, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.09744135285550241, + "language_loss": 0.84777021, + "learning_rate": 0.0009836151611300166, + "loss": 0.85987484, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 568, + "time_per_iteration": 3.2130274772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210546, + "balance_loss_mlp": 1.18979168, + "diversity_loss_mlp": 0.0, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.0877787159655237, + "language_loss": 0.95202124, + "learning_rate": 0.0009835359664844194, + "loss": 0.96412671, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.2076416, + "routerloss_mlp": 0.0, + "step": 569, + "time_per_iteration": 2.614626407623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02163392, + "balance_loss_mlp": 2.12848806, + "diversity_loss_mlp": 0.0, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.098326155744124, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.83200204, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.34960938, + "routerloss_mlp": 0.0, + "step": 570, + "time_per_iteration": 4.910563230514526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188679, + "balance_loss_mlp": 1.16738796, + "diversity_loss_mlp": 0.0, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.10673198509513786, + "language_loss": 0.92503107, + "learning_rate": 0.0009833770140481118, + "loss": 0.93691778, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.21313477, + "routerloss_mlp": 0.0, + "step": 571, + "time_per_iteration": 2.6361794471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167522, + "balance_loss_mlp": 1.14587367, + "diversity_loss_mlp": 0.0, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.06757736028097705, + "language_loss": 0.82720339, + "learning_rate": 0.000983297256319112, + "loss": 0.83887863, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.21655273, + "routerloss_mlp": 0.0, + "step": 572, + "time_per_iteration": 3.2420709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148716, + "balance_loss_mlp": 1.12606621, + "diversity_loss_mlp": 0.0, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.09218112459591986, + "language_loss": 0.87054348, + "learning_rate": 0.000983217310957477, + "loss": 0.88203067, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 573, + "time_per_iteration": 2.7485547065734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139139, + "balance_loss_mlp": 1.11725259, + "diversity_loss_mlp": 0.0, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.08282639029669561, + "language_loss": 0.90421212, + "learning_rate": 0.000983137177994244, + "loss": 0.91560352, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.21899414, + "routerloss_mlp": 0.0, + "step": 574, + "time_per_iteration": 2.8651185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142425, + "balance_loss_mlp": 1.11990607, + "diversity_loss_mlp": 0.0, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.08655490231030577, + "language_loss": 0.8561765, + "learning_rate": 0.0009830568574605235, + "loss": 0.8676008, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.22521973, + "routerloss_mlp": 0.0, + "step": 575, + "time_per_iteration": 2.942331075668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162411, + "balance_loss_mlp": 1.13946342, + "diversity_loss_mlp": 0.0, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.08792859421485215, + "language_loss": 0.88113999, + "learning_rate": 0.0009829763493874992, + "loss": 0.89276409, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 576, + "time_per_iteration": 3.0282514095306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173849, + "balance_loss_mlp": 1.15098429, + "diversity_loss_mlp": 0.0, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.10676499351314739, + "language_loss": 0.9303807, + "learning_rate": 0.0009828956538064264, + "loss": 0.94211912, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.2286377, + "routerloss_mlp": 0.0, + "step": 577, + "time_per_iteration": 2.7946369647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173248, + "balance_loss_mlp": 1.1503005, + "diversity_loss_mlp": 0.0, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.11074471638842859, + "language_loss": 0.91223717, + "learning_rate": 0.0009828147707486344, + "loss": 0.92396963, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 578, + "time_per_iteration": 2.731588125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.13424993, + "diversity_loss_mlp": 0.0, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.09317476454713723, + "language_loss": 0.86116958, + "learning_rate": 0.0009827337002455245, + "loss": 0.87273794, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 579, + "time_per_iteration": 2.639047145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134437, + "balance_loss_mlp": 1.11184728, + "diversity_loss_mlp": 0.0, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.07918824025832125, + "language_loss": 0.88299757, + "learning_rate": 0.0009826524423285712, + "loss": 0.89434195, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.22595215, + "routerloss_mlp": 0.0, + "step": 580, + "time_per_iteration": 2.911012649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114105, + "balance_loss_mlp": 1.11881745, + "diversity_loss_mlp": 0.0, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.10469703454021252, + "language_loss": 0.89618349, + "learning_rate": 0.0009825709970293218, + "loss": 0.90759397, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 581, + "time_per_iteration": 2.8837828636169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135626, + "balance_loss_mlp": 1.11433506, + "diversity_loss_mlp": 0.0, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.1022616119694228, + "language_loss": 0.95317924, + "learning_rate": 0.0009824893643793956, + "loss": 0.96453559, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.21289062, + "routerloss_mlp": 0.0, + "step": 582, + "time_per_iteration": 3.0962114334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948798, + "balance_loss_mlp": 1.63779283, + "diversity_loss_mlp": 0.22248407, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.04350556393742171, + "language_loss": 0.88843536, + "learning_rate": 0.0009824075444104857, + "loss": 0.89792335, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01865991, + "step": 583, + "time_per_iteration": 2.719085454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157517, + "balance_loss_mlp": 1.13638163, + "diversity_loss_mlp": 0.0, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.10740950198198211, + "language_loss": 0.93831933, + "learning_rate": 0.000982325537154357, + "loss": 0.94989443, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.21154785, + "routerloss_mlp": 0.0, + "step": 584, + "time_per_iteration": 2.597120523452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117352, + "balance_loss_mlp": 1.15234792, + "diversity_loss_mlp": 0.0, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.12322952105084124, + "language_loss": 0.94442445, + "learning_rate": 0.0009822433426428484, + "loss": 0.95615965, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.21179199, + "routerloss_mlp": 0.0, + "step": 585, + "time_per_iteration": 2.571805238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238103, + "balance_loss_mlp": 1.2166214, + "diversity_loss_mlp": 0.0, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.08678287386034968, + "language_loss": 0.87089044, + "learning_rate": 0.0009821609609078697, + "loss": 0.88327146, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.21484375, + "routerloss_mlp": 0.0, + "step": 586, + "time_per_iteration": 2.586289405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320429, + "balance_loss_mlp": 1.29861343, + "diversity_loss_mlp": 0.0, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.09324667942342675, + "language_loss": 0.89581811, + "learning_rate": 0.0009820783919814045, + "loss": 0.90902239, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 587, + "time_per_iteration": 2.804417848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01397697, + "balance_loss_mlp": 1.37499988, + "diversity_loss_mlp": 0.0, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.11766834316785481, + "language_loss": 0.82825267, + "learning_rate": 0.0009819956358955095, + "loss": 0.8422296, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.22705078, + "routerloss_mlp": 0.0, + "step": 588, + "time_per_iteration": 2.5654590129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433511, + "balance_loss_mlp": 1.41009879, + "diversity_loss_mlp": 0.0, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.13254981657968556, + "language_loss": 0.84316242, + "learning_rate": 0.0009819126926823127, + "loss": 0.85749757, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.23413086, + "routerloss_mlp": 0.0, + "step": 589, + "time_per_iteration": 2.5090954303741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369151, + "balance_loss_mlp": 1.34720445, + "diversity_loss_mlp": 0.0, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.12923638752993147, + "language_loss": 0.87131608, + "learning_rate": 0.000981829562374016, + "loss": 0.88500756, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 590, + "time_per_iteration": 2.7904558181762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263432, + "balance_loss_mlp": 1.24309444, + "diversity_loss_mlp": 0.0, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.0979331207375339, + "language_loss": 0.97635686, + "learning_rate": 0.0009817462450028933, + "loss": 0.98899126, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 591, + "time_per_iteration": 2.6596498489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186211, + "balance_loss_mlp": 1.16698265, + "diversity_loss_mlp": 0.0, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.0791908179615389, + "language_loss": 0.85476398, + "learning_rate": 0.0009816627406012916, + "loss": 0.86662614, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 592, + "time_per_iteration": 2.795384168624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143018, + "balance_loss_mlp": 1.12423062, + "diversity_loss_mlp": 0.0, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.14133504737490046, + "language_loss": 0.85158926, + "learning_rate": 0.0009815790492016295, + "loss": 0.86301947, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.18774414, + "routerloss_mlp": 0.0, + "step": 593, + "time_per_iteration": 2.968202829360962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113857, + "balance_loss_mlp": 1.11954474, + "diversity_loss_mlp": 0.0, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.10990083394980393, + "language_loss": 0.87156999, + "learning_rate": 0.0009814951708363993, + "loss": 0.88295579, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.19006348, + "routerloss_mlp": 0.0, + "step": 594, + "time_per_iteration": 2.8341050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01993613, + "balance_loss_mlp": 1.96176016, + "diversity_loss_mlp": 0.0, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.10325359814292956, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79984605, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.31835938, + "routerloss_mlp": 0.0, + "step": 595, + "time_per_iteration": 4.746119976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11163688, + "diversity_loss_mlp": 0.0, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.1448933947746474, + "language_loss": 0.89056683, + "learning_rate": 0.0009813268533395648, + "loss": 0.90187395, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.19067383, + "routerloss_mlp": 0.0, + "step": 596, + "time_per_iteration": 2.592421054840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151969, + "balance_loss_mlp": 1.13301492, + "diversity_loss_mlp": 0.0, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.12455054099529249, + "language_loss": 0.8755219, + "learning_rate": 0.0009812424142733073, + "loss": 0.88704157, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.18933105, + "routerloss_mlp": 0.0, + "step": 597, + "time_per_iteration": 2.549654483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158973, + "balance_loss_mlp": 1.13961387, + "diversity_loss_mlp": 0.0, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.1533400924271749, + "language_loss": 0.86129421, + "learning_rate": 0.000981157788372175, + "loss": 0.87288398, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 598, + "time_per_iteration": 3.029372453689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181573, + "balance_loss_mlp": 1.16308403, + "diversity_loss_mlp": 0.0, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.08122879346901381, + "language_loss": 0.89185023, + "learning_rate": 0.0009810729756690223, + "loss": 0.90366596, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.18481445, + "routerloss_mlp": 0.0, + "step": 599, + "time_per_iteration": 2.72200608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225343, + "balance_loss_mlp": 1.20584035, + "diversity_loss_mlp": 0.0, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.09322481346022114, + "language_loss": 0.91937912, + "learning_rate": 0.0009809879761967766, + "loss": 0.93163252, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.19482422, + "routerloss_mlp": 0.0, + "step": 600, + "time_per_iteration": 2.9454104900360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240049, + "balance_loss_mlp": 1.22046316, + "diversity_loss_mlp": 0.0, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.11235514763344263, + "language_loss": 0.86727029, + "learning_rate": 0.0009809027899884378, + "loss": 0.87967086, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.19580078, + "routerloss_mlp": 0.0, + "step": 601, + "time_per_iteration": 2.888047218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288764, + "balance_loss_mlp": 1.26829576, + "diversity_loss_mlp": 0.0, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.07021797329248278, + "language_loss": 0.88593882, + "learning_rate": 0.0009808174170770779, + "loss": 0.89882648, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.20458984, + "routerloss_mlp": 0.0, + "step": 602, + "time_per_iteration": 2.8045670986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02144093, + "balance_loss_mlp": 2.11128712, + "diversity_loss_mlp": 0.0, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.1124732092134732, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.87042338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.328125, + "routerloss_mlp": 0.0, + "step": 603, + "time_per_iteration": 4.899731397628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341078, + "balance_loss_mlp": 1.32069361, + "diversity_loss_mlp": 0.0, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.10202627615666406, + "language_loss": 0.93765342, + "learning_rate": 0.0009806461112779462, + "loss": 0.95106417, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 604, + "time_per_iteration": 2.6618311405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291209, + "balance_loss_mlp": 1.27080083, + "diversity_loss_mlp": 0.0, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.13219567018011513, + "language_loss": 0.87928259, + "learning_rate": 0.0009805601784566814, + "loss": 0.89219463, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.20397949, + "routerloss_mlp": 0.0, + "step": 605, + "time_per_iteration": 2.4783012866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229751, + "balance_loss_mlp": 1.20996237, + "diversity_loss_mlp": 0.0, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.07794567116482086, + "language_loss": 0.95705628, + "learning_rate": 0.0009804740590654089, + "loss": 0.9693538, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.19787598, + "routerloss_mlp": 0.0, + "step": 606, + "time_per_iteration": 2.6886532306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155761, + "balance_loss_mlp": 1.13543582, + "diversity_loss_mlp": 0.0, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.09113538166915294, + "language_loss": 0.90117687, + "learning_rate": 0.0009803877531375635, + "loss": 0.91273439, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 607, + "time_per_iteration": 2.877068281173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127783, + "balance_loss_mlp": 1.1072073, + "diversity_loss_mlp": 0.0, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.0886917383310614, + "language_loss": 0.90959686, + "learning_rate": 0.0009803012607066523, + "loss": 0.92087471, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.20581055, + "routerloss_mlp": 0.0, + "step": 608, + "time_per_iteration": 2.7187952995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110833, + "balance_loss_mlp": 1.08786178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.061304878637031934, + "language_loss": 0.89645171, + "learning_rate": 0.0009802145818062543, + "loss": 0.90753502, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 609, + "time_per_iteration": 2.692622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920288, + "balance_loss_mlp": 1.57755673, + "diversity_loss_mlp": 0.22646153, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.03934500472587961, + "language_loss": 0.91726142, + "learning_rate": 0.0009801277164700212, + "loss": 0.92646432, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01827916, + "step": 610, + "time_per_iteration": 2.5983645915985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100014, + "balance_loss_mlp": 1.07810283, + "diversity_loss_mlp": 0.0, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.11493980483313035, + "language_loss": 0.90203917, + "learning_rate": 0.0009800406647316776, + "loss": 0.91303933, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.21911621, + "routerloss_mlp": 0.0, + "step": 611, + "time_per_iteration": 2.83890438079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02350268, + "balance_loss_mlp": 2.30563617, + "diversity_loss_mlp": 0.0, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.20114955038596882, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7926473, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.44726562, + "routerloss_mlp": 0.0, + "step": 612, + "time_per_iteration": 4.795763254165649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111114, + "balance_loss_mlp": 1.09067178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.10624240262278996, + "language_loss": 0.88978302, + "learning_rate": 0.000979866002183916, + "loss": 0.9008944, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 613, + "time_per_iteration": 2.660820484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.10140252, + "diversity_loss_mlp": 0.0, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.11793468153173196, + "language_loss": 0.90023279, + "learning_rate": 0.0009797783914423082, + "loss": 0.91144633, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.19946289, + "routerloss_mlp": 0.0, + "step": 614, + "time_per_iteration": 2.8052501678466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.13508475, + "diversity_loss_mlp": 0.0, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.09232041353489327, + "language_loss": 0.84365702, + "learning_rate": 0.0009796905944342094, + "loss": 0.8552016, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.19360352, + "routerloss_mlp": 0.0, + "step": 615, + "time_per_iteration": 2.829193115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164283, + "balance_loss_mlp": 1.14475632, + "diversity_loss_mlp": 0.0, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.08204462941928636, + "language_loss": 0.88193601, + "learning_rate": 0.0009796026111937057, + "loss": 0.89357883, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.19519043, + "routerloss_mlp": 0.0, + "step": 616, + "time_per_iteration": 2.5868873596191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165697, + "balance_loss_mlp": 1.14656377, + "diversity_loss_mlp": 0.0, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.08667467412120618, + "language_loss": 0.88612103, + "learning_rate": 0.0009795144417549552, + "loss": 0.89777797, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.19128418, + "routerloss_mlp": 0.0, + "step": 617, + "time_per_iteration": 2.689771890640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163262, + "balance_loss_mlp": 1.14452195, + "diversity_loss_mlp": 0.0, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.07824422885129345, + "language_loss": 0.8978498, + "learning_rate": 0.0009794260861521883, + "loss": 0.90948236, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.18737793, + "routerloss_mlp": 0.0, + "step": 618, + "time_per_iteration": 2.78352689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154292, + "balance_loss_mlp": 1.13528955, + "diversity_loss_mlp": 0.0, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.09960243519509318, + "language_loss": 0.86907887, + "learning_rate": 0.0009793375444197075, + "loss": 0.88062179, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 619, + "time_per_iteration": 2.618597984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159739, + "balance_loss_mlp": 1.14053416, + "diversity_loss_mlp": 0.0, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.09155899478389973, + "language_loss": 0.85016847, + "learning_rate": 0.000979248816591888, + "loss": 0.86176586, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 620, + "time_per_iteration": 2.7570278644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145713, + "balance_loss_mlp": 1.12721133, + "diversity_loss_mlp": 0.0, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.1108991519321712, + "language_loss": 0.86349535, + "learning_rate": 0.0009791599027031766, + "loss": 0.87495244, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 621, + "time_per_iteration": 3.2095139026641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137126, + "balance_loss_mlp": 1.11841059, + "diversity_loss_mlp": 0.0, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.09815511109151757, + "language_loss": 0.86187375, + "learning_rate": 0.0009790708027880932, + "loss": 0.873245, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 622, + "time_per_iteration": 2.878537654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01885107, + "balance_loss_mlp": 1.84448004, + "diversity_loss_mlp": 0.0, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.060338107853692736, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.79312396, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 623, + "time_per_iteration": 4.854407787322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.12785053, + "diversity_loss_mlp": 0.0, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.08227936779447462, + "language_loss": 0.9313252, + "learning_rate": 0.0009788920450172487, + "loss": 0.94280195, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.19812012, + "routerloss_mlp": 0.0, + "step": 624, + "time_per_iteration": 2.633763551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173257, + "balance_loss_mlp": 1.15283692, + "diversity_loss_mlp": 0.0, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.08898942147955141, + "language_loss": 0.90448737, + "learning_rate": 0.0009788023872308875, + "loss": 0.91621995, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.20410156, + "routerloss_mlp": 0.0, + "step": 625, + "time_per_iteration": 2.5277719497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01862648, + "balance_loss_mlp": 1.82163978, + "diversity_loss_mlp": 0.0, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.06145643913195344, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.77291644, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.41015625, + "routerloss_mlp": 0.0, + "step": 626, + "time_per_iteration": 4.746332883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165065, + "balance_loss_mlp": 1.1446321, + "diversity_loss_mlp": 0.0, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.07179626691480034, + "language_loss": 0.93775636, + "learning_rate": 0.0009786225140303285, + "loss": 0.94940698, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.2043457, + "routerloss_mlp": 0.0, + "step": 627, + "time_per_iteration": 2.650980234146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.13354802, + "diversity_loss_mlp": 0.0, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.1000912175423248, + "language_loss": 0.91955918, + "learning_rate": 0.0009785322986859634, + "loss": 0.93110657, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.21191406, + "routerloss_mlp": 0.0, + "step": 628, + "time_per_iteration": 2.699179172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0098085, + "balance_loss_mlp": 1.69793713, + "diversity_loss_mlp": 0.22907162, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.03434932946066091, + "language_loss": 0.92752671, + "learning_rate": 0.0009784418975588838, + "loss": 0.93733525, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01734566, + "step": 629, + "time_per_iteration": 2.7467246055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131113, + "balance_loss_mlp": 1.10905957, + "diversity_loss_mlp": 0.0, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.08662072407619689, + "language_loss": 0.93157279, + "learning_rate": 0.0009783513106841862, + "loss": 0.94288397, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.22070312, + "routerloss_mlp": 0.0, + "step": 630, + "time_per_iteration": 2.699862003326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01893774, + "balance_loss_mlp": 1.85181284, + "diversity_loss_mlp": 0.0, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.08318726834589595, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78626478, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.41992188, + "routerloss_mlp": 0.0, + "step": 631, + "time_per_iteration": 4.952157258987427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129662, + "balance_loss_mlp": 1.10740614, + "diversity_loss_mlp": 0.0, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.08011431594745816, + "language_loss": 0.87836802, + "learning_rate": 0.0009781695798326854, + "loss": 0.88966465, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.22265625, + "routerloss_mlp": 0.0, + "step": 632, + "time_per_iteration": 2.5692520141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.10132909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.08866631591317527, + "language_loss": 0.87804729, + "learning_rate": 0.0009780784359264365, + "loss": 0.88928837, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 633, + "time_per_iteration": 2.6267781257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00719882, + "balance_loss_mlp": 1.16367078, + "diversity_loss_mlp": 0.22089316, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.0030158712959469035, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.74908578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02760048, + "step": 634, + "time_per_iteration": 4.819004535675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00956665, + "balance_loss_mlp": 1.64561963, + "diversity_loss_mlp": 0.23289478, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.029780004210258365, + "language_loss": 0.87410563, + "learning_rate": 0.000977895591329867, + "loss": 0.88367236, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017408, + "step": 635, + "time_per_iteration": 2.8417630195617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111829, + "balance_loss_mlp": 1.09035909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.07301537581986137, + "language_loss": 0.86799347, + "learning_rate": 0.000977803890710533, + "loss": 0.87911177, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 636, + "time_per_iteration": 2.721245765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_mlp": 1.08507979, + "diversity_loss_mlp": 0.0, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.0646034576227674, + "language_loss": 0.93395561, + "learning_rate": 0.0009777120045912774, + "loss": 0.94501537, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.20898438, + "routerloss_mlp": 0.0, + "step": 637, + "time_per_iteration": 2.5976381301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114015, + "balance_loss_mlp": 1.09267688, + "diversity_loss_mlp": 0.0, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.07520229878174765, + "language_loss": 0.89586985, + "learning_rate": 0.0009776199330077736, + "loss": 0.90700996, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.21362305, + "routerloss_mlp": 0.0, + "step": 638, + "time_per_iteration": 2.7055575847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127487, + "balance_loss_mlp": 1.10741186, + "diversity_loss_mlp": 0.0, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.08952902399696973, + "language_loss": 0.91934389, + "learning_rate": 0.0009775276759957667, + "loss": 0.93061876, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.20068359, + "routerloss_mlp": 0.0, + "step": 639, + "time_per_iteration": 2.703442096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113385, + "balance_loss_mlp": 1.11285698, + "diversity_loss_mlp": 0.0, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.08734236555353025, + "language_loss": 0.8993817, + "learning_rate": 0.0009774352335910745, + "loss": 0.91072023, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.21008301, + "routerloss_mlp": 0.0, + "step": 640, + "time_per_iteration": 2.798133373260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133691, + "balance_loss_mlp": 1.11327052, + "diversity_loss_mlp": 0.0, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.08010684820371014, + "language_loss": 0.94195282, + "learning_rate": 0.000977342605829586, + "loss": 0.95328975, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.20422363, + "routerloss_mlp": 0.0, + "step": 641, + "time_per_iteration": 2.72929310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167832, + "balance_loss_mlp": 1.14699411, + "diversity_loss_mlp": 0.0, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.08202605728626432, + "language_loss": 0.85741401, + "learning_rate": 0.0009772497927472623, + "loss": 0.86909235, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.20837402, + "routerloss_mlp": 0.0, + "step": 642, + "time_per_iteration": 3.071017265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166824, + "balance_loss_mlp": 1.14637995, + "diversity_loss_mlp": 0.0, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.0829252807022359, + "language_loss": 0.84863311, + "learning_rate": 0.0009771567943801368, + "loss": 0.86030138, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.20446777, + "routerloss_mlp": 0.0, + "step": 643, + "time_per_iteration": 2.667830228805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.16058123, + "diversity_loss_mlp": 0.0, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.07304892670416417, + "language_loss": 0.89067769, + "learning_rate": 0.0009770636107643152, + "loss": 0.90248668, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.203125, + "routerloss_mlp": 0.0, + "step": 644, + "time_per_iteration": 2.715703010559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187033, + "balance_loss_mlp": 1.16633821, + "diversity_loss_mlp": 0.0, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.07624328698635177, + "language_loss": 0.87043303, + "learning_rate": 0.0009769702419359738, + "loss": 0.88230342, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.20703125, + "routerloss_mlp": 0.0, + "step": 645, + "time_per_iteration": 2.645270586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199535, + "balance_loss_mlp": 1.17913866, + "diversity_loss_mlp": 0.0, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.10325279424343262, + "language_loss": 0.88927197, + "learning_rate": 0.000976876687931362, + "loss": 0.90126729, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 646, + "time_per_iteration": 2.9558987617492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154414, + "balance_loss_mlp": 1.13427997, + "diversity_loss_mlp": 0.0, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.10259074887379964, + "language_loss": 0.84658372, + "learning_rate": 0.0009767829487868005, + "loss": 0.85812783, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.20129395, + "routerloss_mlp": 0.0, + "step": 647, + "time_per_iteration": 2.593254566192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165839, + "balance_loss_mlp": 1.14557362, + "diversity_loss_mlp": 0.0, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.08660672395493044, + "language_loss": 0.88729513, + "learning_rate": 0.000976689024538682, + "loss": 0.8989535, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.20263672, + "routerloss_mlp": 0.0, + "step": 648, + "time_per_iteration": 2.6087043285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.12564492, + "diversity_loss_mlp": 0.0, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.09471610460140056, + "language_loss": 0.86980593, + "learning_rate": 0.0009765949152234716, + "loss": 0.88127637, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.21411133, + "routerloss_mlp": 0.0, + "step": 649, + "time_per_iteration": 2.8878984451293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130912, + "balance_loss_mlp": 2.08723378, + "diversity_loss_mlp": 0.0, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.17488169385486374, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.80816996, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.4375, + "routerloss_mlp": 0.0, + "step": 650, + "time_per_iteration": 4.7227959632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125186, + "balance_loss_mlp": 1.10393071, + "diversity_loss_mlp": 0.0, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.09783498118048492, + "language_loss": 0.81436628, + "learning_rate": 0.0009764061415379919, + "loss": 0.82561815, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.21276855, + "routerloss_mlp": 0.0, + "step": 651, + "time_per_iteration": 3.2849485874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135606, + "balance_loss_mlp": 1.11419618, + "diversity_loss_mlp": 0.0, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08568090703098526, + "language_loss": 0.88376707, + "learning_rate": 0.0009763114772410109, + "loss": 0.89512312, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 652, + "time_per_iteration": 2.640482187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147244, + "balance_loss_mlp": 1.12633479, + "diversity_loss_mlp": 0.0, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.0799999486499222, + "language_loss": 0.86490756, + "learning_rate": 0.0009762166280235146, + "loss": 0.87638003, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.20910645, + "routerloss_mlp": 0.0, + "step": 653, + "time_per_iteration": 2.9535903930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188128, + "balance_loss_mlp": 1.16659844, + "diversity_loss_mlp": 0.0, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.09522027236447655, + "language_loss": 0.86765033, + "learning_rate": 0.0009761215939223267, + "loss": 0.87953162, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 654, + "time_per_iteration": 2.7124929428100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186032, + "balance_loss_mlp": 1.16533732, + "diversity_loss_mlp": 0.0, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.11212167432887624, + "language_loss": 0.85993934, + "learning_rate": 0.0009760263749743428, + "loss": 0.87179965, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.20690918, + "routerloss_mlp": 0.0, + "step": 655, + "time_per_iteration": 2.5919461250305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171572, + "balance_loss_mlp": 1.1518662, + "diversity_loss_mlp": 0.0, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.09226162692886594, + "language_loss": 0.89700639, + "learning_rate": 0.0009759309712165299, + "loss": 0.9087221, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.19702148, + "routerloss_mlp": 0.0, + "step": 656, + "time_per_iteration": 2.746537685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161192, + "balance_loss_mlp": 1.14149833, + "diversity_loss_mlp": 0.0, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.08627335840647962, + "language_loss": 0.92326117, + "learning_rate": 0.0009758353826859272, + "loss": 0.9348731, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 657, + "time_per_iteration": 2.5861480236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128682, + "balance_loss_mlp": 1.10790431, + "diversity_loss_mlp": 0.0, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.1059978443595565, + "language_loss": 0.88603538, + "learning_rate": 0.0009757396094196456, + "loss": 0.89732224, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.20788574, + "routerloss_mlp": 0.0, + "step": 658, + "time_per_iteration": 2.8773136138916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130444, + "balance_loss_mlp": 1.11040533, + "diversity_loss_mlp": 0.0, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.12293029558515219, + "language_loss": 0.83426332, + "learning_rate": 0.0009756436514548673, + "loss": 0.8455677, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.20031738, + "routerloss_mlp": 0.0, + "step": 659, + "time_per_iteration": 2.810722589492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.11438441, + "diversity_loss_mlp": 0.0, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.06793027871708798, + "language_loss": 0.87658846, + "learning_rate": 0.0009755475088288466, + "loss": 0.88793576, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.20349121, + "routerloss_mlp": 0.0, + "step": 660, + "time_per_iteration": 2.7121376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.12785089, + "diversity_loss_mlp": 0.0, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08710392398912287, + "language_loss": 0.89421189, + "learning_rate": 0.0009754511815789095, + "loss": 0.90569162, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.20117188, + "routerloss_mlp": 0.0, + "step": 661, + "time_per_iteration": 2.777318239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162586, + "balance_loss_mlp": 1.14171267, + "diversity_loss_mlp": 0.0, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08537034247511402, + "language_loss": 0.84716892, + "learning_rate": 0.0009753546697424533, + "loss": 0.85879481, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 662, + "time_per_iteration": 2.6664726734161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169368, + "balance_loss_mlp": 1.14935231, + "diversity_loss_mlp": 0.0, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.08593929583832248, + "language_loss": 0.89815515, + "learning_rate": 0.0009752579733569475, + "loss": 0.90984881, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.20019531, + "routerloss_mlp": 0.0, + "step": 663, + "time_per_iteration": 2.695844888687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02192512, + "balance_loss_mlp": 2.16352034, + "diversity_loss_mlp": 0.0, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.2093028146020386, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.77073896, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.2890625, + "routerloss_mlp": 0.0, + "step": 664, + "time_per_iteration": 4.96467137336731 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00927072, + "balance_loss_mlp": 1.59828615, + "diversity_loss_mlp": 0.21952696, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.040572636524321984, + "language_loss": 0.8949101, + "learning_rate": 0.0009750640270890217, + "loss": 0.90418077, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816532, + "step": 665, + "time_per_iteration": 2.7632246017456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241186, + "balance_loss_mlp": 1.22053885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.08846289988129392, + "language_loss": 0.95572138, + "learning_rate": 0.0009749667772818983, + "loss": 0.96813321, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.20654297, + "routerloss_mlp": 0.0, + "step": 666, + "time_per_iteration": 3.037458896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183198, + "balance_loss_mlp": 1.80241597, + "diversity_loss_mlp": 0.0, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.11554481164154014, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.7876792, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.29492188, + "routerloss_mlp": 0.0, + "step": 667, + "time_per_iteration": 4.810182332992554 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244511, + "balance_loss_mlp": 1.22299325, + "diversity_loss_mlp": 0.0, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.09137997717488894, + "language_loss": 0.94816601, + "learning_rate": 0.0009747717245101093, + "loss": 0.9606111, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.21520996, + "routerloss_mlp": 0.0, + "step": 668, + "time_per_iteration": 2.552507162094116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917856, + "balance_loss_mlp": 1.58052325, + "diversity_loss_mlp": 0.21830653, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.03508480239171642, + "language_loss": 0.8457346, + "learning_rate": 0.00097467392162117, + "loss": 0.85491318, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01844162, + "step": 669, + "time_per_iteration": 2.6064391136169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242109, + "balance_loss_mlp": 1.21882796, + "diversity_loss_mlp": 0.0, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.1666980552990896, + "language_loss": 0.90609741, + "learning_rate": 0.0009745759344474708, + "loss": 0.91851848, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.23266602, + "routerloss_mlp": 0.0, + "step": 670, + "time_per_iteration": 2.826202392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229033, + "balance_loss_mlp": 1.2077179, + "diversity_loss_mlp": 0.0, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.09671049007121679, + "language_loss": 0.88974905, + "learning_rate": 0.0009744777630270536, + "loss": 0.90203935, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.21337891, + "routerloss_mlp": 0.0, + "step": 671, + "time_per_iteration": 2.578334331512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233527, + "balance_loss_mlp": 1.21067417, + "diversity_loss_mlp": 0.0, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.08999527722625096, + "language_loss": 0.92790663, + "learning_rate": 0.000974379407398032, + "loss": 0.94024187, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 672, + "time_per_iteration": 2.8661158084869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237675, + "balance_loss_mlp": 1.21589506, + "diversity_loss_mlp": 0.0, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.09653126460783178, + "language_loss": 0.81875724, + "learning_rate": 0.0009742808675985913, + "loss": 0.83113402, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.21801758, + "routerloss_mlp": 0.0, + "step": 673, + "time_per_iteration": 3.0861356258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260533, + "balance_loss_mlp": 1.23754919, + "diversity_loss_mlp": 0.0, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.08653130412501808, + "language_loss": 0.90219223, + "learning_rate": 0.0009741821436669876, + "loss": 0.91479754, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 674, + "time_per_iteration": 2.5609960556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267597, + "balance_loss_mlp": 1.24489975, + "diversity_loss_mlp": 0.0, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.09623752325881015, + "language_loss": 0.91791725, + "learning_rate": 0.0009740832356415492, + "loss": 0.93059325, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.22680664, + "routerloss_mlp": 0.0, + "step": 675, + "time_per_iteration": 2.544027805328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295128, + "balance_loss_mlp": 1.27278781, + "diversity_loss_mlp": 0.0, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.08903369590662558, + "language_loss": 0.87403589, + "learning_rate": 0.0009739841435606756, + "loss": 0.88698715, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.22338867, + "routerloss_mlp": 0.0, + "step": 676, + "time_per_iteration": 2.9931325912475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261461, + "balance_loss_mlp": 1.23933589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.0602287995404217, + "language_loss": 0.89557111, + "learning_rate": 0.0009738848674628377, + "loss": 0.90818572, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 677, + "time_per_iteration": 2.7290966510772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264602, + "balance_loss_mlp": 1.24307275, + "diversity_loss_mlp": 0.0, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.10468610894957399, + "language_loss": 0.88751101, + "learning_rate": 0.000973785407386578, + "loss": 0.90015703, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 678, + "time_per_iteration": 2.7950329780578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00969584, + "balance_loss_mlp": 1.6979661, + "diversity_loss_mlp": 0.20886885, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.03344489204860934, + "language_loss": 0.86933386, + "learning_rate": 0.0009736857633705103, + "loss": 0.87902969, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01616703, + "step": 679, + "time_per_iteration": 2.8691866397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193718, + "balance_loss_mlp": 1.17283261, + "diversity_loss_mlp": 0.0, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.08130386374469858, + "language_loss": 0.92363989, + "learning_rate": 0.0009735859354533196, + "loss": 0.93557703, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 680, + "time_per_iteration": 2.6832337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155917, + "balance_loss_mlp": 1.13447094, + "diversity_loss_mlp": 0.0, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.0924188238597787, + "language_loss": 0.91083395, + "learning_rate": 0.0009734859236737628, + "loss": 0.92239314, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.21459961, + "routerloss_mlp": 0.0, + "step": 681, + "time_per_iteration": 2.6023473739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125397, + "balance_loss_mlp": 1.10410571, + "diversity_loss_mlp": 0.0, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.08442474228180671, + "language_loss": 0.93186569, + "learning_rate": 0.0009733857280706678, + "loss": 0.9431197, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.2130127, + "routerloss_mlp": 0.0, + "step": 682, + "time_per_iteration": 2.5775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968386, + "balance_loss_mlp": 1.69064701, + "diversity_loss_mlp": 0.21057674, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.03992508312329801, + "language_loss": 0.84369749, + "learning_rate": 0.000973285348682934, + "loss": 0.85338134, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777408, + "step": 683, + "time_per_iteration": 2.768641233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01618305, + "balance_loss_mlp": 1.58530831, + "diversity_loss_mlp": 0.0, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.09794042911652269, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79516685, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.33007812, + "routerloss_mlp": 0.0, + "step": 684, + "time_per_iteration": 4.802167177200317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094162, + "balance_loss_mlp": 1.07383704, + "diversity_loss_mlp": 0.0, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.12652995306024198, + "language_loss": 0.84832728, + "learning_rate": 0.0009730840387095046, + "loss": 0.8592689, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.20324707, + "routerloss_mlp": 0.0, + "step": 685, + "time_per_iteration": 3.2910287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112414, + "balance_loss_mlp": 1.09188628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.13012317463795417, + "language_loss": 0.90537834, + "learning_rate": 0.0009729831082019642, + "loss": 0.91650254, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.20532227, + "routerloss_mlp": 0.0, + "step": 686, + "time_per_iteration": 2.7909138202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121716, + "balance_loss_mlp": 1.101331, + "diversity_loss_mlp": 0.0, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08096428549902779, + "language_loss": 0.88353586, + "learning_rate": 0.0009728819940660958, + "loss": 0.89475298, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 687, + "time_per_iteration": 2.7699429988861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131653, + "balance_loss_mlp": 1.11135173, + "diversity_loss_mlp": 0.0, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.07933225152322496, + "language_loss": 0.85085285, + "learning_rate": 0.0009727806963411557, + "loss": 0.86216938, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 688, + "time_per_iteration": 2.581984519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144043, + "balance_loss_mlp": 1.12350333, + "diversity_loss_mlp": 0.0, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.09807362554425139, + "language_loss": 0.87180853, + "learning_rate": 0.000972679215066471, + "loss": 0.88324893, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.20544434, + "routerloss_mlp": 0.0, + "step": 689, + "time_per_iteration": 2.6538989543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148083, + "balance_loss_mlp": 1.12809181, + "diversity_loss_mlp": 0.0, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.09247782934143206, + "language_loss": 0.98983967, + "learning_rate": 0.0009725775502814401, + "loss": 1.00132048, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.19995117, + "routerloss_mlp": 0.0, + "step": 690, + "time_per_iteration": 2.610485315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167941, + "balance_loss_mlp": 1.14827132, + "diversity_loss_mlp": 0.0, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.08082631328369684, + "language_loss": 0.84880829, + "learning_rate": 0.0009724757020255327, + "loss": 0.8604877, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.1965332, + "routerloss_mlp": 0.0, + "step": 691, + "time_per_iteration": 2.8424370288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152001, + "balance_loss_mlp": 1.13209307, + "diversity_loss_mlp": 0.0, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09067820147092803, + "language_loss": 0.87807095, + "learning_rate": 0.0009723736703382902, + "loss": 0.88959098, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.19897461, + "routerloss_mlp": 0.0, + "step": 692, + "time_per_iteration": 2.5578606128692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149266, + "balance_loss_mlp": 1.13037133, + "diversity_loss_mlp": 0.0, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07979062216362842, + "language_loss": 0.82877922, + "learning_rate": 0.0009722714552593244, + "loss": 0.84027195, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 693, + "time_per_iteration": 2.6148533821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153464, + "balance_loss_mlp": 1.13444984, + "diversity_loss_mlp": 0.0, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08708336283232748, + "language_loss": 0.94164526, + "learning_rate": 0.000972169056828319, + "loss": 0.9531799, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 694, + "time_per_iteration": 2.517944097518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154915, + "balance_loss_mlp": 1.1360321, + "diversity_loss_mlp": 0.0, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.0753733884935208, + "language_loss": 0.86921358, + "learning_rate": 0.0009720664750850283, + "loss": 0.8807627, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 695, + "time_per_iteration": 2.8149421215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148667, + "balance_loss_mlp": 1.1299628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.09445278911045346, + "language_loss": 0.92951906, + "learning_rate": 0.0009719637100692784, + "loss": 0.94100577, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 696, + "time_per_iteration": 2.719451904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149322, + "balance_loss_mlp": 1.13098741, + "diversity_loss_mlp": 0.0, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.10008701466446891, + "language_loss": 0.82604736, + "learning_rate": 0.0009718607618209661, + "loss": 0.83754057, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 697, + "time_per_iteration": 2.8692104816436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148038, + "balance_loss_mlp": 1.12914348, + "diversity_loss_mlp": 0.0, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.07908911060166324, + "language_loss": 0.87701273, + "learning_rate": 0.0009717576303800595, + "loss": 0.88849318, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 698, + "time_per_iteration": 3.0484437942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139219, + "balance_loss_mlp": 1.11988366, + "diversity_loss_mlp": 0.0, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.12480577454910273, + "language_loss": 0.85819161, + "learning_rate": 0.0009716543157865975, + "loss": 0.86958385, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.1932373, + "routerloss_mlp": 0.0, + "step": 699, + "time_per_iteration": 2.706787347793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144768, + "balance_loss_mlp": 1.12586117, + "diversity_loss_mlp": 0.0, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.16362357873421526, + "language_loss": 0.83352965, + "learning_rate": 0.0009715508180806907, + "loss": 0.84497738, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.18896484, + "routerloss_mlp": 0.0, + "step": 700, + "time_per_iteration": 3.1985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162079, + "balance_loss_mlp": 1.14230227, + "diversity_loss_mlp": 0.0, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.08746408781150025, + "language_loss": 0.90170425, + "learning_rate": 0.0009714471373025202, + "loss": 0.91332507, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.19763184, + "routerloss_mlp": 0.0, + "step": 701, + "time_per_iteration": 3.487022638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.13656974, + "diversity_loss_mlp": 0.0, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.10787745491017559, + "language_loss": 0.88186693, + "learning_rate": 0.0009713432734923386, + "loss": 0.89343208, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 702, + "time_per_iteration": 2.6239736080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.14830136, + "diversity_loss_mlp": 0.0, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.09670789671988574, + "language_loss": 0.86879516, + "learning_rate": 0.0009712392266904696, + "loss": 0.88047349, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.19506836, + "routerloss_mlp": 0.0, + "step": 703, + "time_per_iteration": 2.7542335987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181198, + "balance_loss_mlp": 1.16149247, + "diversity_loss_mlp": 0.0, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.10598212751912446, + "language_loss": 0.85246772, + "learning_rate": 0.0009711349969373076, + "loss": 0.86427975, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 704, + "time_per_iteration": 3.162461042404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175522, + "balance_loss_mlp": 1.15518451, + "diversity_loss_mlp": 0.0, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.0954290464489283, + "language_loss": 0.80285007, + "learning_rate": 0.0009710305842733178, + "loss": 0.81460524, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 705, + "time_per_iteration": 2.7630715370178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155052, + "balance_loss_mlp": 1.13601446, + "diversity_loss_mlp": 0.0, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.09437017973872532, + "language_loss": 0.89630616, + "learning_rate": 0.0009709259887390373, + "loss": 0.9078567, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.19030762, + "routerloss_mlp": 0.0, + "step": 706, + "time_per_iteration": 2.6160268783569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895019, + "balance_loss_mlp": 1.55161047, + "diversity_loss_mlp": 0.20666173, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.04273378361131697, + "language_loss": 0.90874577, + "learning_rate": 0.0009708212103750737, + "loss": 0.91769588, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588319, + "step": 707, + "time_per_iteration": 2.594606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180444, + "balance_loss_mlp": 1.16110778, + "diversity_loss_mlp": 0.0, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.08814378894040824, + "language_loss": 0.87522972, + "learning_rate": 0.0009707162492221051, + "loss": 0.88703418, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.19335938, + "routerloss_mlp": 0.0, + "step": 708, + "time_per_iteration": 2.8884427547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197388, + "balance_loss_mlp": 1.17801642, + "diversity_loss_mlp": 0.0, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.07892254834086627, + "language_loss": 0.87611169, + "learning_rate": 0.0009706111053208815, + "loss": 0.8880856, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 709, + "time_per_iteration": 2.7824413776397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213311, + "balance_loss_mlp": 1.19383228, + "diversity_loss_mlp": 0.0, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.10389736734512126, + "language_loss": 0.85504246, + "learning_rate": 0.0009705057787122232, + "loss": 0.86717558, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.19458008, + "routerloss_mlp": 0.0, + "step": 710, + "time_per_iteration": 2.529498815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178108, + "balance_loss_mlp": 1.15870059, + "diversity_loss_mlp": 0.0, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.07975606670492637, + "language_loss": 0.91293353, + "learning_rate": 0.0009704002694370216, + "loss": 0.92471457, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.19384766, + "routerloss_mlp": 0.0, + "step": 711, + "time_per_iteration": 2.5365610122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152018, + "balance_loss_mlp": 1.13282573, + "diversity_loss_mlp": 0.0, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.08453852441771745, + "language_loss": 0.86583841, + "learning_rate": 0.0009702945775362388, + "loss": 0.87735862, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.19177246, + "routerloss_mlp": 0.0, + "step": 712, + "time_per_iteration": 2.595674514770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111883, + "balance_loss_mlp": 1.10022175, + "diversity_loss_mlp": 0.0, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.08096963371537849, + "language_loss": 0.87088716, + "learning_rate": 0.0009701887030509086, + "loss": 0.88207549, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.18615723, + "routerloss_mlp": 0.0, + "step": 713, + "time_per_iteration": 2.6124320030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112657, + "balance_loss_mlp": 1.09444165, + "diversity_loss_mlp": 0.0, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.12434454369652892, + "language_loss": 0.91262931, + "learning_rate": 0.0009700826460221346, + "loss": 0.92375588, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.18225098, + "routerloss_mlp": 0.0, + "step": 714, + "time_per_iteration": 2.674612283706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115832, + "balance_loss_mlp": 1.09812903, + "diversity_loss_mlp": 0.0, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.11407804289300516, + "language_loss": 0.92571628, + "learning_rate": 0.0009699764064910921, + "loss": 0.93687463, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.17712402, + "routerloss_mlp": 0.0, + "step": 715, + "time_per_iteration": 2.8810853958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121116, + "balance_loss_mlp": 1.10322237, + "diversity_loss_mlp": 0.0, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08940816195623212, + "language_loss": 0.86826718, + "learning_rate": 0.0009698699844990268, + "loss": 0.87947834, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.17907715, + "routerloss_mlp": 0.0, + "step": 716, + "time_per_iteration": 2.697970151901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153213, + "balance_loss_mlp": 1.13561809, + "diversity_loss_mlp": 0.0, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.07906779204708066, + "language_loss": 0.88138282, + "learning_rate": 0.0009697633800872555, + "loss": 0.89291501, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 717, + "time_per_iteration": 2.8897392749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197417, + "balance_loss_mlp": 1.1801312, + "diversity_loss_mlp": 0.0, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.10867682790127652, + "language_loss": 0.9066782, + "learning_rate": 0.0009696565932971655, + "loss": 0.91865242, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 718, + "time_per_iteration": 2.8944718837738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209582, + "balance_loss_mlp": 1.19165277, + "diversity_loss_mlp": 0.0, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.0949883595308799, + "language_loss": 0.89814746, + "learning_rate": 0.0009695496241702153, + "loss": 0.91024327, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 719, + "time_per_iteration": 2.7888894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188478, + "balance_loss_mlp": 1.17082274, + "diversity_loss_mlp": 0.0, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.11627833553714081, + "language_loss": 0.86245799, + "learning_rate": 0.0009694424727479339, + "loss": 0.87434286, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.17687988, + "routerloss_mlp": 0.0, + "step": 720, + "time_per_iteration": 2.901224374771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157865, + "balance_loss_mlp": 1.14056826, + "diversity_loss_mlp": 0.0, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.09369792564045784, + "language_loss": 0.88928097, + "learning_rate": 0.0009693351390719213, + "loss": 0.90085959, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 721, + "time_per_iteration": 2.6945152282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126431, + "balance_loss_mlp": 1.10868096, + "diversity_loss_mlp": 0.0, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.07998653864580182, + "language_loss": 0.90800881, + "learning_rate": 0.000969227623183848, + "loss": 0.91927308, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 722, + "time_per_iteration": 2.789515733718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110503, + "balance_loss_mlp": 1.0873754, + "diversity_loss_mlp": 0.0, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.07914116119322331, + "language_loss": 0.90912664, + "learning_rate": 0.0009691199251254554, + "loss": 0.92017698, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.17663574, + "routerloss_mlp": 0.0, + "step": 723, + "time_per_iteration": 2.8231685161590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0093359, + "balance_loss_mlp": 1.62175167, + "diversity_loss_mlp": 0.20987722, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.03669424434563534, + "language_loss": 0.86868215, + "learning_rate": 0.0009690120449385555, + "loss": 0.87801802, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777578, + "step": 724, + "time_per_iteration": 2.8498518466949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093753, + "balance_loss_mlp": 1.07543111, + "diversity_loss_mlp": 0.0, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.10366482624390064, + "language_loss": 0.92449063, + "learning_rate": 0.0009689039826650312, + "loss": 0.93542814, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.18322754, + "routerloss_mlp": 0.0, + "step": 725, + "time_per_iteration": 2.7611966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0154366, + "balance_loss_mlp": 1.50932813, + "diversity_loss_mlp": 0.0, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.08078369374569346, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.78066719, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.34375, + "routerloss_mlp": 0.0, + "step": 726, + "time_per_iteration": 4.927435398101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933775, + "balance_loss_mlp": 1.62253523, + "diversity_loss_mlp": 0.20735951, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.04309218151041253, + "language_loss": 0.87429261, + "learning_rate": 0.0009686873120259941, + "loss": 0.88363039, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01882811, + "step": 727, + "time_per_iteration": 2.602264165878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113926, + "balance_loss_mlp": 1.12035322, + "diversity_loss_mlp": 0.0, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.14876828859354083, + "language_loss": 0.8713131, + "learning_rate": 0.0009685787037446004, + "loss": 0.88270569, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.18884277, + "routerloss_mlp": 0.0, + "step": 728, + "time_per_iteration": 2.806549072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118218, + "balance_loss_mlp": 1.09903765, + "diversity_loss_mlp": 0.0, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.1987640778264907, + "language_loss": 0.87505388, + "learning_rate": 0.0009684699135448201, + "loss": 0.88623607, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 729, + "time_per_iteration": 2.7200138568878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112792, + "balance_loss_mlp": 1.09435034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.0640895655048784, + "language_loss": 0.92135447, + "learning_rate": 0.0009683609414688895, + "loss": 0.93248242, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.18432617, + "routerloss_mlp": 0.0, + "step": 730, + "time_per_iteration": 2.7423696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911127, + "balance_loss_mlp": 1.58117688, + "diversity_loss_mlp": 0.20959289, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.03249579551243702, + "language_loss": 0.86587501, + "learning_rate": 0.0009682517875591154, + "loss": 0.87498629, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01574249, + "step": 731, + "time_per_iteration": 2.809400796890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199938, + "balance_loss_mlp": 1.18138909, + "diversity_loss_mlp": 0.0, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.07609394509363156, + "language_loss": 0.86229968, + "learning_rate": 0.0009681424518578749, + "loss": 0.87429905, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.18530273, + "routerloss_mlp": 0.0, + "step": 732, + "time_per_iteration": 2.725839614868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283686, + "balance_loss_mlp": 1.26505399, + "diversity_loss_mlp": 0.0, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.1414658743658329, + "language_loss": 0.87506676, + "learning_rate": 0.000968032934407616, + "loss": 0.88790363, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.1862793, + "routerloss_mlp": 0.0, + "step": 733, + "time_per_iteration": 2.583768844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01310281, + "balance_loss_mlp": 1.29136264, + "diversity_loss_mlp": 0.0, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.10963887531318486, + "language_loss": 0.81871867, + "learning_rate": 0.0009679232352508571, + "loss": 0.8318215, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.18908691, + "routerloss_mlp": 0.0, + "step": 734, + "time_per_iteration": 2.785585880279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286635, + "balance_loss_mlp": 1.26744211, + "diversity_loss_mlp": 0.0, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.10469043869015734, + "language_loss": 0.80695581, + "learning_rate": 0.0009678133544301871, + "loss": 0.81982213, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 735, + "time_per_iteration": 2.6638481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224375, + "balance_loss_mlp": 1.20588589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.06500438819618859, + "language_loss": 0.91870093, + "learning_rate": 0.0009677032919882658, + "loss": 0.93094468, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 736, + "time_per_iteration": 2.6578378677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197974, + "balance_loss_mlp": 1.18056929, + "diversity_loss_mlp": 0.0, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.09940630997209131, + "language_loss": 0.91374373, + "learning_rate": 0.000967593047967823, + "loss": 0.92572349, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.17419434, + "routerloss_mlp": 0.0, + "step": 737, + "time_per_iteration": 2.5236403942108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117212, + "balance_loss_mlp": 1.15476346, + "diversity_loss_mlp": 0.0, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.10840920786543624, + "language_loss": 0.86479127, + "learning_rate": 0.0009674826224116593, + "loss": 0.87651253, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 738, + "time_per_iteration": 2.803260326385498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134605, + "balance_loss_mlp": 1.11759412, + "diversity_loss_mlp": 0.0, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.09051392518082112, + "language_loss": 0.86862409, + "learning_rate": 0.0009673720153626455, + "loss": 0.87997013, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 739, + "time_per_iteration": 2.6086573600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.10798764, + "diversity_loss_mlp": 0.0, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.11444093339414264, + "language_loss": 0.8689152, + "learning_rate": 0.0009672612268637235, + "loss": 0.88016504, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.17016602, + "routerloss_mlp": 0.0, + "step": 740, + "time_per_iteration": 2.582648277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116151, + "balance_loss_mlp": 1.09880614, + "diversity_loss_mlp": 0.0, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.10874190594389947, + "language_loss": 0.84213787, + "learning_rate": 0.0009671502569579048, + "loss": 0.85329938, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 741, + "time_per_iteration": 2.7945284843444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132432, + "balance_loss_mlp": 1.11539662, + "diversity_loss_mlp": 0.0, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.07140691777849974, + "language_loss": 0.89503837, + "learning_rate": 0.0009670391056882719, + "loss": 0.90636265, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.17053223, + "routerloss_mlp": 0.0, + "step": 742, + "time_per_iteration": 2.71687912940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149228, + "balance_loss_mlp": 1.13240731, + "diversity_loss_mlp": 0.0, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.08672376963732596, + "language_loss": 0.88698781, + "learning_rate": 0.0009669277730979776, + "loss": 0.89848006, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 743, + "time_per_iteration": 3.2029030323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147207, + "balance_loss_mlp": 1.13025546, + "diversity_loss_mlp": 0.0, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.09113342882689801, + "language_loss": 0.85227454, + "learning_rate": 0.0009668162592302449, + "loss": 0.86374664, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 744, + "time_per_iteration": 2.899656057357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165032, + "balance_loss_mlp": 1.14748406, + "diversity_loss_mlp": 0.0, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.07780467137911447, + "language_loss": 0.86560214, + "learning_rate": 0.0009667045641283676, + "loss": 0.87725246, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.17553711, + "routerloss_mlp": 0.0, + "step": 745, + "time_per_iteration": 2.6474997997283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.14148676, + "diversity_loss_mlp": 0.0, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.09864944110558675, + "language_loss": 0.95312673, + "learning_rate": 0.0009665926878357092, + "loss": 0.96471858, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.17700195, + "routerloss_mlp": 0.0, + "step": 746, + "time_per_iteration": 2.946307420730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00851982, + "balance_loss_mlp": 1.46230698, + "diversity_loss_mlp": 0.20995456, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.034792990408202794, + "language_loss": 0.91192698, + "learning_rate": 0.0009664806303957043, + "loss": 0.92044681, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01585159, + "step": 747, + "time_per_iteration": 2.706286668777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160661, + "balance_loss_mlp": 1.14221931, + "diversity_loss_mlp": 0.0, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.08367194984434445, + "language_loss": 0.87066692, + "learning_rate": 0.0009663683918518571, + "loss": 0.88227355, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 748, + "time_per_iteration": 2.892982244491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136526, + "balance_loss_mlp": 1.11831081, + "diversity_loss_mlp": 0.0, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.07455761265115375, + "language_loss": 0.85490787, + "learning_rate": 0.0009662559722477428, + "loss": 0.86627316, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.18237305, + "routerloss_mlp": 0.0, + "step": 749, + "time_per_iteration": 2.6979615688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01292346, + "balance_loss_mlp": 1.2582047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.08640394257539531, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77455318, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.34179688, + "routerloss_mlp": 0.0, + "step": 750, + "time_per_iteration": 4.991304397583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.11068118, + "diversity_loss_mlp": 0.0, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.07866539193327844, + "language_loss": 0.89197791, + "learning_rate": 0.0009660305900333632, + "loss": 0.90326303, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.17834473, + "routerloss_mlp": 0.0, + "step": 751, + "time_per_iteration": 2.6706793308258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121937, + "balance_loss_mlp": 1.1038413, + "diversity_loss_mlp": 0.0, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.10038132697844201, + "language_loss": 0.82478833, + "learning_rate": 0.0009659176275105992, + "loss": 0.83600777, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.1809082, + "routerloss_mlp": 0.0, + "step": 752, + "time_per_iteration": 2.697909355163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126212, + "balance_loss_mlp": 1.10777032, + "diversity_loss_mlp": 0.0, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.10638604925915984, + "language_loss": 0.85756153, + "learning_rate": 0.0009658044841025701, + "loss": 0.86882365, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 753, + "time_per_iteration": 2.7749171257019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128331, + "balance_loss_mlp": 1.1107595, + "diversity_loss_mlp": 0.0, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.09130861127340602, + "language_loss": 0.81584072, + "learning_rate": 0.0009656911598532021, + "loss": 0.827124, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.17590332, + "routerloss_mlp": 0.0, + "step": 754, + "time_per_iteration": 2.635702610015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136592, + "balance_loss_mlp": 1.11914003, + "diversity_loss_mlp": 0.0, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.06835454276473461, + "language_loss": 0.90494555, + "learning_rate": 0.0009655776548064917, + "loss": 0.9163115, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.17456055, + "routerloss_mlp": 0.0, + "step": 755, + "time_per_iteration": 2.6545748710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135969, + "balance_loss_mlp": 1.11902952, + "diversity_loss_mlp": 0.0, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.07886906074703284, + "language_loss": 0.88367254, + "learning_rate": 0.0009654639690065054, + "loss": 0.89503217, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 756, + "time_per_iteration": 2.8773815631866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150961, + "balance_loss_mlp": 1.13343716, + "diversity_loss_mlp": 0.0, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.07604063018618923, + "language_loss": 0.8823185, + "learning_rate": 0.00096535010249738, + "loss": 0.89382815, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.17529297, + "routerloss_mlp": 0.0, + "step": 757, + "time_per_iteration": 2.7175021171569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846707, + "balance_loss_mlp": 1.45519352, + "diversity_loss_mlp": 0.20419648, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.03954501513556402, + "language_loss": 0.82782531, + "learning_rate": 0.0009652360553233224, + "loss": 0.83629239, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017012, + "step": 758, + "time_per_iteration": 2.7434637546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115333, + "balance_loss_mlp": 1.12624609, + "diversity_loss_mlp": 0.0, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.03342191973393777, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.7492708, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 759, + "time_per_iteration": 4.910880088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188786, + "balance_loss_mlp": 1.17063034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.0638252555407819, + "language_loss": 0.81659228, + "learning_rate": 0.0009650074191575883, + "loss": 0.82848012, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.18151855, + "routerloss_mlp": 0.0, + "step": 760, + "time_per_iteration": 3.2028603553771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213565, + "balance_loss_mlp": 1.19484925, + "diversity_loss_mlp": 0.0, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.07046318146001718, + "language_loss": 0.86031073, + "learning_rate": 0.0009648928302546766, + "loss": 0.87244636, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 761, + "time_per_iteration": 2.6812515258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243947, + "balance_loss_mlp": 1.22551703, + "diversity_loss_mlp": 0.0, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.0884537515073792, + "language_loss": 0.85470825, + "learning_rate": 0.0009647780608643613, + "loss": 0.86714768, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.1842041, + "routerloss_mlp": 0.0, + "step": 762, + "time_per_iteration": 3.3486785888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012302, + "balance_loss_mlp": 1.21243811, + "diversity_loss_mlp": 0.0, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.12042495658723557, + "language_loss": 0.874053, + "learning_rate": 0.0009646631110312001, + "loss": 0.88635492, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 763, + "time_per_iteration": 2.6648313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172377, + "balance_loss_mlp": 1.1544956, + "diversity_loss_mlp": 0.0, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.05916332097574664, + "language_loss": 0.8841719, + "learning_rate": 0.0009645479807998203, + "loss": 0.89589572, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 764, + "time_per_iteration": 2.7347912788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147505, + "balance_loss_mlp": 1.12983775, + "diversity_loss_mlp": 0.0, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06985321722585584, + "language_loss": 0.92467874, + "learning_rate": 0.0009644326702149196, + "loss": 0.93615377, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.17675781, + "routerloss_mlp": 0.0, + "step": 765, + "time_per_iteration": 2.7316319942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135962, + "balance_loss_mlp": 1.11803293, + "diversity_loss_mlp": 0.0, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.09157028460957184, + "language_loss": 0.84919345, + "learning_rate": 0.0009643171793212653, + "loss": 0.86055309, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 766, + "time_per_iteration": 3.116917610168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105537, + "balance_loss_mlp": 1.08738184, + "diversity_loss_mlp": 0.0, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.08034801396880724, + "language_loss": 0.89233959, + "learning_rate": 0.0009642015081636952, + "loss": 0.90339494, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.18164062, + "routerloss_mlp": 0.0, + "step": 767, + "time_per_iteration": 2.705993175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103513, + "balance_loss_mlp": 1.08563185, + "diversity_loss_mlp": 0.0, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.09221888586765616, + "language_loss": 0.88360566, + "learning_rate": 0.0009640856567871166, + "loss": 0.8946408, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.17895508, + "routerloss_mlp": 0.0, + "step": 768, + "time_per_iteration": 2.5172243118286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108598, + "balance_loss_mlp": 1.08981061, + "diversity_loss_mlp": 0.0, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.0844592716079577, + "language_loss": 0.89047211, + "learning_rate": 0.0009639696252365072, + "loss": 0.9015581, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.18786621, + "routerloss_mlp": 0.0, + "step": 769, + "time_per_iteration": 3.034848690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105095, + "balance_loss_mlp": 1.08673656, + "diversity_loss_mlp": 0.0, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.07095543604969227, + "language_loss": 0.81996548, + "learning_rate": 0.0009638534135569144, + "loss": 0.83101642, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.18371582, + "routerloss_mlp": 0.0, + "step": 770, + "time_per_iteration": 2.947564125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106726, + "balance_loss_mlp": 1.08859468, + "diversity_loss_mlp": 0.0, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.08627707323979403, + "language_loss": 0.9012745, + "learning_rate": 0.0009637370217934554, + "loss": 0.91234171, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.18139648, + "routerloss_mlp": 0.0, + "step": 771, + "time_per_iteration": 2.6592423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.09355128, + "diversity_loss_mlp": 0.0, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.06345294765682771, + "language_loss": 0.82981932, + "learning_rate": 0.0009636204499913175, + "loss": 0.84093815, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 772, + "time_per_iteration": 2.8836610317230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115362, + "balance_loss_mlp": 1.09749293, + "diversity_loss_mlp": 0.0, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06338786563117527, + "language_loss": 0.87914705, + "learning_rate": 0.0009635036981957581, + "loss": 0.89030063, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 773, + "time_per_iteration": 2.885239601135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132405, + "balance_loss_mlp": 1.11417794, + "diversity_loss_mlp": 0.0, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.08623405645423676, + "language_loss": 0.90735364, + "learning_rate": 0.0009633867664521043, + "loss": 0.91867769, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.18212891, + "routerloss_mlp": 0.0, + "step": 774, + "time_per_iteration": 2.802264451980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159356, + "balance_loss_mlp": 1.14176083, + "diversity_loss_mlp": 0.0, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.09977443827883303, + "language_loss": 0.86760318, + "learning_rate": 0.0009632696548057527, + "loss": 0.8791967, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 775, + "time_per_iteration": 2.5641794204711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187156, + "balance_loss_mlp": 1.16960835, + "diversity_loss_mlp": 0.0, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.08744626586779954, + "language_loss": 0.85013115, + "learning_rate": 0.0009631523633021704, + "loss": 0.86200273, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 776, + "time_per_iteration": 2.7851786613464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881631, + "balance_loss_mlp": 1.52411294, + "diversity_loss_mlp": 0.20632464, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.038364140445948956, + "language_loss": 0.88378215, + "learning_rate": 0.0009630348919868936, + "loss": 0.89259851, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164127, + "step": 777, + "time_per_iteration": 2.7285845279693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191902, + "balance_loss_mlp": 1.17415154, + "diversity_loss_mlp": 0.0, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.14061909589017782, + "language_loss": 0.81450796, + "learning_rate": 0.0009629172409055293, + "loss": 0.82642698, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 778, + "time_per_iteration": 2.5018203258514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154162, + "balance_loss_mlp": 1.13728166, + "diversity_loss_mlp": 0.0, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.06968828956123203, + "language_loss": 0.87518388, + "learning_rate": 0.0009627994101037531, + "loss": 0.88672549, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 779, + "time_per_iteration": 2.763136863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139257, + "balance_loss_mlp": 1.12231779, + "diversity_loss_mlp": 0.0, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.07833298109740298, + "language_loss": 0.88761836, + "learning_rate": 0.0009626813996273114, + "loss": 0.8990109, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 780, + "time_per_iteration": 2.8791675567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117275, + "balance_loss_mlp": 1.09990597, + "diversity_loss_mlp": 0.0, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.09603506751758703, + "language_loss": 0.89051467, + "learning_rate": 0.0009625632095220198, + "loss": 0.90168738, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 781, + "time_per_iteration": 2.8194801807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119786, + "balance_loss_mlp": 1.10251248, + "diversity_loss_mlp": 0.0, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.1003760880169841, + "language_loss": 0.86904705, + "learning_rate": 0.0009624448398337637, + "loss": 0.88024497, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 782, + "time_per_iteration": 2.511925458908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117445, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.08409428795596587, + "language_loss": 0.8913728, + "learning_rate": 0.0009623262906084984, + "loss": 0.90254724, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.17236328, + "routerloss_mlp": 0.0, + "step": 783, + "time_per_iteration": 2.9890754222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125496, + "balance_loss_mlp": 1.10804367, + "diversity_loss_mlp": 0.0, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.07818041002140835, + "language_loss": 0.90351313, + "learning_rate": 0.0009622075618922486, + "loss": 0.9147681, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 784, + "time_per_iteration": 2.6550891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119712, + "balance_loss_mlp": 1.10261774, + "diversity_loss_mlp": 0.0, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.07239943737193227, + "language_loss": 0.87125635, + "learning_rate": 0.0009620886537311091, + "loss": 0.88245344, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.17114258, + "routerloss_mlp": 0.0, + "step": 785, + "time_per_iteration": 2.646864652633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125571, + "balance_loss_mlp": 1.10794032, + "diversity_loss_mlp": 0.0, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.08980079735835493, + "language_loss": 0.85309643, + "learning_rate": 0.000961969566171244, + "loss": 0.86435217, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.1763916, + "routerloss_mlp": 0.0, + "step": 786, + "time_per_iteration": 2.5803041458129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136873, + "balance_loss_mlp": 1.11938524, + "diversity_loss_mlp": 0.0, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.08282756535064502, + "language_loss": 0.8993417, + "learning_rate": 0.0009618502992588873, + "loss": 0.91071045, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.17504883, + "routerloss_mlp": 0.0, + "step": 787, + "time_per_iteration": 2.6479151248931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124837, + "balance_loss_mlp": 1.10727715, + "diversity_loss_mlp": 0.0, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07571751270322945, + "language_loss": 0.8792628, + "learning_rate": 0.0009617308530403424, + "loss": 0.89051116, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 788, + "time_per_iteration": 3.002804756164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125535, + "balance_loss_mlp": 1.10758173, + "diversity_loss_mlp": 0.0, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0842913885359751, + "language_loss": 0.88032806, + "learning_rate": 0.0009616112275619825, + "loss": 0.89158338, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 789, + "time_per_iteration": 2.6842775344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110837, + "balance_loss_mlp": 1.09398067, + "diversity_loss_mlp": 0.0, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.07451962795351484, + "language_loss": 0.83893597, + "learning_rate": 0.0009614914228702503, + "loss": 0.85004437, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 790, + "time_per_iteration": 2.714026689529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095726, + "balance_loss_mlp": 1.07848811, + "diversity_loss_mlp": 0.0, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.07099161447381937, + "language_loss": 0.89133644, + "learning_rate": 0.0009613714390116581, + "loss": 0.90229368, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.17260742, + "routerloss_mlp": 0.0, + "step": 791, + "time_per_iteration": 2.947917938232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089669, + "balance_loss_mlp": 1.0730865, + "diversity_loss_mlp": 0.0, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.07518738092336623, + "language_loss": 0.86102855, + "learning_rate": 0.0009612512760327879, + "loss": 0.87192523, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 792, + "time_per_iteration": 2.887404203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092437, + "balance_loss_mlp": 1.07553315, + "diversity_loss_mlp": 0.0, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.09992337759040973, + "language_loss": 0.85428631, + "learning_rate": 0.0009611309339802909, + "loss": 0.86521071, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 793, + "time_per_iteration": 2.463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101517, + "balance_loss_mlp": 1.08537626, + "diversity_loss_mlp": 0.0, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.07717151134226699, + "language_loss": 0.84535038, + "learning_rate": 0.0009610104129008881, + "loss": 0.85636556, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 794, + "time_per_iteration": 3.1276698112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108014, + "balance_loss_mlp": 1.09176612, + "diversity_loss_mlp": 0.0, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.07067272187318202, + "language_loss": 0.88475168, + "learning_rate": 0.0009608897128413701, + "loss": 0.89583182, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 795, + "time_per_iteration": 2.7658157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110863, + "balance_loss_mlp": 1.09251332, + "diversity_loss_mlp": 0.0, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.05987412473430484, + "language_loss": 0.85522842, + "learning_rate": 0.0009607688338485965, + "loss": 0.86631477, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 796, + "time_per_iteration": 2.849942207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112598, + "balance_loss_mlp": 1.10935068, + "diversity_loss_mlp": 0.0, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.07148533051381147, + "language_loss": 0.90245026, + "learning_rate": 0.0009606477759694969, + "loss": 0.91371006, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 797, + "time_per_iteration": 3.0240113735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144466, + "balance_loss_mlp": 1.12839675, + "diversity_loss_mlp": 0.0, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07535837127697287, + "language_loss": 0.87540114, + "learning_rate": 0.0009605265392510703, + "loss": 0.88684577, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 798, + "time_per_iteration": 2.6324868202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147656, + "balance_loss_mlp": 1.13140786, + "diversity_loss_mlp": 0.0, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.070317951825601, + "language_loss": 0.91919398, + "learning_rate": 0.0009604051237403846, + "loss": 0.93067056, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 799, + "time_per_iteration": 2.6472957134246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159957, + "balance_loss_mlp": 1.14441192, + "diversity_loss_mlp": 0.0, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.08825283549053219, + "language_loss": 0.8626982, + "learning_rate": 0.0009602835294845776, + "loss": 0.8742978, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 800, + "time_per_iteration": 2.4501516819000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141823, + "balance_loss_mlp": 1.12552738, + "diversity_loss_mlp": 0.0, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.07489761537063061, + "language_loss": 0.89964634, + "learning_rate": 0.0009601617565308565, + "loss": 0.91106457, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 801, + "time_per_iteration": 2.6480391025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00945745, + "balance_loss_mlp": 1.65525413, + "diversity_loss_mlp": 0.20237769, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.03656221347615257, + "language_loss": 0.8655234, + "learning_rate": 0.0009600398049264977, + "loss": 0.87498081, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01692954, + "step": 802, + "time_per_iteration": 3.0029048919677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00923116, + "balance_loss_mlp": 1.61011553, + "diversity_loss_mlp": 0.20312682, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.045238735441598905, + "language_loss": 0.92041564, + "learning_rate": 0.0009599176747188469, + "loss": 0.92964679, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164945, + "step": 803, + "time_per_iteration": 2.860461473464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113914, + "balance_loss_mlp": 1.12246239, + "diversity_loss_mlp": 0.0, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.08350523706559901, + "language_loss": 0.83155477, + "learning_rate": 0.0009597953659553196, + "loss": 0.84294617, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.16687012, + "routerloss_mlp": 0.0, + "step": 804, + "time_per_iteration": 2.733302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139673, + "balance_loss_mlp": 1.12363935, + "diversity_loss_mlp": 0.0, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.08094420015679657, + "language_loss": 0.89484847, + "learning_rate": 0.0009596728786833997, + "loss": 0.90624517, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.16027832, + "routerloss_mlp": 0.0, + "step": 805, + "time_per_iteration": 2.602963447570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112483, + "balance_loss_mlp": 1.10851073, + "diversity_loss_mlp": 0.0, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.09295267358895155, + "language_loss": 0.8926357, + "learning_rate": 0.0009595502129506415, + "loss": 0.90388405, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 806, + "time_per_iteration": 3.358494997024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112525, + "balance_loss_mlp": 1.10893035, + "diversity_loss_mlp": 0.0, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.09807919542340894, + "language_loss": 0.82600027, + "learning_rate": 0.0009594273688046678, + "loss": 0.83725274, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 807, + "time_per_iteration": 2.7516088485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121041, + "balance_loss_mlp": 1.10408974, + "diversity_loss_mlp": 0.0, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.13657059547118527, + "language_loss": 0.85685933, + "learning_rate": 0.000959304346293171, + "loss": 0.86806977, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 808, + "time_per_iteration": 2.676118850708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133717, + "balance_loss_mlp": 1.11686087, + "diversity_loss_mlp": 0.0, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.08670416080232539, + "language_loss": 0.88104093, + "learning_rate": 0.0009591811454639125, + "loss": 0.89237815, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.16870117, + "routerloss_mlp": 0.0, + "step": 809, + "time_per_iteration": 2.806877613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143795, + "balance_loss_mlp": 1.12712979, + "diversity_loss_mlp": 0.0, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.07575766208840308, + "language_loss": 0.88623202, + "learning_rate": 0.0009590577663647234, + "loss": 0.89766991, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 810, + "time_per_iteration": 2.705397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167139, + "balance_loss_mlp": 1.15012765, + "diversity_loss_mlp": 0.0, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.07966338850805216, + "language_loss": 0.86178398, + "learning_rate": 0.0009589342090435036, + "loss": 0.87345541, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 811, + "time_per_iteration": 2.767648935317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164589, + "balance_loss_mlp": 1.14749408, + "diversity_loss_mlp": 0.0, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07988119295983553, + "language_loss": 0.87430739, + "learning_rate": 0.0009588104735482223, + "loss": 0.88595331, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 812, + "time_per_iteration": 2.6543996334075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.14989901, + "diversity_loss_mlp": 0.0, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.09429144108453459, + "language_loss": 0.83906114, + "learning_rate": 0.0009586865599269177, + "loss": 0.85073483, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 813, + "time_per_iteration": 2.632206439971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180179, + "balance_loss_mlp": 1.1632992, + "diversity_loss_mlp": 0.0, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.08748302318090055, + "language_loss": 0.88416874, + "learning_rate": 0.0009585624682276977, + "loss": 0.89597052, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 814, + "time_per_iteration": 2.7365036010742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187488, + "balance_loss_mlp": 1.17066741, + "diversity_loss_mlp": 0.0, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.08109713122840453, + "language_loss": 0.87263978, + "learning_rate": 0.0009584381984987386, + "loss": 0.88451469, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 815, + "time_per_iteration": 2.5354831218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011941, + "balance_loss_mlp": 1.1770407, + "diversity_loss_mlp": 0.0, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.07928759805262754, + "language_loss": 0.89978456, + "learning_rate": 0.0009583137507882864, + "loss": 0.91172552, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.17077637, + "routerloss_mlp": 0.0, + "step": 816, + "time_per_iteration": 2.679156541824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895961, + "balance_loss_mlp": 1.55854249, + "diversity_loss_mlp": 0.20119007, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.035733799703693336, + "language_loss": 0.81236839, + "learning_rate": 0.000958189125144656, + "loss": 0.82132804, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160944, + "step": 817, + "time_per_iteration": 2.6629080772399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211679, + "balance_loss_mlp": 1.1954186, + "diversity_loss_mlp": 0.0, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08655764528844483, + "language_loss": 0.88309336, + "learning_rate": 0.0009580643216162313, + "loss": 0.89521015, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 818, + "time_per_iteration": 2.6631743907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174608, + "balance_loss_mlp": 1.15813375, + "diversity_loss_mlp": 0.0, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.07543766685957613, + "language_loss": 0.79610753, + "learning_rate": 0.0009579393402514652, + "loss": 0.80785358, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 819, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116637, + "balance_loss_mlp": 1.15002656, + "diversity_loss_mlp": 0.0, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.08555828674018097, + "language_loss": 0.90543056, + "learning_rate": 0.0009578141810988801, + "loss": 0.91709423, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 820, + "time_per_iteration": 2.6443581581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154879, + "balance_loss_mlp": 1.13852358, + "diversity_loss_mlp": 0.0, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.08457683432578478, + "language_loss": 0.90617025, + "learning_rate": 0.0009576888442070668, + "loss": 0.91771901, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.16357422, + "routerloss_mlp": 0.0, + "step": 821, + "time_per_iteration": 2.588172197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131243, + "balance_loss_mlp": 1.11597228, + "diversity_loss_mlp": 0.0, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08246293521158644, + "language_loss": 0.92183721, + "learning_rate": 0.0009575633296246854, + "loss": 0.93314958, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 822, + "time_per_iteration": 2.5674116611480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894902, + "balance_loss_mlp": 1.55344844, + "diversity_loss_mlp": 0.20225295, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.035537794180972825, + "language_loss": 0.83368647, + "learning_rate": 0.0009574376374004652, + "loss": 0.84263551, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01705186, + "step": 823, + "time_per_iteration": 2.6215808391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124038, + "balance_loss_mlp": 1.10815978, + "diversity_loss_mlp": 0.0, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.07732147283422666, + "language_loss": 0.801727, + "learning_rate": 0.000957311767583204, + "loss": 0.81296742, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 824, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114811, + "balance_loss_mlp": 1.12617576, + "diversity_loss_mlp": 0.0, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.06675818035974217, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83219701, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.21972656, + "routerloss_mlp": 0.0, + "step": 825, + "time_per_iteration": 4.730658531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00883043, + "balance_loss_mlp": 1.5295732, + "diversity_loss_mlp": 0.20110103, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.0472865977200058, + "language_loss": 0.91635585, + "learning_rate": 0.0009570594953650961, + "loss": 0.92518628, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01770616, + "step": 826, + "time_per_iteration": 2.528219699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119191, + "balance_loss_mlp": 1.10247803, + "diversity_loss_mlp": 0.0, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.1137923923451387, + "language_loss": 0.80430406, + "learning_rate": 0.00095693309306219, + "loss": 0.81549597, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 827, + "time_per_iteration": 3.0950989723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111184, + "balance_loss_mlp": 1.09513879, + "diversity_loss_mlp": 0.0, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.08215179220405018, + "language_loss": 0.87886679, + "learning_rate": 0.0009568065133621244, + "loss": 0.8899852, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.16699219, + "routerloss_mlp": 0.0, + "step": 828, + "time_per_iteration": 3.367777109146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106235, + "balance_loss_mlp": 1.08993912, + "diversity_loss_mlp": 0.0, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.0806870261134831, + "language_loss": 0.85100621, + "learning_rate": 0.0009566797563140422, + "loss": 0.86206853, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 829, + "time_per_iteration": 2.8803212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122437, + "balance_loss_mlp": 1.10618925, + "diversity_loss_mlp": 0.0, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.0881590388408274, + "language_loss": 0.88045579, + "learning_rate": 0.0009565528219671547, + "loss": 0.89168018, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 830, + "time_per_iteration": 2.8965914249420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130205, + "balance_loss_mlp": 1.11437368, + "diversity_loss_mlp": 0.0, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.08433678519740714, + "language_loss": 0.84820044, + "learning_rate": 0.0009564257103707418, + "loss": 0.85950249, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 831, + "time_per_iteration": 2.6071205139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138047, + "balance_loss_mlp": 1.12237096, + "diversity_loss_mlp": 0.0, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08192391736137887, + "language_loss": 0.90990019, + "learning_rate": 0.0009562984215741533, + "loss": 0.92128068, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.15661621, + "routerloss_mlp": 0.0, + "step": 832, + "time_per_iteration": 2.647022008895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126204, + "balance_loss_mlp": 1.11050415, + "diversity_loss_mlp": 0.0, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.08304692865674389, + "language_loss": 0.8233614, + "learning_rate": 0.0009561709556268065, + "loss": 0.83462346, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 833, + "time_per_iteration": 2.7033326625823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113334, + "balance_loss_mlp": 1.09758639, + "diversity_loss_mlp": 0.0, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.1118379895427605, + "language_loss": 0.94022137, + "learning_rate": 0.0009560433125781884, + "loss": 0.95135468, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 834, + "time_per_iteration": 2.7286314964294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137088, + "balance_loss_mlp": 1.12088716, + "diversity_loss_mlp": 0.0, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.07457680689162895, + "language_loss": 0.92389894, + "learning_rate": 0.0009559154924778544, + "loss": 0.93526971, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 835, + "time_per_iteration": 2.7348785400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143876, + "balance_loss_mlp": 1.12812805, + "diversity_loss_mlp": 0.0, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.10043267780752475, + "language_loss": 0.85037422, + "learning_rate": 0.0009557874953754284, + "loss": 0.86181295, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 836, + "time_per_iteration": 3.069246768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156501, + "balance_loss_mlp": 1.14049125, + "diversity_loss_mlp": 0.0, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08327927090533828, + "language_loss": 0.83506572, + "learning_rate": 0.0009556593213206038, + "loss": 0.84663069, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 837, + "time_per_iteration": 2.7368414402008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190738, + "balance_loss_mlp": 1.17505026, + "diversity_loss_mlp": 0.0, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.08045457133261572, + "language_loss": 0.87076676, + "learning_rate": 0.0009555309703631414, + "loss": 0.88267422, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 838, + "time_per_iteration": 2.72027850151062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180132, + "balance_loss_mlp": 1.16382456, + "diversity_loss_mlp": 0.0, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.09367634959673259, + "language_loss": 0.87476748, + "learning_rate": 0.0009554024425528722, + "loss": 0.88656878, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 839, + "time_per_iteration": 2.7314722537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173375, + "balance_loss_mlp": 1.15756762, + "diversity_loss_mlp": 0.0, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0683151622017414, + "language_loss": 0.88983327, + "learning_rate": 0.0009552737379396948, + "loss": 0.90156698, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.15795898, + "routerloss_mlp": 0.0, + "step": 840, + "time_per_iteration": 2.6384117603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165757, + "balance_loss_mlp": 1.14950919, + "diversity_loss_mlp": 0.0, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.08203724053437887, + "language_loss": 0.87545735, + "learning_rate": 0.0009551448565735767, + "loss": 0.88711488, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 841, + "time_per_iteration": 2.7497382164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158402, + "balance_loss_mlp": 1.14156926, + "diversity_loss_mlp": 0.0, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.08523302245909381, + "language_loss": 0.84374112, + "learning_rate": 0.0009550157985045543, + "loss": 0.8553251, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 842, + "time_per_iteration": 3.080169916152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114708, + "balance_loss_mlp": 1.13046193, + "diversity_loss_mlp": 0.0, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.10255895710786052, + "language_loss": 0.89356017, + "learning_rate": 0.0009548865637827321, + "loss": 0.90503097, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 843, + "time_per_iteration": 2.684195041656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158581, + "balance_loss_mlp": 1.14129627, + "diversity_loss_mlp": 0.0, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.08376364289368579, + "language_loss": 0.89409387, + "learning_rate": 0.0009547571524582838, + "loss": 0.90567964, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 844, + "time_per_iteration": 2.5846645832061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157702, + "balance_loss_mlp": 1.14051175, + "diversity_loss_mlp": 0.0, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.09201378669766774, + "language_loss": 0.92096436, + "learning_rate": 0.0009546275645814512, + "loss": 0.93254137, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.17211914, + "routerloss_mlp": 0.0, + "step": 845, + "time_per_iteration": 2.603830575942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165367, + "balance_loss_mlp": 1.1485343, + "diversity_loss_mlp": 0.0, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.11870998115484692, + "language_loss": 0.8935858, + "learning_rate": 0.0009544978002025446, + "loss": 0.90523952, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 846, + "time_per_iteration": 2.57155179977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167547, + "balance_loss_mlp": 1.15075064, + "diversity_loss_mlp": 0.0, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.08095587687984966, + "language_loss": 0.86639023, + "learning_rate": 0.0009543678593719434, + "loss": 0.87806571, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.16809082, + "routerloss_mlp": 0.0, + "step": 847, + "time_per_iteration": 2.7022597789764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189002, + "balance_loss_mlp": 1.17215741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.06757237913003537, + "language_loss": 0.87374425, + "learning_rate": 0.0009542377421400945, + "loss": 0.8856343, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 848, + "time_per_iteration": 2.7858939170837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209239, + "balance_loss_mlp": 1.1922878, + "diversity_loss_mlp": 0.0, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.0709695929057924, + "language_loss": 0.83489215, + "learning_rate": 0.0009541074485575145, + "loss": 0.84698457, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 849, + "time_per_iteration": 2.7202138900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206318, + "balance_loss_mlp": 1.18949735, + "diversity_loss_mlp": 0.0, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.09796618546415216, + "language_loss": 0.91934282, + "learning_rate": 0.0009539769786747874, + "loss": 0.93140602, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 850, + "time_per_iteration": 2.6165611743927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183142, + "balance_loss_mlp": 1.16619003, + "diversity_loss_mlp": 0.0, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.08882238893928415, + "language_loss": 0.81184316, + "learning_rate": 0.0009538463325425665, + "loss": 0.82367456, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 851, + "time_per_iteration": 2.686708927154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.13394117, + "diversity_loss_mlp": 0.0, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.07439357185799754, + "language_loss": 0.85950458, + "learning_rate": 0.0009537155102115728, + "loss": 0.87101221, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 852, + "time_per_iteration": 2.5918595790863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875998, + "balance_loss_mlp": 1.52336514, + "diversity_loss_mlp": 0.19506347, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.033648266618603755, + "language_loss": 0.83653182, + "learning_rate": 0.0009535845117325961, + "loss": 0.84529185, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0167836, + "step": 853, + "time_per_iteration": 2.724388599395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106481, + "balance_loss_mlp": 1.08957744, + "diversity_loss_mlp": 0.0, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.08216353114673619, + "language_loss": 0.93429655, + "learning_rate": 0.0009534533371564946, + "loss": 0.94536138, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 854, + "time_per_iteration": 2.7487661838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011031, + "balance_loss_mlp": 1.08627963, + "diversity_loss_mlp": 0.0, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.1393079137823864, + "language_loss": 0.88947123, + "learning_rate": 0.0009533219865341949, + "loss": 0.9005022, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 855, + "time_per_iteration": 2.5900051593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095935, + "balance_loss_mlp": 1.0794363, + "diversity_loss_mlp": 0.0, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.09213408499242232, + "language_loss": 0.86629748, + "learning_rate": 0.0009531904599166916, + "loss": 0.87725687, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 856, + "time_per_iteration": 2.6516594886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093162, + "balance_loss_mlp": 1.07659197, + "diversity_loss_mlp": 0.0, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.11803940214792888, + "language_loss": 0.85319799, + "learning_rate": 0.0009530587573550478, + "loss": 0.86412966, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 857, + "time_per_iteration": 2.6046345233917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.06968486, + "diversity_loss_mlp": 0.0, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.035898632567184195, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75406808, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 858, + "time_per_iteration": 5.039424180984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113669, + "balance_loss_mlp": 1.12172914, + "diversity_loss_mlp": 0.0, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.11200047020164162, + "language_loss": 0.90257657, + "learning_rate": 0.0009527948246039337, + "loss": 0.91394353, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.14929199, + "routerloss_mlp": 0.0, + "step": 859, + "time_per_iteration": 2.550898551940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912162, + "balance_loss_mlp": 1.5939728, + "diversity_loss_mlp": 0.19291875, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.041813305841329106, + "language_loss": 0.87981749, + "learning_rate": 0.000952662594516931, + "loss": 0.88893914, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871633, + "step": 860, + "time_per_iteration": 3.135986089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159964, + "balance_loss_mlp": 1.14404976, + "diversity_loss_mlp": 0.0, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.09693666764449156, + "language_loss": 0.86321676, + "learning_rate": 0.0009525301886907234, + "loss": 0.87481636, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 861, + "time_per_iteration": 2.8601465225219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117936, + "balance_loss_mlp": 1.16340995, + "diversity_loss_mlp": 0.0, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.08775979857040934, + "language_loss": 0.87897611, + "learning_rate": 0.0009523976071767155, + "loss": 0.89076972, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 862, + "time_per_iteration": 2.676481246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186964, + "balance_loss_mlp": 1.17058492, + "diversity_loss_mlp": 0.0, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.08829714099376759, + "language_loss": 0.87565947, + "learning_rate": 0.00095226485002638, + "loss": 0.88752913, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 863, + "time_per_iteration": 2.7554168701171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188442, + "balance_loss_mlp": 1.17221785, + "diversity_loss_mlp": 0.0, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.07683945950910559, + "language_loss": 0.89008975, + "learning_rate": 0.0009521319172912576, + "loss": 0.90197414, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.16223145, + "routerloss_mlp": 0.0, + "step": 864, + "time_per_iteration": 2.7515084743499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180456, + "balance_loss_mlp": 1.16381395, + "diversity_loss_mlp": 0.0, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.07957847945510911, + "language_loss": 0.95031559, + "learning_rate": 0.0009519988090229579, + "loss": 0.96212018, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 865, + "time_per_iteration": 2.671473741531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177408, + "balance_loss_mlp": 1.16058719, + "diversity_loss_mlp": 0.0, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.08787110668844439, + "language_loss": 0.87748879, + "learning_rate": 0.0009518655252731576, + "loss": 0.8892628, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 866, + "time_per_iteration": 2.7561991214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152051, + "balance_loss_mlp": 1.13470602, + "diversity_loss_mlp": 0.0, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.07641565274747647, + "language_loss": 0.90193641, + "learning_rate": 0.0009517320660936022, + "loss": 0.91345698, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.17358398, + "routerloss_mlp": 0.0, + "step": 867, + "time_per_iteration": 2.7005693912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177189, + "balance_loss_mlp": 1.16064239, + "diversity_loss_mlp": 0.0, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.08424262891613502, + "language_loss": 0.83321446, + "learning_rate": 0.0009515984315361051, + "loss": 0.84498632, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.16552734, + "routerloss_mlp": 0.0, + "step": 868, + "time_per_iteration": 2.7969586849212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167914, + "balance_loss_mlp": 1.15145087, + "diversity_loss_mlp": 0.0, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08829416831991993, + "language_loss": 0.87132847, + "learning_rate": 0.000951464621652548, + "loss": 0.88300765, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 869, + "time_per_iteration": 2.6121644973754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152825, + "balance_loss_mlp": 1.13639808, + "diversity_loss_mlp": 0.0, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.07099792340868973, + "language_loss": 0.79077303, + "learning_rate": 0.0009513306364948804, + "loss": 0.80230129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 870, + "time_per_iteration": 2.7814862728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140954, + "balance_loss_mlp": 1.12481356, + "diversity_loss_mlp": 0.0, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09401721418936884, + "language_loss": 0.89126736, + "learning_rate": 0.0009511964761151197, + "loss": 0.90267694, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 871, + "time_per_iteration": 2.601903200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152354, + "balance_loss_mlp": 1.13628435, + "diversity_loss_mlp": 0.0, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.07594901152089473, + "language_loss": 0.90430808, + "learning_rate": 0.0009510621405653521, + "loss": 0.91583163, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 872, + "time_per_iteration": 2.6015260219573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140995, + "balance_loss_mlp": 1.12449682, + "diversity_loss_mlp": 0.0, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.08553354640914074, + "language_loss": 0.84159112, + "learning_rate": 0.0009509276298977309, + "loss": 0.85300112, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 873, + "time_per_iteration": 2.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156157, + "balance_loss_mlp": 1.13969469, + "diversity_loss_mlp": 0.0, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.09960357111836311, + "language_loss": 0.81973028, + "learning_rate": 0.0009507929441644778, + "loss": 0.83129185, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 874, + "time_per_iteration": 3.518749237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141075, + "balance_loss_mlp": 1.12455297, + "diversity_loss_mlp": 0.0, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.09789550875526438, + "language_loss": 0.86003464, + "learning_rate": 0.0009506580834178826, + "loss": 0.87144536, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.1652832, + "routerloss_mlp": 0.0, + "step": 875, + "time_per_iteration": 2.7423431873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152406, + "balance_loss_mlp": 1.13565707, + "diversity_loss_mlp": 0.0, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.08790070613593892, + "language_loss": 0.91631377, + "learning_rate": 0.0009505230477103028, + "loss": 0.92783785, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 876, + "time_per_iteration": 2.698725938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133355, + "balance_loss_mlp": 1.11677289, + "diversity_loss_mlp": 0.0, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.09908277874944699, + "language_loss": 0.81365788, + "learning_rate": 0.0009503878370941641, + "loss": 0.82499135, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 877, + "time_per_iteration": 2.791314125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891363, + "balance_loss_mlp": 1.54620337, + "diversity_loss_mlp": 0.20141272, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.04203797903351432, + "language_loss": 0.89092785, + "learning_rate": 0.0009502524516219595, + "loss": 0.89984149, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01755447, + "step": 878, + "time_per_iteration": 2.776076078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_mlp": 1.12719083, + "diversity_loss_mlp": 0.0, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.08982042340710936, + "language_loss": 0.90123284, + "learning_rate": 0.0009501168913462506, + "loss": 0.91266429, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 879, + "time_per_iteration": 2.6948277950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112281, + "balance_loss_mlp": 1.09587741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.05096984028598956, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80234206, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 880, + "time_per_iteration": 4.850466728210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143836, + "balance_loss_mlp": 1.12831497, + "diversity_loss_mlp": 0.0, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.08080936273118028, + "language_loss": 0.85235959, + "learning_rate": 0.0009498452465949042, + "loss": 0.8637979, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.1550293, + "routerloss_mlp": 0.0, + "step": 881, + "time_per_iteration": 3.2163655757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147334, + "balance_loss_mlp": 1.13156271, + "diversity_loss_mlp": 0.0, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.06875421208466073, + "language_loss": 0.91363323, + "learning_rate": 0.0009497091622247285, + "loss": 0.92510653, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 882, + "time_per_iteration": 2.686939239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152935, + "balance_loss_mlp": 1.13735437, + "diversity_loss_mlp": 0.0, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.08376903723107024, + "language_loss": 0.93688583, + "learning_rate": 0.0009495729032619723, + "loss": 0.94841516, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.15563965, + "routerloss_mlp": 0.0, + "step": 883, + "time_per_iteration": 2.709554433822632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164282, + "balance_loss_mlp": 1.14845097, + "diversity_loss_mlp": 0.0, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07836441801613908, + "language_loss": 0.83897853, + "learning_rate": 0.0009494364697595354, + "loss": 0.85062128, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 884, + "time_per_iteration": 2.905869722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192457, + "balance_loss_mlp": 1.17685246, + "diversity_loss_mlp": 0.0, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.08347533231949411, + "language_loss": 0.89193916, + "learning_rate": 0.0009492998617703867, + "loss": 0.90386373, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 885, + "time_per_iteration": 2.655181884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.18021917, + "diversity_loss_mlp": 0.0, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.09597329726050118, + "language_loss": 0.87667245, + "learning_rate": 0.0009491630793475619, + "loss": 0.88863432, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 886, + "time_per_iteration": 2.6077725887298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195953, + "balance_loss_mlp": 1.17983615, + "diversity_loss_mlp": 0.0, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.09161300078510141, + "language_loss": 0.8529889, + "learning_rate": 0.0009490261225441643, + "loss": 0.86494851, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 887, + "time_per_iteration": 2.8882617950439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169082, + "balance_loss_mlp": 1.15244031, + "diversity_loss_mlp": 0.0, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07944379291645969, + "language_loss": 0.90366387, + "learning_rate": 0.0009488889914133656, + "loss": 0.91535467, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 888, + "time_per_iteration": 2.969808578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192276, + "balance_loss_mlp": 1.17532432, + "diversity_loss_mlp": 0.0, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.0816216626447537, + "language_loss": 0.89335579, + "learning_rate": 0.0009487516860084047, + "loss": 0.90527856, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 889, + "time_per_iteration": 2.6975717544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164555, + "balance_loss_mlp": 1.14738929, + "diversity_loss_mlp": 0.0, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.08956429914743876, + "language_loss": 0.88835347, + "learning_rate": 0.0009486142063825884, + "loss": 0.89999902, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 890, + "time_per_iteration": 2.5376908779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087842, + "balance_loss_mlp": 1.07248783, + "diversity_loss_mlp": 0.0, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.041165905845677725, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73514056, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 891, + "time_per_iteration": 4.961901664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168071, + "balance_loss_mlp": 1.15150142, + "diversity_loss_mlp": 0.0, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.09530662242326329, + "language_loss": 0.89790797, + "learning_rate": 0.0009483387246819542, + "loss": 0.90958869, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 892, + "time_per_iteration": 2.7075483798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.0489924, + "diversity_loss_mlp": 0.0, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.03173229244132217, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83349359, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 893, + "time_per_iteration": 4.639479398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175334, + "balance_loss_mlp": 1.15915704, + "diversity_loss_mlp": 0.0, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.09568003043121609, + "language_loss": 0.88799989, + "learning_rate": 0.0009480625467392688, + "loss": 0.89975327, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 894, + "time_per_iteration": 2.6601061820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.04933381, + "diversity_loss_mlp": 0.0, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.02668432598653126, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79057646, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 895, + "time_per_iteration": 4.739619970321655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154117, + "balance_loss_mlp": 1.13857174, + "diversity_loss_mlp": 0.0, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0641043143423189, + "language_loss": 0.87743723, + "learning_rate": 0.0009477856729834196, + "loss": 0.88897842, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 896, + "time_per_iteration": 2.7397632598876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143695, + "balance_loss_mlp": 1.12863934, + "diversity_loss_mlp": 0.0, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.08265751895316475, + "language_loss": 0.89999056, + "learning_rate": 0.0009476469753098809, + "loss": 0.9114275, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 897, + "time_per_iteration": 2.7494678497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.13624024, + "diversity_loss_mlp": 0.0, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08701823937514089, + "language_loss": 0.86839932, + "learning_rate": 0.0009475081038443738, + "loss": 0.87991428, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.15234375, + "routerloss_mlp": 0.0, + "step": 898, + "time_per_iteration": 2.6241486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147135, + "balance_loss_mlp": 1.13179302, + "diversity_loss_mlp": 0.0, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.10104724937619765, + "language_loss": 0.85756111, + "learning_rate": 0.0009473690586408124, + "loss": 0.86903244, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 899, + "time_per_iteration": 2.8371973037719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141451, + "balance_loss_mlp": 1.1257633, + "diversity_loss_mlp": 0.0, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.08019640817702944, + "language_loss": 0.86364079, + "learning_rate": 0.0009472298397531792, + "loss": 0.87505525, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 900, + "time_per_iteration": 2.742392063140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158392, + "balance_loss_mlp": 1.14285886, + "diversity_loss_mlp": 0.0, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.08623310667606855, + "language_loss": 0.86846912, + "learning_rate": 0.0009470904472355235, + "loss": 0.88005304, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 901, + "time_per_iteration": 2.6695165634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168499, + "balance_loss_mlp": 1.15235806, + "diversity_loss_mlp": 0.0, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.08505658620970231, + "language_loss": 0.7976377, + "learning_rate": 0.0009469508811419626, + "loss": 0.80932266, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 902, + "time_per_iteration": 2.706495761871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295395, + "balance_loss_mlp": 1.28533375, + "diversity_loss_mlp": 0.0, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.12561294289393785, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72909224, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 903, + "time_per_iteration": 4.816544532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201232, + "balance_loss_mlp": 1.18432808, + "diversity_loss_mlp": 0.0, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.08260915403461032, + "language_loss": 0.83578205, + "learning_rate": 0.0009466712284439292, + "loss": 0.84779429, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 904, + "time_per_iteration": 2.7518186569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225673, + "balance_loss_mlp": 1.20837545, + "diversity_loss_mlp": 0.0, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.10172065741669829, + "language_loss": 0.88445127, + "learning_rate": 0.0009465311419480276, + "loss": 0.89670801, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 905, + "time_per_iteration": 2.6713294982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222896, + "balance_loss_mlp": 1.20540833, + "diversity_loss_mlp": 0.0, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.08928567213571854, + "language_loss": 0.88188136, + "learning_rate": 0.0009463908820933622, + "loss": 0.89411032, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 906, + "time_per_iteration": 2.838935375213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211371, + "balance_loss_mlp": 1.19455028, + "diversity_loss_mlp": 0.0, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.07641026648080583, + "language_loss": 0.82561022, + "learning_rate": 0.0009462504489343868, + "loss": 0.83772391, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 907, + "time_per_iteration": 2.814695119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176767, + "balance_loss_mlp": 1.15961313, + "diversity_loss_mlp": 0.0, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.1031074016814366, + "language_loss": 0.88790941, + "learning_rate": 0.0009461098425256222, + "loss": 0.89967716, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 908, + "time_per_iteration": 2.6116297245025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159634, + "balance_loss_mlp": 1.14329028, + "diversity_loss_mlp": 0.0, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.08015161116044169, + "language_loss": 0.86030436, + "learning_rate": 0.0009459690629216567, + "loss": 0.87190068, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 909, + "time_per_iteration": 2.6483752727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130085, + "balance_loss_mlp": 1.11407518, + "diversity_loss_mlp": 0.0, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.1301831169035446, + "language_loss": 0.87761313, + "learning_rate": 0.0009458281101771457, + "loss": 0.88891399, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 910, + "time_per_iteration": 2.6089227199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00992009, + "balance_loss_mlp": 1.75545192, + "diversity_loss_mlp": 0.19214596, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.033219305186726854, + "language_loss": 0.82887536, + "learning_rate": 0.0009456869843468122, + "loss": 0.83879542, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820984, + "step": 911, + "time_per_iteration": 2.895577907562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110935, + "balance_loss_mlp": 1.09519958, + "diversity_loss_mlp": 0.0, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.09801228329993106, + "language_loss": 0.78689641, + "learning_rate": 0.0009455456854854459, + "loss": 0.79800576, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 912, + "time_per_iteration": 2.61677885055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112332, + "balance_loss_mlp": 1.09684718, + "diversity_loss_mlp": 0.0, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.10345929433375275, + "language_loss": 0.84027654, + "learning_rate": 0.0009454042136479039, + "loss": 0.8513999, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 913, + "time_per_iteration": 2.63289737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970368, + "balance_loss_mlp": 1.71473479, + "diversity_loss_mlp": 0.18966624, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.036406885856323776, + "language_loss": 0.82874572, + "learning_rate": 0.0009452625688891103, + "loss": 0.83844936, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816791, + "step": 914, + "time_per_iteration": 2.5505056381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00652668, + "balance_loss_mlp": 1.1176697, + "diversity_loss_mlp": 0.15453993, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.002103211778310914, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79387403, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01656273, + "step": 915, + "time_per_iteration": 4.6835761070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138887, + "balance_loss_mlp": 1.12381876, + "diversity_loss_mlp": 0.0, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.10180381633640839, + "language_loss": 0.92940623, + "learning_rate": 0.0009449787608278015, + "loss": 0.94079512, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 916, + "time_per_iteration": 2.7294180393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155245, + "balance_loss_mlp": 1.13949776, + "diversity_loss_mlp": 0.0, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08481056496958321, + "language_loss": 0.92318904, + "learning_rate": 0.0009448365976354704, + "loss": 0.9347415, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 917, + "time_per_iteration": 2.4908158779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174187, + "balance_loss_mlp": 1.15821338, + "diversity_loss_mlp": 0.0, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.1031397623895646, + "language_loss": 0.89928877, + "learning_rate": 0.0009446942617422558, + "loss": 0.91103065, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 918, + "time_per_iteration": 2.5721499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191219, + "balance_loss_mlp": 1.1748755, + "diversity_loss_mlp": 0.0, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.17804953788653613, + "language_loss": 0.85687363, + "learning_rate": 0.0009445517532034176, + "loss": 0.86878586, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 919, + "time_per_iteration": 2.6613845825195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195517, + "balance_loss_mlp": 1.18031824, + "diversity_loss_mlp": 0.0, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.09678678856513988, + "language_loss": 0.89147103, + "learning_rate": 0.0009444090720742824, + "loss": 0.90342629, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 920, + "time_per_iteration": 2.587042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186456, + "balance_loss_mlp": 1.17107785, + "diversity_loss_mlp": 0.0, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.10185153476697495, + "language_loss": 0.87654328, + "learning_rate": 0.0009442662184102439, + "loss": 0.88840789, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 921, + "time_per_iteration": 2.8263702392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153869, + "balance_loss_mlp": 1.13851511, + "diversity_loss_mlp": 0.0, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.07023953845341, + "language_loss": 0.87764925, + "learning_rate": 0.000944123192266763, + "loss": 0.88918793, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 922, + "time_per_iteration": 2.789288282394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914197, + "balance_loss_mlp": 1.60349846, + "diversity_loss_mlp": 0.18745996, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.03372690713262746, + "language_loss": 0.83555657, + "learning_rate": 0.0009439799936993671, + "loss": 0.84469855, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871805, + "step": 923, + "time_per_iteration": 2.7374520301818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137351, + "balance_loss_mlp": 1.12125802, + "diversity_loss_mlp": 0.0, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.08202300708599226, + "language_loss": 0.87886107, + "learning_rate": 0.0009438366227636511, + "loss": 0.89023459, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.16088867, + "routerloss_mlp": 0.0, + "step": 924, + "time_per_iteration": 2.7159595489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148154, + "balance_loss_mlp": 1.13190556, + "diversity_loss_mlp": 0.0, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.08035818105278464, + "language_loss": 0.86048192, + "learning_rate": 0.0009436930795152763, + "loss": 0.8719635, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 925, + "time_per_iteration": 2.8248116970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143318, + "balance_loss_mlp": 1.12739205, + "diversity_loss_mlp": 0.0, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07405817727017547, + "language_loss": 0.86317486, + "learning_rate": 0.0009435493640099713, + "loss": 0.87460804, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 926, + "time_per_iteration": 2.8155741691589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161834, + "balance_loss_mlp": 1.1451211, + "diversity_loss_mlp": 0.0, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.09122083849675254, + "language_loss": 0.84453332, + "learning_rate": 0.0009434054763035314, + "loss": 0.8561517, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 927, + "time_per_iteration": 2.636686325073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158411, + "balance_loss_mlp": 1.1422224, + "diversity_loss_mlp": 0.0, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.0663266274239875, + "language_loss": 0.85362542, + "learning_rate": 0.0009432614164518185, + "loss": 0.86520946, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 928, + "time_per_iteration": 2.9446685314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171163, + "balance_loss_mlp": 1.15443754, + "diversity_loss_mlp": 0.0, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07726522608444414, + "language_loss": 0.84178561, + "learning_rate": 0.000943117184510762, + "loss": 0.85349721, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 929, + "time_per_iteration": 3.0194530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175374, + "balance_loss_mlp": 1.16435885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.030831515732685378, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79965341, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 930, + "time_per_iteration": 5.04656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172004, + "balance_loss_mlp": 1.15555263, + "diversity_loss_mlp": 0.0, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.08209248711818126, + "language_loss": 0.88495553, + "learning_rate": 0.0009428282045846674, + "loss": 0.89667559, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.16455078, + "routerloss_mlp": 0.0, + "step": 931, + "time_per_iteration": 2.6833221912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905029, + "balance_loss_mlp": 1.58147573, + "diversity_loss_mlp": 0.18920106, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.030391877730158674, + "language_loss": 0.89804769, + "learning_rate": 0.0009426834567118214, + "loss": 0.90709794, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01969042, + "step": 932, + "time_per_iteration": 3.0804004669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174106, + "balance_loss_mlp": 1.15761924, + "diversity_loss_mlp": 0.0, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.06967623980831897, + "language_loss": 0.80600739, + "learning_rate": 0.0009425385369740155, + "loss": 0.81774843, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.16491699, + "routerloss_mlp": 0.0, + "step": 933, + "time_per_iteration": 3.039576530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172613, + "balance_loss_mlp": 1.15553069, + "diversity_loss_mlp": 0.0, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.09198882046168515, + "language_loss": 0.87049097, + "learning_rate": 0.0009423934454275125, + "loss": 0.88221705, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 934, + "time_per_iteration": 2.8528192043304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147429, + "balance_loss_mlp": 1.13053656, + "diversity_loss_mlp": 0.0, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.09002999058802562, + "language_loss": 0.92077851, + "learning_rate": 0.0009422481821286418, + "loss": 0.93225282, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.16906738, + "routerloss_mlp": 0.0, + "step": 935, + "time_per_iteration": 2.720700740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140916, + "balance_loss_mlp": 1.12434602, + "diversity_loss_mlp": 0.0, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.11818586168906865, + "language_loss": 0.88474637, + "learning_rate": 0.0009421027471337998, + "loss": 0.89615548, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 936, + "time_per_iteration": 2.61820125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114364, + "balance_loss_mlp": 1.12680769, + "diversity_loss_mlp": 0.0, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.13119105141522364, + "language_loss": 0.82430404, + "learning_rate": 0.0009419571404994493, + "loss": 0.83574045, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 937, + "time_per_iteration": 2.6458749771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.11016333, + "diversity_loss_mlp": 0.0, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.10011425098636609, + "language_loss": 0.90748799, + "learning_rate": 0.00094181136228212, + "loss": 0.91875559, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 938, + "time_per_iteration": 2.659946918487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132333, + "balance_loss_mlp": 1.11602521, + "diversity_loss_mlp": 0.0, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06984091109722412, + "language_loss": 0.86027002, + "learning_rate": 0.0009416654125384077, + "loss": 0.8715933, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 939, + "time_per_iteration": 2.723839044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182476, + "balance_loss_mlp": 1.17174697, + "diversity_loss_mlp": 0.0, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.0414358910702132, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.8095485, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 940, + "time_per_iteration": 4.920511722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141007, + "balance_loss_mlp": 1.12453222, + "diversity_loss_mlp": 0.0, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.0813056862192268, + "language_loss": 0.83903325, + "learning_rate": 0.000941372998698552, + "loss": 0.85044336, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 941, + "time_per_iteration": 2.937645673751831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00896978, + "balance_loss_mlp": 1.56833267, + "diversity_loss_mlp": 0.1911485, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.04191931915848681, + "language_loss": 0.82149267, + "learning_rate": 0.0009412265347159336, + "loss": 0.83046246, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0172378, + "step": 942, + "time_per_iteration": 2.7250781059265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116112, + "balance_loss_mlp": 1.14446664, + "diversity_loss_mlp": 0.0, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.08706600394859935, + "language_loss": 0.84761524, + "learning_rate": 0.0009410798994339829, + "loss": 0.85922647, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 943, + "time_per_iteration": 2.5916900634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115721, + "balance_loss_mlp": 1.14027047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.07414862428622851, + "language_loss": 0.87698966, + "learning_rate": 0.000940933092909628, + "loss": 0.88856173, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 944, + "time_per_iteration": 2.6747801303863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166789, + "balance_loss_mlp": 1.15049326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.07390491400887403, + "language_loss": 0.83424389, + "learning_rate": 0.0009407861151998649, + "loss": 0.84591174, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 945, + "time_per_iteration": 2.602691411972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163795, + "balance_loss_mlp": 1.14708209, + "diversity_loss_mlp": 0.0, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.07435679337016335, + "language_loss": 0.86087269, + "learning_rate": 0.0009406389663617552, + "loss": 0.87251067, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 946, + "time_per_iteration": 2.6775379180908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139209, + "balance_loss_mlp": 1.12300825, + "diversity_loss_mlp": 0.0, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.08423780444915897, + "language_loss": 0.86031067, + "learning_rate": 0.000940491646452427, + "loss": 0.87170279, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 947, + "time_per_iteration": 2.717313051223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134537, + "balance_loss_mlp": 1.11805058, + "diversity_loss_mlp": 0.0, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.0716601161320721, + "language_loss": 0.90799212, + "learning_rate": 0.000940344155529075, + "loss": 0.91933751, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 948, + "time_per_iteration": 2.645601749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905236, + "balance_loss_mlp": 1.57791471, + "diversity_loss_mlp": 0.19691566, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.03478780514937427, + "language_loss": 0.87420666, + "learning_rate": 0.0009401964936489605, + "loss": 0.883259, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01782099, + "step": 949, + "time_per_iteration": 2.546546459197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132433, + "balance_loss_mlp": 1.11666203, + "diversity_loss_mlp": 0.0, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.11218622077210595, + "language_loss": 0.85308415, + "learning_rate": 0.0009400486608694108, + "loss": 0.86440849, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 950, + "time_per_iteration": 2.71462345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135805, + "balance_loss_mlp": 1.1190201, + "diversity_loss_mlp": 0.0, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.07143871570155125, + "language_loss": 0.87176299, + "learning_rate": 0.0009399006572478195, + "loss": 0.88312101, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 951, + "time_per_iteration": 3.0933260917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137853, + "balance_loss_mlp": 1.12129509, + "diversity_loss_mlp": 0.0, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.08672794105569953, + "language_loss": 0.90997601, + "learning_rate": 0.0009397524828416468, + "loss": 0.92135453, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 952, + "time_per_iteration": 2.6721160411834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906668, + "balance_loss_mlp": 1.58174932, + "diversity_loss_mlp": 0.19792399, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.0341945315399877, + "language_loss": 0.96079636, + "learning_rate": 0.0009396041377084192, + "loss": 0.96986312, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01683164, + "step": 953, + "time_per_iteration": 2.6563429832458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147916, + "balance_loss_mlp": 1.1312983, + "diversity_loss_mlp": 0.0, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.07156922543086394, + "language_loss": 0.87274891, + "learning_rate": 0.0009394556219057295, + "loss": 0.88422805, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 954, + "time_per_iteration": 2.710129499435425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.1480366, + "diversity_loss_mlp": 0.0, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08933499459227748, + "language_loss": 0.83389091, + "learning_rate": 0.0009393069354912362, + "loss": 0.84553862, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 955, + "time_per_iteration": 2.736077070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162546, + "balance_loss_mlp": 1.1459167, + "diversity_loss_mlp": 0.0, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.10088049230192819, + "language_loss": 0.81851852, + "learning_rate": 0.0009391580785226649, + "loss": 0.83014399, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 956, + "time_per_iteration": 2.8675243854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139492, + "balance_loss_mlp": 1.12933517, + "diversity_loss_mlp": 0.0, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.028623000900350283, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80479944, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 957, + "time_per_iteration": 4.758531332015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128949, + "balance_loss_mlp": 1.11177051, + "diversity_loss_mlp": 0.0, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.0742792603097427, + "language_loss": 0.8674221, + "learning_rate": 0.0009388598531545196, + "loss": 0.87871158, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 958, + "time_per_iteration": 2.8665144443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110151, + "balance_loss_mlp": 1.09304404, + "diversity_loss_mlp": 0.0, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.08387101873752756, + "language_loss": 0.85292655, + "learning_rate": 0.000938710484870727, + "loss": 0.86402804, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.17126465, + "routerloss_mlp": 0.0, + "step": 959, + "time_per_iteration": 2.5621094703674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113798, + "balance_loss_mlp": 1.09718001, + "diversity_loss_mlp": 0.0, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.08027143748444723, + "language_loss": 0.85896957, + "learning_rate": 0.0009385609462644189, + "loss": 0.87010753, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 960, + "time_per_iteration": 2.6949400901794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122642, + "balance_loss_mlp": 1.10596502, + "diversity_loss_mlp": 0.0, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07967759372686231, + "language_loss": 0.8535409, + "learning_rate": 0.0009384112373936514, + "loss": 0.86476731, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 961, + "time_per_iteration": 2.644244432449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.11566615, + "diversity_loss_mlp": 0.0, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.09330138113238175, + "language_loss": 0.91539109, + "learning_rate": 0.0009382613583165467, + "loss": 0.92671585, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 962, + "time_per_iteration": 2.8191375732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128481, + "balance_loss_mlp": 1.11161256, + "diversity_loss_mlp": 0.0, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.08799115365988901, + "language_loss": 0.89600122, + "learning_rate": 0.0009381113090912928, + "loss": 0.90728599, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 963, + "time_per_iteration": 2.77341890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137775, + "balance_loss_mlp": 1.12159812, + "diversity_loss_mlp": 0.0, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.08224545608030313, + "language_loss": 0.89354098, + "learning_rate": 0.000937961089776144, + "loss": 0.90491867, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 964, + "time_per_iteration": 2.6057045459747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140677, + "balance_loss_mlp": 1.12448788, + "diversity_loss_mlp": 0.0, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.08763662153745684, + "language_loss": 0.82399738, + "learning_rate": 0.0009378107004294208, + "loss": 0.83540416, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 965, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132665, + "balance_loss_mlp": 1.11624968, + "diversity_loss_mlp": 0.0, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.0696996408734829, + "language_loss": 0.91584361, + "learning_rate": 0.0009376601411095096, + "loss": 0.92717028, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.16418457, + "routerloss_mlp": 0.0, + "step": 966, + "time_per_iteration": 2.6557700634002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108368, + "balance_loss_mlp": 1.09209585, + "diversity_loss_mlp": 0.0, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.0928645758984953, + "language_loss": 0.86438054, + "learning_rate": 0.0009375094118748622, + "loss": 0.8754642, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.16271973, + "routerloss_mlp": 0.0, + "step": 967, + "time_per_iteration": 2.5574727058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121341, + "balance_loss_mlp": 1.10546279, + "diversity_loss_mlp": 0.0, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.08866997131388626, + "language_loss": 0.90710455, + "learning_rate": 0.0009373585127839976, + "loss": 0.91831791, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 968, + "time_per_iteration": 2.9949731826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.1066587, + "diversity_loss_mlp": 0.0, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08663719992470821, + "language_loss": 0.90892541, + "learning_rate": 0.0009372074438954994, + "loss": 0.92014849, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.15637207, + "routerloss_mlp": 0.0, + "step": 969, + "time_per_iteration": 2.583392381668091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115205, + "balance_loss_mlp": 1.09983897, + "diversity_loss_mlp": 0.0, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.1288159292638968, + "language_loss": 0.91714692, + "learning_rate": 0.0009370562052680181, + "loss": 0.92829901, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.15356445, + "routerloss_mlp": 0.0, + "step": 970, + "time_per_iteration": 2.476053476333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131477, + "balance_loss_mlp": 1.1160872, + "diversity_loss_mlp": 0.0, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.05501755081279848, + "language_loss": 0.89296091, + "learning_rate": 0.0009369047969602695, + "loss": 0.90427566, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 971, + "time_per_iteration": 2.705310344696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161734, + "balance_loss_mlp": 1.14604628, + "diversity_loss_mlp": 0.0, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.09590230746039986, + "language_loss": 0.86690193, + "learning_rate": 0.0009367532190310357, + "loss": 0.8785193, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 972, + "time_per_iteration": 2.551683187484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151378, + "balance_loss_mlp": 1.13526106, + "diversity_loss_mlp": 0.0, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.13723256450586457, + "language_loss": 0.88859725, + "learning_rate": 0.0009366014715391644, + "loss": 0.90011096, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 973, + "time_per_iteration": 2.6311707496643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140536, + "balance_loss_mlp": 1.12521768, + "diversity_loss_mlp": 0.0, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.0667022200872989, + "language_loss": 0.83902818, + "learning_rate": 0.0009364495545435693, + "loss": 0.85043353, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 974, + "time_per_iteration": 2.756056308746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121055, + "balance_loss_mlp": 1.10528326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.06720472395514528, + "language_loss": 0.88235438, + "learning_rate": 0.0009362974681032297, + "loss": 0.89356488, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 975, + "time_per_iteration": 2.601027488708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117499, + "balance_loss_mlp": 1.10179889, + "diversity_loss_mlp": 0.0, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.09372829562862567, + "language_loss": 0.88529336, + "learning_rate": 0.0009361452122771907, + "loss": 0.8964684, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 976, + "time_per_iteration": 2.8729074001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124468, + "balance_loss_mlp": 1.107934, + "diversity_loss_mlp": 0.0, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.10248565336705484, + "language_loss": 0.83506191, + "learning_rate": 0.0009359927871245635, + "loss": 0.84630656, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 977, + "time_per_iteration": 2.4633541107177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114403, + "balance_loss_mlp": 1.12861657, + "diversity_loss_mlp": 0.0, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09207140211488826, + "language_loss": 0.85937703, + "learning_rate": 0.0009358401927045246, + "loss": 0.87081736, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 978, + "time_per_iteration": 2.8528451919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165656, + "balance_loss_mlp": 1.15002799, + "diversity_loss_mlp": 0.0, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.09819064259764942, + "language_loss": 0.88151729, + "learning_rate": 0.0009356874290763166, + "loss": 0.89317381, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 979, + "time_per_iteration": 3.4732589721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165217, + "balance_loss_mlp": 1.14985144, + "diversity_loss_mlp": 0.0, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.07125364842819645, + "language_loss": 0.88739443, + "learning_rate": 0.0009355344962992474, + "loss": 0.8990466, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 980, + "time_per_iteration": 2.618013381958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092711, + "balance_loss_mlp": 1.61735535, + "diversity_loss_mlp": 0.20325859, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.031158428526317693, + "language_loss": 0.8787328, + "learning_rate": 0.0009353813944326908, + "loss": 0.88800395, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0168031, + "step": 981, + "time_per_iteration": 2.926612377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00925726, + "balance_loss_mlp": 1.616956, + "diversity_loss_mlp": 0.20126666, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0354798675553145, + "language_loss": 0.82752389, + "learning_rate": 0.0009352281235360863, + "loss": 0.83678114, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01661466, + "step": 982, + "time_per_iteration": 2.7461719512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156754, + "balance_loss_mlp": 1.14193642, + "diversity_loss_mlp": 0.0, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.08008026175511872, + "language_loss": 0.84875655, + "learning_rate": 0.0009350746836689389, + "loss": 0.86032403, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 983, + "time_per_iteration": 2.5128703117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232965, + "balance_loss_mlp": 1.22199774, + "diversity_loss_mlp": 0.0, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.06420942239022731, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82672185, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 984, + "time_per_iteration": 4.987680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144146, + "balance_loss_mlp": 1.12880325, + "diversity_loss_mlp": 0.0, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08702988523082197, + "language_loss": 0.82654107, + "learning_rate": 0.0009347672972613634, + "loss": 0.83798254, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 985, + "time_per_iteration": 2.586580514907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891878, + "balance_loss_mlp": 1.54986262, + "diversity_loss_mlp": 0.20135348, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.032521151954013804, + "language_loss": 0.85226321, + "learning_rate": 0.0009346133508402735, + "loss": 0.86118197, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01626948, + "step": 986, + "time_per_iteration": 2.7389352321624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151414, + "balance_loss_mlp": 1.13596404, + "diversity_loss_mlp": 0.0, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.0982536864932062, + "language_loss": 0.84267235, + "learning_rate": 0.0009344592356873166, + "loss": 0.85418648, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 987, + "time_per_iteration": 2.6327145099639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157169, + "balance_loss_mlp": 1.14155281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.07528447862042392, + "language_loss": 0.78532755, + "learning_rate": 0.0009343049518623255, + "loss": 0.79689926, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 988, + "time_per_iteration": 2.7461259365081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161817, + "balance_loss_mlp": 1.14693928, + "diversity_loss_mlp": 0.0, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.07061488940634471, + "language_loss": 0.83142781, + "learning_rate": 0.0009341504994251985, + "loss": 0.84304595, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 989, + "time_per_iteration": 2.9033045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128003, + "balance_loss_mlp": 1.11765516, + "diversity_loss_mlp": 0.0, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.02664126889468688, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74648499, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 990, + "time_per_iteration": 5.065544605255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116372, + "balance_loss_mlp": 1.14821064, + "diversity_loss_mlp": 0.0, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.062492069067547173, + "language_loss": 0.81668103, + "learning_rate": 0.0009338410889544574, + "loss": 0.82831824, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 991, + "time_per_iteration": 3.0360453128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160077, + "balance_loss_mlp": 1.14444828, + "diversity_loss_mlp": 0.0, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.07188646642614673, + "language_loss": 0.87598348, + "learning_rate": 0.000933686131040967, + "loss": 0.88758421, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 992, + "time_per_iteration": 4.194309234619141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132508, + "balance_loss_mlp": 1.11693931, + "diversity_loss_mlp": 0.0, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.07096950165415856, + "language_loss": 0.90250611, + "learning_rate": 0.0009335310047555883, + "loss": 0.91383117, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 993, + "time_per_iteration": 2.7198565006256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128004, + "balance_loss_mlp": 1.11225605, + "diversity_loss_mlp": 0.0, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.07682750770192658, + "language_loss": 0.8836562, + "learning_rate": 0.0009333757101585467, + "loss": 0.89493626, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 994, + "time_per_iteration": 2.6651480197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.10621142, + "diversity_loss_mlp": 0.0, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.10461680978710068, + "language_loss": 0.9317944, + "learning_rate": 0.0009332202473101329, + "loss": 0.94301325, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 995, + "time_per_iteration": 2.667943239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00890685, + "balance_loss_mlp": 1.54595685, + "diversity_loss_mlp": 0.2013846, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.03439253799161941, + "language_loss": 0.8270663, + "learning_rate": 0.0009330646162707028, + "loss": 0.83597314, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170145, + "step": 996, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130524, + "balance_loss_mlp": 1.11483645, + "diversity_loss_mlp": 0.0, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.07379991060729872, + "language_loss": 0.84002179, + "learning_rate": 0.0009329088171006779, + "loss": 0.85132706, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 997, + "time_per_iteration": 3.133023738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136353, + "balance_loss_mlp": 1.12061739, + "diversity_loss_mlp": 0.0, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.09187105070084006, + "language_loss": 0.85599297, + "learning_rate": 0.0009327528498605446, + "loss": 0.86735654, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 998, + "time_per_iteration": 2.5390877723693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888942, + "balance_loss_mlp": 1.54108667, + "diversity_loss_mlp": 0.20404731, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.03685920036749298, + "language_loss": 0.89166534, + "learning_rate": 0.0009325967146108548, + "loss": 0.90055484, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01637482, + "step": 999, + "time_per_iteration": 2.7167420387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159789, + "balance_loss_mlp": 1.14361215, + "diversity_loss_mlp": 0.0, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.08415694153473897, + "language_loss": 0.87386107, + "learning_rate": 0.0009324404114122258, + "loss": 0.88545901, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 1000, + "time_per_iteration": 2.6833291053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164843, + "balance_loss_mlp": 1.1492269, + "diversity_loss_mlp": 0.0, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.07516183221332183, + "language_loss": 0.86446774, + "learning_rate": 0.0009322839403253397, + "loss": 0.87611622, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 1001, + "time_per_iteration": 4.16480565071106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173642, + "balance_loss_mlp": 1.15789402, + "diversity_loss_mlp": 0.0, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07739515949456567, + "language_loss": 0.84035075, + "learning_rate": 0.0009321273014109439, + "loss": 0.8520872, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1002, + "time_per_iteration": 2.9390604496002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183539, + "balance_loss_mlp": 1.16795826, + "diversity_loss_mlp": 0.0, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.08102605487142737, + "language_loss": 0.84643984, + "learning_rate": 0.0009319704947298513, + "loss": 0.85827518, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1003, + "time_per_iteration": 2.923952579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116012, + "balance_loss_mlp": 1.14496815, + "diversity_loss_mlp": 0.0, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.060771133612280225, + "language_loss": 0.88448775, + "learning_rate": 0.0009318135203429393, + "loss": 0.89608896, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.15124512, + "routerloss_mlp": 0.0, + "step": 1004, + "time_per_iteration": 2.7170984745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135222, + "balance_loss_mlp": 1.11972475, + "diversity_loss_mlp": 0.0, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.07023398647530335, + "language_loss": 0.87528408, + "learning_rate": 0.0009316563783111511, + "loss": 0.88663626, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1005, + "time_per_iteration": 2.7271320819854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011162, + "balance_loss_mlp": 1.10061884, + "diversity_loss_mlp": 0.0, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.07388032809600253, + "language_loss": 0.82009041, + "learning_rate": 0.0009314990686954943, + "loss": 0.83125246, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1006, + "time_per_iteration": 2.9210305213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108745, + "balance_loss_mlp": 1.09337938, + "diversity_loss_mlp": 0.0, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.06330578200459082, + "language_loss": 0.80805916, + "learning_rate": 0.000931341591557042, + "loss": 0.81914663, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 1007, + "time_per_iteration": 3.695157051086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095993, + "balance_loss_mlp": 1.08054364, + "diversity_loss_mlp": 0.0, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.07858263731415134, + "language_loss": 0.87216473, + "learning_rate": 0.0009311839469569325, + "loss": 0.88312465, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1008, + "time_per_iteration": 2.633854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108854, + "balance_loss_mlp": 1.07287586, + "diversity_loss_mlp": 0.0, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.14235975733457876, + "language_loss": 0.87399781, + "learning_rate": 0.0009310261349563687, + "loss": 0.88488322, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1009, + "time_per_iteration": 2.702073574066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898627, + "balance_loss_mlp": 1.56164169, + "diversity_loss_mlp": 0.20371187, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.03011805945399338, + "language_loss": 0.85438645, + "learning_rate": 0.0009308681556166186, + "loss": 0.86337274, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01594995, + "step": 1010, + "time_per_iteration": 2.8698601722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111744, + "balance_loss_mlp": 1.0962348, + "diversity_loss_mlp": 0.0, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.08879322612819535, + "language_loss": 0.87462533, + "learning_rate": 0.0009307100089990152, + "loss": 0.88574278, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1011, + "time_per_iteration": 2.7149901390075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140864, + "balance_loss_mlp": 1.12543821, + "diversity_loss_mlp": 0.0, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.07383907155719892, + "language_loss": 0.83837229, + "learning_rate": 0.0009305516951649568, + "loss": 0.84978092, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.15405273, + "routerloss_mlp": 0.0, + "step": 1012, + "time_per_iteration": 2.702683448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161407, + "balance_loss_mlp": 1.14599323, + "diversity_loss_mlp": 0.0, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07624018834593461, + "language_loss": 0.86570859, + "learning_rate": 0.0009303932141759057, + "loss": 0.87732267, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 1013, + "time_per_iteration": 2.7500197887420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168747, + "balance_loss_mlp": 1.15382242, + "diversity_loss_mlp": 0.0, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.08469076174706892, + "language_loss": 0.83575755, + "learning_rate": 0.0009302345660933902, + "loss": 0.84744501, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1014, + "time_per_iteration": 2.8010780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171185, + "balance_loss_mlp": 1.15642715, + "diversity_loss_mlp": 0.0, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.08619273283705803, + "language_loss": 0.85146868, + "learning_rate": 0.0009300757509790026, + "loss": 0.86318052, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1015, + "time_per_iteration": 2.840315103530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150596, + "balance_loss_mlp": 1.13570654, + "diversity_loss_mlp": 0.0, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.10655365126946059, + "language_loss": 0.90244913, + "learning_rate": 0.0009299167688944005, + "loss": 0.91395509, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1016, + "time_per_iteration": 2.502391815185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130549, + "balance_loss_mlp": 1.11540985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07757202619564983, + "language_loss": 0.85754222, + "learning_rate": 0.0009297576199013063, + "loss": 0.86884773, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1017, + "time_per_iteration": 2.7255496978759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00657481, + "balance_loss_mlp": 1.1064117, + "diversity_loss_mlp": 0.17609364, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.0027779106975556575, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.73659611, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01622855, + "step": 1018, + "time_per_iteration": 4.943171739578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01384914, + "balance_loss_mlp": 1.37351775, + "diversity_loss_mlp": 0.0, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.09054623740471555, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80811214, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 1019, + "time_per_iteration": 5.518418788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125322, + "balance_loss_mlp": 1.11074281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.08202201534603108, + "language_loss": 0.8648417, + "learning_rate": 0.0009292791720892659, + "loss": 0.87609494, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1020, + "time_per_iteration": 2.889078140258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131715, + "balance_loss_mlp": 1.11721921, + "diversity_loss_mlp": 0.0, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07932574612707302, + "language_loss": 0.88913518, + "learning_rate": 0.0009291193560807218, + "loss": 0.90045238, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1021, + "time_per_iteration": 2.5933609008789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136679, + "balance_loss_mlp": 1.122159, + "diversity_loss_mlp": 0.0, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.08278255048112054, + "language_loss": 0.87034905, + "learning_rate": 0.0009289593734732688, + "loss": 0.88171583, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1022, + "time_per_iteration": 2.600834369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132774, + "balance_loss_mlp": 1.11842132, + "diversity_loss_mlp": 0.0, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.08270608551386573, + "language_loss": 0.93774927, + "learning_rate": 0.0009287992243290175, + "loss": 0.94907701, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1023, + "time_per_iteration": 2.474914312362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111783, + "balance_loss_mlp": 1.10275006, + "diversity_loss_mlp": 0.0, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.06901830196983176, + "language_loss": 0.90473127, + "learning_rate": 0.0009286389087101435, + "loss": 0.91590953, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1024, + "time_per_iteration": 2.7718465328216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120328, + "balance_loss_mlp": 1.1055932, + "diversity_loss_mlp": 0.0, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07476522676232629, + "language_loss": 0.8853035, + "learning_rate": 0.0009284784266788864, + "loss": 0.89650679, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1025, + "time_per_iteration": 2.7143290042877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122071, + "balance_loss_mlp": 1.10795665, + "diversity_loss_mlp": 0.0, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.08990804702262417, + "language_loss": 0.91984832, + "learning_rate": 0.0009283177782975512, + "loss": 0.93106908, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1026, + "time_per_iteration": 2.948909282684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115739, + "balance_loss_mlp": 1.10118401, + "diversity_loss_mlp": 0.0, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08229992096701991, + "language_loss": 0.88074464, + "learning_rate": 0.000928156963628507, + "loss": 0.89190209, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1027, + "time_per_iteration": 2.5764074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109846, + "balance_loss_mlp": 1.09483802, + "diversity_loss_mlp": 0.0, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.08379460495492784, + "language_loss": 0.87978798, + "learning_rate": 0.0009279959827341877, + "loss": 0.89088643, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1028, + "time_per_iteration": 2.752347946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08043635, + "diversity_loss_mlp": 0.0, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.08467225305095022, + "language_loss": 0.87624389, + "learning_rate": 0.0009278348356770915, + "loss": 0.88720024, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1029, + "time_per_iteration": 2.555527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.08132768, + "diversity_loss_mlp": 0.0, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.0755245964113765, + "language_loss": 0.85285002, + "learning_rate": 0.0009276735225197814, + "loss": 0.86381966, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1030, + "time_per_iteration": 2.5947089195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104122, + "balance_loss_mlp": 1.08832633, + "diversity_loss_mlp": 0.0, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.08972056860523267, + "language_loss": 0.85732102, + "learning_rate": 0.0009275120433248847, + "loss": 0.86836231, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.15783691, + "routerloss_mlp": 0.0, + "step": 1031, + "time_per_iteration": 2.676872730255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109193, + "balance_loss_mlp": 1.09355247, + "diversity_loss_mlp": 0.0, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.07488561277584621, + "language_loss": 0.85529125, + "learning_rate": 0.0009273503981550931, + "loss": 0.86638314, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1032, + "time_per_iteration": 3.09958815574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099668, + "balance_loss_mlp": 1.08494592, + "diversity_loss_mlp": 0.0, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.1040963884260124, + "language_loss": 0.86882496, + "learning_rate": 0.0009271885870731626, + "loss": 0.87982166, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1033, + "time_per_iteration": 2.509047269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098009, + "balance_loss_mlp": 1.08258307, + "diversity_loss_mlp": 0.0, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.09324111295027285, + "language_loss": 0.88376671, + "learning_rate": 0.0009270266101419143, + "loss": 0.89474678, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1034, + "time_per_iteration": 2.6504034996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094236, + "balance_loss_mlp": 1.07954955, + "diversity_loss_mlp": 0.0, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.12545708784893086, + "language_loss": 0.85201651, + "learning_rate": 0.0009268644674242328, + "loss": 0.86295891, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1035, + "time_per_iteration": 2.6919047832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105423, + "balance_loss_mlp": 1.08997381, + "diversity_loss_mlp": 0.0, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.09055239952020887, + "language_loss": 0.80814689, + "learning_rate": 0.0009267021589830678, + "loss": 0.81920111, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1036, + "time_per_iteration": 2.582871198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278291, + "balance_loss_mlp": 1.26927888, + "diversity_loss_mlp": 0.0, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.10087907784966592, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78905374, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 1037, + "time_per_iteration": 4.955699920654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112922, + "balance_loss_mlp": 1.11371088, + "diversity_loss_mlp": 0.0, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.08737337363848705, + "language_loss": 0.9264009, + "learning_rate": 0.000926377045182406, + "loss": 0.93769312, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1038, + "time_per_iteration": 2.8884389400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140143, + "balance_loss_mlp": 1.12453878, + "diversity_loss_mlp": 0.0, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.10415849564176528, + "language_loss": 0.87916917, + "learning_rate": 0.0009262142399491296, + "loss": 0.89057058, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1039, + "time_per_iteration": 3.045872211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143419, + "balance_loss_mlp": 1.12763548, + "diversity_loss_mlp": 0.0, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.09906225236156592, + "language_loss": 0.87455821, + "learning_rate": 0.0009260512692448105, + "loss": 0.88599241, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.15771484, + "routerloss_mlp": 0.0, + "step": 1040, + "time_per_iteration": 2.699052572250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.10879421, + "diversity_loss_mlp": 0.0, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0911420547130344, + "language_loss": 0.8431657, + "learning_rate": 0.000925888133132719, + "loss": 0.85441184, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1041, + "time_per_iteration": 2.780141830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063145, + "balance_loss_mlp": 1.05260694, + "diversity_loss_mlp": 0.0, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.04139604987307943, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80673575, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 1042, + "time_per_iteration": 4.971017360687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100746, + "balance_loss_mlp": 1.08498645, + "diversity_loss_mlp": 0.0, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.08950731646766712, + "language_loss": 0.81070006, + "learning_rate": 0.0009255613649386244, + "loss": 0.82170749, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.1574707, + "routerloss_mlp": 0.0, + "step": 1043, + "time_per_iteration": 2.6508612632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091355, + "balance_loss_mlp": 1.07623935, + "diversity_loss_mlp": 0.0, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07614483401418765, + "language_loss": 0.78829026, + "learning_rate": 0.0009253977329834838, + "loss": 0.79920387, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1044, + "time_per_iteration": 2.7090582847595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109464, + "balance_loss_mlp": 1.07947624, + "diversity_loss_mlp": 0.0, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.0989854096864982, + "language_loss": 0.86366481, + "learning_rate": 0.0009252339358742965, + "loss": 0.8746112, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1045, + "time_per_iteration": 2.801323652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100349, + "balance_loss_mlp": 1.08526874, + "diversity_loss_mlp": 0.0, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07994799859902735, + "language_loss": 0.83704323, + "learning_rate": 0.000925069973674654, + "loss": 0.84804672, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1046, + "time_per_iteration": 2.6286635398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011046, + "balance_loss_mlp": 1.09036636, + "diversity_loss_mlp": 0.0, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.05803081938267982, + "language_loss": 0.88841283, + "learning_rate": 0.000924905846448212, + "loss": 0.89945889, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1047, + "time_per_iteration": 2.7208023071289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135029, + "balance_loss_mlp": 1.12078381, + "diversity_loss_mlp": 0.0, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.09159511175118457, + "language_loss": 0.85692465, + "learning_rate": 0.0009247415542586906, + "loss": 0.86827493, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1048, + "time_per_iteration": 2.8772377967834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089504, + "balance_loss_mlp": 1.55797935, + "diversity_loss_mlp": 0.19993141, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.028193920194447036, + "language_loss": 0.83094788, + "learning_rate": 0.0009245770971698735, + "loss": 0.83989829, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01608507, + "step": 1049, + "time_per_iteration": 2.922792911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143099, + "balance_loss_mlp": 1.12878203, + "diversity_loss_mlp": 0.0, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.08345797467079887, + "language_loss": 0.88434327, + "learning_rate": 0.0009244124752456087, + "loss": 0.89577425, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1050, + "time_per_iteration": 2.5263967514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141188, + "balance_loss_mlp": 1.12675214, + "diversity_loss_mlp": 0.0, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.07479960387863874, + "language_loss": 0.85303241, + "learning_rate": 0.0009242476885498081, + "loss": 0.86444432, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1051, + "time_per_iteration": 2.8012773990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146474, + "balance_loss_mlp": 1.13181126, + "diversity_loss_mlp": 0.0, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.07632391919964465, + "language_loss": 0.81114984, + "learning_rate": 0.0009240827371464474, + "loss": 0.82261455, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1052, + "time_per_iteration": 2.546449661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146729, + "balance_loss_mlp": 1.1323998, + "diversity_loss_mlp": 0.0, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.11219768477147798, + "language_loss": 0.84167284, + "learning_rate": 0.0009239176210995666, + "loss": 0.85314012, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1053, + "time_per_iteration": 3.4905290603637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153158, + "balance_loss_mlp": 1.13878179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.07345468089138417, + "language_loss": 0.93850195, + "learning_rate": 0.0009237523404732695, + "loss": 0.95003355, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1054, + "time_per_iteration": 2.8854215145111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116831, + "balance_loss_mlp": 1.15374279, + "diversity_loss_mlp": 0.0, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.08788286689344726, + "language_loss": 0.84136868, + "learning_rate": 0.0009235868953317235, + "loss": 0.85305184, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1055, + "time_per_iteration": 2.785616397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115453, + "balance_loss_mlp": 1.14033246, + "diversity_loss_mlp": 0.0, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.07006303181868268, + "language_loss": 0.85314858, + "learning_rate": 0.0009234212857391602, + "loss": 0.86469388, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1056, + "time_per_iteration": 3.192293167114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167757, + "balance_loss_mlp": 1.15304708, + "diversity_loss_mlp": 0.0, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.07469852363602907, + "language_loss": 0.89220309, + "learning_rate": 0.000923255511759875, + "loss": 0.9038806, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1057, + "time_per_iteration": 2.783778429031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881428, + "balance_loss_mlp": 1.53356147, + "diversity_loss_mlp": 0.1968638, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.032510948660132113, + "language_loss": 0.84587663, + "learning_rate": 0.000923089573458227, + "loss": 0.85469091, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621579, + "step": 1058, + "time_per_iteration": 2.8847100734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150746, + "balance_loss_mlp": 1.13623881, + "diversity_loss_mlp": 0.0, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.11181454207252314, + "language_loss": 0.83516467, + "learning_rate": 0.0009229234708986392, + "loss": 0.84667218, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1059, + "time_per_iteration": 2.9079415798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172867, + "balance_loss_mlp": 1.16251993, + "diversity_loss_mlp": 0.0, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.06024273804144221, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82839763, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1060, + "time_per_iteration": 4.646218776702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112152, + "balance_loss_mlp": 1.10713172, + "diversity_loss_mlp": 0.0, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.08928557521337042, + "language_loss": 0.85345757, + "learning_rate": 0.0009225907732636548, + "loss": 0.86467278, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1061, + "time_per_iteration": 2.745448112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106249, + "balance_loss_mlp": 1.09209883, + "diversity_loss_mlp": 0.0, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.079028173596336, + "language_loss": 0.86936563, + "learning_rate": 0.0009224241783174227, + "loss": 0.88042819, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1062, + "time_per_iteration": 2.6923935413360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090506, + "balance_loss_mlp": 1.07616472, + "diversity_loss_mlp": 0.0, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.07452632641130948, + "language_loss": 0.85384166, + "learning_rate": 0.0009222574193715802, + "loss": 0.86474669, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1063, + "time_per_iteration": 2.7701327800750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092958, + "balance_loss_mlp": 1.07850981, + "diversity_loss_mlp": 0.0, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06517233034985846, + "language_loss": 0.85915947, + "learning_rate": 0.000922090496490869, + "loss": 0.87008905, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1064, + "time_per_iteration": 2.7387099266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098934, + "balance_loss_mlp": 1.08404493, + "diversity_loss_mlp": 0.0, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.06963355430403552, + "language_loss": 0.89889115, + "learning_rate": 0.0009219234097400937, + "loss": 0.90988052, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1065, + "time_per_iteration": 2.859334707260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112884, + "balance_loss_mlp": 1.09778059, + "diversity_loss_mlp": 0.0, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06723697540994414, + "language_loss": 0.83086514, + "learning_rate": 0.0009217561591841237, + "loss": 0.84199405, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1066, + "time_per_iteration": 3.3065547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00886484, + "balance_loss_mlp": 1.54046464, + "diversity_loss_mlp": 0.1982768, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.03984406199709606, + "language_loss": 0.80820358, + "learning_rate": 0.0009215887448878913, + "loss": 0.8170684, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01711285, + "step": 1067, + "time_per_iteration": 2.6291754245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131678, + "balance_loss_mlp": 1.11697936, + "diversity_loss_mlp": 0.0, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.07633348035576148, + "language_loss": 0.85365784, + "learning_rate": 0.0009214211669163922, + "loss": 0.86497462, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1068, + "time_per_iteration": 2.747936725616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136914, + "balance_loss_mlp": 1.12220347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.07197705825645119, + "language_loss": 0.9405331, + "learning_rate": 0.0009212534253346862, + "loss": 0.95190227, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1069, + "time_per_iteration": 2.696131467819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128507, + "balance_loss_mlp": 1.11372542, + "diversity_loss_mlp": 0.0, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.09743186487320747, + "language_loss": 0.84269625, + "learning_rate": 0.0009210855202078964, + "loss": 0.85398132, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1070, + "time_per_iteration": 2.6194372177124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114316, + "balance_loss_mlp": 1.12903321, + "diversity_loss_mlp": 0.0, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.08033414700046611, + "language_loss": 0.87081122, + "learning_rate": 0.0009209174516012091, + "loss": 0.88224292, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1071, + "time_per_iteration": 2.5169904232025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146914, + "balance_loss_mlp": 1.13247752, + "diversity_loss_mlp": 0.0, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.06769648970134874, + "language_loss": 0.89207751, + "learning_rate": 0.0009207492195798747, + "loss": 0.90354669, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1072, + "time_per_iteration": 2.804577112197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137485, + "balance_loss_mlp": 1.12303698, + "diversity_loss_mlp": 0.0, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.0857236005827703, + "language_loss": 0.84780991, + "learning_rate": 0.0009205808242092061, + "loss": 0.85918474, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1073, + "time_per_iteration": 2.6134936809539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122455, + "balance_loss_mlp": 1.10787559, + "diversity_loss_mlp": 0.0, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.09531084522047072, + "language_loss": 0.82512677, + "learning_rate": 0.0009204122655545808, + "loss": 0.83635134, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1074, + "time_per_iteration": 3.461315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888955, + "balance_loss_mlp": 1.54418314, + "diversity_loss_mlp": 0.20175909, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.03221822204199988, + "language_loss": 0.80952764, + "learning_rate": 0.0009202435436814388, + "loss": 0.81841719, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01598355, + "step": 1075, + "time_per_iteration": 2.728055238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146745, + "balance_loss_mlp": 1.13259482, + "diversity_loss_mlp": 0.0, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.0831097658087499, + "language_loss": 0.89925295, + "learning_rate": 0.0009200746586552836, + "loss": 0.91072041, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1076, + "time_per_iteration": 2.929422616958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136182, + "balance_loss_mlp": 1.12185347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.07960863169785164, + "language_loss": 0.84148425, + "learning_rate": 0.0009199056105416825, + "loss": 0.85284609, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1077, + "time_per_iteration": 3.0795576572418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148051, + "balance_loss_mlp": 1.13384151, + "diversity_loss_mlp": 0.0, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.06589509494701294, + "language_loss": 0.86599898, + "learning_rate": 0.0009197363994062654, + "loss": 0.87747955, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1078, + "time_per_iteration": 2.8304550647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891417, + "balance_loss_mlp": 1.54815006, + "diversity_loss_mlp": 0.20151556, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.027729032115243194, + "language_loss": 0.84302026, + "learning_rate": 0.0009195670253147262, + "loss": 0.85193443, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01658459, + "step": 1079, + "time_per_iteration": 2.987715005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168872, + "balance_loss_mlp": 1.15472198, + "diversity_loss_mlp": 0.0, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.07878432741989363, + "language_loss": 0.82508785, + "learning_rate": 0.0009193974883328216, + "loss": 0.83677661, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1080, + "time_per_iteration": 2.6007754802703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178335, + "balance_loss_mlp": 1.16408908, + "diversity_loss_mlp": 0.0, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06872318796781544, + "language_loss": 0.86871535, + "learning_rate": 0.0009192277885263718, + "loss": 0.88049871, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1081, + "time_per_iteration": 2.645918846130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116777, + "balance_loss_mlp": 1.15339386, + "diversity_loss_mlp": 0.0, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.08475435362049728, + "language_loss": 0.86010319, + "learning_rate": 0.0009190579259612602, + "loss": 0.87178093, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1082, + "time_per_iteration": 3.2688331604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153529, + "balance_loss_mlp": 1.13914001, + "diversity_loss_mlp": 0.0, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.06676527060715894, + "language_loss": 0.86419082, + "learning_rate": 0.000918887900703433, + "loss": 0.8757261, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1083, + "time_per_iteration": 2.7645068168640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129996, + "balance_loss_mlp": 1.11559522, + "diversity_loss_mlp": 0.0, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07296749014166971, + "language_loss": 0.89779425, + "learning_rate": 0.0009187177128188999, + "loss": 0.90909421, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1084, + "time_per_iteration": 2.441312313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128832, + "balance_loss_mlp": 1.11915255, + "diversity_loss_mlp": 0.0, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.053207927956046876, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78285372, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1085, + "time_per_iteration": 4.864179849624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117368, + "balance_loss_mlp": 1.1029439, + "diversity_loss_mlp": 0.0, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07905606819783856, + "language_loss": 0.85833263, + "learning_rate": 0.000918376849434071, + "loss": 0.86950636, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1086, + "time_per_iteration": 4.049270868301392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112999, + "balance_loss_mlp": 1.09849179, + "diversity_loss_mlp": 0.0, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.08954509639668791, + "language_loss": 0.90778226, + "learning_rate": 0.0009182061740661098, + "loss": 0.91891223, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1087, + "time_per_iteration": 2.557358741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128974, + "balance_loss_mlp": 1.11446643, + "diversity_loss_mlp": 0.0, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.08446380837501397, + "language_loss": 0.85054636, + "learning_rate": 0.0009180353363361127, + "loss": 0.86183608, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1088, + "time_per_iteration": 3.0897305011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118573, + "balance_loss_mlp": 1.10417306, + "diversity_loss_mlp": 0.0, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.08173869768976531, + "language_loss": 0.82508695, + "learning_rate": 0.0009178643363104044, + "loss": 0.83627272, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1089, + "time_per_iteration": 3.124645948410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113657, + "balance_loss_mlp": 1.09938824, + "diversity_loss_mlp": 0.0, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.09307233053408402, + "language_loss": 0.90518665, + "learning_rate": 0.0009176931740553735, + "loss": 0.9163233, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1090, + "time_per_iteration": 2.6098225116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113731, + "balance_loss_mlp": 1.09981966, + "diversity_loss_mlp": 0.0, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.09489388322063774, + "language_loss": 0.8240813, + "learning_rate": 0.0009175218496374708, + "loss": 0.83521861, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1091, + "time_per_iteration": 3.336355686187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110612, + "balance_loss_mlp": 1.09205294, + "diversity_loss_mlp": 0.0, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.08870561470384966, + "language_loss": 0.86057436, + "learning_rate": 0.0009173503631232103, + "loss": 0.87163556, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1092, + "time_per_iteration": 3.356015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106884, + "balance_loss_mlp": 1.09269798, + "diversity_loss_mlp": 0.0, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.09478788106803046, + "language_loss": 0.82067865, + "learning_rate": 0.0009171787145791691, + "loss": 0.83174753, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1093, + "time_per_iteration": 3.2546143531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116222, + "balance_loss_mlp": 1.10199988, + "diversity_loss_mlp": 0.0, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.14674509624116924, + "language_loss": 0.80160701, + "learning_rate": 0.000917006904071987, + "loss": 0.81276917, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1094, + "time_per_iteration": 2.5837080478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911953, + "balance_loss_mlp": 1.58726883, + "diversity_loss_mlp": 0.20477253, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.035943125208157026, + "language_loss": 0.8737694, + "learning_rate": 0.0009168349316683669, + "loss": 0.88288891, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01593196, + "step": 1095, + "time_per_iteration": 2.768296718597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136825, + "balance_loss_mlp": 1.1224122, + "diversity_loss_mlp": 0.0, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.06639171103878667, + "language_loss": 0.82719827, + "learning_rate": 0.0009166627974350741, + "loss": 0.83856648, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1096, + "time_per_iteration": 2.8819992542266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145046, + "balance_loss_mlp": 1.13041949, + "diversity_loss_mlp": 0.0, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.08337696606413014, + "language_loss": 0.89929205, + "learning_rate": 0.0009164905014389373, + "loss": 0.91074252, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1097, + "time_per_iteration": 2.7877442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163813, + "balance_loss_mlp": 1.1495918, + "diversity_loss_mlp": 0.0, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.08033808486911229, + "language_loss": 0.86386079, + "learning_rate": 0.0009163180437468476, + "loss": 0.87549889, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1098, + "time_per_iteration": 2.6314592361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176615, + "balance_loss_mlp": 1.16195273, + "diversity_loss_mlp": 0.0, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.09094665560265827, + "language_loss": 0.85629344, + "learning_rate": 0.000916145424425759, + "loss": 0.86805964, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1099, + "time_per_iteration": 2.6608541011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181873, + "balance_loss_mlp": 1.16744852, + "diversity_loss_mlp": 0.0, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.09944182260515583, + "language_loss": 0.9083795, + "learning_rate": 0.0009159726435426885, + "loss": 0.9201982, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1100, + "time_per_iteration": 3.0502405166625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.134619, + "diversity_loss_mlp": 0.0, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.09151162791452093, + "language_loss": 0.90900993, + "learning_rate": 0.0009157997011647154, + "loss": 0.92050231, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1101, + "time_per_iteration": 2.6048476696014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127613, + "balance_loss_mlp": 1.11389172, + "diversity_loss_mlp": 0.0, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.07696729699318336, + "language_loss": 0.86130077, + "learning_rate": 0.0009156265973589817, + "loss": 0.87257689, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1102, + "time_per_iteration": 2.7552144527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114805, + "balance_loss_mlp": 1.10088181, + "diversity_loss_mlp": 0.0, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.07661877314329607, + "language_loss": 0.89485067, + "learning_rate": 0.0009154533321926926, + "loss": 0.90599877, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.13909912, + "routerloss_mlp": 0.0, + "step": 1103, + "time_per_iteration": 4.073851108551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105254, + "balance_loss_mlp": 1.09134197, + "diversity_loss_mlp": 0.0, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.08363594534482698, + "language_loss": 0.8717171, + "learning_rate": 0.0009152799057331156, + "loss": 0.88276958, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1104, + "time_per_iteration": 3.142221450805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100132, + "balance_loss_mlp": 1.08656633, + "diversity_loss_mlp": 0.0, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.1056362594360365, + "language_loss": 0.91270363, + "learning_rate": 0.0009151063180475805, + "loss": 0.92370498, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1105, + "time_per_iteration": 2.512547016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095772, + "balance_loss_mlp": 1.08196795, + "diversity_loss_mlp": 0.0, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.08072473316090223, + "language_loss": 0.84285367, + "learning_rate": 0.0009149325692034803, + "loss": 0.85381138, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1106, + "time_per_iteration": 2.5711469650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.06266928, + "diversity_loss_mlp": 0.0, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.04229613635199888, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.8027482, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1107, + "time_per_iteration": 4.817704916000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129097, + "balance_loss_mlp": 1.11547112, + "diversity_loss_mlp": 0.0, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.07382538641756346, + "language_loss": 0.8748607, + "learning_rate": 0.0009145845883094678, + "loss": 0.88615161, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1108, + "time_per_iteration": 3.039318561553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150208, + "balance_loss_mlp": 1.13671303, + "diversity_loss_mlp": 0.0, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07887220377556703, + "language_loss": 0.85174125, + "learning_rate": 0.000914410356394654, + "loss": 0.86324334, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1109, + "time_per_iteration": 2.76413893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116209, + "balance_loss_mlp": 1.1484766, + "diversity_loss_mlp": 0.0, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.06362602917472766, + "language_loss": 0.84447891, + "learning_rate": 0.0009142359635914709, + "loss": 0.85609984, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1110, + "time_per_iteration": 3.007201671600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163563, + "balance_loss_mlp": 1.15004468, + "diversity_loss_mlp": 0.0, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.07633144605420673, + "language_loss": 0.84598219, + "learning_rate": 0.0009140614099676245, + "loss": 0.85761786, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1111, + "time_per_iteration": 2.569401979446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161722, + "balance_loss_mlp": 1.14807272, + "diversity_loss_mlp": 0.0, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.0712977258009472, + "language_loss": 0.82590818, + "learning_rate": 0.0009138866955908821, + "loss": 0.83752549, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1112, + "time_per_iteration": 2.870701789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166663, + "balance_loss_mlp": 1.15294182, + "diversity_loss_mlp": 0.0, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.09239605609063735, + "language_loss": 0.80485952, + "learning_rate": 0.0009137118205290738, + "loss": 0.81652606, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.13739014, + "routerloss_mlp": 0.0, + "step": 1113, + "time_per_iteration": 2.9623591899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174843, + "balance_loss_mlp": 1.16082442, + "diversity_loss_mlp": 0.0, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.08763873550503462, + "language_loss": 0.90553653, + "learning_rate": 0.0009135367848500924, + "loss": 0.91728497, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1114, + "time_per_iteration": 2.5287492275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165146, + "balance_loss_mlp": 1.15138936, + "diversity_loss_mlp": 0.0, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.11593363319598911, + "language_loss": 0.86361086, + "learning_rate": 0.0009133615886218927, + "loss": 0.87526232, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1115, + "time_per_iteration": 2.6945505142211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141616, + "balance_loss_mlp": 1.12725139, + "diversity_loss_mlp": 0.0, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.08371979294567897, + "language_loss": 0.87389791, + "learning_rate": 0.0009131862319124917, + "loss": 0.88531411, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1116, + "time_per_iteration": 2.6219210624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130003, + "balance_loss_mlp": 1.1162107, + "diversity_loss_mlp": 0.0, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08272793517794225, + "language_loss": 0.83981287, + "learning_rate": 0.0009130107147899691, + "loss": 0.85111284, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1117, + "time_per_iteration": 2.698151111602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.1039083, + "diversity_loss_mlp": 0.0, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.4685945915436946, + "language_loss": 0.85086691, + "learning_rate": 0.0009128350373224665, + "loss": 0.86204791, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1118, + "time_per_iteration": 2.545565128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059182, + "balance_loss_mlp": 1.04950213, + "diversity_loss_mlp": 0.0, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.03761711697708654, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82515609, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1119, + "time_per_iteration": 4.648902416229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118843, + "balance_loss_mlp": 1.10412121, + "diversity_loss_mlp": 0.0, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07492511871579786, + "language_loss": 0.85205054, + "learning_rate": 0.0009124832016254005, + "loss": 0.86323893, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1120, + "time_per_iteration": 2.5875513553619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112958, + "balance_loss_mlp": 1.11404657, + "diversity_loss_mlp": 0.0, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.10623123993924175, + "language_loss": 0.88117284, + "learning_rate": 0.0009123070435324316, + "loss": 0.89246857, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1121, + "time_per_iteration": 2.752814769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119114, + "balance_loss_mlp": 1.10852826, + "diversity_loss_mlp": 0.0, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.05861429426141409, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78994894, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 1122, + "time_per_iteration": 4.993450880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114894, + "balance_loss_mlp": 1.13229823, + "diversity_loss_mlp": 0.0, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.09758120262844092, + "language_loss": 0.86477894, + "learning_rate": 0.0009119542471995752, + "loss": 0.87626839, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 1123, + "time_per_iteration": 2.8260560035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132116, + "balance_loss_mlp": 1.1160109, + "diversity_loss_mlp": 0.0, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.1175490331770948, + "language_loss": 0.81597894, + "learning_rate": 0.0009117776090966554, + "loss": 0.82730007, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1124, + "time_per_iteration": 2.955768585205078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133281, + "balance_loss_mlp": 1.1166153, + "diversity_loss_mlp": 0.0, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.08908783615486303, + "language_loss": 0.86717665, + "learning_rate": 0.0009116008111274899, + "loss": 0.87850952, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1125, + "time_per_iteration": 3.2493131160736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_mlp": 1.02921367, + "diversity_loss_mlp": 0.0, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.03267712428803131, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80145574, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 1126, + "time_per_iteration": 4.8121678829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148434, + "balance_loss_mlp": 1.13257909, + "diversity_loss_mlp": 0.0, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.09699177011816186, + "language_loss": 0.85244691, + "learning_rate": 0.0009112467358650396, + "loss": 0.86393118, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1127, + "time_per_iteration": 3.144075393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166528, + "balance_loss_mlp": 1.15056634, + "diversity_loss_mlp": 0.0, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.07985175184807933, + "language_loss": 0.86319685, + "learning_rate": 0.0009110694587092192, + "loss": 0.87486213, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.1595459, + "routerloss_mlp": 0.0, + "step": 1128, + "time_per_iteration": 2.7497644424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179675, + "balance_loss_mlp": 1.1634866, + "diversity_loss_mlp": 0.0, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.1038215552752292, + "language_loss": 0.81267089, + "learning_rate": 0.0009108920219620815, + "loss": 0.82446766, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 1129, + "time_per_iteration": 2.6150496006011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195026, + "balance_loss_mlp": 1.1788609, + "diversity_loss_mlp": 0.0, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06771714561059723, + "language_loss": 0.89286679, + "learning_rate": 0.0009107144256925133, + "loss": 0.9048171, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1130, + "time_per_iteration": 2.6569926738739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196317, + "balance_loss_mlp": 1.18006873, + "diversity_loss_mlp": 0.0, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08333124164895586, + "language_loss": 0.82520813, + "learning_rate": 0.0009105366699694638, + "loss": 0.83717132, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 1131, + "time_per_iteration": 2.7384698390960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200769, + "balance_loss_mlp": 1.18390059, + "diversity_loss_mlp": 0.0, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.07018840625680964, + "language_loss": 0.81826723, + "learning_rate": 0.0009103587548619439, + "loss": 0.83027488, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 1132, + "time_per_iteration": 2.8361291885375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188026, + "balance_loss_mlp": 1.17064476, + "diversity_loss_mlp": 0.0, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.08238158624987729, + "language_loss": 0.85952497, + "learning_rate": 0.0009101806804390261, + "loss": 0.87140524, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.1739502, + "routerloss_mlp": 0.0, + "step": 1133, + "time_per_iteration": 2.8646528720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846565, + "balance_loss_mlp": 1.45559311, + "diversity_loss_mlp": 0.20202307, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.03511986753794681, + "language_loss": 0.90682399, + "learning_rate": 0.0009100024467698453, + "loss": 0.91528964, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01775702, + "step": 1134, + "time_per_iteration": 2.628955364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119036, + "balance_loss_mlp": 1.17289567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.09831196896097749, + "language_loss": 0.82889581, + "learning_rate": 0.0009098240539235981, + "loss": 0.84079945, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 1135, + "time_per_iteration": 2.6857638359069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179858, + "balance_loss_mlp": 1.16191649, + "diversity_loss_mlp": 0.0, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.07855046788509763, + "language_loss": 0.87649047, + "learning_rate": 0.0009096455019695423, + "loss": 0.88828909, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 1136, + "time_per_iteration": 2.814746856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175201, + "balance_loss_mlp": 1.15702188, + "diversity_loss_mlp": 0.0, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.090535881946018, + "language_loss": 0.89789271, + "learning_rate": 0.000909466790976998, + "loss": 0.90964472, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.18188477, + "routerloss_mlp": 0.0, + "step": 1137, + "time_per_iteration": 2.503934144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151304, + "balance_loss_mlp": 1.13231349, + "diversity_loss_mlp": 0.0, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07386356915969775, + "language_loss": 0.82546908, + "learning_rate": 0.0009092879210153473, + "loss": 0.83698207, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.18981934, + "routerloss_mlp": 0.0, + "step": 1138, + "time_per_iteration": 3.106015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143167, + "balance_loss_mlp": 1.12445128, + "diversity_loss_mlp": 0.0, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.08443059177839436, + "language_loss": 0.89126158, + "learning_rate": 0.0009091088921540333, + "loss": 0.90269327, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.18701172, + "routerloss_mlp": 0.0, + "step": 1139, + "time_per_iteration": 2.5165584087371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197317, + "balance_loss_mlp": 1.18491888, + "diversity_loss_mlp": 0.0, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.06938907882855633, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76705992, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12402344, + "routerloss_mlp": 0.0, + "step": 1140, + "time_per_iteration": 4.907839775085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845315, + "balance_loss_mlp": 1.45913088, + "diversity_loss_mlp": 0.19676474, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.04157801253712285, + "language_loss": 0.84799111, + "learning_rate": 0.0009087503580104985, + "loss": 0.8564443, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01736734, + "step": 1141, + "time_per_iteration": 2.6928980350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106137, + "balance_loss_mlp": 1.08643126, + "diversity_loss_mlp": 0.0, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.09652849342648293, + "language_loss": 0.7964108, + "learning_rate": 0.0009085708528674728, + "loss": 0.80747211, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 1142, + "time_per_iteration": 2.7800490856170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.09476519, + "diversity_loss_mlp": 0.0, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.11345906914127299, + "language_loss": 0.8700006, + "learning_rate": 0.0009083911891031745, + "loss": 0.88115132, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 1143, + "time_per_iteration": 3.104893684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110402, + "balance_loss_mlp": 1.08533978, + "diversity_loss_mlp": 0.0, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.12428556161586228, + "language_loss": 0.91569418, + "learning_rate": 0.0009082113667873553, + "loss": 0.92673439, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.18676758, + "routerloss_mlp": 0.0, + "step": 1144, + "time_per_iteration": 3.0838277339935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138926, + "balance_loss_mlp": 1.12060392, + "diversity_loss_mlp": 0.0, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.0955721440223133, + "language_loss": 0.90911627, + "learning_rate": 0.0009080313859898283, + "loss": 0.92050546, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 1145, + "time_per_iteration": 2.4998109340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162548, + "balance_loss_mlp": 1.14463091, + "diversity_loss_mlp": 0.0, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07871728913387968, + "language_loss": 0.91642439, + "learning_rate": 0.0009078512467804684, + "loss": 0.92804986, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.17932129, + "routerloss_mlp": 0.0, + "step": 1146, + "time_per_iteration": 2.583137273788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192448, + "balance_loss_mlp": 1.17516243, + "diversity_loss_mlp": 0.0, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.10815580627735921, + "language_loss": 0.90245295, + "learning_rate": 0.0009076709492292119, + "loss": 0.91437739, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 1147, + "time_per_iteration": 2.6189510822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199389, + "balance_loss_mlp": 1.18260384, + "diversity_loss_mlp": 0.0, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.10018226205073696, + "language_loss": 0.88948917, + "learning_rate": 0.0009074904934060562, + "loss": 0.90148306, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1148, + "time_per_iteration": 2.6619913578033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.17623389, + "diversity_loss_mlp": 0.0, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.09879445691718633, + "language_loss": 0.85041308, + "learning_rate": 0.0009073098793810607, + "loss": 0.8623414, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.1661377, + "routerloss_mlp": 0.0, + "step": 1149, + "time_per_iteration": 2.9382119178771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185083, + "balance_loss_mlp": 1.16848898, + "diversity_loss_mlp": 0.0, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.09716543961816822, + "language_loss": 0.88557786, + "learning_rate": 0.000907129107224346, + "loss": 0.89742863, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.16601562, + "routerloss_mlp": 0.0, + "step": 1150, + "time_per_iteration": 2.717400550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.17356002, + "diversity_loss_mlp": 0.0, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.0741661773141201, + "language_loss": 0.88313866, + "learning_rate": 0.0009069481770060939, + "loss": 0.89504004, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1151, + "time_per_iteration": 2.676938056945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118655, + "balance_loss_mlp": 1.17039752, + "diversity_loss_mlp": 0.0, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06827936796637825, + "language_loss": 0.83848286, + "learning_rate": 0.000906767088796548, + "loss": 0.85034835, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1152, + "time_per_iteration": 3.442782163619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185002, + "balance_loss_mlp": 1.16889715, + "diversity_loss_mlp": 0.0, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.07358747282835834, + "language_loss": 0.87001419, + "learning_rate": 0.0009065858426660127, + "loss": 0.88186425, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1153, + "time_per_iteration": 2.6501753330230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178927, + "balance_loss_mlp": 1.16286922, + "diversity_loss_mlp": 0.0, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.0863709920952229, + "language_loss": 0.84764236, + "learning_rate": 0.0009064044386848543, + "loss": 0.85943162, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1154, + "time_per_iteration": 2.920689344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176891, + "balance_loss_mlp": 1.16032064, + "diversity_loss_mlp": 0.0, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.07669791788600007, + "language_loss": 0.88829726, + "learning_rate": 0.0009062228769234997, + "loss": 0.90006614, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 1155, + "time_per_iteration": 2.561638832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154629, + "balance_loss_mlp": 1.13797593, + "diversity_loss_mlp": 0.0, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.08447027490527963, + "language_loss": 0.81123281, + "learning_rate": 0.0009060411574524376, + "loss": 0.82277906, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 1156, + "time_per_iteration": 2.655132293701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162354, + "balance_loss_mlp": 1.14597416, + "diversity_loss_mlp": 0.0, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.08665349089557017, + "language_loss": 0.87817705, + "learning_rate": 0.0009058592803422178, + "loss": 0.88980061, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 1157, + "time_per_iteration": 3.1417362689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183028, + "balance_loss_mlp": 1.17430186, + "diversity_loss_mlp": 0.0, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.06198684812147071, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79893315, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1158, + "time_per_iteration": 4.867843866348267 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.1120069, + "diversity_loss_mlp": 0.0, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.0864152607347894, + "language_loss": 0.90156865, + "learning_rate": 0.00090549505348681, + "loss": 0.91285539, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1159, + "time_per_iteration": 2.581865072250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118982, + "balance_loss_mlp": 1.1025548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07056827667929483, + "language_loss": 0.83819324, + "learning_rate": 0.0009053127038830275, + "loss": 0.84938306, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 1160, + "time_per_iteration": 2.9969708919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881169, + "balance_loss_mlp": 1.53314447, + "diversity_loss_mlp": 0.19063006, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.04002382495760162, + "language_loss": 0.87460124, + "learning_rate": 0.000905130196922898, + "loss": 0.88341296, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01928164, + "step": 1161, + "time_per_iteration": 2.6307718753814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881407, + "balance_loss_mlp": 1.5316093, + "diversity_loss_mlp": 0.19140732, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.030280826501304762, + "language_loss": 0.86784196, + "learning_rate": 0.0009049475326772769, + "loss": 0.87665606, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01989887, + "step": 1162, + "time_per_iteration": 2.6021478176116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00889034, + "balance_loss_mlp": 1.54766631, + "diversity_loss_mlp": 0.19066738, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.03198536270345376, + "language_loss": 0.83124602, + "learning_rate": 0.0009047647112170811, + "loss": 0.84013629, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01986698, + "step": 1163, + "time_per_iteration": 2.804150342941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123868, + "balance_loss_mlp": 1.1070838, + "diversity_loss_mlp": 0.0, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.09901141435665076, + "language_loss": 0.87948084, + "learning_rate": 0.0009045817326132876, + "loss": 0.89071947, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1164, + "time_per_iteration": 3.6840732097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125369, + "balance_loss_mlp": 1.107988, + "diversity_loss_mlp": 0.0, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.08432013167879508, + "language_loss": 0.83142793, + "learning_rate": 0.0009043985969369357, + "loss": 0.84268159, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.17407227, + "routerloss_mlp": 0.0, + "step": 1165, + "time_per_iteration": 2.8148193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146301, + "balance_loss_mlp": 1.12976706, + "diversity_loss_mlp": 0.0, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.06944445596490195, + "language_loss": 0.84334069, + "learning_rate": 0.0009042153042591245, + "loss": 0.85480368, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 1166, + "time_per_iteration": 2.8004493713378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142176, + "balance_loss_mlp": 1.12542677, + "diversity_loss_mlp": 0.0, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.06821660135571728, + "language_loss": 0.85225487, + "learning_rate": 0.0009040318546510146, + "loss": 0.86367661, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 1167, + "time_per_iteration": 3.1969215869903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156354, + "balance_loss_mlp": 1.13979554, + "diversity_loss_mlp": 0.0, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.06547364647617461, + "language_loss": 0.84988701, + "learning_rate": 0.0009038482481838275, + "loss": 0.86145055, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 1168, + "time_per_iteration": 2.7087180614471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00861334, + "balance_loss_mlp": 1.49333596, + "diversity_loss_mlp": 0.19261675, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.02892951533663535, + "language_loss": 0.87266529, + "learning_rate": 0.0009036644849288455, + "loss": 0.88127863, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01835741, + "step": 1169, + "time_per_iteration": 3.1039352416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.1631248, + "diversity_loss_mlp": 0.0, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.06865085555084699, + "language_loss": 0.85404736, + "learning_rate": 0.0009034805649574118, + "loss": 0.86584634, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.16784668, + "routerloss_mlp": 0.0, + "step": 1170, + "time_per_iteration": 2.659322738647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208955, + "balance_loss_mlp": 1.1926589, + "diversity_loss_mlp": 0.0, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.07685307661183591, + "language_loss": 0.85691977, + "learning_rate": 0.0009032964883409308, + "loss": 0.86900926, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 1171, + "time_per_iteration": 2.8938751220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128211, + "balance_loss_mlp": 1.11910319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.06058864885284362, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74178743, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 1172, + "time_per_iteration": 4.983820676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217918, + "balance_loss_mlp": 1.20207548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.1048847225020503, + "language_loss": 0.8717351, + "learning_rate": 0.0009029278654587462, + "loss": 0.88391435, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.1583252, + "routerloss_mlp": 0.0, + "step": 1173, + "time_per_iteration": 2.639632225036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181665, + "balance_loss_mlp": 1.16508245, + "diversity_loss_mlp": 0.0, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.07111002228073603, + "language_loss": 0.82226282, + "learning_rate": 0.0009027433193361548, + "loss": 0.83407944, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1174, + "time_per_iteration": 2.7443323135375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159983, + "balance_loss_mlp": 1.14366364, + "diversity_loss_mlp": 0.0, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06531304020653, + "language_loss": 0.86980343, + "learning_rate": 0.00090255861685474, + "loss": 0.88140327, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 1175, + "time_per_iteration": 2.7534220218658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142116, + "balance_loss_mlp": 1.12533128, + "diversity_loss_mlp": 0.0, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.10016618462748716, + "language_loss": 0.90750074, + "learning_rate": 0.0009023737580862095, + "loss": 0.91892195, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1176, + "time_per_iteration": 2.5116937160491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114077, + "balance_loss_mlp": 1.12470055, + "diversity_loss_mlp": 0.0, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0707285441494173, + "language_loss": 0.83225566, + "learning_rate": 0.0009021887431023321, + "loss": 0.84366333, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1177, + "time_per_iteration": 2.599956512451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130034, + "balance_loss_mlp": 1.11444104, + "diversity_loss_mlp": 0.0, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.08431891612549362, + "language_loss": 0.87212515, + "learning_rate": 0.0009020035719749369, + "loss": 0.88342547, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1178, + "time_per_iteration": 2.7144312858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135701, + "balance_loss_mlp": 1.1205014, + "diversity_loss_mlp": 0.0, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.09883499682369536, + "language_loss": 0.77450085, + "learning_rate": 0.0009018182447759136, + "loss": 0.7858578, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1179, + "time_per_iteration": 2.98848557472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137145, + "balance_loss_mlp": 1.12187457, + "diversity_loss_mlp": 0.0, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.08173095074239418, + "language_loss": 0.79878223, + "learning_rate": 0.0009016327615772126, + "loss": 0.81015366, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1180, + "time_per_iteration": 2.9338154792785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149275, + "balance_loss_mlp": 1.13449335, + "diversity_loss_mlp": 0.0, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.08374692364956231, + "language_loss": 0.87680298, + "learning_rate": 0.0009014471224508451, + "loss": 0.88829577, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1181, + "time_per_iteration": 2.7131431102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881934, + "balance_loss_mlp": 1.53494334, + "diversity_loss_mlp": 0.19571492, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.04185105584005936, + "language_loss": 0.83154267, + "learning_rate": 0.0009012613274688823, + "loss": 0.84036207, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01660516, + "step": 1182, + "time_per_iteration": 2.649559736251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184244, + "balance_loss_mlp": 1.1692239, + "diversity_loss_mlp": 0.0, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.12019924395271459, + "language_loss": 0.87753081, + "learning_rate": 0.0009010753767034565, + "loss": 0.8893733, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1183, + "time_per_iteration": 2.5258986949920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175003, + "balance_loss_mlp": 1.16030502, + "diversity_loss_mlp": 0.0, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.08783280174490297, + "language_loss": 0.78918862, + "learning_rate": 0.0009008892702267599, + "loss": 0.80093861, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1184, + "time_per_iteration": 2.9962406158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139115, + "balance_loss_mlp": 1.12460732, + "diversity_loss_mlp": 0.0, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.08254121322216867, + "language_loss": 0.88525105, + "learning_rate": 0.0009007030081110457, + "loss": 0.89664215, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1185, + "time_per_iteration": 2.5990660190582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125585, + "balance_loss_mlp": 1.11087465, + "diversity_loss_mlp": 0.0, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.07610459395316062, + "language_loss": 0.84548527, + "learning_rate": 0.000900516590428627, + "loss": 0.85674113, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1186, + "time_per_iteration": 2.7377407550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121666, + "balance_loss_mlp": 1.1070751, + "diversity_loss_mlp": 0.0, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.13748029932532174, + "language_loss": 0.89182103, + "learning_rate": 0.0009003300172518778, + "loss": 0.90303767, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1187, + "time_per_iteration": 2.6916556358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116227, + "balance_loss_mlp": 1.10145736, + "diversity_loss_mlp": 0.0, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.11313229810108143, + "language_loss": 0.84335989, + "learning_rate": 0.0009001432886532321, + "loss": 0.85452211, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1188, + "time_per_iteration": 2.9698264598846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114727, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.06729358528862889, + "language_loss": 0.86774516, + "learning_rate": 0.0008999564047051843, + "loss": 0.87889242, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1189, + "time_per_iteration": 2.5002098083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136799, + "balance_loss_mlp": 1.12243462, + "diversity_loss_mlp": 0.0, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.0714274855120672, + "language_loss": 0.84824312, + "learning_rate": 0.0008997693654802894, + "loss": 0.85961115, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1190, + "time_per_iteration": 2.6300055980682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149047, + "balance_loss_mlp": 1.13425303, + "diversity_loss_mlp": 0.0, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.07754985979781381, + "language_loss": 0.86714745, + "learning_rate": 0.0008995821710511625, + "loss": 0.87863791, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1191, + "time_per_iteration": 2.7126989364624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162855, + "balance_loss_mlp": 1.14807296, + "diversity_loss_mlp": 0.0, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.11547698788472376, + "language_loss": 0.85060751, + "learning_rate": 0.0008993948214904786, + "loss": 0.86223602, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1192, + "time_per_iteration": 2.5562260150909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152534, + "balance_loss_mlp": 1.14361739, + "diversity_loss_mlp": 0.0, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.05307726892258072, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79574746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 1193, + "time_per_iteration": 4.909748792648315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187526, + "balance_loss_mlp": 1.17205215, + "diversity_loss_mlp": 0.0, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.09739164860103838, + "language_loss": 0.78353333, + "learning_rate": 0.0008990196572654427, + "loss": 0.79540861, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.15454102, + "routerloss_mlp": 0.0, + "step": 1194, + "time_per_iteration": 2.8592262268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117424, + "balance_loss_mlp": 1.1592319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.06260411033315277, + "language_loss": 0.87559408, + "learning_rate": 0.0008988318427467426, + "loss": 0.88733649, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1195, + "time_per_iteration": 2.7444722652435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00878316, + "balance_loss_mlp": 1.52780199, + "diversity_loss_mlp": 0.1948241, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.0364111048645648, + "language_loss": 0.86376345, + "learning_rate": 0.0008986438733877887, + "loss": 0.87254667, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01700337, + "step": 1196, + "time_per_iteration": 3.5090088844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137546, + "balance_loss_mlp": 1.1229074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.08413871186116019, + "language_loss": 0.83810687, + "learning_rate": 0.0008984557492615576, + "loss": 0.84948236, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1197, + "time_per_iteration": 2.9953744411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10803354, + "diversity_loss_mlp": 0.0, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.08617240411661099, + "language_loss": 0.90267789, + "learning_rate": 0.0008982674704410854, + "loss": 0.91390687, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1198, + "time_per_iteration": 2.7513339519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110338, + "balance_loss_mlp": 1.09598517, + "diversity_loss_mlp": 0.0, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.11146547076727734, + "language_loss": 0.77876621, + "learning_rate": 0.0008980790369994682, + "loss": 0.78986955, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1199, + "time_per_iteration": 2.989825487136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120977, + "balance_loss_mlp": 1.10670781, + "diversity_loss_mlp": 0.0, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.0677628031660983, + "language_loss": 0.8729977, + "learning_rate": 0.000897890449009863, + "loss": 0.88420743, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1200, + "time_per_iteration": 2.6784448623657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127646, + "balance_loss_mlp": 1.11330509, + "diversity_loss_mlp": 0.0, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.080414080555838, + "language_loss": 0.89825618, + "learning_rate": 0.0008977017065454853, + "loss": 0.90953267, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1201, + "time_per_iteration": 2.6610703468322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00880483, + "balance_loss_mlp": 1.52539706, + "diversity_loss_mlp": 0.19880572, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.03277795962214655, + "language_loss": 0.80367738, + "learning_rate": 0.0008975128096796121, + "loss": 0.81248224, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01838172, + "step": 1202, + "time_per_iteration": 2.901998996734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145011, + "balance_loss_mlp": 1.13089633, + "diversity_loss_mlp": 0.0, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.10693947298766643, + "language_loss": 0.85848922, + "learning_rate": 0.0008973237584855794, + "loss": 0.86993933, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1203, + "time_per_iteration": 2.872408151626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160017, + "balance_loss_mlp": 1.1457237, + "diversity_loss_mlp": 0.0, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.08753213296005687, + "language_loss": 0.82586002, + "learning_rate": 0.0008971345530367832, + "loss": 0.83746028, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1204, + "time_per_iteration": 2.4641921520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185717, + "balance_loss_mlp": 1.17120886, + "diversity_loss_mlp": 0.0, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07947534631123947, + "language_loss": 0.85658818, + "learning_rate": 0.0008969451934066799, + "loss": 0.8684454, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1205, + "time_per_iteration": 2.7822117805480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173591, + "balance_loss_mlp": 1.15872586, + "diversity_loss_mlp": 0.0, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08780432716538046, + "language_loss": 0.79991889, + "learning_rate": 0.0008967556796687854, + "loss": 0.81165481, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1206, + "time_per_iteration": 2.8849406242370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117013, + "balance_loss_mlp": 1.15584886, + "diversity_loss_mlp": 0.0, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.07569633120476413, + "language_loss": 0.83779937, + "learning_rate": 0.0008965660118966752, + "loss": 0.84950066, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1207, + "time_per_iteration": 2.9316329956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146692, + "balance_loss_mlp": 1.1319102, + "diversity_loss_mlp": 0.0, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06968265941642382, + "language_loss": 0.90114093, + "learning_rate": 0.0008963761901639851, + "loss": 0.91260791, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1208, + "time_per_iteration": 2.8140323162078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113879, + "balance_loss_mlp": 1.12392485, + "diversity_loss_mlp": 0.0, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.08612535310277082, + "language_loss": 0.83098078, + "learning_rate": 0.0008961862145444103, + "loss": 0.84236872, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1209, + "time_per_iteration": 2.7529945373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122935, + "balance_loss_mlp": 1.10796285, + "diversity_loss_mlp": 0.0, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08243119711445285, + "language_loss": 0.85338795, + "learning_rate": 0.0008959960851117059, + "loss": 0.86461735, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1210, + "time_per_iteration": 2.624340534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108554, + "balance_loss_mlp": 1.09396267, + "diversity_loss_mlp": 0.0, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.10596241027535934, + "language_loss": 0.84048676, + "learning_rate": 0.0008958058019396868, + "loss": 0.85157233, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1211, + "time_per_iteration": 2.8316566944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112068, + "balance_loss_mlp": 1.09751284, + "diversity_loss_mlp": 0.0, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.07651667178885936, + "language_loss": 0.86494702, + "learning_rate": 0.0008956153651022274, + "loss": 0.8760677, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1212, + "time_per_iteration": 2.684788465499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103634, + "balance_loss_mlp": 1.08926892, + "diversity_loss_mlp": 0.0, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.07459915787800217, + "language_loss": 0.83929688, + "learning_rate": 0.0008954247746732618, + "loss": 0.85033321, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1213, + "time_per_iteration": 2.6184399127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117524, + "balance_loss_mlp": 1.10321903, + "diversity_loss_mlp": 0.0, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.08317009769115577, + "language_loss": 0.90604293, + "learning_rate": 0.0008952340307267837, + "loss": 0.91721821, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1214, + "time_per_iteration": 2.8993093967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119644, + "balance_loss_mlp": 1.10553002, + "diversity_loss_mlp": 0.0, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.09601716623847659, + "language_loss": 0.83731341, + "learning_rate": 0.0008950431333368468, + "loss": 0.84850979, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1215, + "time_per_iteration": 2.6151199340820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130858, + "balance_loss_mlp": 1.11676729, + "diversity_loss_mlp": 0.0, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.08049188450288745, + "language_loss": 0.84623635, + "learning_rate": 0.0008948520825775634, + "loss": 0.8575449, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1216, + "time_per_iteration": 3.645200490951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123063, + "balance_loss_mlp": 1.10880601, + "diversity_loss_mlp": 0.0, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.08038238822992319, + "language_loss": 0.83978343, + "learning_rate": 0.0008946608785231067, + "loss": 0.85101402, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1217, + "time_per_iteration": 2.871616840362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126329, + "balance_loss_mlp": 1.11263156, + "diversity_loss_mlp": 0.0, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.07832391647543825, + "language_loss": 0.84442961, + "learning_rate": 0.0008944695212477084, + "loss": 0.85569292, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1218, + "time_per_iteration": 2.507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123493, + "balance_loss_mlp": 1.10867572, + "diversity_loss_mlp": 0.0, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.07420792055611987, + "language_loss": 0.86334574, + "learning_rate": 0.0008942780108256599, + "loss": 0.87458062, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1219, + "time_per_iteration": 2.6183433532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107778, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.07657909053901747, + "language_loss": 0.86160946, + "learning_rate": 0.0008940863473313121, + "loss": 0.87268722, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1220, + "time_per_iteration": 2.495164632797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107377, + "balance_loss_mlp": 1.09272623, + "diversity_loss_mlp": 0.0, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07962638616920462, + "language_loss": 0.87889743, + "learning_rate": 0.0008938945308390756, + "loss": 0.88997114, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1221, + "time_per_iteration": 2.613927125930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097552, + "balance_loss_mlp": 1.08298469, + "diversity_loss_mlp": 0.0, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.06679649396710063, + "language_loss": 0.87179595, + "learning_rate": 0.00089370256142342, + "loss": 0.88277149, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1222, + "time_per_iteration": 2.732208013534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094745, + "balance_loss_mlp": 1.07952189, + "diversity_loss_mlp": 0.0, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.06680688140454344, + "language_loss": 0.84810197, + "learning_rate": 0.0008935104391588746, + "loss": 0.85904944, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1223, + "time_per_iteration": 2.7585461139678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094902, + "balance_loss_mlp": 1.07917881, + "diversity_loss_mlp": 0.0, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.07271030004651308, + "language_loss": 0.83111542, + "learning_rate": 0.0008933181641200276, + "loss": 0.84206444, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.15710449, + "routerloss_mlp": 0.0, + "step": 1224, + "time_per_iteration": 3.1440725326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087445, + "balance_loss_mlp": 1.07139981, + "diversity_loss_mlp": 0.0, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.07882513603721358, + "language_loss": 0.85824931, + "learning_rate": 0.0008931257363815271, + "loss": 0.8691237, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.16040039, + "routerloss_mlp": 0.0, + "step": 1225, + "time_per_iteration": 2.8887243270874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092595, + "balance_loss_mlp": 1.07659674, + "diversity_loss_mlp": 0.0, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.09571789824401095, + "language_loss": 0.89901638, + "learning_rate": 0.0008929331560180798, + "loss": 0.90994227, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.15991211, + "routerloss_mlp": 0.0, + "step": 1226, + "time_per_iteration": 2.897155284881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095366, + "balance_loss_mlp": 1.07965469, + "diversity_loss_mlp": 0.0, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.068724406385502, + "language_loss": 0.90771782, + "learning_rate": 0.0008927404231044525, + "loss": 0.91867149, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 1227, + "time_per_iteration": 2.6892144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_mlp": 1.08764625, + "diversity_loss_mlp": 0.0, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.06943954848997126, + "language_loss": 0.81646705, + "learning_rate": 0.0008925475377154703, + "loss": 0.82749879, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1228, + "time_per_iteration": 2.727325201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.11394727, + "diversity_loss_mlp": 0.0, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.0778889683705481, + "language_loss": 0.8212285, + "learning_rate": 0.0008923544999260183, + "loss": 0.83252132, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1229, + "time_per_iteration": 2.7520618438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.13194346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.0853653064859127, + "language_loss": 0.91254115, + "learning_rate": 0.00089216130981104, + "loss": 0.92400861, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1230, + "time_per_iteration": 3.016228199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138894, + "balance_loss_mlp": 1.12364721, + "diversity_loss_mlp": 0.0, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.08048994442870243, + "language_loss": 0.82752085, + "learning_rate": 0.000891967967445539, + "loss": 0.83890975, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.15222168, + "routerloss_mlp": 0.0, + "step": 1231, + "time_per_iteration": 2.65736722946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126061, + "balance_loss_mlp": 1.11135054, + "diversity_loss_mlp": 0.0, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.05909715635047166, + "language_loss": 0.889099, + "learning_rate": 0.0008917744729045772, + "loss": 0.90035963, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1232, + "time_per_iteration": 2.8686273097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110871, + "balance_loss_mlp": 1.0962795, + "diversity_loss_mlp": 0.0, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.08046733758331526, + "language_loss": 0.83836448, + "learning_rate": 0.0008915808262632757, + "loss": 0.84947324, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1233, + "time_per_iteration": 2.860353708267212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918962, + "balance_loss_mlp": 1.60287488, + "diversity_loss_mlp": 0.20008399, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.03182006079144566, + "language_loss": 0.93544835, + "learning_rate": 0.0008913870275968148, + "loss": 0.94463801, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017482, + "step": 1234, + "time_per_iteration": 2.7328829765319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095751, + "balance_loss_mlp": 1.08008718, + "diversity_loss_mlp": 0.0, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.07195832826776788, + "language_loss": 0.87503707, + "learning_rate": 0.0008911930769804342, + "loss": 0.88599461, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1235, + "time_per_iteration": 3.2619638442993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091405, + "balance_loss_mlp": 1.07551408, + "diversity_loss_mlp": 0.0, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.07148547933088874, + "language_loss": 0.91313815, + "learning_rate": 0.0008909989744894318, + "loss": 0.92405218, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1236, + "time_per_iteration": 2.8687992095947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080974, + "balance_loss_mlp": 1.06530952, + "diversity_loss_mlp": 0.0, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.08021447901266163, + "language_loss": 0.81662518, + "learning_rate": 0.0008908047201991649, + "loss": 0.8274349, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1237, + "time_per_iteration": 2.737638235092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076608, + "balance_loss_mlp": 1.06138515, + "diversity_loss_mlp": 0.0, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.07749899394714953, + "language_loss": 0.86585152, + "learning_rate": 0.0008906103141850502, + "loss": 0.87661767, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.15197754, + "routerloss_mlp": 0.0, + "step": 1238, + "time_per_iteration": 2.9184746742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068848, + "balance_loss_mlp": 1.05385113, + "diversity_loss_mlp": 0.0, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.10230617436374452, + "language_loss": 0.88104367, + "learning_rate": 0.0008904157565225621, + "loss": 0.89173216, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.1496582, + "routerloss_mlp": 0.0, + "step": 1239, + "time_per_iteration": 2.6396749019622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_mlp": 1.06220865, + "diversity_loss_mlp": 0.0, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.10467557893696883, + "language_loss": 0.81824136, + "learning_rate": 0.000890221047287235, + "loss": 0.82901168, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1240, + "time_per_iteration": 3.496812582015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081371, + "balance_loss_mlp": 1.06710172, + "diversity_loss_mlp": 0.0, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.09443583580909311, + "language_loss": 0.91125917, + "learning_rate": 0.0008900261865546615, + "loss": 0.92207289, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1241, + "time_per_iteration": 2.6527724266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103977, + "balance_loss_mlp": 1.0890398, + "diversity_loss_mlp": 0.0, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.08429957072104315, + "language_loss": 0.84985352, + "learning_rate": 0.0008898311744004936, + "loss": 0.86089325, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1242, + "time_per_iteration": 2.6740338802337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118763, + "balance_loss_mlp": 1.10411179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.07332762129893158, + "language_loss": 0.86932802, + "learning_rate": 0.0008896360109004414, + "loss": 0.88051569, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1243, + "time_per_iteration": 2.643489122390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142112, + "balance_loss_mlp": 1.12715125, + "diversity_loss_mlp": 0.0, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.09306092844590973, + "language_loss": 0.84636557, + "learning_rate": 0.0008894406961302742, + "loss": 0.85778666, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1244, + "time_per_iteration": 2.5876173973083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150798, + "balance_loss_mlp": 1.13590896, + "diversity_loss_mlp": 0.0, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.0838589606869783, + "language_loss": 0.83944738, + "learning_rate": 0.0008892452301658201, + "loss": 0.85095537, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1245, + "time_per_iteration": 2.928391218185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116146, + "balance_loss_mlp": 1.1460346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.0736247551351698, + "language_loss": 0.83299339, + "learning_rate": 0.0008890496130829653, + "loss": 0.84460801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1246, + "time_per_iteration": 2.6510462760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915571, + "balance_loss_mlp": 1.59993446, + "diversity_loss_mlp": 0.1987851, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.03287481157446996, + "language_loss": 0.85918486, + "learning_rate": 0.0008888538449576555, + "loss": 0.86834061, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621127, + "step": 1247, + "time_per_iteration": 2.5719456672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178279, + "balance_loss_mlp": 1.16323447, + "diversity_loss_mlp": 0.0, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.10811715250715398, + "language_loss": 0.83036304, + "learning_rate": 0.0008886579258658944, + "loss": 0.8421458, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.15014648, + "routerloss_mlp": 0.0, + "step": 1248, + "time_per_iteration": 2.5736701488494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148631, + "balance_loss_mlp": 1.13341999, + "diversity_loss_mlp": 0.0, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.07868761607649298, + "language_loss": 0.84717274, + "learning_rate": 0.0008884618558837446, + "loss": 0.85865903, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1249, + "time_per_iteration": 2.8215761184692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911764, + "balance_loss_mlp": 1.59372783, + "diversity_loss_mlp": 0.19720009, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.03236174678929329, + "language_loss": 0.8677094, + "learning_rate": 0.0008882656350873273, + "loss": 0.87682706, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01629994, + "step": 1250, + "time_per_iteration": 2.885092258453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126022, + "balance_loss_mlp": 1.11122799, + "diversity_loss_mlp": 0.0, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.08347743908005935, + "language_loss": 0.87000573, + "learning_rate": 0.0008880692635528219, + "loss": 0.88126594, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1251, + "time_per_iteration": 3.049070119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106629, + "balance_loss_mlp": 1.09177542, + "diversity_loss_mlp": 0.0, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.07406446185181008, + "language_loss": 0.89514965, + "learning_rate": 0.0008878727413564669, + "loss": 0.90621597, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1252, + "time_per_iteration": 2.734839677810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.06804204, + "diversity_loss_mlp": 0.0, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.048930323133030355, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81211317, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1253, + "time_per_iteration": 4.854974031448364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00873083, + "balance_loss_mlp": 1.51531768, + "diversity_loss_mlp": 0.19563958, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.03648198852202315, + "language_loss": 0.78763413, + "learning_rate": 0.0008874792452834528, + "loss": 0.7963649, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01760404, + "step": 1254, + "time_per_iteration": 2.803690195083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090026, + "balance_loss_mlp": 1.07530415, + "diversity_loss_mlp": 0.0, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.09659900556863026, + "language_loss": 0.8729195, + "learning_rate": 0.0008872822715595626, + "loss": 0.88381982, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1255, + "time_per_iteration": 2.657867670059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084204, + "balance_loss_mlp": 1.06968451, + "diversity_loss_mlp": 0.0, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.10497791491954662, + "language_loss": 0.87333822, + "learning_rate": 0.0008870851474793598, + "loss": 0.88418031, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1256, + "time_per_iteration": 2.5694568157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083756, + "balance_loss_mlp": 1.06920075, + "diversity_loss_mlp": 0.0, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.07331256259210016, + "language_loss": 0.89243567, + "learning_rate": 0.0008868878731193752, + "loss": 0.90327322, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1257, + "time_per_iteration": 2.829789400100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086138, + "balance_loss_mlp": 1.07158267, + "diversity_loss_mlp": 0.0, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.07236027639177293, + "language_loss": 0.89720446, + "learning_rate": 0.0008866904485561973, + "loss": 0.90806586, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1258, + "time_per_iteration": 2.731635570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078524, + "balance_loss_mlp": 1.06384969, + "diversity_loss_mlp": 0.0, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.0727569881861308, + "language_loss": 0.83084273, + "learning_rate": 0.000886492873866473, + "loss": 0.84162796, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1259, + "time_per_iteration": 2.8250575065612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080175, + "balance_loss_mlp": 1.06528533, + "diversity_loss_mlp": 0.0, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.10762424055834904, + "language_loss": 0.84672934, + "learning_rate": 0.000886295149126908, + "loss": 0.85753107, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1260, + "time_per_iteration": 2.7148356437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086434, + "balance_loss_mlp": 1.07181931, + "diversity_loss_mlp": 0.0, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.07159531524201106, + "language_loss": 0.85693741, + "learning_rate": 0.0008860972744142655, + "loss": 0.86780179, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1261, + "time_per_iteration": 2.931696653366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115009, + "balance_loss_mlp": 1.10064411, + "diversity_loss_mlp": 0.0, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.065367920687613, + "language_loss": 0.81639904, + "learning_rate": 0.0008858992498053671, + "loss": 0.82754916, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1262, + "time_per_iteration": 2.846466541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055704, + "balance_loss_mlp": 1.04764521, + "diversity_loss_mlp": 0.0, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.03374572714932058, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77644455, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1263, + "time_per_iteration": 4.882519006729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00872344, + "balance_loss_mlp": 1.51226497, + "diversity_loss_mlp": 0.19974959, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.03166105856965055, + "language_loss": 0.83409035, + "learning_rate": 0.0008855027512063817, + "loss": 0.84281385, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633644, + "step": 1264, + "time_per_iteration": 2.7414488792419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185798, + "balance_loss_mlp": 1.17132628, + "diversity_loss_mlp": 0.0, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.06261248257395001, + "language_loss": 0.85949916, + "learning_rate": 0.0008853042773702292, + "loss": 0.8713572, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1265, + "time_per_iteration": 2.695514440536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196886, + "balance_loss_mlp": 1.18234205, + "diversity_loss_mlp": 0.0, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.08760826562773598, + "language_loss": 0.87981403, + "learning_rate": 0.0008851056539456896, + "loss": 0.89178288, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1266, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119913, + "balance_loss_mlp": 1.18489647, + "diversity_loss_mlp": 0.0, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.07991839198753149, + "language_loss": 0.81904382, + "learning_rate": 0.0008849068810098755, + "loss": 0.83103514, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1267, + "time_per_iteration": 3.3067915439605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174372, + "balance_loss_mlp": 1.15992332, + "diversity_loss_mlp": 0.0, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.10499473220259715, + "language_loss": 0.83550054, + "learning_rate": 0.0008847079586399575, + "loss": 0.84724426, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1268, + "time_per_iteration": 2.4791157245635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115106, + "balance_loss_mlp": 1.13699341, + "diversity_loss_mlp": 0.0, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07765469411987547, + "language_loss": 0.86144567, + "learning_rate": 0.0008845088869131641, + "loss": 0.87295628, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1269, + "time_per_iteration": 2.6733555793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111483, + "balance_loss_mlp": 1.10053682, + "diversity_loss_mlp": 0.0, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.0888033537849515, + "language_loss": 0.88898385, + "learning_rate": 0.0008843096659067818, + "loss": 0.90013218, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1270, + "time_per_iteration": 2.6315910816192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111213, + "balance_loss_mlp": 1.09708679, + "diversity_loss_mlp": 0.0, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.09475560383246978, + "language_loss": 0.86565858, + "learning_rate": 0.000884110295698155, + "loss": 0.87677073, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1271, + "time_per_iteration": 2.926668643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.08752966, + "diversity_loss_mlp": 0.0, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.09917556522455147, + "language_loss": 0.85849231, + "learning_rate": 0.0008839107763646861, + "loss": 0.86951411, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1272, + "time_per_iteration": 2.58022403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110242, + "balance_loss_mlp": 1.08751881, + "diversity_loss_mlp": 0.0, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.08783320449451974, + "language_loss": 0.89941388, + "learning_rate": 0.0008837111079838353, + "loss": 0.91043806, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1273, + "time_per_iteration": 2.6877150535583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111566, + "balance_loss_mlp": 1.10096157, + "diversity_loss_mlp": 0.0, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.07640958054403056, + "language_loss": 0.89671296, + "learning_rate": 0.000883511290633121, + "loss": 0.90786958, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1274, + "time_per_iteration": 2.5929813385009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123812, + "balance_loss_mlp": 1.10898256, + "diversity_loss_mlp": 0.0, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.05814589763763208, + "language_loss": 0.92211604, + "learning_rate": 0.000883311324390119, + "loss": 0.93335414, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1275, + "time_per_iteration": 2.721343517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138447, + "balance_loss_mlp": 1.12315261, + "diversity_loss_mlp": 0.0, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.10098653640048322, + "language_loss": 0.81237984, + "learning_rate": 0.0008831112093324629, + "loss": 0.82376432, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.15283203, + "routerloss_mlp": 0.0, + "step": 1276, + "time_per_iteration": 3.066657543182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148152, + "balance_loss_mlp": 1.13266695, + "diversity_loss_mlp": 0.0, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.07328274291062464, + "language_loss": 0.89255905, + "learning_rate": 0.0008829109455378444, + "loss": 0.90404058, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 1277, + "time_per_iteration": 2.6705071926116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163573, + "balance_loss_mlp": 1.14844561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.08343231090098181, + "language_loss": 0.86569774, + "learning_rate": 0.000882710533084013, + "loss": 0.87733346, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1278, + "time_per_iteration": 2.632864236831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152351, + "balance_loss_mlp": 1.13783133, + "diversity_loss_mlp": 0.0, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.0729065811951457, + "language_loss": 0.8929435, + "learning_rate": 0.0008825099720487755, + "loss": 0.90446699, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1279, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00676302, + "balance_loss_mlp": 1.12665224, + "diversity_loss_mlp": 0.19835761, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.0027483074809680533, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.75937444, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0137972, + "step": 1280, + "time_per_iteration": 4.88429594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111003, + "balance_loss_mlp": 1.10232449, + "diversity_loss_mlp": 0.0, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.05615046205501133, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79055113, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1281, + "time_per_iteration": 4.752316236495972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113823, + "balance_loss_mlp": 1.09987593, + "diversity_loss_mlp": 0.0, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.08093958913819582, + "language_loss": 0.89542687, + "learning_rate": 0.0008819073982335619, + "loss": 0.90656507, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1282, + "time_per_iteration": 2.876927137374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110167, + "balance_loss_mlp": 1.08783603, + "diversity_loss_mlp": 0.0, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07169123109412263, + "language_loss": 0.84362143, + "learning_rate": 0.0008817062436519235, + "loss": 0.8546381, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.13824463, + "routerloss_mlp": 0.0, + "step": 1283, + "time_per_iteration": 2.6551387310028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0086846, + "balance_loss_mlp": 1.5022366, + "diversity_loss_mlp": 0.20048198, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.033180516132009126, + "language_loss": 0.89655471, + "learning_rate": 0.0008815049408787788, + "loss": 0.90523928, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01710081, + "step": 1284, + "time_per_iteration": 2.5652830600738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100363, + "balance_loss_mlp": 1.08698821, + "diversity_loss_mlp": 0.0, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.0762028673981185, + "language_loss": 0.85473216, + "learning_rate": 0.0008813034899922805, + "loss": 0.86573577, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1285, + "time_per_iteration": 2.549622058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111306, + "balance_loss_mlp": 1.09783578, + "diversity_loss_mlp": 0.0, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.11471388318643767, + "language_loss": 0.89855313, + "learning_rate": 0.0008811018910706387, + "loss": 0.9096663, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1286, + "time_per_iteration": 2.575176954269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117993, + "balance_loss_mlp": 1.10453439, + "diversity_loss_mlp": 0.0, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.10517914532856759, + "language_loss": 0.81922066, + "learning_rate": 0.0008809001441921211, + "loss": 0.83040059, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1287, + "time_per_iteration": 2.732236862182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.1132865, + "diversity_loss_mlp": 0.0, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.1440229573277689, + "language_loss": 0.85392761, + "learning_rate": 0.0008806982494350528, + "loss": 0.86519527, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1288, + "time_per_iteration": 2.6544177532196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168468, + "balance_loss_mlp": 1.1549263, + "diversity_loss_mlp": 0.0, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.07192560701016996, + "language_loss": 0.9021467, + "learning_rate": 0.0008804962068778161, + "loss": 0.91383135, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1289, + "time_per_iteration": 2.8321304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_mlp": 1.20329499, + "diversity_loss_mlp": 0.0, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.08274381184261048, + "language_loss": 0.81234664, + "learning_rate": 0.0008802940165988511, + "loss": 0.82451665, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1290, + "time_per_iteration": 2.848726749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262968, + "balance_loss_mlp": 1.24875808, + "diversity_loss_mlp": 0.0, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.09449787402071168, + "language_loss": 0.88461435, + "learning_rate": 0.000880091678676655, + "loss": 0.8972441, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1291, + "time_per_iteration": 2.802199363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279654, + "balance_loss_mlp": 1.26553965, + "diversity_loss_mlp": 0.0, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.11843407890200246, + "language_loss": 0.88870949, + "learning_rate": 0.0008798891931897821, + "loss": 0.90150601, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1292, + "time_per_iteration": 2.7150259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870403, + "balance_loss_mlp": 1.50883341, + "diversity_loss_mlp": 0.20002533, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.035309457370921726, + "language_loss": 0.84031773, + "learning_rate": 0.0008796865602168447, + "loss": 0.84902173, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597392, + "step": 1293, + "time_per_iteration": 2.5952000617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210957, + "balance_loss_mlp": 1.19661582, + "diversity_loss_mlp": 0.0, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.07909897749306223, + "language_loss": 0.88611919, + "learning_rate": 0.0008794837798365115, + "loss": 0.89822876, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1294, + "time_per_iteration": 2.6257524490356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167929, + "balance_loss_mlp": 1.15246725, + "diversity_loss_mlp": 0.0, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.06704316740686254, + "language_loss": 0.8866623, + "learning_rate": 0.0008792808521275089, + "loss": 0.89834166, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1295, + "time_per_iteration": 2.7125115394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153488, + "balance_loss_mlp": 1.13757372, + "diversity_loss_mlp": 0.0, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.08601952378824393, + "language_loss": 0.87496305, + "learning_rate": 0.0008790777771686206, + "loss": 0.88649786, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1296, + "time_per_iteration": 2.6131319999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124706, + "balance_loss_mlp": 1.10882747, + "diversity_loss_mlp": 0.0, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.0951042007575699, + "language_loss": 0.8543523, + "learning_rate": 0.0008788745550386872, + "loss": 0.86559939, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 1297, + "time_per_iteration": 2.5590503215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115503, + "balance_loss_mlp": 1.09948111, + "diversity_loss_mlp": 0.0, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.07219065567928346, + "language_loss": 0.80291975, + "learning_rate": 0.0008786711858166063, + "loss": 0.81407487, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1298, + "time_per_iteration": 2.951768398284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00871436, + "balance_loss_mlp": 1.51113367, + "diversity_loss_mlp": 0.19870289, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.03357842357877673, + "language_loss": 0.83488023, + "learning_rate": 0.0008784676695813332, + "loss": 0.84359455, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165179, + "step": 1299, + "time_per_iteration": 2.985684871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108985, + "balance_loss_mlp": 1.07411456, + "diversity_loss_mlp": 0.0, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07050099983107566, + "language_loss": 0.84900999, + "learning_rate": 0.0008782640064118796, + "loss": 0.85990846, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1300, + "time_per_iteration": 2.943368673324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139161, + "balance_loss_mlp": 1.13172245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.062054541004710057, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323914, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.07421875, + "routerloss_mlp": 0.0, + "step": 1301, + "time_per_iteration": 4.975619316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106013, + "balance_loss_mlp": 1.09055138, + "diversity_loss_mlp": 0.0, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.08145949094764637, + "language_loss": 0.86554521, + "learning_rate": 0.0008778562395867648, + "loss": 0.87660533, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1302, + "time_per_iteration": 2.6318612098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111342, + "balance_loss_mlp": 1.09572554, + "diversity_loss_mlp": 0.0, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.0727542370097133, + "language_loss": 0.84224409, + "learning_rate": 0.0008776521360894127, + "loss": 0.85335743, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 1303, + "time_per_iteration": 2.6512627601623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_mlp": 1.02259421, + "diversity_loss_mlp": 0.0, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.02979233866947858, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79991817, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.07128906, + "routerloss_mlp": 0.0, + "step": 1304, + "time_per_iteration": 4.802467107772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112993, + "balance_loss_mlp": 1.11518431, + "diversity_loss_mlp": 0.0, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.07060498048015267, + "language_loss": 0.9057076, + "learning_rate": 0.0008772434893213186, + "loss": 0.91700697, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1305, + "time_per_iteration": 2.601546049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137218, + "balance_loss_mlp": 1.12251997, + "diversity_loss_mlp": 0.0, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.13797279723809866, + "language_loss": 0.84362888, + "learning_rate": 0.0008770389462092276, + "loss": 0.85500103, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1306, + "time_per_iteration": 2.626138210296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141522, + "balance_loss_mlp": 1.12685966, + "diversity_loss_mlp": 0.0, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.08471108342240245, + "language_loss": 0.86803389, + "learning_rate": 0.0008768342567176357, + "loss": 0.87944913, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1307, + "time_per_iteration": 2.8074796199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114159, + "balance_loss_mlp": 1.12681937, + "diversity_loss_mlp": 0.0, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.07263390393133992, + "language_loss": 0.90559924, + "learning_rate": 0.0008766294209260107, + "loss": 0.91701508, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1308, + "time_per_iteration": 2.670790910720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.13312435, + "diversity_loss_mlp": 0.0, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.07764888634730133, + "language_loss": 0.91554916, + "learning_rate": 0.0008764244389138767, + "loss": 0.92702377, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1309, + "time_per_iteration": 2.572793483734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147476, + "balance_loss_mlp": 1.13318276, + "diversity_loss_mlp": 0.0, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09714227143719616, + "language_loss": 0.82980847, + "learning_rate": 0.000876219310760815, + "loss": 0.8412832, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1310, + "time_per_iteration": 2.8601791858673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146968, + "balance_loss_mlp": 1.13273418, + "diversity_loss_mlp": 0.0, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.09648806821544922, + "language_loss": 0.81436276, + "learning_rate": 0.0008760140365464631, + "loss": 0.82583249, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1311, + "time_per_iteration": 2.599353790283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870128, + "balance_loss_mlp": 1.50605726, + "diversity_loss_mlp": 0.20002663, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.03529693250820236, + "language_loss": 0.871418, + "learning_rate": 0.0008758086163505156, + "loss": 0.88011926, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170862, + "step": 1312, + "time_per_iteration": 2.6166832447052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163863, + "balance_loss_mlp": 1.14953399, + "diversity_loss_mlp": 0.0, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.07147814499844148, + "language_loss": 0.89267951, + "learning_rate": 0.0008756030502527239, + "loss": 0.90431809, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1313, + "time_per_iteration": 2.8452062606811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188075, + "balance_loss_mlp": 1.17377019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.09335955432973846, + "language_loss": 0.90298462, + "learning_rate": 0.0008753973383328954, + "loss": 0.91486537, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1314, + "time_per_iteration": 2.6988537311553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165459, + "balance_loss_mlp": 1.15108287, + "diversity_loss_mlp": 0.0, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.08872096542459323, + "language_loss": 0.83944553, + "learning_rate": 0.0008751914806708952, + "loss": 0.85110015, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1315, + "time_per_iteration": 2.6328680515289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.1372478, + "diversity_loss_mlp": 0.0, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.09247066962171595, + "language_loss": 0.81854099, + "learning_rate": 0.0008749854773466439, + "loss": 0.83005595, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1316, + "time_per_iteration": 2.6708498001098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134446, + "balance_loss_mlp": 1.11980653, + "diversity_loss_mlp": 0.0, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.06992463478304738, + "language_loss": 0.84568423, + "learning_rate": 0.0008747793284401192, + "loss": 0.85702872, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1317, + "time_per_iteration": 2.70182204246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120804, + "balance_loss_mlp": 1.10560477, + "diversity_loss_mlp": 0.0, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.11229953955213261, + "language_loss": 0.85994983, + "learning_rate": 0.0008745730340313551, + "loss": 0.87115788, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1318, + "time_per_iteration": 2.8026556968688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.1048007, + "diversity_loss_mlp": 0.0, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.0843917818222923, + "language_loss": 0.84519732, + "learning_rate": 0.0008743665942004422, + "loss": 0.85639453, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.14904785, + "routerloss_mlp": 0.0, + "step": 1319, + "time_per_iteration": 2.6717073917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120645, + "balance_loss_mlp": 1.10569644, + "diversity_loss_mlp": 0.0, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.06860607652829093, + "language_loss": 0.92769039, + "learning_rate": 0.0008741600090275277, + "loss": 0.93889689, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1320, + "time_per_iteration": 2.6251981258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120587, + "balance_loss_mlp": 1.10530448, + "diversity_loss_mlp": 0.0, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.09643257369734548, + "language_loss": 0.8425917, + "learning_rate": 0.0008739532785928151, + "loss": 0.85379755, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1321, + "time_per_iteration": 3.4925267696380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101061, + "balance_loss_mlp": 1.09305024, + "diversity_loss_mlp": 0.0, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.04547815076873398, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75994641, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1322, + "time_per_iteration": 4.8446879386901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0085354, + "balance_loss_mlp": 1.4814328, + "diversity_loss_mlp": 0.19370571, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.036800523279172735, + "language_loss": 0.82844102, + "learning_rate": 0.0008735393822590908, + "loss": 0.83697641, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597124, + "step": 1323, + "time_per_iteration": 2.7354650497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174586, + "balance_loss_mlp": 1.16032863, + "diversity_loss_mlp": 0.0, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.08280852347492981, + "language_loss": 0.87442601, + "learning_rate": 0.0008733322165207681, + "loss": 0.88617194, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1324, + "time_per_iteration": 2.6581695079803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120021, + "balance_loss_mlp": 1.18529749, + "diversity_loss_mlp": 0.0, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.0779912319299164, + "language_loss": 0.8296451, + "learning_rate": 0.0008731249058420247, + "loss": 0.84164721, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1325, + "time_per_iteration": 3.0674960613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203892, + "balance_loss_mlp": 1.18865728, + "diversity_loss_mlp": 0.0, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.10695670124077197, + "language_loss": 0.90080667, + "learning_rate": 0.0008729174503033459, + "loss": 0.91284555, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1326, + "time_per_iteration": 2.6511192321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188403, + "balance_loss_mlp": 1.17334652, + "diversity_loss_mlp": 0.0, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.10125548093505272, + "language_loss": 0.82427752, + "learning_rate": 0.0008727098499852728, + "loss": 0.83616149, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1327, + "time_per_iteration": 2.833803415298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150318, + "balance_loss_mlp": 1.13529778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.08478455973869617, + "language_loss": 0.89778203, + "learning_rate": 0.0008725021049684034, + "loss": 0.90928519, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.15002441, + "routerloss_mlp": 0.0, + "step": 1328, + "time_per_iteration": 2.7405433654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116795, + "balance_loss_mlp": 1.10194123, + "diversity_loss_mlp": 0.0, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.07099770943741918, + "language_loss": 0.83078361, + "learning_rate": 0.000872294215333391, + "loss": 0.84195161, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1329, + "time_per_iteration": 3.219834089279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099158, + "balance_loss_mlp": 1.08430433, + "diversity_loss_mlp": 0.0, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.06913408205057751, + "language_loss": 0.82662833, + "learning_rate": 0.0008720861811609457, + "loss": 0.8376199, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1330, + "time_per_iteration": 2.753122329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096587, + "balance_loss_mlp": 1.0816741, + "diversity_loss_mlp": 0.0, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0919113566921475, + "language_loss": 0.83719599, + "learning_rate": 0.0008718780025318338, + "loss": 0.84816188, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1331, + "time_per_iteration": 2.724808692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.09296656, + "diversity_loss_mlp": 0.0, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.09880415123515712, + "language_loss": 0.83982158, + "learning_rate": 0.0008716696795268771, + "loss": 0.85089689, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1332, + "time_per_iteration": 2.718421220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098797, + "balance_loss_mlp": 1.08430111, + "diversity_loss_mlp": 0.0, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.15208681676824193, + "language_loss": 0.85333431, + "learning_rate": 0.0008714612122269538, + "loss": 0.8643223, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1333, + "time_per_iteration": 2.877823829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120258, + "balance_loss_mlp": 1.10586989, + "diversity_loss_mlp": 0.0, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.07756137703605612, + "language_loss": 0.89334106, + "learning_rate": 0.0008712526007129982, + "loss": 0.90454364, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1334, + "time_per_iteration": 2.561842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155014, + "balance_loss_mlp": 1.14101923, + "diversity_loss_mlp": 0.0, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.12724628219842446, + "language_loss": 0.90676123, + "learning_rate": 0.0008710438450660003, + "loss": 0.91831136, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1335, + "time_per_iteration": 2.6618270874023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199277, + "balance_loss_mlp": 1.18486404, + "diversity_loss_mlp": 0.0, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.10895723532104484, + "language_loss": 0.87596953, + "learning_rate": 0.0008708349453670064, + "loss": 0.88796222, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1336, + "time_per_iteration": 2.5121865272521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195197, + "balance_loss_mlp": 1.18032002, + "diversity_loss_mlp": 0.0, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.10227195785495524, + "language_loss": 0.91035736, + "learning_rate": 0.0008706259016971185, + "loss": 0.92230934, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1337, + "time_per_iteration": 2.7760090827941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189061, + "balance_loss_mlp": 1.17414773, + "diversity_loss_mlp": 0.0, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.12625436277937716, + "language_loss": 0.83095431, + "learning_rate": 0.0008704167141374944, + "loss": 0.84284496, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1338, + "time_per_iteration": 2.824122428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146224, + "balance_loss_mlp": 1.13107228, + "diversity_loss_mlp": 0.0, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.0801465901926633, + "language_loss": 0.88427222, + "learning_rate": 0.0008702073827693482, + "loss": 0.89573455, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1339, + "time_per_iteration": 2.708488941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101183, + "balance_loss_mlp": 1.0865202, + "diversity_loss_mlp": 0.0, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.07445900988257396, + "language_loss": 0.88514435, + "learning_rate": 0.0008699979076739494, + "loss": 0.89615613, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1340, + "time_per_iteration": 2.960650682449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.07054412, + "diversity_loss_mlp": 0.0, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.09041758143252471, + "language_loss": 0.88622832, + "learning_rate": 0.0008697882889326234, + "loss": 0.89708054, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1341, + "time_per_iteration": 2.5199689865112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094608, + "balance_loss_mlp": 1.08043432, + "diversity_loss_mlp": 0.0, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.08157938691300957, + "language_loss": 0.86840844, + "learning_rate": 0.0008695785266267515, + "loss": 0.87935448, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1342, + "time_per_iteration": 2.6833419799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089859, + "balance_loss_mlp": 1.56664371, + "diversity_loss_mlp": 0.19803861, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.03344075262961686, + "language_loss": 0.83491886, + "learning_rate": 0.0008693686208377704, + "loss": 0.84390479, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01624843, + "step": 1343, + "time_per_iteration": 2.8157622814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101399, + "balance_loss_mlp": 1.08711743, + "diversity_loss_mlp": 0.0, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.07460013341605923, + "language_loss": 0.89022982, + "learning_rate": 0.0008691585716471733, + "loss": 0.90124375, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1344, + "time_per_iteration": 2.6386232376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.09707415, + "diversity_loss_mlp": 0.0, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.08548738123283665, + "language_loss": 0.85822487, + "learning_rate": 0.0008689483791365079, + "loss": 0.86934054, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1345, + "time_per_iteration": 2.831817626953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112096, + "balance_loss_mlp": 1.10685778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.07218857890204664, + "language_loss": 0.89327282, + "learning_rate": 0.0008687380433873786, + "loss": 0.90448248, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1346, + "time_per_iteration": 2.8322408199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1251955, + "diversity_loss_mlp": 0.0, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.07612070672802876, + "language_loss": 0.82638776, + "learning_rate": 0.0008685275644814448, + "loss": 0.83778065, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1347, + "time_per_iteration": 2.689772367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116224, + "balance_loss_mlp": 1.14764857, + "diversity_loss_mlp": 0.0, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07884944678342334, + "language_loss": 0.84390515, + "learning_rate": 0.0008683169425004216, + "loss": 0.85552752, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1348, + "time_per_iteration": 2.895153760910034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159732, + "balance_loss_mlp": 1.14511704, + "diversity_loss_mlp": 0.0, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.10354145261803285, + "language_loss": 0.83314335, + "learning_rate": 0.0008681061775260799, + "loss": 0.84474063, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1349, + "time_per_iteration": 2.850862503051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166024, + "balance_loss_mlp": 1.15118265, + "diversity_loss_mlp": 0.0, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08416928552821445, + "language_loss": 0.9214983, + "learning_rate": 0.0008678952696402458, + "loss": 0.93315852, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1350, + "time_per_iteration": 2.525019884109497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153353, + "balance_loss_mlp": 1.13848734, + "diversity_loss_mlp": 0.0, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.07397225666721696, + "language_loss": 0.86554277, + "learning_rate": 0.000867684218924801, + "loss": 0.87707639, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1351, + "time_per_iteration": 2.8780648708343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083238, + "balance_loss_mlp": 1.07517958, + "diversity_loss_mlp": 0.0, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.0438698963901256, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80030328, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1352, + "time_per_iteration": 4.916059255599976 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132931, + "balance_loss_mlp": 1.11807716, + "diversity_loss_mlp": 0.0, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.06358739416567256, + "language_loss": 0.85154414, + "learning_rate": 0.0008672616893328834, + "loss": 0.86287344, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1353, + "time_per_iteration": 2.9301464557647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120208, + "balance_loss_mlp": 1.10545015, + "diversity_loss_mlp": 0.0, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.0804298790611747, + "language_loss": 0.89736795, + "learning_rate": 0.0008670502106204512, + "loss": 0.90857005, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.14733887, + "routerloss_mlp": 0.0, + "step": 1354, + "time_per_iteration": 2.8392651081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121529, + "balance_loss_mlp": 1.10672283, + "diversity_loss_mlp": 0.0, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.08121830869095954, + "language_loss": 0.81676221, + "learning_rate": 0.0008668385894064892, + "loss": 0.82797754, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1355, + "time_per_iteration": 2.632744550704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.10095191, + "diversity_loss_mlp": 0.0, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.0871855710564252, + "language_loss": 0.88984954, + "learning_rate": 0.0008666268257731562, + "loss": 0.90100139, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1356, + "time_per_iteration": 3.0961363315582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132093, + "balance_loss_mlp": 1.11785948, + "diversity_loss_mlp": 0.0, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.08548634624367135, + "language_loss": 0.8594982, + "learning_rate": 0.0008664149198026662, + "loss": 0.87081909, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1357, + "time_per_iteration": 3.2423956394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133945, + "balance_loss_mlp": 1.12039137, + "diversity_loss_mlp": 0.0, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.09109654485188295, + "language_loss": 0.88802171, + "learning_rate": 0.0008662028715772883, + "loss": 0.89936113, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1358, + "time_per_iteration": 2.619495153427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138578, + "balance_loss_mlp": 1.12476182, + "diversity_loss_mlp": 0.0, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.07135790209188476, + "language_loss": 0.85816395, + "learning_rate": 0.0008659906811793467, + "loss": 0.86954975, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.13842773, + "routerloss_mlp": 0.0, + "step": 1359, + "time_per_iteration": 2.6752817630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135637, + "balance_loss_mlp": 1.12191582, + "diversity_loss_mlp": 0.0, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.07783428421444573, + "language_loss": 0.89649427, + "learning_rate": 0.0008657783486912215, + "loss": 0.90785068, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1360, + "time_per_iteration": 2.770136594772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918859, + "balance_loss_mlp": 1.60386825, + "diversity_loss_mlp": 0.20058532, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.03438194549161764, + "language_loss": 0.90315008, + "learning_rate": 0.0008655658741953472, + "loss": 0.91233867, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01663268, + "step": 1361, + "time_per_iteration": 3.239567518234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117406, + "balance_loss_mlp": 1.10352993, + "diversity_loss_mlp": 0.0, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.053733033776962646, + "language_loss": 0.88311911, + "learning_rate": 0.0008653532577742136, + "loss": 0.89429319, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1362, + "time_per_iteration": 2.6912107467651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111717, + "balance_loss_mlp": 1.09805584, + "diversity_loss_mlp": 0.0, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.07456283347469675, + "language_loss": 0.8687824, + "learning_rate": 0.0008651404995103659, + "loss": 0.87989956, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1363, + "time_per_iteration": 2.5554919242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106371, + "balance_loss_mlp": 1.09212554, + "diversity_loss_mlp": 0.0, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.0735216597505126, + "language_loss": 0.87311852, + "learning_rate": 0.0008649275994864041, + "loss": 0.88418221, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1364, + "time_per_iteration": 2.7228429317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109936, + "balance_loss_mlp": 1.0955832, + "diversity_loss_mlp": 0.0, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.06423000395680191, + "language_loss": 0.83767593, + "learning_rate": 0.0008647145577849834, + "loss": 0.84877527, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1365, + "time_per_iteration": 2.8194234371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110395, + "balance_loss_mlp": 1.09573257, + "diversity_loss_mlp": 0.0, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.0636918785190987, + "language_loss": 0.82912111, + "learning_rate": 0.0008645013744888139, + "loss": 0.8402251, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1366, + "time_per_iteration": 2.9121909141540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106528, + "balance_loss_mlp": 1.09266424, + "diversity_loss_mlp": 0.0, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.07268525177684865, + "language_loss": 0.87255573, + "learning_rate": 0.0008642880496806607, + "loss": 0.88362104, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1367, + "time_per_iteration": 2.7527663707733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117256, + "balance_loss_mlp": 1.1027844, + "diversity_loss_mlp": 0.0, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.06883104565378229, + "language_loss": 0.84193766, + "learning_rate": 0.0008640745834433437, + "loss": 0.85311019, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1368, + "time_per_iteration": 2.7203800678253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114698, + "balance_loss_mlp": 1.10065532, + "diversity_loss_mlp": 0.0, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.0718323039568536, + "language_loss": 0.87083656, + "learning_rate": 0.000863860975859738, + "loss": 0.88198352, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1369, + "time_per_iteration": 2.9021553993225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116648, + "balance_loss_mlp": 1.10278392, + "diversity_loss_mlp": 0.0, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.08463505288724613, + "language_loss": 0.88568735, + "learning_rate": 0.0008636472270127733, + "loss": 0.8968538, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1370, + "time_per_iteration": 2.6336748600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118916, + "balance_loss_mlp": 1.10440779, + "diversity_loss_mlp": 0.0, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.08505114845208346, + "language_loss": 0.90530956, + "learning_rate": 0.0008634333369854345, + "loss": 0.91649872, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1371, + "time_per_iteration": 2.585775136947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122621, + "balance_loss_mlp": 1.10868549, + "diversity_loss_mlp": 0.0, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.07138701063901956, + "language_loss": 0.87574148, + "learning_rate": 0.0008632193058607608, + "loss": 0.88696772, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.13952637, + "routerloss_mlp": 0.0, + "step": 1372, + "time_per_iteration": 2.719151735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124687, + "balance_loss_mlp": 1.11042953, + "diversity_loss_mlp": 0.0, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.09395332240398839, + "language_loss": 0.81125695, + "learning_rate": 0.0008630051337218466, + "loss": 0.82250381, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1373, + "time_per_iteration": 2.6700031757354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118707, + "balance_loss_mlp": 1.10506988, + "diversity_loss_mlp": 0.0, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0808240378873911, + "language_loss": 0.82403839, + "learning_rate": 0.0008627908206518409, + "loss": 0.83522546, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1374, + "time_per_iteration": 2.6610107421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061343, + "balance_loss_mlp": 1.05442929, + "diversity_loss_mlp": 0.0, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.04099598647265769, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76212597, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 1375, + "time_per_iteration": 4.979893922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109458, + "balance_loss_mlp": 1.09580863, + "diversity_loss_mlp": 0.0, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.06989177478220372, + "language_loss": 0.91488004, + "learning_rate": 0.0008623617720514241, + "loss": 0.92597461, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1376, + "time_per_iteration": 2.6515755653381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109452, + "balance_loss_mlp": 1.09554029, + "diversity_loss_mlp": 0.0, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.07399727326907257, + "language_loss": 0.84706682, + "learning_rate": 0.0008621470366875848, + "loss": 0.85816133, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1377, + "time_per_iteration": 2.599776268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119233, + "balance_loss_mlp": 1.10546422, + "diversity_loss_mlp": 0.0, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.07769258092785128, + "language_loss": 0.87980253, + "learning_rate": 0.0008619321607257966, + "loss": 0.89099485, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1378, + "time_per_iteration": 2.678865671157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.10274947, + "diversity_loss_mlp": 0.0, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.07519514659764338, + "language_loss": 0.82002568, + "learning_rate": 0.000861717144249482, + "loss": 0.83118635, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1379, + "time_per_iteration": 2.8830740451812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118616, + "balance_loss_mlp": 1.10515702, + "diversity_loss_mlp": 0.0, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06542821866252439, + "language_loss": 0.89670694, + "learning_rate": 0.0008615019873421175, + "loss": 0.90789306, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.1348877, + "routerloss_mlp": 0.0, + "step": 1380, + "time_per_iteration": 2.4692320823669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124803, + "balance_loss_mlp": 1.11096311, + "diversity_loss_mlp": 0.0, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.08230289019981965, + "language_loss": 0.85984069, + "learning_rate": 0.0008612866900872349, + "loss": 0.87108874, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1381, + "time_per_iteration": 2.5671193599700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119212, + "balance_loss_mlp": 1.10564578, + "diversity_loss_mlp": 0.0, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.09708901974799254, + "language_loss": 0.8800329, + "learning_rate": 0.0008610712525684197, + "loss": 0.89122504, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1382, + "time_per_iteration": 2.673672676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.12075388, + "diversity_loss_mlp": 0.0, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.08550137436350284, + "language_loss": 0.84231853, + "learning_rate": 0.0008608556748693121, + "loss": 0.85366714, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1383, + "time_per_iteration": 3.285391330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113293, + "balance_loss_mlp": 1.11881518, + "diversity_loss_mlp": 0.0, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.07276264363306281, + "language_loss": 0.86098409, + "learning_rate": 0.000860639957073607, + "loss": 0.87231338, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1384, + "time_per_iteration": 2.74979829788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130834, + "balance_loss_mlp": 1.11668396, + "diversity_loss_mlp": 0.0, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.07735164598050102, + "language_loss": 0.87488532, + "learning_rate": 0.0008604240992650534, + "loss": 0.88619369, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1385, + "time_per_iteration": 2.765714406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113264, + "balance_loss_mlp": 1.11819148, + "diversity_loss_mlp": 0.0, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.09224305204204497, + "language_loss": 0.89344275, + "learning_rate": 0.0008602081015274545, + "loss": 0.90476912, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1386, + "time_per_iteration": 2.7466471195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130382, + "balance_loss_mlp": 1.11580229, + "diversity_loss_mlp": 0.0, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.08049268911379595, + "language_loss": 0.83551365, + "learning_rate": 0.0008599919639446684, + "loss": 0.84681749, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1387, + "time_per_iteration": 2.680053234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119435, + "balance_loss_mlp": 1.10439074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.08313146027802099, + "language_loss": 0.80363739, + "learning_rate": 0.000859775686600607, + "loss": 0.81483173, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1388, + "time_per_iteration": 2.5738272666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114186, + "balance_loss_mlp": 1.12722135, + "diversity_loss_mlp": 0.0, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.08559032433145165, + "language_loss": 0.85052109, + "learning_rate": 0.0008595592695792367, + "loss": 0.86193967, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1389, + "time_per_iteration": 2.660012722015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112772, + "balance_loss_mlp": 1.11312914, + "diversity_loss_mlp": 0.0, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.07620364037172102, + "language_loss": 0.90774226, + "learning_rate": 0.0008593427129645778, + "loss": 0.91901946, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1390, + "time_per_iteration": 2.62744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131127, + "balance_loss_mlp": 1.11615419, + "diversity_loss_mlp": 0.0, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.0742307152228864, + "language_loss": 0.85619152, + "learning_rate": 0.0008591260168407052, + "loss": 0.86750275, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1391, + "time_per_iteration": 2.738680124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113811, + "balance_loss_mlp": 1.09930313, + "diversity_loss_mlp": 0.0, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.05574398067767488, + "language_loss": 0.82839364, + "learning_rate": 0.0008589091812917479, + "loss": 0.83953172, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1392, + "time_per_iteration": 2.5947506427764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109245, + "balance_loss_mlp": 1.09471345, + "diversity_loss_mlp": 0.0, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07022348692687568, + "language_loss": 0.85257161, + "learning_rate": 0.0008586922064018887, + "loss": 0.86366403, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1393, + "time_per_iteration": 2.6624581813812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110195, + "balance_loss_mlp": 1.09542501, + "diversity_loss_mlp": 0.0, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.07561979453055602, + "language_loss": 0.89401793, + "learning_rate": 0.0008584750922553651, + "loss": 0.9051199, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1394, + "time_per_iteration": 3.1940202713012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107917, + "balance_loss_mlp": 1.0934931, + "diversity_loss_mlp": 0.0, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.07234350422575066, + "language_loss": 0.83740592, + "learning_rate": 0.0008582578389364677, + "loss": 0.84848505, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1395, + "time_per_iteration": 2.8844621181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106129, + "balance_loss_mlp": 1.09147811, + "diversity_loss_mlp": 0.0, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.061968206774760184, + "language_loss": 0.91908813, + "learning_rate": 0.0008580404465295422, + "loss": 0.93014938, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1396, + "time_per_iteration": 2.7842769622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106127, + "balance_loss_mlp": 1.09155917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07293181793333794, + "language_loss": 0.88274646, + "learning_rate": 0.0008578229151189876, + "loss": 0.89380777, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1397, + "time_per_iteration": 2.96771502494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110096, + "balance_loss_mlp": 1.08638036, + "diversity_loss_mlp": 0.0, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.08798004746081324, + "language_loss": 0.81253606, + "learning_rate": 0.0008576052447892573, + "loss": 0.82354569, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1398, + "time_per_iteration": 2.5413830280303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101823, + "balance_loss_mlp": 1.08761334, + "diversity_loss_mlp": 0.0, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.0737959226904994, + "language_loss": 0.86320835, + "learning_rate": 0.000857387435624858, + "loss": 0.87422657, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1399, + "time_per_iteration": 2.554016351699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00934821, + "balance_loss_mlp": 1.63627267, + "diversity_loss_mlp": 0.20064378, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.02492172823463741, + "language_loss": 0.88190895, + "learning_rate": 0.0008571694877103513, + "loss": 0.89125717, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636335, + "step": 1400, + "time_per_iteration": 3.307114839553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110386, + "balance_loss_mlp": 1.09591365, + "diversity_loss_mlp": 0.0, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.07757128819182789, + "language_loss": 0.87680864, + "learning_rate": 0.0008569514011303515, + "loss": 0.88791251, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1401, + "time_per_iteration": 2.800502300262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917512, + "balance_loss_mlp": 1.60226941, + "diversity_loss_mlp": 0.19939175, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.03393521208879438, + "language_loss": 0.88186574, + "learning_rate": 0.0008567331759695277, + "loss": 0.8910408, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01668182, + "step": 1402, + "time_per_iteration": 2.7670016288757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108043, + "balance_loss_mlp": 1.09297514, + "diversity_loss_mlp": 0.0, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.0674494366068644, + "language_loss": 0.86427194, + "learning_rate": 0.0008565148123126023, + "loss": 0.87535238, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1403, + "time_per_iteration": 2.660659074783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094781, + "balance_loss_mlp": 1.08053553, + "diversity_loss_mlp": 0.0, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.059221605294443855, + "language_loss": 0.86113608, + "learning_rate": 0.0008562963102443516, + "loss": 0.8720839, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1404, + "time_per_iteration": 2.6982760429382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110424, + "balance_loss_mlp": 1.090042, + "diversity_loss_mlp": 0.0, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.08483345099627004, + "language_loss": 0.85166299, + "learning_rate": 0.0008560776698496056, + "loss": 0.86270541, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1405, + "time_per_iteration": 2.9167518615722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110133, + "balance_loss_mlp": 1.09539831, + "diversity_loss_mlp": 0.0, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.06923600464578249, + "language_loss": 0.85861331, + "learning_rate": 0.0008558588912132481, + "loss": 0.86971468, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1406, + "time_per_iteration": 2.8346776962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00696474, + "balance_loss_mlp": 1.17983532, + "diversity_loss_mlp": 0.18206902, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.0036772550136199766, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77155459, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0155216, + "step": 1407, + "time_per_iteration": 4.943782091140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_mlp": 1.09137964, + "diversity_loss_mlp": 0.0, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.08329945876184135, + "language_loss": 0.82942384, + "learning_rate": 0.0008554209195555016, + "loss": 0.84047806, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1408, + "time_per_iteration": 2.7417516708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125368, + "balance_loss_mlp": 1.11146832, + "diversity_loss_mlp": 0.0, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.06975199960684045, + "language_loss": 0.8827157, + "learning_rate": 0.0008552017267041483, + "loss": 0.89396936, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1409, + "time_per_iteration": 2.6978721618652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126015, + "balance_loss_mlp": 1.11216331, + "diversity_loss_mlp": 0.0, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06710824628929367, + "language_loss": 0.83395678, + "learning_rate": 0.0008549823959512549, + "loss": 0.84521693, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1410, + "time_per_iteration": 2.6867637634277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125714, + "balance_loss_mlp": 1.11246991, + "diversity_loss_mlp": 0.0, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.07002470067050659, + "language_loss": 0.86486357, + "learning_rate": 0.0008547629273819728, + "loss": 0.87612069, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.13262939, + "routerloss_mlp": 0.0, + "step": 1411, + "time_per_iteration": 3.410454750061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142962, + "balance_loss_mlp": 1.12940812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.07619635814943253, + "language_loss": 0.83522588, + "learning_rate": 0.0008545433210815074, + "loss": 0.84665549, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1412, + "time_per_iteration": 2.638172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139051, + "balance_loss_mlp": 1.12536621, + "diversity_loss_mlp": 0.0, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.06317158203016926, + "language_loss": 0.87351668, + "learning_rate": 0.0008543235771351176, + "loss": 0.88490719, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1413, + "time_per_iteration": 2.7705581188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159735, + "balance_loss_mlp": 1.14645457, + "diversity_loss_mlp": 0.0, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.08259318688939964, + "language_loss": 0.84684592, + "learning_rate": 0.0008541036956281154, + "loss": 0.85844326, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1414, + "time_per_iteration": 2.8803579807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147853, + "balance_loss_mlp": 1.13435841, + "diversity_loss_mlp": 0.0, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.09396951476817994, + "language_loss": 0.81928164, + "learning_rate": 0.0008538836766458665, + "loss": 0.83076018, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.13519287, + "routerloss_mlp": 0.0, + "step": 1415, + "time_per_iteration": 2.860991954803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140979, + "balance_loss_mlp": 1.12721062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.07553622395064079, + "language_loss": 0.84927893, + "learning_rate": 0.0008536635202737897, + "loss": 0.86068869, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1416, + "time_per_iteration": 2.848196268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146453, + "balance_loss_mlp": 1.13278019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.07031625369418516, + "language_loss": 0.82188255, + "learning_rate": 0.0008534432265973573, + "loss": 0.83334708, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1417, + "time_per_iteration": 2.6029789447784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153419, + "balance_loss_mlp": 1.13950717, + "diversity_loss_mlp": 0.0, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07823597875801033, + "language_loss": 0.88322413, + "learning_rate": 0.000853222795702095, + "loss": 0.89475828, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1418, + "time_per_iteration": 3.3933968544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149691, + "balance_loss_mlp": 1.13570726, + "diversity_loss_mlp": 0.0, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.07267637680100167, + "language_loss": 0.83730674, + "learning_rate": 0.0008530022276735813, + "loss": 0.84880364, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1419, + "time_per_iteration": 2.766181707382202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134499, + "balance_loss_mlp": 1.12086129, + "diversity_loss_mlp": 0.0, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.06887995103877555, + "language_loss": 0.86238861, + "learning_rate": 0.0008527815225974489, + "loss": 0.87373358, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1420, + "time_per_iteration": 2.6471102237701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135972, + "balance_loss_mlp": 1.12148833, + "diversity_loss_mlp": 0.0, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10131461494963417, + "language_loss": 0.88726115, + "learning_rate": 0.0008525606805593829, + "loss": 0.89862096, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1421, + "time_per_iteration": 2.436647653579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118286, + "balance_loss_mlp": 1.10405266, + "diversity_loss_mlp": 0.0, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.0859881194807961, + "language_loss": 0.8254106, + "learning_rate": 0.0008523397016451213, + "loss": 0.83659345, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1422, + "time_per_iteration": 2.593588352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103656, + "balance_loss_mlp": 1.08907628, + "diversity_loss_mlp": 0.0, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.06052148467578676, + "language_loss": 0.87038374, + "learning_rate": 0.0008521185859404564, + "loss": 0.88142037, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1423, + "time_per_iteration": 3.3936307430267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092129, + "balance_loss_mlp": 1.07775199, + "diversity_loss_mlp": 0.0, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06977326166261295, + "language_loss": 0.8940134, + "learning_rate": 0.0008518973335312326, + "loss": 0.90493476, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1424, + "time_per_iteration": 2.7834270000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081272, + "balance_loss_mlp": 1.06702638, + "diversity_loss_mlp": 0.0, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.119675165593639, + "language_loss": 0.83282709, + "learning_rate": 0.0008516759445033477, + "loss": 0.84363985, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1425, + "time_per_iteration": 2.665099859237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.06930685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08266887436661914, + "language_loss": 0.85026807, + "learning_rate": 0.0008514544189427526, + "loss": 0.86110568, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1426, + "time_per_iteration": 2.6887404918670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086038, + "balance_loss_mlp": 1.07249546, + "diversity_loss_mlp": 0.0, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.06908859165293682, + "language_loss": 0.86575979, + "learning_rate": 0.0008512327569354511, + "loss": 0.87662017, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1427, + "time_per_iteration": 2.5235631465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108142, + "balance_loss_mlp": 1.09480238, + "diversity_loss_mlp": 0.0, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.08987008099145026, + "language_loss": 0.8368206, + "learning_rate": 0.0008510109585675001, + "loss": 0.847902, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.13360596, + "routerloss_mlp": 0.0, + "step": 1428, + "time_per_iteration": 2.613348960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140613, + "balance_loss_mlp": 1.13260245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.05207498704371428, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82293957, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1429, + "time_per_iteration": 4.706013202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133032, + "balance_loss_mlp": 1.11977601, + "diversity_loss_mlp": 0.0, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.09002666847623074, + "language_loss": 0.80503839, + "learning_rate": 0.0008505669530941415, + "loss": 0.8163687, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1430, + "time_per_iteration": 3.2976372241973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0097004, + "balance_loss_mlp": 1.70641518, + "diversity_loss_mlp": 0.20088202, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.03747760406507578, + "language_loss": 0.84294951, + "learning_rate": 0.000850344746161112, + "loss": 0.85264993, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01639144, + "step": 1431, + "time_per_iteration": 2.6297106742858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139922, + "balance_loss_mlp": 1.12685704, + "diversity_loss_mlp": 0.0, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.08230554095697513, + "language_loss": 0.87346137, + "learning_rate": 0.0008501224032121894, + "loss": 0.88486063, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1432, + "time_per_iteration": 2.4853787422180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129049, + "balance_loss_mlp": 1.1158998, + "diversity_loss_mlp": 0.0, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06557126517551867, + "language_loss": 0.82118285, + "learning_rate": 0.0008498999243336946, + "loss": 0.83247334, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1433, + "time_per_iteration": 2.623809576034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11776567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.0832335684907068, + "language_loss": 0.87471139, + "learning_rate": 0.0008496773096120021, + "loss": 0.88601708, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.12817383, + "routerloss_mlp": 0.0, + "step": 1434, + "time_per_iteration": 2.7995760440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111971, + "balance_loss_mlp": 1.10637057, + "diversity_loss_mlp": 0.0, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.10286197296711953, + "language_loss": 0.84387434, + "learning_rate": 0.0008494545591335381, + "loss": 0.85507143, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.13354492, + "routerloss_mlp": 0.0, + "step": 1435, + "time_per_iteration": 2.933576822280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113068, + "balance_loss_mlp": 1.09978795, + "diversity_loss_mlp": 0.0, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.053150449500146836, + "language_loss": 0.86971611, + "learning_rate": 0.0008492316729847823, + "loss": 0.88084674, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1436, + "time_per_iteration": 2.8865604400634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.09676659, + "diversity_loss_mlp": 0.0, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08937825724590943, + "language_loss": 0.7968539, + "learning_rate": 0.0008490086512522664, + "loss": 0.80795395, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1437, + "time_per_iteration": 2.7166872024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105369, + "balance_loss_mlp": 1.0916723, + "diversity_loss_mlp": 0.0, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.09013751301914075, + "language_loss": 0.90582836, + "learning_rate": 0.0008487854940225755, + "loss": 0.91688204, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1438, + "time_per_iteration": 2.4426465034484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102616, + "balance_loss_mlp": 1.08844161, + "diversity_loss_mlp": 0.0, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.09066429268698341, + "language_loss": 0.89896768, + "learning_rate": 0.0008485622013823466, + "loss": 0.90999383, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1439, + "time_per_iteration": 2.599177360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090727, + "balance_loss_mlp": 1.07675576, + "diversity_loss_mlp": 0.0, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.08059762035463526, + "language_loss": 0.83446515, + "learning_rate": 0.00084833877341827, + "loss": 0.84537244, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1440, + "time_per_iteration": 2.667215347290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.0762167, + "diversity_loss_mlp": 0.0, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.07889497077341047, + "language_loss": 0.80625433, + "learning_rate": 0.000848115210217088, + "loss": 0.81715715, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1441, + "time_per_iteration": 2.5463788509368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.08003855, + "diversity_loss_mlp": 0.0, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.08443965058939805, + "language_loss": 0.81771946, + "learning_rate": 0.0008478915118655952, + "loss": 0.82866359, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1442, + "time_per_iteration": 2.743678569793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118232, + "balance_loss_mlp": 1.10385561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.07019455815968899, + "language_loss": 0.86195552, + "learning_rate": 0.0008476676784506393, + "loss": 0.87313789, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1443, + "time_per_iteration": 2.663422107696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.10996866, + "diversity_loss_mlp": 0.0, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.08623331537045495, + "language_loss": 0.81889486, + "learning_rate": 0.0008474437100591201, + "loss": 0.83014178, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1444, + "time_per_iteration": 3.340557813644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129459, + "balance_loss_mlp": 1.11489129, + "diversity_loss_mlp": 0.0, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.08279806566523454, + "language_loss": 0.85577607, + "learning_rate": 0.0008472196067779898, + "loss": 0.86707067, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1445, + "time_per_iteration": 2.675623655319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112665, + "balance_loss_mlp": 1.09800267, + "diversity_loss_mlp": 0.0, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10281028137483857, + "language_loss": 0.85108185, + "learning_rate": 0.0008469953686942531, + "loss": 0.86220849, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1446, + "time_per_iteration": 3.0647382736206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933894, + "balance_loss_mlp": 1.63962197, + "diversity_loss_mlp": 0.19544066, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.039122045531048345, + "language_loss": 0.83261281, + "learning_rate": 0.0008467709958949668, + "loss": 0.84195173, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636306, + "step": 1447, + "time_per_iteration": 2.777806043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932176, + "balance_loss_mlp": 1.63710666, + "diversity_loss_mlp": 0.19454433, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.036668832644649825, + "language_loss": 0.85678959, + "learning_rate": 0.0008465464884672403, + "loss": 0.8661114, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01635053, + "step": 1448, + "time_per_iteration": 2.7313778400421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109364, + "balance_loss_mlp": 1.07944214, + "diversity_loss_mlp": 0.0, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.08672786191572247, + "language_loss": 0.85892808, + "learning_rate": 0.0008463218464982348, + "loss": 0.86986446, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1449, + "time_per_iteration": 2.8115885257720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109775, + "balance_loss_mlp": 1.08367157, + "diversity_loss_mlp": 0.0, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.09681901325388456, + "language_loss": 0.8756566, + "learning_rate": 0.0008460970700751645, + "loss": 0.88663405, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1450, + "time_per_iteration": 3.071645975112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093318, + "balance_loss_mlp": 1.07963276, + "diversity_loss_mlp": 0.0, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.09020366192691211, + "language_loss": 0.87640095, + "learning_rate": 0.000845872159285295, + "loss": 0.88733411, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1451, + "time_per_iteration": 2.7342164516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051691, + "balance_loss_mlp": 1.04301238, + "diversity_loss_mlp": 0.0, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.032344288076380935, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78818536, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1452, + "time_per_iteration": 4.95387077331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121492, + "balance_loss_mlp": 1.10795009, + "diversity_loss_mlp": 0.0, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.08097200979220782, + "language_loss": 0.86171871, + "learning_rate": 0.0008454219349544836, + "loss": 0.87293363, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1453, + "time_per_iteration": 3.373755693435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127619, + "balance_loss_mlp": 1.11439896, + "diversity_loss_mlp": 0.0, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.0882994281711823, + "language_loss": 0.81864405, + "learning_rate": 0.000845196621588334, + "loss": 0.82992017, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.13244629, + "routerloss_mlp": 0.0, + "step": 1454, + "time_per_iteration": 2.758122682571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147815, + "balance_loss_mlp": 1.13453507, + "diversity_loss_mlp": 0.0, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.06575509380885615, + "language_loss": 0.76256007, + "learning_rate": 0.0008449711742049706, + "loss": 0.7740382, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1455, + "time_per_iteration": 2.752345561981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.1432693, + "diversity_loss_mlp": 0.0, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.10411587441286801, + "language_loss": 0.84306383, + "learning_rate": 0.0008447455928919196, + "loss": 0.85462898, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1456, + "time_per_iteration": 2.6104180812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146529, + "balance_loss_mlp": 1.13327312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.07273170046833245, + "language_loss": 0.86767292, + "learning_rate": 0.0008445198777367595, + "loss": 0.87913817, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1457, + "time_per_iteration": 2.614743947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144047, + "balance_loss_mlp": 1.13080251, + "diversity_loss_mlp": 0.0, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.08362811388708001, + "language_loss": 0.81054902, + "learning_rate": 0.0008442940288271208, + "loss": 0.82198954, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1458, + "time_per_iteration": 2.615705966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112578, + "balance_loss_mlp": 1.11191583, + "diversity_loss_mlp": 0.0, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06892977395484212, + "language_loss": 0.8688817, + "learning_rate": 0.0008440680462506856, + "loss": 0.88013953, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1459, + "time_per_iteration": 2.810474157333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121233, + "balance_loss_mlp": 1.10828125, + "diversity_loss_mlp": 0.0, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.06441288224223744, + "language_loss": 0.86424565, + "learning_rate": 0.0008438419300951883, + "loss": 0.87545788, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1460, + "time_per_iteration": 2.6540863513946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115517, + "balance_loss_mlp": 1.10215354, + "diversity_loss_mlp": 0.0, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.12446768600100189, + "language_loss": 0.86647975, + "learning_rate": 0.0008436156804484148, + "loss": 0.87763494, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1461, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110833, + "balance_loss_mlp": 1.0965395, + "diversity_loss_mlp": 0.0, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.08490544085138897, + "language_loss": 0.88168794, + "learning_rate": 0.0008433892973982031, + "loss": 0.89279622, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1462, + "time_per_iteration": 2.561211347579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10098886, + "diversity_loss_mlp": 0.0, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07295818188475026, + "language_loss": 0.84776855, + "learning_rate": 0.0008431627810324431, + "loss": 0.85892212, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1463, + "time_per_iteration": 2.654146671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117739, + "balance_loss_mlp": 1.10345769, + "diversity_loss_mlp": 0.0, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.06893619297503142, + "language_loss": 0.8126353, + "learning_rate": 0.000842936131439076, + "loss": 0.82381272, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1464, + "time_per_iteration": 2.6571760177612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115394, + "balance_loss_mlp": 1.1010766, + "diversity_loss_mlp": 0.0, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.07879840484237804, + "language_loss": 0.87885797, + "learning_rate": 0.0008427093487060951, + "loss": 0.89001191, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1465, + "time_per_iteration": 2.6847336292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101907, + "balance_loss_mlp": 1.08776927, + "diversity_loss_mlp": 0.0, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.06118480673876746, + "language_loss": 0.84661305, + "learning_rate": 0.000842482432921545, + "loss": 0.8576321, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1466, + "time_per_iteration": 2.884965181350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110751, + "balance_loss_mlp": 1.09353852, + "diversity_loss_mlp": 0.0, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.07927655906335743, + "language_loss": 0.87199128, + "learning_rate": 0.0008422553841735225, + "loss": 0.88306642, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1467, + "time_per_iteration": 2.528017997741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115631, + "balance_loss_mlp": 1.10146928, + "diversity_loss_mlp": 0.0, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.07348722340160863, + "language_loss": 0.84837711, + "learning_rate": 0.0008420282025501757, + "loss": 0.85953343, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1468, + "time_per_iteration": 2.7696359157562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115321, + "balance_loss_mlp": 1.10156429, + "diversity_loss_mlp": 0.0, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07024793700711117, + "language_loss": 0.85080296, + "learning_rate": 0.0008418008881397043, + "loss": 0.86195612, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1469, + "time_per_iteration": 2.659646511077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115825, + "balance_loss_mlp": 1.10241413, + "diversity_loss_mlp": 0.0, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.12791916727658353, + "language_loss": 0.82420468, + "learning_rate": 0.0008415734410303595, + "loss": 0.83536291, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1470, + "time_per_iteration": 3.2350287437438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120259, + "balance_loss_mlp": 1.10672879, + "diversity_loss_mlp": 0.0, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.0700140113394834, + "language_loss": 0.90437436, + "learning_rate": 0.0008413458613104444, + "loss": 0.91557699, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1471, + "time_per_iteration": 2.7219245433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111254, + "balance_loss_mlp": 1.09766376, + "diversity_loss_mlp": 0.0, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.07145574186167022, + "language_loss": 0.83164495, + "learning_rate": 0.0008411181490683129, + "loss": 0.84275752, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1472, + "time_per_iteration": 2.727936029434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107735, + "balance_loss_mlp": 1.09348917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.0645149730480124, + "language_loss": 0.82377428, + "learning_rate": 0.0008408903043923707, + "loss": 0.83485162, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1473, + "time_per_iteration": 2.9972269535064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111455, + "balance_loss_mlp": 1.1004951, + "diversity_loss_mlp": 0.0, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09233547648167305, + "language_loss": 0.81268132, + "learning_rate": 0.0008406623273710754, + "loss": 0.82382679, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1474, + "time_per_iteration": 2.5923123359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105938, + "balance_loss_mlp": 1.09263408, + "diversity_loss_mlp": 0.0, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.0761903935255829, + "language_loss": 0.8290056, + "learning_rate": 0.0008404342180929351, + "loss": 0.840065, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1475, + "time_per_iteration": 2.664698600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121728, + "balance_loss_mlp": 1.10819817, + "diversity_loss_mlp": 0.0, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.08946081876366527, + "language_loss": 0.81824017, + "learning_rate": 0.00084020597664651, + "loss": 0.82945752, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1476, + "time_per_iteration": 2.7941510677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113829, + "balance_loss_mlp": 1.10019112, + "diversity_loss_mlp": 0.0, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.09030679544521746, + "language_loss": 0.83820337, + "learning_rate": 0.0008399776031204111, + "loss": 0.84934169, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1477, + "time_per_iteration": 2.7508158683776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101529, + "balance_loss_mlp": 1.08784389, + "diversity_loss_mlp": 0.0, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07642048536310797, + "language_loss": 0.79864645, + "learning_rate": 0.0008397490976033009, + "loss": 0.80966175, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1478, + "time_per_iteration": 2.6500625610351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054127, + "balance_loss_mlp": 1.04673624, + "diversity_loss_mlp": 0.0, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.0303646120618472, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933775, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.07373047, + "routerloss_mlp": 0.0, + "step": 1479, + "time_per_iteration": 4.757360935211182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098606, + "balance_loss_mlp": 1.08449173, + "diversity_loss_mlp": 0.0, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06570619267025138, + "language_loss": 0.85133117, + "learning_rate": 0.0008392916909509525, + "loss": 0.86231726, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1480, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093081, + "balance_loss_mlp": 1.07888281, + "diversity_loss_mlp": 0.0, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.07896332999012158, + "language_loss": 0.8543641, + "learning_rate": 0.0008390627899932954, + "loss": 0.86529493, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1481, + "time_per_iteration": 2.5937705039978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100254, + "balance_loss_mlp": 1.08532953, + "diversity_loss_mlp": 0.0, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.08879627929694006, + "language_loss": 0.88894033, + "learning_rate": 0.000838833757399789, + "loss": 0.89994287, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1482, + "time_per_iteration": 2.95451283454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106961, + "balance_loss_mlp": 1.09247661, + "diversity_loss_mlp": 0.0, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.08557616325511565, + "language_loss": 0.80760586, + "learning_rate": 0.0008386045932593515, + "loss": 0.81867552, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1483, + "time_per_iteration": 2.6901025772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.09776473, + "diversity_loss_mlp": 0.0, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.0661413109298982, + "language_loss": 0.86017227, + "learning_rate": 0.0008383752976609525, + "loss": 0.87129307, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1484, + "time_per_iteration": 2.9148330688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116421, + "balance_loss_mlp": 1.1014719, + "diversity_loss_mlp": 0.0, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06788684976720215, + "language_loss": 0.80004096, + "learning_rate": 0.0008381458706936123, + "loss": 0.81120521, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1485, + "time_per_iteration": 2.681067943572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112387, + "balance_loss_mlp": 1.09728312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06920905175587555, + "language_loss": 0.8725493, + "learning_rate": 0.0008379163124464025, + "loss": 0.88367319, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1486, + "time_per_iteration": 2.7093162536621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_mlp": 1.10290396, + "diversity_loss_mlp": 0.0, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.09647963836289664, + "language_loss": 0.77093983, + "learning_rate": 0.0008376866230084452, + "loss": 0.78211844, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1487, + "time_per_iteration": 2.8678433895111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00910546, + "balance_loss_mlp": 1.59136748, + "diversity_loss_mlp": 0.19592074, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.03660624024989628, + "language_loss": 0.86046171, + "learning_rate": 0.000837456802468914, + "loss": 0.86956716, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01690142, + "step": 1488, + "time_per_iteration": 2.602982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102391, + "balance_loss_mlp": 1.08787107, + "diversity_loss_mlp": 0.0, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.0820682475712047, + "language_loss": 0.85374725, + "learning_rate": 0.0008372268509170331, + "loss": 0.86477119, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1489, + "time_per_iteration": 2.6895487308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099554, + "balance_loss_mlp": 1.08529639, + "diversity_loss_mlp": 0.0, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.09305985964981825, + "language_loss": 0.85262501, + "learning_rate": 0.0008369967684420779, + "loss": 0.86362052, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1490, + "time_per_iteration": 2.7102949619293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083179, + "balance_loss_mlp": 1.06912422, + "diversity_loss_mlp": 0.0, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.08804420397834639, + "language_loss": 0.84696782, + "learning_rate": 0.0008367665551333736, + "loss": 0.85779965, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1491, + "time_per_iteration": 2.618272304534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088636, + "balance_loss_mlp": 1.07430756, + "diversity_loss_mlp": 0.0, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.07991380194683065, + "language_loss": 0.85525382, + "learning_rate": 0.0008365362110802977, + "loss": 0.86614019, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1492, + "time_per_iteration": 2.851928234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101019, + "balance_loss_mlp": 1.08655906, + "diversity_loss_mlp": 0.0, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.0838988471662801, + "language_loss": 0.82620168, + "learning_rate": 0.0008363057363722773, + "loss": 0.83721185, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1493, + "time_per_iteration": 2.853207588195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_mlp": 1.09245062, + "diversity_loss_mlp": 0.0, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.06826703692619526, + "language_loss": 0.84157109, + "learning_rate": 0.0008360751310987906, + "loss": 0.85263485, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1494, + "time_per_iteration": 2.57387638092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11695361, + "diversity_loss_mlp": 0.0, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.058749130100992836, + "language_loss": 0.85290074, + "learning_rate": 0.0008358443953493666, + "loss": 0.86420786, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1495, + "time_per_iteration": 2.8883073329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164777, + "balance_loss_mlp": 1.15067482, + "diversity_loss_mlp": 0.0, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.08087911977453179, + "language_loss": 0.88221979, + "learning_rate": 0.0008356135292135851, + "loss": 0.89386749, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1496, + "time_per_iteration": 2.5230934619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186431, + "balance_loss_mlp": 1.17226899, + "diversity_loss_mlp": 0.0, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.11116302526442519, + "language_loss": 0.92429602, + "learning_rate": 0.0008353825327810758, + "loss": 0.93616039, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1497, + "time_per_iteration": 2.420966863632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188369, + "balance_loss_mlp": 1.17465985, + "diversity_loss_mlp": 0.0, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.07094257684914687, + "language_loss": 0.8160103, + "learning_rate": 0.00083515140614152, + "loss": 0.82789397, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1498, + "time_per_iteration": 2.7105205059051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172297, + "balance_loss_mlp": 1.15901685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.09212284213685974, + "language_loss": 0.87059236, + "learning_rate": 0.0008349201493846485, + "loss": 0.88231528, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1499, + "time_per_iteration": 2.6807801723480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148211, + "balance_loss_mlp": 1.13470435, + "diversity_loss_mlp": 0.0, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.07375807574735407, + "language_loss": 0.88790113, + "learning_rate": 0.0008346887626002432, + "loss": 0.89938325, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1500, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919256, + "balance_loss_mlp": 1.60489607, + "diversity_loss_mlp": 0.19980004, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.030907333217789122, + "language_loss": 0.85892522, + "learning_rate": 0.000834457245878137, + "loss": 0.86811781, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0169074, + "step": 1501, + "time_per_iteration": 2.6543540954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112198, + "balance_loss_mlp": 1.10861671, + "diversity_loss_mlp": 0.0, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.09029230185558035, + "language_loss": 0.81450766, + "learning_rate": 0.000834225599308212, + "loss": 0.82572746, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1502, + "time_per_iteration": 3.2493886947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125349, + "balance_loss_mlp": 1.11191428, + "diversity_loss_mlp": 0.0, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07343077704271528, + "language_loss": 0.85592055, + "learning_rate": 0.0008339938229804016, + "loss": 0.86717403, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.13458252, + "routerloss_mlp": 0.0, + "step": 1503, + "time_per_iteration": 2.712455987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.08344853, + "diversity_loss_mlp": 0.0, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.040592353184382625, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76525998, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1504, + "time_per_iteration": 4.975377082824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117311, + "balance_loss_mlp": 1.10320854, + "diversity_loss_mlp": 0.0, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.10665663300821891, + "language_loss": 0.84014988, + "learning_rate": 0.0008335298814111094, + "loss": 0.85132295, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1505, + "time_per_iteration": 2.563352584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119478, + "balance_loss_mlp": 1.10572124, + "diversity_loss_mlp": 0.0, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.07488877863745698, + "language_loss": 0.87982982, + "learning_rate": 0.0008332977163497455, + "loss": 0.89102459, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1506, + "time_per_iteration": 2.799177646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011178, + "balance_loss_mlp": 1.10419846, + "diversity_loss_mlp": 0.0, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.08855239932012744, + "language_loss": 0.83522987, + "learning_rate": 0.0008330654218907325, + "loss": 0.84640789, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1507, + "time_per_iteration": 2.7311654090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130891, + "balance_loss_mlp": 1.1170032, + "diversity_loss_mlp": 0.0, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.06185767339129184, + "language_loss": 0.82011658, + "learning_rate": 0.0008328329981242548, + "loss": 0.83142549, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1508, + "time_per_iteration": 2.87014102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148949, + "balance_loss_mlp": 1.13483465, + "diversity_loss_mlp": 0.0, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.0780337340178098, + "language_loss": 0.88045996, + "learning_rate": 0.0008326004451405475, + "loss": 0.89194947, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1509, + "time_per_iteration": 2.7449288368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146827, + "balance_loss_mlp": 1.13290334, + "diversity_loss_mlp": 0.0, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.07615169765943663, + "language_loss": 0.82328165, + "learning_rate": 0.0008323677630298957, + "loss": 0.83474988, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1510, + "time_per_iteration": 2.5527472496032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911058, + "balance_loss_mlp": 1.59209251, + "diversity_loss_mlp": 0.19929613, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.030084219280472915, + "language_loss": 0.84789264, + "learning_rate": 0.0008321349518826345, + "loss": 0.85700321, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01536426, + "step": 1511, + "time_per_iteration": 2.85006046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167449, + "balance_loss_mlp": 1.15337038, + "diversity_loss_mlp": 0.0, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.09547204503407083, + "language_loss": 0.94614309, + "learning_rate": 0.0008319020117891491, + "loss": 0.95781755, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1512, + "time_per_iteration": 2.619699001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150869, + "balance_loss_mlp": 1.13603973, + "diversity_loss_mlp": 0.0, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.0903449194731753, + "language_loss": 0.86757064, + "learning_rate": 0.0008316689428398751, + "loss": 0.87907934, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1513, + "time_per_iteration": 2.6975061893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122355, + "balance_loss_mlp": 1.10804975, + "diversity_loss_mlp": 0.0, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05700485295001885, + "language_loss": 0.88661957, + "learning_rate": 0.0008314357451252979, + "loss": 0.89784312, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1514, + "time_per_iteration": 2.7759623527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101866, + "balance_loss_mlp": 1.08762062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.06876651723291546, + "language_loss": 0.87979865, + "learning_rate": 0.0008312024187359527, + "loss": 0.89081734, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1515, + "time_per_iteration": 2.6594746112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108928, + "balance_loss_mlp": 1.07499838, + "diversity_loss_mlp": 0.0, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.06943657009436902, + "language_loss": 0.87168229, + "learning_rate": 0.000830968963762425, + "loss": 0.88257504, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1516, + "time_per_iteration": 3.0544168949127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078645, + "balance_loss_mlp": 1.06457818, + "diversity_loss_mlp": 0.0, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.07942748937188983, + "language_loss": 0.84183443, + "learning_rate": 0.0008307353802953497, + "loss": 0.85262084, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1517, + "time_per_iteration": 2.7325901985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.06031072, + "diversity_loss_mlp": 0.0, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.0903207444065502, + "language_loss": 0.86203992, + "learning_rate": 0.0008305016684254125, + "loss": 0.87279052, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1518, + "time_per_iteration": 2.790580987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073552, + "balance_loss_mlp": 1.05908012, + "diversity_loss_mlp": 0.0, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.07640210633127195, + "language_loss": 0.86818451, + "learning_rate": 0.0008302678282433479, + "loss": 0.87892002, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1519, + "time_per_iteration": 2.594045400619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077986, + "balance_loss_mlp": 1.06394291, + "diversity_loss_mlp": 0.0, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07607218771192015, + "language_loss": 0.84937745, + "learning_rate": 0.0008300338598399411, + "loss": 0.86015737, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1520, + "time_per_iteration": 2.6176183223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897129, + "balance_loss_mlp": 1.56367016, + "diversity_loss_mlp": 0.19839743, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.03454500929264816, + "language_loss": 0.94754219, + "learning_rate": 0.0008297997633060263, + "loss": 0.95651346, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160955, + "step": 1521, + "time_per_iteration": 2.5507402420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098471, + "balance_loss_mlp": 1.08445215, + "diversity_loss_mlp": 0.0, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07923859397995789, + "language_loss": 0.84868819, + "learning_rate": 0.0008295655387324883, + "loss": 0.8596729, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1522, + "time_per_iteration": 2.942894458770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103286, + "balance_loss_mlp": 1.08957708, + "diversity_loss_mlp": 0.0, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.09185291067452052, + "language_loss": 0.84979212, + "learning_rate": 0.0008293311862102609, + "loss": 0.86082506, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1523, + "time_per_iteration": 2.555556297302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.10218382, + "diversity_loss_mlp": 0.0, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.07878242279946136, + "language_loss": 0.88546365, + "learning_rate": 0.0008290967058303275, + "loss": 0.89662319, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1524, + "time_per_iteration": 2.5723721981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117022, + "balance_loss_mlp": 1.10387325, + "diversity_loss_mlp": 0.0, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07157234250277994, + "language_loss": 0.86573815, + "learning_rate": 0.0008288620976837219, + "loss": 0.87690842, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1525, + "time_per_iteration": 2.539079427719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116802, + "balance_loss_mlp": 1.10354626, + "diversity_loss_mlp": 0.0, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.07300174969402286, + "language_loss": 0.82548958, + "learning_rate": 0.000828627361861527, + "loss": 0.83665758, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1526, + "time_per_iteration": 2.5784413814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117225, + "balance_loss_mlp": 1.10368335, + "diversity_loss_mlp": 0.0, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.105387273671708, + "language_loss": 0.84438479, + "learning_rate": 0.0008283924984548752, + "loss": 0.85555708, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1527, + "time_per_iteration": 2.876854181289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136626, + "balance_loss_mlp": 1.12352467, + "diversity_loss_mlp": 0.0, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.07473419184062492, + "language_loss": 0.84776825, + "learning_rate": 0.0008281575075549485, + "loss": 0.8591345, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1528, + "time_per_iteration": 2.5660881996154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103997, + "balance_loss_mlp": 1.09631968, + "diversity_loss_mlp": 0.0, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.053938657910520806, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78456688, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1529, + "time_per_iteration": 4.633493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149647, + "balance_loss_mlp": 1.13666511, + "diversity_loss_mlp": 0.0, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.07225715112962865, + "language_loss": 0.90511358, + "learning_rate": 0.0008276871436402469, + "loss": 0.91661, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1530, + "time_per_iteration": 2.8149213790893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156897, + "balance_loss_mlp": 1.14402199, + "diversity_loss_mlp": 0.0, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.10076437192912456, + "language_loss": 0.87526608, + "learning_rate": 0.000827451770808083, + "loss": 0.88683504, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1531, + "time_per_iteration": 2.7307019233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137224, + "balance_loss_mlp": 1.12402749, + "diversity_loss_mlp": 0.0, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.07118672956881426, + "language_loss": 0.8318634, + "learning_rate": 0.0008272162708478674, + "loss": 0.84323561, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1532, + "time_per_iteration": 2.559326648712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135091, + "balance_loss_mlp": 1.1222167, + "diversity_loss_mlp": 0.0, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.07324079883183283, + "language_loss": 0.86170006, + "learning_rate": 0.000826980643851029, + "loss": 0.87305093, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1533, + "time_per_iteration": 2.728351354598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120692, + "balance_loss_mlp": 1.10734081, + "diversity_loss_mlp": 0.0, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.07850912920042735, + "language_loss": 0.84523225, + "learning_rate": 0.0008267448899090464, + "loss": 0.85643911, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1534, + "time_per_iteration": 2.595296859741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121931, + "balance_loss_mlp": 1.10788798, + "diversity_loss_mlp": 0.0, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.07265790711823701, + "language_loss": 0.80930066, + "learning_rate": 0.0008265090091134473, + "loss": 0.82051992, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1535, + "time_per_iteration": 2.8336315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105358, + "balance_loss_mlp": 1.09133863, + "diversity_loss_mlp": 0.0, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.08467148330579209, + "language_loss": 0.80271345, + "learning_rate": 0.0008262730015558088, + "loss": 0.81376696, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1536, + "time_per_iteration": 2.9066760540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102197, + "balance_loss_mlp": 1.08847594, + "diversity_loss_mlp": 0.0, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.07407642769484, + "language_loss": 0.81805962, + "learning_rate": 0.0008260368673277574, + "loss": 0.82908159, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1537, + "time_per_iteration": 3.1795482635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106736, + "balance_loss_mlp": 1.09302735, + "diversity_loss_mlp": 0.0, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06784415515848828, + "language_loss": 0.84026253, + "learning_rate": 0.0008258006065209682, + "loss": 0.85132986, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1538, + "time_per_iteration": 2.766732931137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112658, + "balance_loss_mlp": 1.09863889, + "diversity_loss_mlp": 0.0, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.0747520981493109, + "language_loss": 0.80543184, + "learning_rate": 0.0008255642192271657, + "loss": 0.81655836, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1539, + "time_per_iteration": 2.792191505432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130833, + "balance_loss_mlp": 1.11683834, + "diversity_loss_mlp": 0.0, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06277821647748005, + "language_loss": 0.83592129, + "learning_rate": 0.0008253277055381241, + "loss": 0.8472296, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1540, + "time_per_iteration": 2.8384311199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138407, + "balance_loss_mlp": 1.12428069, + "diversity_loss_mlp": 0.0, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09924754491110549, + "language_loss": 0.85482454, + "learning_rate": 0.0008250910655456658, + "loss": 0.86620867, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1541, + "time_per_iteration": 3.1718008518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133859, + "balance_loss_mlp": 1.12016189, + "diversity_loss_mlp": 0.0, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.07747440640117766, + "language_loss": 0.83370835, + "learning_rate": 0.0008248542993416625, + "loss": 0.84504688, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1542, + "time_per_iteration": 2.5952396392822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127147, + "balance_loss_mlp": 1.11278272, + "diversity_loss_mlp": 0.0, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.08018137719350796, + "language_loss": 0.83926904, + "learning_rate": 0.0008246174070180352, + "loss": 0.85054052, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1543, + "time_per_iteration": 2.6775217056274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.10168624, + "diversity_loss_mlp": 0.0, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09273281815149376, + "language_loss": 0.83928716, + "learning_rate": 0.0008243803886667537, + "loss": 0.85044312, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1544, + "time_per_iteration": 3.0925238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110422, + "balance_loss_mlp": 1.09024858, + "diversity_loss_mlp": 0.0, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.06593992881851045, + "language_loss": 0.79115343, + "learning_rate": 0.0008241432443798364, + "loss": 0.80219567, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1545, + "time_per_iteration": 2.839099407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088557, + "balance_loss_mlp": 1.07518196, + "diversity_loss_mlp": 0.0, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05453506209022983, + "language_loss": 0.85691601, + "learning_rate": 0.0008239059742493512, + "loss": 0.86780155, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1546, + "time_per_iteration": 2.7476751804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088565, + "balance_loss_mlp": 1.07480812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.06672989003234615, + "language_loss": 0.87117672, + "learning_rate": 0.0008236685783674142, + "loss": 0.88206244, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1547, + "time_per_iteration": 3.0519776344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06796312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.04305360715769565, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.772995, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1548, + "time_per_iteration": 4.883166790008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.07123256, + "diversity_loss_mlp": 0.0, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.11160876507978217, + "language_loss": 0.82253683, + "learning_rate": 0.0008231934097178955, + "loss": 0.8333841, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.1350708, + "routerloss_mlp": 0.0, + "step": 1549, + "time_per_iteration": 2.60786771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_mlp": 1.07919788, + "diversity_loss_mlp": 0.0, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.07843428838445873, + "language_loss": 0.85328496, + "learning_rate": 0.0008229556371347903, + "loss": 0.86420953, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1550, + "time_per_iteration": 2.962412118911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106892, + "balance_loss_mlp": 1.09379029, + "diversity_loss_mlp": 0.0, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.0840525031564576, + "language_loss": 0.79399186, + "learning_rate": 0.0008227177391691874, + "loss": 0.80506086, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.13122559, + "routerloss_mlp": 0.0, + "step": 1551, + "time_per_iteration": 3.1673550605773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111848, + "balance_loss_mlp": 1.09871709, + "diversity_loss_mlp": 0.0, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07195743014481873, + "language_loss": 0.89281148, + "learning_rate": 0.0008224797159134463, + "loss": 0.90392995, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1552, + "time_per_iteration": 2.7333877086639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121508, + "balance_loss_mlp": 1.10890126, + "diversity_loss_mlp": 0.0, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.07485820549569244, + "language_loss": 0.83144093, + "learning_rate": 0.0008222415674599765, + "loss": 0.84265602, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.12609863, + "routerloss_mlp": 0.0, + "step": 1553, + "time_per_iteration": 3.077017068862915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135128, + "balance_loss_mlp": 1.12165701, + "diversity_loss_mlp": 0.0, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.08671551895934956, + "language_loss": 0.83149582, + "learning_rate": 0.0008220032939012349, + "loss": 0.84284711, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1554, + "time_per_iteration": 2.6689035892486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115966, + "balance_loss_mlp": 1.10284674, + "diversity_loss_mlp": 0.0, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.06666483036401037, + "language_loss": 0.87800217, + "learning_rate": 0.0008217648953297277, + "loss": 0.88916183, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1555, + "time_per_iteration": 2.8417294025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119981, + "balance_loss_mlp": 1.10677278, + "diversity_loss_mlp": 0.0, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.08472740856632217, + "language_loss": 0.78017807, + "learning_rate": 0.0008215263718380095, + "loss": 0.7913779, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1556, + "time_per_iteration": 2.682047128677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096383, + "balance_loss_mlp": 1.08319807, + "diversity_loss_mlp": 0.0, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07743195715790333, + "language_loss": 0.84389544, + "learning_rate": 0.0008212877235186833, + "loss": 0.85485923, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.13201904, + "routerloss_mlp": 0.0, + "step": 1557, + "time_per_iteration": 2.6532580852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074398, + "balance_loss_mlp": 1.06710196, + "diversity_loss_mlp": 0.0, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.04061005434024277, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78811955, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.07275391, + "routerloss_mlp": 0.0, + "step": 1558, + "time_per_iteration": 4.923272132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092088, + "balance_loss_mlp": 1.07896352, + "diversity_loss_mlp": 0.0, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.10565427097675566, + "language_loss": 0.8116585, + "learning_rate": 0.0008208100527678611, + "loss": 0.82257938, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1559, + "time_per_iteration": 2.602773427963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.07101393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.11780548804152448, + "language_loss": 0.78494406, + "learning_rate": 0.0008205710305218135, + "loss": 0.79578459, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1560, + "time_per_iteration": 3.013576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089526, + "balance_loss_mlp": 1.07663918, + "diversity_loss_mlp": 0.0, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.08018423106971302, + "language_loss": 0.89838511, + "learning_rate": 0.0008203318838190541, + "loss": 0.9092803, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1561, + "time_per_iteration": 2.741619348526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108989, + "balance_loss_mlp": 1.07702184, + "diversity_loss_mlp": 0.0, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.09397123990600864, + "language_loss": 0.85396177, + "learning_rate": 0.0008200926127524281, + "loss": 0.86486065, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1562, + "time_per_iteration": 2.60974383354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106708, + "balance_loss_mlp": 1.0936904, + "diversity_loss_mlp": 0.0, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.08688269643752358, + "language_loss": 0.83400619, + "learning_rate": 0.0008198532174148289, + "loss": 0.84507322, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1563, + "time_per_iteration": 2.7336533069610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079297, + "balance_loss_mlp": 1.07195389, + "diversity_loss_mlp": 0.0, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.04112604139988501, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81765467, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1564, + "time_per_iteration": 4.828714609146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145426, + "balance_loss_mlp": 1.1324501, + "diversity_loss_mlp": 0.0, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08852118135813189, + "language_loss": 0.89291, + "learning_rate": 0.0008193740542985244, + "loss": 0.90436429, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1565, + "time_per_iteration": 2.5988731384277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151488, + "balance_loss_mlp": 1.13872099, + "diversity_loss_mlp": 0.0, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.1281977179548432, + "language_loss": 0.86354733, + "learning_rate": 0.0008191342867058467, + "loss": 0.87506223, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1566, + "time_per_iteration": 2.6914639472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.10574174, + "diversity_loss_mlp": 0.0, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.07018370282969584, + "language_loss": 0.83602738, + "learning_rate": 0.0008188943952142509, + "loss": 0.84721458, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1567, + "time_per_iteration": 2.7846438884735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111402, + "balance_loss_mlp": 1.09847367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.08750889372003143, + "language_loss": 0.82150149, + "learning_rate": 0.0008186543799168711, + "loss": 0.83261549, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1568, + "time_per_iteration": 3.1300384998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094152, + "balance_loss_mlp": 1.08103871, + "diversity_loss_mlp": 0.0, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.07719475001811499, + "language_loss": 0.88627326, + "learning_rate": 0.0008184142409068892, + "loss": 0.89721477, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.13134766, + "routerloss_mlp": 0.0, + "step": 1569, + "time_per_iteration": 2.9922726154327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087737, + "balance_loss_mlp": 1.07475495, + "diversity_loss_mlp": 0.0, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.07345065764158631, + "language_loss": 0.86446834, + "learning_rate": 0.000818173978277536, + "loss": 0.87534571, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.12994385, + "routerloss_mlp": 0.0, + "step": 1570, + "time_per_iteration": 2.695930242538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089564, + "balance_loss_mlp": 1.07673669, + "diversity_loss_mlp": 0.0, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.0712021049255776, + "language_loss": 0.83337176, + "learning_rate": 0.000817933592122089, + "loss": 0.84426749, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.12841797, + "routerloss_mlp": 0.0, + "step": 1571, + "time_per_iteration": 2.7131617069244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087482, + "balance_loss_mlp": 1.07427394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.08283074842036095, + "language_loss": 0.83667982, + "learning_rate": 0.0008176930825338749, + "loss": 0.84755468, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1572, + "time_per_iteration": 2.5447826385498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087862, + "balance_loss_mlp": 1.07405734, + "diversity_loss_mlp": 0.0, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07741282152017008, + "language_loss": 0.88849854, + "learning_rate": 0.0008174524496062679, + "loss": 0.89937723, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1573, + "time_per_iteration": 2.908740997314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092114, + "balance_loss_mlp": 1.07822633, + "diversity_loss_mlp": 0.0, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.06962859876416791, + "language_loss": 0.85499102, + "learning_rate": 0.0008172116934326894, + "loss": 0.86591208, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1574, + "time_per_iteration": 2.751488208770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098046, + "balance_loss_mlp": 1.08365786, + "diversity_loss_mlp": 0.0, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.09195920466248479, + "language_loss": 0.8794626, + "learning_rate": 0.0008169708141066097, + "loss": 0.89044309, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1575, + "time_per_iteration": 2.5947275161743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118908, + "balance_loss_mlp": 1.10441208, + "diversity_loss_mlp": 0.0, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.0784824693742563, + "language_loss": 0.90658617, + "learning_rate": 0.0008167298117215465, + "loss": 0.91777527, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1576, + "time_per_iteration": 2.5396125316619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011316, + "balance_loss_mlp": 1.11705649, + "diversity_loss_mlp": 0.0, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.1093253517132677, + "language_loss": 0.87566864, + "learning_rate": 0.0008164886863710649, + "loss": 0.88698471, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1577, + "time_per_iteration": 2.931835412979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138004, + "balance_loss_mlp": 1.12323439, + "diversity_loss_mlp": 0.0, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07788016425512684, + "language_loss": 0.8637675, + "learning_rate": 0.0008162474381487783, + "loss": 0.87514758, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1578, + "time_per_iteration": 3.041262626647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125978, + "balance_loss_mlp": 1.11132693, + "diversity_loss_mlp": 0.0, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.1532642042193693, + "language_loss": 0.84568751, + "learning_rate": 0.0008160060671483475, + "loss": 0.8569473, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1579, + "time_per_iteration": 2.6566197872161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110829, + "balance_loss_mlp": 1.0942831, + "diversity_loss_mlp": 0.0, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.10001869607158981, + "language_loss": 0.8342396, + "learning_rate": 0.0008157645734634809, + "loss": 0.84532249, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1580, + "time_per_iteration": 2.5994346141815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151521, + "balance_loss_mlp": 1.14064956, + "diversity_loss_mlp": 0.0, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.06737085519591758, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78048015, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 1581, + "time_per_iteration": 4.946556329727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00631723, + "balance_loss_mlp": 1.05820811, + "diversity_loss_mlp": 0.17941347, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.002006006723137456, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.73846221, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01291206, + "step": 1582, + "time_per_iteration": 4.897693395614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.08376384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.07529557219412701, + "language_loss": 0.83949858, + "learning_rate": 0.000815039357240067, + "loss": 0.85047406, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1583, + "time_per_iteration": 2.6096932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101837, + "balance_loss_mlp": 1.0882473, + "diversity_loss_mlp": 0.0, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.0740498467066553, + "language_loss": 0.84922493, + "learning_rate": 0.0008147973737554952, + "loss": 0.86024332, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1584, + "time_per_iteration": 2.7863824367523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106775, + "balance_loss_mlp": 1.09364963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.11669723774220289, + "language_loss": 0.85926318, + "learning_rate": 0.000814555268055744, + "loss": 0.87033093, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1585, + "time_per_iteration": 2.6167564392089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111589, + "balance_loss_mlp": 1.1022768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07476018488685929, + "language_loss": 0.87489879, + "learning_rate": 0.0008143130402348073, + "loss": 0.88605773, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1586, + "time_per_iteration": 2.6318202018737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112097, + "balance_loss_mlp": 1.10742807, + "diversity_loss_mlp": 0.0, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.07016471467090964, + "language_loss": 0.79198885, + "learning_rate": 0.0008140706903867265, + "loss": 0.80319858, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1587, + "time_per_iteration": 2.82663893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128991, + "balance_loss_mlp": 1.11541307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.09040046070353, + "language_loss": 0.90612531, + "learning_rate": 0.0008138282186055897, + "loss": 0.91741514, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1588, + "time_per_iteration": 2.690561294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142156, + "balance_loss_mlp": 1.12872136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.07675542780120453, + "language_loss": 0.82382154, + "learning_rate": 0.0008135856249855331, + "loss": 0.83524311, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1589, + "time_per_iteration": 2.6935813426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115907, + "balance_loss_mlp": 1.14551568, + "diversity_loss_mlp": 0.0, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.07642745969896261, + "language_loss": 0.89603746, + "learning_rate": 0.0008133429096207398, + "loss": 0.90762818, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1590, + "time_per_iteration": 2.7690787315368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113549, + "balance_loss_mlp": 1.10534787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.03962763613217991, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76425815, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.08203125, + "routerloss_mlp": 0.0, + "step": 1591, + "time_per_iteration": 4.950432538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184059, + "balance_loss_mlp": 1.17060041, + "diversity_loss_mlp": 0.0, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.0624915030883944, + "language_loss": 0.8671608, + "learning_rate": 0.0008128571140339123, + "loss": 0.87900144, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1592, + "time_per_iteration": 2.717022657394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.15618944, + "diversity_loss_mlp": 0.0, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.08640912687422367, + "language_loss": 0.87240267, + "learning_rate": 0.0008126140340004805, + "loss": 0.88410139, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1593, + "time_per_iteration": 2.5112054347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157381, + "balance_loss_mlp": 1.14379096, + "diversity_loss_mlp": 0.0, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06492228459438584, + "language_loss": 0.82168889, + "learning_rate": 0.0008123708325995172, + "loss": 0.83326268, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1594, + "time_per_iteration": 3.193125009536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139509, + "balance_loss_mlp": 1.1256932, + "diversity_loss_mlp": 0.0, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06515151231920442, + "language_loss": 0.79815221, + "learning_rate": 0.0008121275099254414, + "loss": 0.80954736, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1595, + "time_per_iteration": 2.9032304286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133663, + "balance_loss_mlp": 1.12007284, + "diversity_loss_mlp": 0.0, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06899315915000012, + "language_loss": 0.88638222, + "learning_rate": 0.0008118840660727194, + "loss": 0.89771879, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1596, + "time_per_iteration": 2.6298515796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115466, + "balance_loss_mlp": 1.10215056, + "diversity_loss_mlp": 0.0, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.06984166924665287, + "language_loss": 0.87847084, + "learning_rate": 0.0008116405011358644, + "loss": 0.88962543, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.13336182, + "routerloss_mlp": 0.0, + "step": 1597, + "time_per_iteration": 3.1922342777252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095988, + "balance_loss_mlp": 1.08212388, + "diversity_loss_mlp": 0.0, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.07145022695402857, + "language_loss": 0.79985273, + "learning_rate": 0.0008113968152094369, + "loss": 0.81081259, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1598, + "time_per_iteration": 2.500500440597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090097, + "balance_loss_mlp": 1.07637632, + "diversity_loss_mlp": 0.0, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.07896733537507578, + "language_loss": 0.82477671, + "learning_rate": 0.0008111530083880438, + "loss": 0.83567768, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1599, + "time_per_iteration": 2.9081485271453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.07693791, + "diversity_loss_mlp": 0.0, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.10700735308097704, + "language_loss": 0.86289096, + "learning_rate": 0.0008109090807663399, + "loss": 0.87379909, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1600, + "time_per_iteration": 2.7883458137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084832, + "balance_loss_mlp": 1.07049167, + "diversity_loss_mlp": 0.0, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.058046583591585654, + "language_loss": 0.8845669, + "learning_rate": 0.0008106650324390257, + "loss": 0.89541531, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1601, + "time_per_iteration": 2.8250818252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012306, + "balance_loss_mlp": 1.78856134, + "diversity_loss_mlp": 0.20302816, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.03151963489439222, + "language_loss": 0.81347358, + "learning_rate": 0.0008104208635008493, + "loss": 0.8235966, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165114, + "step": 1602, + "time_per_iteration": 2.6824991703033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078191, + "balance_loss_mlp": 1.06365991, + "diversity_loss_mlp": 0.0, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.06925842581040223, + "language_loss": 0.81696957, + "learning_rate": 0.0008101765740466058, + "loss": 0.82775152, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1603, + "time_per_iteration": 2.4828884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083153, + "balance_loss_mlp": 1.06891942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.08194523431430376, + "language_loss": 0.83996522, + "learning_rate": 0.0008099321641711364, + "loss": 0.85079676, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1604, + "time_per_iteration": 2.628990650177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093302, + "balance_loss_mlp": 1.07891393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.066381842407901, + "language_loss": 0.83568424, + "learning_rate": 0.0008096876339693295, + "loss": 0.84661728, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1605, + "time_per_iteration": 2.621486186981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104017, + "balance_loss_mlp": 1.0898906, + "diversity_loss_mlp": 0.0, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.08065648415588843, + "language_loss": 0.8146233, + "learning_rate": 0.0008094429835361206, + "loss": 0.82566357, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1606, + "time_per_iteration": 2.9436137676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101821, + "balance_loss_mlp": 1.08727765, + "diversity_loss_mlp": 0.0, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.06722603246449312, + "language_loss": 0.85730284, + "learning_rate": 0.0008091982129664908, + "loss": 0.86832106, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1607, + "time_per_iteration": 2.6776270866394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110298, + "balance_loss_mlp": 1.09606481, + "diversity_loss_mlp": 0.0, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.07435522574008574, + "language_loss": 0.83177197, + "learning_rate": 0.0008089533223554687, + "loss": 0.842875, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1608, + "time_per_iteration": 2.6971724033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106883, + "balance_loss_mlp": 1.09322155, + "diversity_loss_mlp": 0.0, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.08534881839400792, + "language_loss": 0.85436511, + "learning_rate": 0.0008087083117981294, + "loss": 0.86543399, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1609, + "time_per_iteration": 2.873072624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100887, + "balance_loss_mlp": 1.08715367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.08408730625442483, + "language_loss": 0.88209295, + "learning_rate": 0.0008084631813895943, + "loss": 0.89310181, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1610, + "time_per_iteration": 2.7717368602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_mlp": 1.0843389, + "diversity_loss_mlp": 0.0, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07291880748627809, + "language_loss": 0.84093356, + "learning_rate": 0.0008082179312250315, + "loss": 0.85191453, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1611, + "time_per_iteration": 2.6323728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167376, + "balance_loss_mlp": 1.15912676, + "diversity_loss_mlp": 0.0, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.06715325583723679, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81023216, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.08251953, + "routerloss_mlp": 0.0, + "step": 1612, + "time_per_iteration": 4.837978839874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103787, + "balance_loss_mlp": 1.09591889, + "diversity_loss_mlp": 0.0, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.04843806861709949, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77733123, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.07861328, + "routerloss_mlp": 0.0, + "step": 1613, + "time_per_iteration": 5.086154937744141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118625, + "balance_loss_mlp": 1.10497594, + "diversity_loss_mlp": 0.0, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.09649046421891638, + "language_loss": 0.82414234, + "learning_rate": 0.0008074814631475545, + "loss": 0.83532858, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1614, + "time_per_iteration": 3.3300058841705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115901, + "balance_loss_mlp": 1.10232294, + "diversity_loss_mlp": 0.0, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.10381126956618623, + "language_loss": 0.7917223, + "learning_rate": 0.0008072357349114907, + "loss": 0.80288124, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1615, + "time_per_iteration": 2.692242383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123449, + "balance_loss_mlp": 1.1100384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.09811598085954727, + "language_loss": 0.88751173, + "learning_rate": 0.0008069898873959363, + "loss": 0.89874619, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1616, + "time_per_iteration": 2.688138723373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119599, + "balance_loss_mlp": 1.10590243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.06496922585492992, + "language_loss": 0.85670269, + "learning_rate": 0.0008067439206963375, + "loss": 0.8678987, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1617, + "time_per_iteration": 2.628465175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126727, + "balance_loss_mlp": 1.11359048, + "diversity_loss_mlp": 0.0, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.08367367493581554, + "language_loss": 0.86233091, + "learning_rate": 0.0008064978349081873, + "loss": 0.87359822, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1618, + "time_per_iteration": 2.9359195232391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122791, + "balance_loss_mlp": 1.10941529, + "diversity_loss_mlp": 0.0, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.062058920213391884, + "language_loss": 0.86742592, + "learning_rate": 0.0008062516301270245, + "loss": 0.87865382, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1619, + "time_per_iteration": 2.685615301132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968061, + "balance_loss_mlp": 1.70987701, + "diversity_loss_mlp": 0.19448289, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.02692656797073588, + "language_loss": 0.8831743, + "learning_rate": 0.0008060053064484343, + "loss": 0.89285493, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588114, + "step": 1620, + "time_per_iteration": 2.9507076740264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131577, + "balance_loss_mlp": 1.11839283, + "diversity_loss_mlp": 0.0, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.08216719715750098, + "language_loss": 0.85142976, + "learning_rate": 0.0008057588639680482, + "loss": 0.86274558, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.13208008, + "routerloss_mlp": 0.0, + "step": 1621, + "time_per_iteration": 2.7498936653137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00955916, + "balance_loss_mlp": 1.68915153, + "diversity_loss_mlp": 0.19115068, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.038673577194741904, + "language_loss": 0.82934028, + "learning_rate": 0.0008055123027815434, + "loss": 0.83889943, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01576493, + "step": 1622, + "time_per_iteration": 2.92877459526062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119016, + "balance_loss_mlp": 1.10545552, + "diversity_loss_mlp": 0.0, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.11144773799130939, + "language_loss": 0.8492527, + "learning_rate": 0.0008052656229846436, + "loss": 0.86044282, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.13580322, + "routerloss_mlp": 0.0, + "step": 1623, + "time_per_iteration": 2.6647849082946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104342, + "balance_loss_mlp": 1.09039474, + "diversity_loss_mlp": 0.0, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.09067734621983937, + "language_loss": 0.90320027, + "learning_rate": 0.0008050188246731182, + "loss": 0.9142437, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1624, + "time_per_iteration": 2.6908931732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108727, + "balance_loss_mlp": 1.07360816, + "diversity_loss_mlp": 0.0, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08706559573327896, + "language_loss": 0.8222695, + "learning_rate": 0.0008047719079427834, + "loss": 0.83314216, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1625, + "time_per_iteration": 2.979578733444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281481, + "balance_loss_mlp": 1.27170551, + "diversity_loss_mlp": 0.0, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.09241126848133228, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75633186, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.09765625, + "routerloss_mlp": 0.0, + "step": 1626, + "time_per_iteration": 4.813723802566528 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078597, + "balance_loss_mlp": 1.06489933, + "diversity_loss_mlp": 0.0, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.061158387019755324, + "language_loss": 0.86164916, + "learning_rate": 0.0008042777196091757, + "loss": 0.87243509, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1627, + "time_per_iteration": 2.6777052879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931263, + "balance_loss_mlp": 1.63595629, + "diversity_loss_mlp": 0.19502082, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.02888255305303151, + "language_loss": 0.81839561, + "learning_rate": 0.0008040304481977643, + "loss": 0.82770824, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01577434, + "step": 1628, + "time_per_iteration": 2.685519218444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_mlp": 1.07024312, + "diversity_loss_mlp": 0.0, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.070875243316129, + "language_loss": 0.86462033, + "learning_rate": 0.0008037830587512649, + "loss": 0.875458, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1629, + "time_per_iteration": 3.0812296867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093655, + "balance_loss_mlp": 1.07976675, + "diversity_loss_mlp": 0.0, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.07857424850498267, + "language_loss": 0.78910959, + "learning_rate": 0.0008035355513657224, + "loss": 0.80004621, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1630, + "time_per_iteration": 2.509866714477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109932, + "balance_loss_mlp": 1.08518136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.05926482463995905, + "language_loss": 0.9323386, + "learning_rate": 0.0008032879261372279, + "loss": 0.94333184, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1631, + "time_per_iteration": 2.793675422668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121244, + "balance_loss_mlp": 1.20142555, + "diversity_loss_mlp": 0.0, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.0543299042148954, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80848283, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 1632, + "time_per_iteration": 5.6717705726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_mlp": 1.08712876, + "diversity_loss_mlp": 0.0, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.07399367926820971, + "language_loss": 0.87236691, + "learning_rate": 0.0008027923225359748, + "loss": 0.88337696, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.13885498, + "routerloss_mlp": 0.0, + "step": 1633, + "time_per_iteration": 2.591161012649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_mlp": 1.09272563, + "diversity_loss_mlp": 0.0, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.07361205381971474, + "language_loss": 0.8823992, + "learning_rate": 0.0008025443443556267, + "loss": 0.89347273, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1634, + "time_per_iteration": 2.714925765991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_mlp": 1.09279966, + "diversity_loss_mlp": 0.0, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.05821338652647348, + "language_loss": 0.88174599, + "learning_rate": 0.000802296248717147, + "loss": 0.89281231, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1635, + "time_per_iteration": 2.924661159515381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102803, + "balance_loss_mlp": 1.08889091, + "diversity_loss_mlp": 0.0, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06918051977022115, + "language_loss": 0.78766519, + "learning_rate": 0.0008020480357168554, + "loss": 0.79869324, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1636, + "time_per_iteration": 2.8397598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096954, + "balance_loss_mlp": 1.08334041, + "diversity_loss_mlp": 0.0, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.061070409346790804, + "language_loss": 0.88343245, + "learning_rate": 0.0008017997054511165, + "loss": 0.89440191, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.13623047, + "routerloss_mlp": 0.0, + "step": 1637, + "time_per_iteration": 2.5770463943481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109567, + "balance_loss_mlp": 1.08241367, + "diversity_loss_mlp": 0.0, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06082888573267997, + "language_loss": 0.85688329, + "learning_rate": 0.0008015512580163407, + "loss": 0.86783999, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1638, + "time_per_iteration": 2.7893900871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915347, + "balance_loss_mlp": 1.6005652, + "diversity_loss_mlp": 0.19760543, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.03200753828687725, + "language_loss": 0.80247211, + "learning_rate": 0.0008013026935089838, + "loss": 0.8116256, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0162621, + "step": 1639, + "time_per_iteration": 2.9013028144836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116887, + "balance_loss_mlp": 1.10366678, + "diversity_loss_mlp": 0.0, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.07107229367788748, + "language_loss": 0.84156835, + "learning_rate": 0.0008010540120255472, + "loss": 0.85273731, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1640, + "time_per_iteration": 2.6617894172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122905, + "balance_loss_mlp": 1.10991144, + "diversity_loss_mlp": 0.0, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.08316081918757003, + "language_loss": 0.86058956, + "learning_rate": 0.0008008052136625774, + "loss": 0.87181866, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1641, + "time_per_iteration": 2.8128581047058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117766, + "balance_loss_mlp": 1.10461712, + "diversity_loss_mlp": 0.0, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.11340060957388516, + "language_loss": 0.86898887, + "learning_rate": 0.0008005562985166666, + "loss": 0.88016647, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1642, + "time_per_iteration": 2.6915791034698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113412, + "balance_loss_mlp": 1.10045385, + "diversity_loss_mlp": 0.0, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.06371803301806024, + "language_loss": 0.85065734, + "learning_rate": 0.0008003072666844524, + "loss": 0.86179143, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.12976074, + "routerloss_mlp": 0.0, + "step": 1643, + "time_per_iteration": 2.713515520095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110554, + "balance_loss_mlp": 1.09287417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.09207812275617455, + "language_loss": 0.82446098, + "learning_rate": 0.0008000581182626173, + "loss": 0.83551639, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 1644, + "time_per_iteration": 2.5728507041931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099382, + "balance_loss_mlp": 1.08668065, + "diversity_loss_mlp": 0.0, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.07446065392993936, + "language_loss": 0.86341298, + "learning_rate": 0.0007998088533478894, + "loss": 0.87440687, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.12713623, + "routerloss_mlp": 0.0, + "step": 1645, + "time_per_iteration": 2.7022316455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103676, + "balance_loss_mlp": 1.09096265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.09512310951915111, + "language_loss": 0.84171218, + "learning_rate": 0.000799559472037042, + "loss": 0.85274899, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.12719727, + "routerloss_mlp": 0.0, + "step": 1646, + "time_per_iteration": 2.5341672897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089286, + "balance_loss_mlp": 1.07678151, + "diversity_loss_mlp": 0.0, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05690135295492242, + "language_loss": 0.87462902, + "learning_rate": 0.0007993099744268932, + "loss": 0.88552189, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1647, + "time_per_iteration": 2.9204719066619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097973, + "balance_loss_mlp": 1.08491409, + "diversity_loss_mlp": 0.0, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.08028992569563033, + "language_loss": 0.88103539, + "learning_rate": 0.000799060360614307, + "loss": 0.8920151, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1648, + "time_per_iteration": 2.7098584175109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094994, + "balance_loss_mlp": 1.08204746, + "diversity_loss_mlp": 0.0, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.07374581447427947, + "language_loss": 0.83565277, + "learning_rate": 0.0007988106306961917, + "loss": 0.84660268, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1649, + "time_per_iteration": 3.136148691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096074, + "balance_loss_mlp": 1.08292556, + "diversity_loss_mlp": 0.0, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.08307651310008923, + "language_loss": 0.84510154, + "learning_rate": 0.0007985607847695014, + "loss": 0.85606229, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1650, + "time_per_iteration": 2.6657865047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090136, + "balance_loss_mlp": 1.07697558, + "diversity_loss_mlp": 0.0, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.07221907468491222, + "language_loss": 0.82981718, + "learning_rate": 0.0007983108229312345, + "loss": 0.84071863, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.13183594, + "routerloss_mlp": 0.0, + "step": 1651, + "time_per_iteration": 2.939943313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109006, + "balance_loss_mlp": 1.07648206, + "diversity_loss_mlp": 0.0, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.0785368607999539, + "language_loss": 0.86505926, + "learning_rate": 0.0007980607452784351, + "loss": 0.87595987, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1652, + "time_per_iteration": 2.586700916290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082675, + "balance_loss_mlp": 1.06952596, + "diversity_loss_mlp": 0.0, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.06920593361186494, + "language_loss": 0.90510356, + "learning_rate": 0.0007978105519081919, + "loss": 0.91593033, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1653, + "time_per_iteration": 2.665844440460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084984, + "balance_loss_mlp": 1.0715965, + "diversity_loss_mlp": 0.0, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.07269169213621761, + "language_loss": 0.87967515, + "learning_rate": 0.0007975602429176385, + "loss": 0.89052504, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.13415527, + "routerloss_mlp": 0.0, + "step": 1654, + "time_per_iteration": 2.5818393230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085975, + "balance_loss_mlp": 1.07225442, + "diversity_loss_mlp": 0.0, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.08150423110047789, + "language_loss": 0.81308222, + "learning_rate": 0.0007973098184039536, + "loss": 0.82394195, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1655, + "time_per_iteration": 2.664916515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094509, + "balance_loss_mlp": 1.08110952, + "diversity_loss_mlp": 0.0, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.0661968945841423, + "language_loss": 0.8695243, + "learning_rate": 0.0007970592784643602, + "loss": 0.88046944, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.13427734, + "routerloss_mlp": 0.0, + "step": 1656, + "time_per_iteration": 2.851214647293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104427, + "balance_loss_mlp": 1.09084868, + "diversity_loss_mlp": 0.0, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.0809768283097012, + "language_loss": 0.85228848, + "learning_rate": 0.0007968086231961272, + "loss": 0.86333275, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1657, + "time_per_iteration": 2.6277201175689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111168, + "balance_loss_mlp": 1.09744644, + "diversity_loss_mlp": 0.0, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.10999441213252201, + "language_loss": 0.83322126, + "learning_rate": 0.0007965578526965671, + "loss": 0.84433806, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1658, + "time_per_iteration": 2.5514447689056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097556, + "balance_loss_mlp": 1.08337009, + "diversity_loss_mlp": 0.0, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.07090711515760839, + "language_loss": 0.86299932, + "learning_rate": 0.0007963069670630377, + "loss": 0.87397492, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1659, + "time_per_iteration": 2.722572088241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.07523549, + "diversity_loss_mlp": 0.0, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.07181055202596492, + "language_loss": 0.88127738, + "learning_rate": 0.0007960559663929416, + "loss": 0.8921715, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1660, + "time_per_iteration": 2.6411688327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079317, + "balance_loss_mlp": 1.06500006, + "diversity_loss_mlp": 0.0, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.06614466369263741, + "language_loss": 0.87915826, + "learning_rate": 0.0007958048507837259, + "loss": 0.88995141, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1661, + "time_per_iteration": 2.954888343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075627, + "balance_loss_mlp": 1.06107187, + "diversity_loss_mlp": 0.0, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.08599761261652404, + "language_loss": 0.87309289, + "learning_rate": 0.0007955536203328822, + "loss": 0.88384914, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1662, + "time_per_iteration": 2.9499282836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074811, + "balance_loss_mlp": 1.06073272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.08962386225204486, + "language_loss": 0.8334958, + "learning_rate": 0.0007953022751379469, + "loss": 0.84424388, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1663, + "time_per_iteration": 2.768754005432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075722, + "balance_loss_mlp": 1.06131005, + "diversity_loss_mlp": 0.0, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.08182948291647181, + "language_loss": 0.8200748, + "learning_rate": 0.000795050815296501, + "loss": 0.830832, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1664, + "time_per_iteration": 2.9893014430999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.07167196, + "diversity_loss_mlp": 0.0, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.0641722272838546, + "language_loss": 0.93037909, + "learning_rate": 0.0007947992409061695, + "loss": 0.94122881, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1665, + "time_per_iteration": 2.583789110183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100063, + "balance_loss_mlp": 1.08662808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07388769827525307, + "language_loss": 0.86501724, + "learning_rate": 0.0007945475520646226, + "loss": 0.87601787, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1666, + "time_per_iteration": 2.944988965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127031, + "balance_loss_mlp": 1.11408508, + "diversity_loss_mlp": 0.0, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.0781321549049884, + "language_loss": 0.84777099, + "learning_rate": 0.0007942957488695743, + "loss": 0.85904133, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1667, + "time_per_iteration": 2.667464017868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138299, + "balance_loss_mlp": 1.12505507, + "diversity_loss_mlp": 0.0, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06588913292879497, + "language_loss": 0.81000018, + "learning_rate": 0.0007940438314187833, + "loss": 0.82138324, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.13250732, + "routerloss_mlp": 0.0, + "step": 1668, + "time_per_iteration": 3.0395359992980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147791, + "balance_loss_mlp": 1.13491094, + "diversity_loss_mlp": 0.0, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.07621602089938284, + "language_loss": 0.80540276, + "learning_rate": 0.0007937917998100529, + "loss": 0.8168807, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1669, + "time_per_iteration": 2.5894687175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142528, + "balance_loss_mlp": 1.1294744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.07981389159152626, + "language_loss": 0.79167509, + "learning_rate": 0.0007935396541412302, + "loss": 0.80310035, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.13067627, + "routerloss_mlp": 0.0, + "step": 1670, + "time_per_iteration": 2.672978401184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141245, + "balance_loss_mlp": 1.12813175, + "diversity_loss_mlp": 0.0, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.06899314705075654, + "language_loss": 0.85712755, + "learning_rate": 0.0007932873945102068, + "loss": 0.86854005, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1671, + "time_per_iteration": 2.6296515464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_mlp": 1.25616145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.05047573422440889, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.77033865, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1672, + "time_per_iteration": 4.840561628341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138556, + "balance_loss_mlp": 1.1251744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.06902528499394482, + "language_loss": 0.86527705, + "learning_rate": 0.0007927825337533461, + "loss": 0.87666261, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1673, + "time_per_iteration": 2.693758964538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142697, + "balance_loss_mlp": 1.12930942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.08521571565711833, + "language_loss": 0.84877092, + "learning_rate": 0.0007925299328235131, + "loss": 0.8601979, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1674, + "time_per_iteration": 2.659621238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141943, + "balance_loss_mlp": 1.12855613, + "diversity_loss_mlp": 0.0, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.08187135533898351, + "language_loss": 0.84720862, + "learning_rate": 0.000792277218323488, + "loss": 0.85862803, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1675, + "time_per_iteration": 2.646108865737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135022, + "balance_loss_mlp": 1.12169456, + "diversity_loss_mlp": 0.0, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.08499328402904442, + "language_loss": 0.8509531, + "learning_rate": 0.0007920243903513833, + "loss": 0.86230332, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1676, + "time_per_iteration": 2.5730555057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126699, + "balance_loss_mlp": 1.11364567, + "diversity_loss_mlp": 0.0, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.08854342537284099, + "language_loss": 0.84008271, + "learning_rate": 0.0007917714490053556, + "loss": 0.85134971, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1677, + "time_per_iteration": 2.718555212020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122958, + "balance_loss_mlp": 1.10974979, + "diversity_loss_mlp": 0.0, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.07711595043056121, + "language_loss": 0.86223996, + "learning_rate": 0.0007915183943836055, + "loss": 0.87346947, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1678, + "time_per_iteration": 2.902038812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112402, + "balance_loss_mlp": 1.09958673, + "diversity_loss_mlp": 0.0, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07762427611918464, + "language_loss": 0.8422336, + "learning_rate": 0.0007912652265843773, + "loss": 0.85335761, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1679, + "time_per_iteration": 3.024665117263794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107958, + "balance_loss_mlp": 1.09453535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.06959311244041297, + "language_loss": 0.81845474, + "learning_rate": 0.0007910119457059597, + "loss": 0.82953429, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1680, + "time_per_iteration": 2.6954221725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111109, + "balance_loss_mlp": 1.09806776, + "diversity_loss_mlp": 0.0, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.08135634404485692, + "language_loss": 0.80380678, + "learning_rate": 0.0007907585518466849, + "loss": 0.81491786, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1681, + "time_per_iteration": 2.961648464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108764, + "balance_loss_mlp": 1.09574652, + "diversity_loss_mlp": 0.0, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.06462126830885603, + "language_loss": 0.89670283, + "learning_rate": 0.000790505045104929, + "loss": 0.90779042, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1682, + "time_per_iteration": 2.5210485458374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111018, + "balance_loss_mlp": 1.09719789, + "diversity_loss_mlp": 0.0, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.08715930327910015, + "language_loss": 0.86719161, + "learning_rate": 0.0007902514255791125, + "loss": 0.8782934, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1683, + "time_per_iteration": 2.8002610206604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097901, + "balance_loss_mlp": 1.084764, + "diversity_loss_mlp": 0.0, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06656486310868524, + "language_loss": 0.8795855, + "learning_rate": 0.0007899976933676986, + "loss": 0.89056444, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1684, + "time_per_iteration": 2.967172622680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092249, + "balance_loss_mlp": 1.07880259, + "diversity_loss_mlp": 0.0, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.09628316614228749, + "language_loss": 0.87045735, + "learning_rate": 0.0007897438485691955, + "loss": 0.88137984, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1685, + "time_per_iteration": 2.680147171020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103099, + "balance_loss_mlp": 1.0898304, + "diversity_loss_mlp": 0.0, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.0850736326825917, + "language_loss": 0.82684374, + "learning_rate": 0.0007894898912821542, + "loss": 0.83787471, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1686, + "time_per_iteration": 2.554380416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.0880518, + "diversity_loss_mlp": 0.0, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.06056792299191916, + "language_loss": 0.86695451, + "learning_rate": 0.0007892358216051695, + "loss": 0.87797034, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1687, + "time_per_iteration": 2.7851648330688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.09641767, + "diversity_loss_mlp": 0.0, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.07434076211008771, + "language_loss": 0.91829026, + "learning_rate": 0.0007889816396368803, + "loss": 0.92938912, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1688, + "time_per_iteration": 2.6211581230163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.10499799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07845440141588131, + "language_loss": 0.85253429, + "learning_rate": 0.0007887273454759687, + "loss": 0.8637172, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.13299561, + "routerloss_mlp": 0.0, + "step": 1689, + "time_per_iteration": 2.507779598236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.10946417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.08373410695529686, + "language_loss": 0.82792354, + "learning_rate": 0.0007884729392211603, + "loss": 0.83914578, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1690, + "time_per_iteration": 2.6805906295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119249, + "balance_loss_mlp": 1.10672641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09069843341009556, + "language_loss": 0.85648167, + "learning_rate": 0.0007882184209712245, + "loss": 0.86767411, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.12530518, + "routerloss_mlp": 0.0, + "step": 1691, + "time_per_iteration": 2.569239377975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00949982, + "balance_loss_mlp": 1.66309059, + "diversity_loss_mlp": 0.20491584, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.028395749586794427, + "language_loss": 0.85757548, + "learning_rate": 0.000787963790824974, + "loss": 0.86707526, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597837, + "step": 1692, + "time_per_iteration": 3.009209156036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113225, + "balance_loss_mlp": 1.10071397, + "diversity_loss_mlp": 0.0, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.22846677162281695, + "language_loss": 0.89612615, + "learning_rate": 0.0007877090488812651, + "loss": 0.90725839, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.12512207, + "routerloss_mlp": 0.0, + "step": 1693, + "time_per_iteration": 2.450209617614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936753, + "balance_loss_mlp": 1.63723278, + "diversity_loss_mlp": 0.20419246, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.03161007726798549, + "language_loss": 0.83743423, + "learning_rate": 0.0007874541952389973, + "loss": 0.84680176, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01604037, + "step": 1694, + "time_per_iteration": 2.6965737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111615, + "balance_loss_mlp": 1.10350823, + "diversity_loss_mlp": 0.0, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.07424213060006848, + "language_loss": 0.86538494, + "learning_rate": 0.0007871992299971136, + "loss": 0.87654638, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1695, + "time_per_iteration": 2.570406913757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131966, + "balance_loss_mlp": 1.11953878, + "diversity_loss_mlp": 0.0, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.0612219868328418, + "language_loss": 0.84142137, + "learning_rate": 0.0007869441532546001, + "loss": 0.852741, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1696, + "time_per_iteration": 2.763688087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128533, + "balance_loss_mlp": 1.11626601, + "diversity_loss_mlp": 0.0, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06155756648422996, + "language_loss": 0.79298395, + "learning_rate": 0.0007866889651104867, + "loss": 0.80426925, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 1697, + "time_per_iteration": 2.816236972808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130866, + "balance_loss_mlp": 1.11769366, + "diversity_loss_mlp": 0.0, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.0827611554210385, + "language_loss": 0.83172429, + "learning_rate": 0.000786433665663846, + "loss": 0.84303296, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.13195801, + "routerloss_mlp": 0.0, + "step": 1698, + "time_per_iteration": 2.6627049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135283, + "balance_loss_mlp": 1.12240815, + "diversity_loss_mlp": 0.0, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.08562611300573084, + "language_loss": 0.86256903, + "learning_rate": 0.0007861782550137942, + "loss": 0.87392187, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1699, + "time_per_iteration": 2.9298973083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115677, + "balance_loss_mlp": 1.10270739, + "diversity_loss_mlp": 0.0, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.06870341741306431, + "language_loss": 0.85913056, + "learning_rate": 0.0007859227332594901, + "loss": 0.8702873, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1700, + "time_per_iteration": 2.9108214378356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099921, + "balance_loss_mlp": 1.08703494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.08010897822069696, + "language_loss": 0.84705722, + "learning_rate": 0.0007856671005001365, + "loss": 0.85805643, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1701, + "time_per_iteration": 3.172921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088126, + "balance_loss_mlp": 1.07506084, + "diversity_loss_mlp": 0.0, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.0963591610521261, + "language_loss": 0.81720912, + "learning_rate": 0.0007854113568349787, + "loss": 0.82809043, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.13085938, + "routerloss_mlp": 0.0, + "step": 1702, + "time_per_iteration": 3.1135685443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100269, + "balance_loss_mlp": 1.08686948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.07838750037803571, + "language_loss": 0.80661154, + "learning_rate": 0.0007851555023633052, + "loss": 0.8176142, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.13397217, + "routerloss_mlp": 0.0, + "step": 1703, + "time_per_iteration": 2.841059684753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086421, + "balance_loss_mlp": 1.07271171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07047077484334266, + "language_loss": 0.82222247, + "learning_rate": 0.0007848995371844474, + "loss": 0.83308667, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1704, + "time_per_iteration": 2.515455961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094981, + "balance_loss_mlp": 1.0816896, + "diversity_loss_mlp": 0.0, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.08203255389116743, + "language_loss": 0.80260348, + "learning_rate": 0.0007846434613977801, + "loss": 0.81355333, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1705, + "time_per_iteration": 2.523026466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.0868392, + "diversity_loss_mlp": 0.0, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.07270926258732689, + "language_loss": 0.78603041, + "learning_rate": 0.0007843872751027203, + "loss": 0.7970314, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.13275146, + "routerloss_mlp": 0.0, + "step": 1706, + "time_per_iteration": 2.8923709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915397, + "balance_loss_mlp": 1.59612775, + "diversity_loss_mlp": 0.20258766, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.02966318853366187, + "language_loss": 0.87305748, + "learning_rate": 0.0007841309783987287, + "loss": 0.88221151, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01603885, + "step": 1707, + "time_per_iteration": 2.7517144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115655, + "balance_loss_mlp": 1.10263109, + "diversity_loss_mlp": 0.0, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06500174516261728, + "language_loss": 0.89240694, + "learning_rate": 0.0007838745713853084, + "loss": 0.9035635, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1708, + "time_per_iteration": 2.6181201934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122322, + "balance_loss_mlp": 1.10945296, + "diversity_loss_mlp": 0.0, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.06936064314807153, + "language_loss": 0.8434307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85465395, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.12866211, + "routerloss_mlp": 0.0, + "step": 1709, + "time_per_iteration": 2.7040350437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124287, + "balance_loss_mlp": 1.1112572, + "diversity_loss_mlp": 0.0, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06883588356672955, + "language_loss": 0.86454904, + "learning_rate": 0.0007833614268284082, + "loss": 0.87579191, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 1710, + "time_per_iteration": 2.5110740661621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425821, + "balance_loss_mlp": 1.41738081, + "diversity_loss_mlp": 0.0, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.1402114647579648, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75535595, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.08447266, + "routerloss_mlp": 0.0, + "step": 1711, + "time_per_iteration": 4.873327016830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129416, + "balance_loss_mlp": 1.11650598, + "diversity_loss_mlp": 0.0, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.0798208466882041, + "language_loss": 0.78414649, + "learning_rate": 0.0007828478422289016, + "loss": 0.79544067, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 1712, + "time_per_iteration": 2.608412027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138403, + "balance_loss_mlp": 1.12507582, + "diversity_loss_mlp": 0.0, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.07544776571140048, + "language_loss": 0.8909815, + "learning_rate": 0.0007825908851623833, + "loss": 0.90236557, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.13323975, + "routerloss_mlp": 0.0, + "step": 1713, + "time_per_iteration": 2.8033607006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134961, + "balance_loss_mlp": 1.12190771, + "diversity_loss_mlp": 0.0, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.06974595077498419, + "language_loss": 0.85003847, + "learning_rate": 0.0007823338183843533, + "loss": 0.86138809, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1714, + "time_per_iteration": 2.6861188411712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148942, + "balance_loss_mlp": 1.13610959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.07049806127627434, + "language_loss": 0.81025606, + "learning_rate": 0.0007820766419946141, + "loss": 0.82174551, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1715, + "time_per_iteration": 3.3007164001464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168148, + "balance_loss_mlp": 1.16008925, + "diversity_loss_mlp": 0.0, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.052131774928428895, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80840629, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1716, + "time_per_iteration": 4.947760105133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906852, + "balance_loss_mlp": 1.58163857, + "diversity_loss_mlp": 0.20079982, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.033697214377685164, + "language_loss": 0.75853068, + "learning_rate": 0.0007815619607794288, + "loss": 0.76759923, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563331, + "step": 1717, + "time_per_iteration": 2.689937114715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173062, + "balance_loss_mlp": 1.1601274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.09689448967864323, + "language_loss": 0.8294118, + "learning_rate": 0.0007813044561538001, + "loss": 0.84114236, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1718, + "time_per_iteration": 3.1421005725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158875, + "balance_loss_mlp": 1.14559531, + "diversity_loss_mlp": 0.0, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06842928932014077, + "language_loss": 0.88578129, + "learning_rate": 0.0007810468423160958, + "loss": 0.89736998, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1719, + "time_per_iteration": 2.8917293548583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157511, + "balance_loss_mlp": 1.14486265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06941390463820386, + "language_loss": 0.81896281, + "learning_rate": 0.0007807891193663306, + "loss": 0.83053792, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.12640381, + "routerloss_mlp": 0.0, + "step": 1720, + "time_per_iteration": 2.8352882862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141979, + "balance_loss_mlp": 1.12950385, + "diversity_loss_mlp": 0.0, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07961809028947962, + "language_loss": 0.82409328, + "learning_rate": 0.0007805312874045614, + "loss": 0.83551311, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1721, + "time_per_iteration": 2.5056259632110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137994, + "balance_loss_mlp": 1.12510777, + "diversity_loss_mlp": 0.0, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.09061115976682882, + "language_loss": 0.86960506, + "learning_rate": 0.0007802733465308874, + "loss": 0.88098502, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1722, + "time_per_iteration": 2.438533306121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144898, + "balance_loss_mlp": 1.13225603, + "diversity_loss_mlp": 0.0, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06773749819611302, + "language_loss": 0.84162688, + "learning_rate": 0.0007800152968454501, + "loss": 0.8530758, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1723, + "time_per_iteration": 2.6364991664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134044, + "balance_loss_mlp": 1.12146711, + "diversity_loss_mlp": 0.0, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06044198445597461, + "language_loss": 0.90330362, + "learning_rate": 0.0007797571384484334, + "loss": 0.91464406, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.12567139, + "routerloss_mlp": 0.0, + "step": 1724, + "time_per_iteration": 2.8638265132904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133346, + "balance_loss_mlp": 1.12061453, + "diversity_loss_mlp": 0.0, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.0752969909322094, + "language_loss": 0.91929704, + "learning_rate": 0.0007794988714400633, + "loss": 0.93063056, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 1725, + "time_per_iteration": 2.615788698196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125798, + "balance_loss_mlp": 1.11242867, + "diversity_loss_mlp": 0.0, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.07890733478173245, + "language_loss": 0.85302055, + "learning_rate": 0.0007792404959206079, + "loss": 0.86427855, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.13372803, + "routerloss_mlp": 0.0, + "step": 1726, + "time_per_iteration": 2.545780897140503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107165, + "balance_loss_mlp": 1.09446895, + "diversity_loss_mlp": 0.0, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.07756389475354548, + "language_loss": 0.81480336, + "learning_rate": 0.0007789820119903774, + "loss": 0.82587504, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.12689209, + "routerloss_mlp": 0.0, + "step": 1727, + "time_per_iteration": 3.005662441253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114992, + "balance_loss_mlp": 1.10335684, + "diversity_loss_mlp": 0.0, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.03748312413261812, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.7960766, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 1728, + "time_per_iteration": 4.833205223083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105872, + "balance_loss_mlp": 1.09285486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.07170574552345628, + "language_loss": 0.83970881, + "learning_rate": 0.0007784647192990428, + "loss": 0.85076749, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 1729, + "time_per_iteration": 2.7309772968292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107979, + "balance_loss_mlp": 1.0948776, + "diversity_loss_mlp": 0.0, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06011930461286596, + "language_loss": 0.80777055, + "learning_rate": 0.0007782059107387696, + "loss": 0.81885028, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.13116455, + "routerloss_mlp": 0.0, + "step": 1730, + "time_per_iteration": 2.8615641593933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113195, + "balance_loss_mlp": 1.11733532, + "diversity_loss_mlp": 0.0, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.08106060743083753, + "language_loss": 0.88617826, + "learning_rate": 0.0007779469941693826, + "loss": 0.89749771, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1731, + "time_per_iteration": 2.801208257675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126142, + "balance_loss_mlp": 1.11240935, + "diversity_loss_mlp": 0.0, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.09519717038034853, + "language_loss": 0.77091044, + "learning_rate": 0.0007776879696914029, + "loss": 0.78217185, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1732, + "time_per_iteration": 2.8286595344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123068, + "balance_loss_mlp": 1.10889435, + "diversity_loss_mlp": 0.0, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.05947539267688924, + "language_loss": 0.88910627, + "learning_rate": 0.000777428837405392, + "loss": 0.90033698, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1733, + "time_per_iteration": 2.8319156169891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121491, + "balance_loss_mlp": 1.10701954, + "diversity_loss_mlp": 0.0, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.07113995025739508, + "language_loss": 0.86735553, + "learning_rate": 0.0007771695974119544, + "loss": 0.87857044, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1734, + "time_per_iteration": 2.5376570224761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112031, + "balance_loss_mlp": 1.09795249, + "diversity_loss_mlp": 0.0, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.08734149249458338, + "language_loss": 0.75937277, + "learning_rate": 0.0007769102498117359, + "loss": 0.77049315, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1735, + "time_per_iteration": 3.093188524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105698, + "balance_loss_mlp": 1.09138131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06929562674350419, + "language_loss": 0.79383999, + "learning_rate": 0.000776650794705424, + "loss": 0.80489695, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1736, + "time_per_iteration": 3.253673791885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121685, + "balance_loss_mlp": 1.10730791, + "diversity_loss_mlp": 0.0, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.06325878214231093, + "language_loss": 0.82130396, + "learning_rate": 0.0007763912321937483, + "loss": 0.83252084, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1737, + "time_per_iteration": 2.7109947204589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117751, + "balance_loss_mlp": 1.10324299, + "diversity_loss_mlp": 0.0, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.08404595709863052, + "language_loss": 0.82403475, + "learning_rate": 0.0007761315623774799, + "loss": 0.83521223, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1738, + "time_per_iteration": 3.4125657081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109047, + "balance_loss_mlp": 1.0946703, + "diversity_loss_mlp": 0.0, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08421865543081901, + "language_loss": 0.87820536, + "learning_rate": 0.0007758717853574313, + "loss": 0.88929582, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1739, + "time_per_iteration": 2.7345223426818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106099, + "balance_loss_mlp": 1.09184134, + "diversity_loss_mlp": 0.0, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.07638673743764693, + "language_loss": 0.90095574, + "learning_rate": 0.0007756119012344571, + "loss": 0.91201669, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1740, + "time_per_iteration": 2.5901129245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.08709717, + "diversity_loss_mlp": 0.0, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06863708242027233, + "language_loss": 0.8461023, + "learning_rate": 0.0007753519101094535, + "loss": 0.85711253, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1741, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089984, + "balance_loss_mlp": 1.07595301, + "diversity_loss_mlp": 0.0, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.07992644583812669, + "language_loss": 0.86363387, + "learning_rate": 0.0007750918120833575, + "loss": 0.87453371, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1742, + "time_per_iteration": 2.58940052986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.07488728, + "diversity_loss_mlp": 0.0, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.11201991585260462, + "language_loss": 0.87392128, + "learning_rate": 0.0007748316072571485, + "loss": 0.88480592, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1743, + "time_per_iteration": 2.8557286262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086266, + "balance_loss_mlp": 1.07202053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0749416267225997, + "language_loss": 0.79045737, + "learning_rate": 0.0007745712957318467, + "loss": 0.80131996, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1744, + "time_per_iteration": 2.9912548065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084233, + "balance_loss_mlp": 1.07057166, + "diversity_loss_mlp": 0.0, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06946859722884112, + "language_loss": 0.86471289, + "learning_rate": 0.0007743108776085141, + "loss": 0.87555522, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1745, + "time_per_iteration": 2.7899224758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_mlp": 1.07023191, + "diversity_loss_mlp": 0.0, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.08256839233284315, + "language_loss": 0.82965624, + "learning_rate": 0.0007740503529882543, + "loss": 0.84050083, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1746, + "time_per_iteration": 2.808084011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084564, + "balance_loss_mlp": 1.07044971, + "diversity_loss_mlp": 0.0, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.07349682427851349, + "language_loss": 0.90707254, + "learning_rate": 0.0007737897219722114, + "loss": 0.91791821, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1747, + "time_per_iteration": 2.712833881378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092286, + "balance_loss_mlp": 1.07794499, + "diversity_loss_mlp": 0.0, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.05794758251669461, + "language_loss": 0.81094921, + "learning_rate": 0.0007735289846615716, + "loss": 0.82187206, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1748, + "time_per_iteration": 2.677976369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108166, + "balance_loss_mlp": 1.09457588, + "diversity_loss_mlp": 0.0, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.0827866783592608, + "language_loss": 0.823035, + "learning_rate": 0.0007732681411575621, + "loss": 0.8341167, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1749, + "time_per_iteration": 2.674349069595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114062, + "balance_loss_mlp": 1.09997165, + "diversity_loss_mlp": 0.0, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.4203922337067485, + "language_loss": 0.87328398, + "learning_rate": 0.0007730071915614514, + "loss": 0.88442457, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1750, + "time_per_iteration": 2.6714634895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113648, + "balance_loss_mlp": 1.10037947, + "diversity_loss_mlp": 0.0, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09571011442330926, + "language_loss": 0.88792437, + "learning_rate": 0.0007727461359745489, + "loss": 0.89906085, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1751, + "time_per_iteration": 2.469905376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141755, + "balance_loss_mlp": 1.12897623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.07412184794878955, + "language_loss": 0.85941112, + "learning_rate": 0.0007724849744982056, + "loss": 0.87082875, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 1752, + "time_per_iteration": 2.6805977821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117715, + "balance_loss_mlp": 1.16388226, + "diversity_loss_mlp": 0.0, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.09378397224837084, + "language_loss": 0.81843758, + "learning_rate": 0.0007722237072338131, + "loss": 0.83020908, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1753, + "time_per_iteration": 2.7348344326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186311, + "balance_loss_mlp": 1.17280459, + "diversity_loss_mlp": 0.0, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.1034159122014491, + "language_loss": 0.85304463, + "learning_rate": 0.0007719623342828046, + "loss": 0.86490774, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1754, + "time_per_iteration": 2.5181336402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202577, + "balance_loss_mlp": 1.18872511, + "diversity_loss_mlp": 0.0, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.12703041648808322, + "language_loss": 0.84088987, + "learning_rate": 0.000771700855746654, + "loss": 0.85291564, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1755, + "time_per_iteration": 2.590925931930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188345, + "balance_loss_mlp": 1.1743381, + "diversity_loss_mlp": 0.0, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.06849832931784437, + "language_loss": 0.88371092, + "learning_rate": 0.0007714392717268763, + "loss": 0.89559436, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1756, + "time_per_iteration": 2.560246706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189305, + "balance_loss_mlp": 1.17545295, + "diversity_loss_mlp": 0.0, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.09135673410225151, + "language_loss": 0.8630141, + "learning_rate": 0.0007711775823250273, + "loss": 0.8749072, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1757, + "time_per_iteration": 2.562939167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194838, + "balance_loss_mlp": 1.18069935, + "diversity_loss_mlp": 0.0, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.07414503329772545, + "language_loss": 0.83081156, + "learning_rate": 0.0007709157876427039, + "loss": 0.84275991, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1758, + "time_per_iteration": 3.0652947425842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190916, + "balance_loss_mlp": 1.17681408, + "diversity_loss_mlp": 0.0, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.06977999371164574, + "language_loss": 0.85321373, + "learning_rate": 0.0007706538877815439, + "loss": 0.86512285, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1759, + "time_per_iteration": 2.5949320793151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202515, + "balance_loss_mlp": 1.1888063, + "diversity_loss_mlp": 0.0, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.052908737395413206, + "language_loss": 0.83029473, + "learning_rate": 0.0007703918828432259, + "loss": 0.84231991, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1760, + "time_per_iteration": 2.6404576301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231589, + "balance_loss_mlp": 1.21696198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.11529749255982873, + "language_loss": 0.89274669, + "learning_rate": 0.000770129772929469, + "loss": 0.90506256, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1761, + "time_per_iteration": 2.6486427783966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212596, + "balance_loss_mlp": 1.19812357, + "diversity_loss_mlp": 0.0, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.10010821715075297, + "language_loss": 0.8820551, + "learning_rate": 0.0007698675581420334, + "loss": 0.89418107, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1762, + "time_per_iteration": 2.8473589420318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170537, + "balance_loss_mlp": 1.15610099, + "diversity_loss_mlp": 0.0, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06768336788468338, + "language_loss": 0.79040444, + "learning_rate": 0.0007696052385827199, + "loss": 0.80210984, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1763, + "time_per_iteration": 2.9893951416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147034, + "balance_loss_mlp": 1.13271689, + "diversity_loss_mlp": 0.0, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.06731413775333611, + "language_loss": 0.78161937, + "learning_rate": 0.00076934281435337, + "loss": 0.79308975, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1764, + "time_per_iteration": 2.7329161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933074, + "balance_loss_mlp": 1.62411106, + "diversity_loss_mlp": 0.20785357, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0341650984642099, + "language_loss": 0.86205357, + "learning_rate": 0.0007690802855558658, + "loss": 0.87138426, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170921, + "step": 1765, + "time_per_iteration": 2.9281163215637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121638, + "balance_loss_mlp": 1.10924029, + "diversity_loss_mlp": 0.0, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.029090002598214117, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77496594, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 1766, + "time_per_iteration": 4.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104609, + "balance_loss_mlp": 1.08886182, + "diversity_loss_mlp": 0.0, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.08396151855964885, + "language_loss": 0.89357018, + "learning_rate": 0.0007685549146641262, + "loss": 0.90461624, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1767, + "time_per_iteration": 2.5867435932159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108303, + "balance_loss_mlp": 1.093521, + "diversity_loss_mlp": 0.0, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.10736891621188589, + "language_loss": 0.8816734, + "learning_rate": 0.0007682920727738579, + "loss": 0.89275646, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1768, + "time_per_iteration": 2.5119268894195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102653, + "balance_loss_mlp": 1.08738232, + "diversity_loss_mlp": 0.0, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.10494960168224592, + "language_loss": 0.85048056, + "learning_rate": 0.000768029126723369, + "loss": 0.86150718, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1769, + "time_per_iteration": 2.495424270629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090257, + "balance_loss_mlp": 1.07520068, + "diversity_loss_mlp": 0.0, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.08686425564719477, + "language_loss": 0.82128584, + "learning_rate": 0.0007677660766147447, + "loss": 0.83218843, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 1770, + "time_per_iteration": 2.532904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066946, + "balance_loss_mlp": 1.05578792, + "diversity_loss_mlp": 0.0, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.023964921008177247, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73537892, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 1771, + "time_per_iteration": 4.944117784500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117931, + "balance_loss_mlp": 1.1034112, + "diversity_loss_mlp": 0.0, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.10616133846526872, + "language_loss": 0.795196, + "learning_rate": 0.0007672396646316306, + "loss": 0.80637527, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1772, + "time_per_iteration": 2.6089062690734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134399, + "balance_loss_mlp": 1.11959314, + "diversity_loss_mlp": 0.0, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.07513330183645242, + "language_loss": 0.80376065, + "learning_rate": 0.000766976302961512, + "loss": 0.8151046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1773, + "time_per_iteration": 3.042421340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158934, + "balance_loss_mlp": 1.14410484, + "diversity_loss_mlp": 0.0, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.07872996810077096, + "language_loss": 0.81390858, + "learning_rate": 0.0007667128376420003, + "loss": 0.82549793, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1774, + "time_per_iteration": 2.536562442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208475, + "balance_loss_mlp": 1.19358635, + "diversity_loss_mlp": 0.0, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.08297883362487203, + "language_loss": 0.8462863, + "learning_rate": 0.0007664492687753817, + "loss": 0.85837102, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1775, + "time_per_iteration": 2.6977102756500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198612, + "balance_loss_mlp": 1.18424678, + "diversity_loss_mlp": 0.0, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10155126624771216, + "language_loss": 0.81542516, + "learning_rate": 0.000766185596463983, + "loss": 0.82741123, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1776, + "time_per_iteration": 2.6038215160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196202, + "balance_loss_mlp": 1.18163514, + "diversity_loss_mlp": 0.0, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.0897891274607312, + "language_loss": 0.77011722, + "learning_rate": 0.0007659218208101706, + "loss": 0.78207922, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1777, + "time_per_iteration": 3.0933022499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173425, + "balance_loss_mlp": 1.15902483, + "diversity_loss_mlp": 0.0, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.08364054831663822, + "language_loss": 0.85122472, + "learning_rate": 0.0007656579419163515, + "loss": 0.86295897, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1778, + "time_per_iteration": 2.732297420501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146549, + "balance_loss_mlp": 1.13211274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.0722191895240348, + "language_loss": 0.77409559, + "learning_rate": 0.0007653939598849724, + "loss": 0.78556108, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1779, + "time_per_iteration": 2.4908664226531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_mlp": 1.02253902, + "diversity_loss_mlp": 0.0, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.029240552967656448, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83912855, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.10107422, + "routerloss_mlp": 0.0, + "step": 1780, + "time_per_iteration": 4.9182775020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.10688317, + "diversity_loss_mlp": 0.0, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07624931845389674, + "language_loss": 0.80176342, + "learning_rate": 0.000764865686819522, + "loss": 0.81297386, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1781, + "time_per_iteration": 3.0602052211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111853, + "balance_loss_mlp": 1.097965, + "diversity_loss_mlp": 0.0, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.07936344533488468, + "language_loss": 0.85836053, + "learning_rate": 0.0007646013959905449, + "loss": 0.86947906, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1782, + "time_per_iteration": 2.5750925540924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109071, + "balance_loss_mlp": 1.09528995, + "diversity_loss_mlp": 0.0, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.07233814650781724, + "language_loss": 0.81042612, + "learning_rate": 0.0007643370024341949, + "loss": 0.82151681, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1783, + "time_per_iteration": 3.0870087146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110812, + "balance_loss_mlp": 1.09431553, + "diversity_loss_mlp": 0.0, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.07806584209391611, + "language_loss": 0.83175099, + "learning_rate": 0.0007640725062531195, + "loss": 0.84283221, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1784, + "time_per_iteration": 2.5063886642456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102776, + "balance_loss_mlp": 1.08888865, + "diversity_loss_mlp": 0.0, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.5067557182324087, + "language_loss": 0.86699629, + "learning_rate": 0.0007638079075500047, + "loss": 0.87802398, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1785, + "time_per_iteration": 2.532945394515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015111, + "balance_loss_mlp": 1.00562215, + "diversity_loss_mlp": 0.0, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.016449027395748255, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76195776, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 1786, + "time_per_iteration": 4.944318056106567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150049, + "balance_loss_mlp": 1.13542247, + "diversity_loss_mlp": 0.0, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.07356798682381475, + "language_loss": 0.83088338, + "learning_rate": 0.0007632784029886026, + "loss": 0.84238386, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1787, + "time_per_iteration": 2.6217002868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204344, + "balance_loss_mlp": 1.1884768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.08799574205003287, + "language_loss": 0.85466659, + "learning_rate": 0.0007630134973358873, + "loss": 0.86671007, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1788, + "time_per_iteration": 2.9664394855499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251833, + "balance_loss_mlp": 1.2359066, + "diversity_loss_mlp": 0.0, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.1052875761358054, + "language_loss": 0.86575854, + "learning_rate": 0.0007627484895722763, + "loss": 0.87827688, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1789, + "time_per_iteration": 2.67280912399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247407, + "balance_loss_mlp": 1.23117065, + "diversity_loss_mlp": 0.0, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.09611070791328494, + "language_loss": 0.80025196, + "learning_rate": 0.0007624833798006552, + "loss": 0.81272602, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.16235352, + "routerloss_mlp": 0.0, + "step": 1790, + "time_per_iteration": 3.046809196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238128, + "balance_loss_mlp": 1.22221315, + "diversity_loss_mlp": 0.0, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.07959093752215074, + "language_loss": 0.83783114, + "learning_rate": 0.0007622181681239483, + "loss": 0.8502124, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1791, + "time_per_iteration": 2.6601433753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244342, + "balance_loss_mlp": 1.22793913, + "diversity_loss_mlp": 0.0, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07919089267187412, + "language_loss": 0.84668601, + "learning_rate": 0.0007619528546451202, + "loss": 0.85912943, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 1792, + "time_per_iteration": 2.782947063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208587, + "balance_loss_mlp": 1.19314909, + "diversity_loss_mlp": 0.0, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.07332959959795217, + "language_loss": 0.83832949, + "learning_rate": 0.0007616874394671745, + "loss": 0.85041535, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1793, + "time_per_iteration": 3.3206703662872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184994, + "balance_loss_mlp": 1.169258, + "diversity_loss_mlp": 0.0, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.0713753042238581, + "language_loss": 0.85051751, + "learning_rate": 0.0007614219226931547, + "loss": 0.86236751, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1794, + "time_per_iteration": 2.7190396785736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179587, + "balance_loss_mlp": 1.16401851, + "diversity_loss_mlp": 0.0, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.07163818055438703, + "language_loss": 0.8457973, + "learning_rate": 0.0007611563044261435, + "loss": 0.85759324, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 1795, + "time_per_iteration": 2.5077741146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150042, + "balance_loss_mlp": 1.13422251, + "diversity_loss_mlp": 0.0, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.0670543853763616, + "language_loss": 0.86376798, + "learning_rate": 0.0007608905847692631, + "loss": 0.8752684, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1796, + "time_per_iteration": 2.4662768840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112741, + "balance_loss_mlp": 1.11171043, + "diversity_loss_mlp": 0.0, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07671810253227593, + "language_loss": 0.86553091, + "learning_rate": 0.0007606247638256749, + "loss": 0.87680501, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 1797, + "time_per_iteration": 2.8649494647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00624206, + "balance_loss_mlp": 1.05204535, + "diversity_loss_mlp": 0.16984753, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.0016633519833830733, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.78794497, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01325956, + "step": 1798, + "time_per_iteration": 4.963132619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_mlp": 1.04498482, + "diversity_loss_mlp": 0.0, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.032920799461559694, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80382872, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.10693359, + "routerloss_mlp": 0.0, + "step": 1799, + "time_per_iteration": 4.773633003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099713, + "balance_loss_mlp": 1.08345306, + "diversity_loss_mlp": 0.0, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.10233507255995049, + "language_loss": 0.85892332, + "learning_rate": 0.0007598266943068686, + "loss": 0.86992049, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 1800, + "time_per_iteration": 2.7380948066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092311, + "balance_loss_mlp": 1.0761466, + "diversity_loss_mlp": 0.0, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.08416075255699706, + "language_loss": 0.83903629, + "learning_rate": 0.0007595604692488507, + "loss": 0.84995937, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1801, + "time_per_iteration": 2.5558300018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099836, + "balance_loss_mlp": 1.08382583, + "diversity_loss_mlp": 0.0, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.0681721192963598, + "language_loss": 0.82674247, + "learning_rate": 0.0007592941434205215, + "loss": 0.83774084, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 1802, + "time_per_iteration": 2.8181002140045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017477, + "balance_loss_mlp": 1.00651026, + "diversity_loss_mlp": 0.0, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.018274165575771096, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588537, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 1803, + "time_per_iteration": 5.063629388809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126513, + "balance_loss_mlp": 1.11121821, + "diversity_loss_mlp": 0.0, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07342722091818694, + "language_loss": 0.80217302, + "learning_rate": 0.0007587611898665566, + "loss": 0.81343818, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.15270996, + "routerloss_mlp": 0.0, + "step": 1804, + "time_per_iteration": 3.0994317531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113982, + "balance_loss_mlp": 1.12468028, + "diversity_loss_mlp": 0.0, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.05936466476556785, + "language_loss": 0.82130265, + "learning_rate": 0.0007584945623478315, + "loss": 0.83270085, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1805, + "time_per_iteration": 2.833981513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152624, + "balance_loss_mlp": 1.13780582, + "diversity_loss_mlp": 0.0, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.08744691316973383, + "language_loss": 0.80801159, + "learning_rate": 0.000758227834472617, + "loss": 0.81953788, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1806, + "time_per_iteration": 3.0535178184509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166216, + "balance_loss_mlp": 1.15111172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.07500761638021176, + "language_loss": 0.77729452, + "learning_rate": 0.0007579610063444664, + "loss": 0.7889567, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1807, + "time_per_iteration": 2.7615864276885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149194, + "balance_loss_mlp": 1.1339947, + "diversity_loss_mlp": 0.0, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.07406875426876382, + "language_loss": 0.87547183, + "learning_rate": 0.0007576940780669712, + "loss": 0.88696373, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1808, + "time_per_iteration": 3.264080762863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143539, + "balance_loss_mlp": 1.12863731, + "diversity_loss_mlp": 0.0, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.07928472428244501, + "language_loss": 0.84104979, + "learning_rate": 0.0007574270497437624, + "loss": 0.85248518, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1809, + "time_per_iteration": 2.9859273433685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128376, + "balance_loss_mlp": 1.11302221, + "diversity_loss_mlp": 0.0, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.07150597602774303, + "language_loss": 0.88426095, + "learning_rate": 0.000757159921478509, + "loss": 0.89554477, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 1810, + "time_per_iteration": 2.7891488075256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057512, + "balance_loss_mlp": 1.04754615, + "diversity_loss_mlp": 0.0, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.03228641235871289, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75508153, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 1811, + "time_per_iteration": 4.737962007522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103488, + "balance_loss_mlp": 1.08814573, + "diversity_loss_mlp": 0.0, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.07438083858778873, + "language_loss": 0.87798911, + "learning_rate": 0.0007566253655367423, + "loss": 0.88902402, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 1812, + "time_per_iteration": 2.5879476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.07600367, + "diversity_loss_mlp": 0.0, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.06854488097647142, + "language_loss": 0.8957805, + "learning_rate": 0.000756357938067762, + "loss": 0.90669596, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 1813, + "time_per_iteration": 2.7090489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.07826209, + "diversity_loss_mlp": 0.0, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.0690606019510397, + "language_loss": 0.8334865, + "learning_rate": 0.0007560904110718033, + "loss": 0.84443069, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1814, + "time_per_iteration": 3.2445590496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096093, + "balance_loss_mlp": 1.08003569, + "diversity_loss_mlp": 0.0, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06223934742271703, + "language_loss": 0.83650601, + "learning_rate": 0.0007558227846527297, + "loss": 0.84746695, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1815, + "time_per_iteration": 2.8504550457000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110334, + "balance_loss_mlp": 1.08731842, + "diversity_loss_mlp": 0.0, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.07831164241761415, + "language_loss": 0.83117825, + "learning_rate": 0.0007555550589144429, + "loss": 0.84221166, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1816, + "time_per_iteration": 2.4655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111049, + "balance_loss_mlp": 1.09515882, + "diversity_loss_mlp": 0.0, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.08460625336983617, + "language_loss": 0.84522688, + "learning_rate": 0.000755287233960883, + "loss": 0.85633731, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1817, + "time_per_iteration": 2.602492094039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.07385683, + "diversity_loss_mlp": 0.0, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.07045705340523431, + "language_loss": 0.77682364, + "learning_rate": 0.0007550193098960292, + "loss": 0.78771949, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1818, + "time_per_iteration": 2.8674800395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00989642, + "balance_loss_mlp": 1.73270237, + "diversity_loss_mlp": 0.21087486, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.029406524514427698, + "language_loss": 0.86412024, + "learning_rate": 0.0007547512868238988, + "loss": 0.87401664, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01785346, + "step": 1819, + "time_per_iteration": 3.151559829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090552, + "balance_loss_mlp": 1.07453036, + "diversity_loss_mlp": 0.0, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.06124546921927801, + "language_loss": 0.83503008, + "learning_rate": 0.0007544831648485473, + "loss": 0.84593564, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1820, + "time_per_iteration": 2.6791367530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094234, + "balance_loss_mlp": 1.07806909, + "diversity_loss_mlp": 0.0, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.08232155140582742, + "language_loss": 0.81448233, + "learning_rate": 0.0007542149440740694, + "loss": 0.82542467, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1821, + "time_per_iteration": 2.665632724761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.07229352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.08177047744866778, + "language_loss": 0.85514361, + "learning_rate": 0.000753946624604597, + "loss": 0.8660273, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1822, + "time_per_iteration": 2.708221673965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085159, + "balance_loss_mlp": 1.06938744, + "diversity_loss_mlp": 0.0, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.07022994660183399, + "language_loss": 0.88119262, + "learning_rate": 0.0007536782065443015, + "loss": 0.89204431, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 1823, + "time_per_iteration": 2.633929967880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109141, + "balance_loss_mlp": 1.0758059, + "diversity_loss_mlp": 0.0, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.09965750131036237, + "language_loss": 0.75038946, + "learning_rate": 0.0007534096899973919, + "loss": 0.7613036, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1824, + "time_per_iteration": 2.585160732269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089888, + "balance_loss_mlp": 1.07460535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.0636070515998131, + "language_loss": 0.82941401, + "learning_rate": 0.0007531410750681154, + "loss": 0.84031284, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1825, + "time_per_iteration": 2.7595911026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100118, + "balance_loss_mlp": 1.08562207, + "diversity_loss_mlp": 0.0, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.09267960960885083, + "language_loss": 0.87015611, + "learning_rate": 0.0007528723618607575, + "loss": 0.88115728, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1826, + "time_per_iteration": 3.4216692447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090335, + "balance_loss_mlp": 1.07524323, + "diversity_loss_mlp": 0.0, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.07214965975453298, + "language_loss": 0.82582879, + "learning_rate": 0.0007526035504796422, + "loss": 0.83673215, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.15087891, + "routerloss_mlp": 0.0, + "step": 1827, + "time_per_iteration": 2.7822000980377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094099, + "balance_loss_mlp": 1.0794003, + "diversity_loss_mlp": 0.0, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.07057247929289283, + "language_loss": 0.86824054, + "learning_rate": 0.0007523346410291312, + "loss": 0.8791815, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1828, + "time_per_iteration": 2.7560181617736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098232, + "balance_loss_mlp": 1.08291376, + "diversity_loss_mlp": 0.0, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.0630617970486185, + "language_loss": 0.85159689, + "learning_rate": 0.0007520656336136245, + "loss": 0.86257917, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1829, + "time_per_iteration": 2.9432313442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098974, + "balance_loss_mlp": 1.08431172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06541232162591855, + "language_loss": 0.88230217, + "learning_rate": 0.0007517965283375599, + "loss": 0.89329195, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1830, + "time_per_iteration": 2.8773486614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098552, + "balance_loss_mlp": 1.08363926, + "diversity_loss_mlp": 0.0, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.06973135687475002, + "language_loss": 0.89511967, + "learning_rate": 0.0007515273253054132, + "loss": 0.90610522, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1831, + "time_per_iteration": 2.662757396697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097986, + "balance_loss_mlp": 1.08288169, + "diversity_loss_mlp": 0.0, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.07142201858296882, + "language_loss": 0.82785273, + "learning_rate": 0.0007512580246216988, + "loss": 0.83883256, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1832, + "time_per_iteration": 2.730994939804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096378, + "balance_loss_mlp": 1.08164394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.07119734441282773, + "language_loss": 0.84715027, + "learning_rate": 0.000750988626390968, + "loss": 0.85811406, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1833, + "time_per_iteration": 2.604182004928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_mlp": 1.07508624, + "diversity_loss_mlp": 0.0, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.07060575001723658, + "language_loss": 0.85089648, + "learning_rate": 0.0007507191307178108, + "loss": 0.86179501, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1834, + "time_per_iteration": 2.7584774494171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083532, + "balance_loss_mlp": 1.06808281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.09392412586459238, + "language_loss": 0.75105453, + "learning_rate": 0.0007504495377068543, + "loss": 0.76188982, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1835, + "time_per_iteration": 2.731039524078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087025, + "balance_loss_mlp": 1.07230306, + "diversity_loss_mlp": 0.0, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09299008065025831, + "language_loss": 0.81784093, + "learning_rate": 0.0007501798474627642, + "loss": 0.82871115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1836, + "time_per_iteration": 2.9180665016174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092183, + "balance_loss_mlp": 1.07738876, + "diversity_loss_mlp": 0.0, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.06800399913452355, + "language_loss": 0.8354817, + "learning_rate": 0.0007499100600902433, + "loss": 0.84640354, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1837, + "time_per_iteration": 2.981478452682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097597, + "balance_loss_mlp": 1.08236217, + "diversity_loss_mlp": 0.0, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.07178124654929893, + "language_loss": 0.83625698, + "learning_rate": 0.0007496401756940324, + "loss": 0.84723294, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1838, + "time_per_iteration": 2.7256877422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107267, + "balance_loss_mlp": 1.09267545, + "diversity_loss_mlp": 0.0, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.08438072522416575, + "language_loss": 0.81940264, + "learning_rate": 0.0007493701943789098, + "loss": 0.83047533, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1839, + "time_per_iteration": 2.805553674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117723, + "balance_loss_mlp": 1.10266685, + "diversity_loss_mlp": 0.0, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07000666511795951, + "language_loss": 0.82830888, + "learning_rate": 0.000749100116249692, + "loss": 0.83948612, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1840, + "time_per_iteration": 2.608135223388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00954188, + "balance_loss_mlp": 1.66862321, + "diversity_loss_mlp": 0.20571998, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.03743173710930313, + "language_loss": 0.86076337, + "learning_rate": 0.0007488299414112321, + "loss": 0.87030524, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01701665, + "step": 1841, + "time_per_iteration": 2.6307811737060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112413, + "balance_loss_mlp": 1.10974133, + "diversity_loss_mlp": 0.0, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.06710116446149988, + "language_loss": 0.77204335, + "learning_rate": 0.0007485596699684215, + "loss": 0.78328466, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1842, + "time_per_iteration": 2.808776378631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132611, + "balance_loss_mlp": 1.11780548, + "diversity_loss_mlp": 0.0, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.07987851383877129, + "language_loss": 0.85353696, + "learning_rate": 0.000748289302026189, + "loss": 0.86486304, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1843, + "time_per_iteration": 2.8449106216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127963, + "balance_loss_mlp": 1.11339569, + "diversity_loss_mlp": 0.0, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.06918658934745357, + "language_loss": 0.85752398, + "learning_rate": 0.0007480188376895004, + "loss": 0.86880362, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1844, + "time_per_iteration": 3.0339298248291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160602, + "balance_loss_mlp": 1.15135121, + "diversity_loss_mlp": 0.0, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.06421168097867443, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74971944, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 1845, + "time_per_iteration": 4.932978391647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.10506296, + "diversity_loss_mlp": 0.0, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08194467088107492, + "language_loss": 0.78768218, + "learning_rate": 0.0007474776202528074, + "loss": 0.79887938, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1846, + "time_per_iteration": 2.9188990592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111713, + "balance_loss_mlp": 1.1021452, + "diversity_loss_mlp": 0.0, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08015412782248336, + "language_loss": 0.80999184, + "learning_rate": 0.000747206867362922, + "loss": 0.82116312, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1847, + "time_per_iteration": 3.0966272354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099814, + "balance_loss_mlp": 1.085235, + "diversity_loss_mlp": 0.0, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.09857033029565816, + "language_loss": 0.836568, + "learning_rate": 0.0007469360184988194, + "loss": 0.84756613, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1848, + "time_per_iteration": 2.9021246433258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104687, + "balance_loss_mlp": 1.08986914, + "diversity_loss_mlp": 0.0, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08185517170087683, + "language_loss": 0.86821651, + "learning_rate": 0.0007466650737656518, + "loss": 0.8792634, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1849, + "time_per_iteration": 2.615549325942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102053, + "balance_loss_mlp": 1.0876888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06916390030254578, + "language_loss": 0.89687926, + "learning_rate": 0.0007463940332686098, + "loss": 0.9078998, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1850, + "time_per_iteration": 2.497159242630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931214, + "balance_loss_mlp": 1.62144685, + "diversity_loss_mlp": 0.20650919, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.030410176313075864, + "language_loss": 0.84120536, + "learning_rate": 0.0007461228971129205, + "loss": 0.85051751, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01723633, + "step": 1851, + "time_per_iteration": 2.959170341491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931448, + "balance_loss_mlp": 1.62270963, + "diversity_loss_mlp": 0.20620242, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.03221270440610224, + "language_loss": 0.85523784, + "learning_rate": 0.0007458516654038483, + "loss": 0.86455238, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01699215, + "step": 1852, + "time_per_iteration": 2.6886868476867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.13526964, + "diversity_loss_mlp": 0.0, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06572834298852859, + "language_loss": 0.86835778, + "learning_rate": 0.0007455803382466946, + "loss": 0.8798511, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1853, + "time_per_iteration": 2.8323659896850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151398, + "balance_loss_mlp": 1.13686657, + "diversity_loss_mlp": 0.0, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.06349489422764842, + "language_loss": 0.86956179, + "learning_rate": 0.0007453089157467979, + "loss": 0.88107574, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1854, + "time_per_iteration": 2.817117929458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.13687038, + "diversity_loss_mlp": 0.0, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06687597930641362, + "language_loss": 0.8221277, + "learning_rate": 0.0007450373980095341, + "loss": 0.83364242, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1855, + "time_per_iteration": 3.0857772827148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148466, + "balance_loss_mlp": 1.13494754, + "diversity_loss_mlp": 0.0, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.0656889709190827, + "language_loss": 0.86804116, + "learning_rate": 0.0007447657851403155, + "loss": 0.87952584, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1856, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144273, + "balance_loss_mlp": 1.1303966, + "diversity_loss_mlp": 0.0, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.08894932465162153, + "language_loss": 0.78988904, + "learning_rate": 0.0007444940772445915, + "loss": 0.80133176, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1857, + "time_per_iteration": 2.752232551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122576, + "balance_loss_mlp": 1.10860419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06705763345081875, + "language_loss": 0.80129987, + "learning_rate": 0.0007442222744278484, + "loss": 0.81252563, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1858, + "time_per_iteration": 2.638322591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110883, + "balance_loss_mlp": 1.09717393, + "diversity_loss_mlp": 0.0, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.05935371072747042, + "language_loss": 0.8399322, + "learning_rate": 0.0007439503767956099, + "loss": 0.85104102, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.137146, + "routerloss_mlp": 0.0, + "step": 1859, + "time_per_iteration": 2.699204921722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124434, + "balance_loss_mlp": 1.11480188, + "diversity_loss_mlp": 0.0, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.03541879327423246, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80796039, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 1860, + "time_per_iteration": 4.89499831199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089258, + "balance_loss_mlp": 1.07479787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.06413043417122823, + "language_loss": 0.86215138, + "learning_rate": 0.000743406297506922, + "loss": 0.87304389, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1861, + "time_per_iteration": 2.7184388637542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919817, + "balance_loss_mlp": 1.60078692, + "diversity_loss_mlp": 0.20507258, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.028510278569739433, + "language_loss": 0.84439111, + "learning_rate": 0.0007431341160617031, + "loss": 0.8535893, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01688758, + "step": 1862, + "time_per_iteration": 2.8915610313415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084391, + "balance_loss_mlp": 1.06988358, + "diversity_loss_mlp": 0.0, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06954606141633879, + "language_loss": 0.88100171, + "learning_rate": 0.0007428618402234491, + "loss": 0.8918457, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1863, + "time_per_iteration": 2.6724555492401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087919, + "balance_loss_mlp": 1.0733279, + "diversity_loss_mlp": 0.0, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.07542508091229044, + "language_loss": 0.80288851, + "learning_rate": 0.0007425894700978668, + "loss": 0.81376767, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1864, + "time_per_iteration": 2.724853038787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_mlp": 1.06996608, + "diversity_loss_mlp": 0.0, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.07695346444963648, + "language_loss": 0.7981261, + "learning_rate": 0.0007423170057906996, + "loss": 0.80896473, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1865, + "time_per_iteration": 3.9006779193878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108627, + "balance_loss_mlp": 1.0722512, + "diversity_loss_mlp": 0.0, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.07814080760266444, + "language_loss": 0.86228722, + "learning_rate": 0.0007420444474077275, + "loss": 0.87314993, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1866, + "time_per_iteration": 2.546194076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095193, + "balance_loss_mlp": 1.0812335, + "diversity_loss_mlp": 0.0, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.0773553058948038, + "language_loss": 0.8949936, + "learning_rate": 0.0007417717950547671, + "loss": 0.90594554, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1867, + "time_per_iteration": 2.5670700073242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052493, + "balance_loss_mlp": 1.04262233, + "diversity_loss_mlp": 0.0, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.023944930622272237, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.770491, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 1868, + "time_per_iteration": 4.900780200958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101302, + "balance_loss_mlp": 1.087533, + "diversity_loss_mlp": 0.0, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.06547244306940128, + "language_loss": 0.84938717, + "learning_rate": 0.0007412262088623299, + "loss": 0.86040014, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1869, + "time_per_iteration": 2.7674195766448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092029, + "balance_loss_mlp": 1.60128522, + "diversity_loss_mlp": 0.20662443, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.03542659619783611, + "language_loss": 0.79155517, + "learning_rate": 0.0007409532752346684, + "loss": 0.80075806, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633519, + "step": 1870, + "time_per_iteration": 2.7116785049438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111485, + "balance_loss_mlp": 1.101367, + "diversity_loss_mlp": 0.0, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.061502004439029076, + "language_loss": 0.8836326, + "learning_rate": 0.0007406802480606491, + "loss": 0.89478111, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1871, + "time_per_iteration": 2.642608165740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105605, + "balance_loss_mlp": 1.0916698, + "diversity_loss_mlp": 0.0, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.06939665757215846, + "language_loss": 0.90353388, + "learning_rate": 0.0007404071274462707, + "loss": 0.91458994, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.1394043, + "routerloss_mlp": 0.0, + "step": 1872, + "time_per_iteration": 2.5600955486297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113518, + "balance_loss_mlp": 1.09967744, + "diversity_loss_mlp": 0.0, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.07241097832053987, + "language_loss": 0.83719409, + "learning_rate": 0.0007401339134975682, + "loss": 0.84832925, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1873, + "time_per_iteration": 2.6775293350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111162, + "balance_loss_mlp": 1.09724998, + "diversity_loss_mlp": 0.0, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.07980684605652169, + "language_loss": 0.84604299, + "learning_rate": 0.0007398606063206122, + "loss": 0.85715467, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1874, + "time_per_iteration": 2.6092889308929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109131, + "balance_loss_mlp": 1.09546924, + "diversity_loss_mlp": 0.0, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.09304103013369584, + "language_loss": 0.78818524, + "learning_rate": 0.0007395872060215101, + "loss": 0.79927647, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1875, + "time_per_iteration": 2.5999374389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124779, + "balance_loss_mlp": 1.11121297, + "diversity_loss_mlp": 0.0, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.08049441369365674, + "language_loss": 0.8851527, + "learning_rate": 0.0007393137127064056, + "loss": 0.89640045, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1876, + "time_per_iteration": 2.635896682739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127435, + "balance_loss_mlp": 1.11380959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.06613177233605298, + "language_loss": 0.84377646, + "learning_rate": 0.0007390401264814779, + "loss": 0.8550508, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1877, + "time_per_iteration": 2.597508192062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151319, + "balance_loss_mlp": 1.1378243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.09083655630754779, + "language_loss": 0.84454513, + "learning_rate": 0.0007387664474529427, + "loss": 0.8560583, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1878, + "time_per_iteration": 2.6493661403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143725, + "balance_loss_mlp": 1.1302073, + "diversity_loss_mlp": 0.0, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.0643860955644754, + "language_loss": 0.91379291, + "learning_rate": 0.0007384926757270518, + "loss": 0.92523015, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1879, + "time_per_iteration": 2.62565016746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152012, + "balance_loss_mlp": 1.13819528, + "diversity_loss_mlp": 0.0, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.07609143241795291, + "language_loss": 0.80057949, + "learning_rate": 0.0007382188114100924, + "loss": 0.81209958, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1880, + "time_per_iteration": 2.974212169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.14148784, + "diversity_loss_mlp": 0.0, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.0632350243804942, + "language_loss": 0.8182314, + "learning_rate": 0.0007379448546083884, + "loss": 0.82978803, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1881, + "time_per_iteration": 2.894099712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154364, + "balance_loss_mlp": 1.14052355, + "diversity_loss_mlp": 0.0, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06232367753538678, + "language_loss": 0.8822301, + "learning_rate": 0.0007376708054282992, + "loss": 0.89377379, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1882, + "time_per_iteration": 2.9576163291931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162916, + "balance_loss_mlp": 1.14919519, + "diversity_loss_mlp": 0.0, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.06608098206448941, + "language_loss": 0.83563071, + "learning_rate": 0.0007373966639762201, + "loss": 0.84725988, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1883, + "time_per_iteration": 2.6004068851470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158732, + "balance_loss_mlp": 1.14478457, + "diversity_loss_mlp": 0.0, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.07441448138889938, + "language_loss": 0.88544619, + "learning_rate": 0.0007371224303585822, + "loss": 0.89703357, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1884, + "time_per_iteration": 2.5741078853607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109461, + "balance_loss_mlp": 1.09897089, + "diversity_loss_mlp": 0.0, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.03545085729862102, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81466532, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 1885, + "time_per_iteration": 4.706872224807739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.13442218, + "diversity_loss_mlp": 0.0, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.0691831634947964, + "language_loss": 0.8278423, + "learning_rate": 0.0007365736870525335, + "loss": 0.83932269, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1886, + "time_per_iteration": 2.8480284214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135823, + "balance_loss_mlp": 1.12236464, + "diversity_loss_mlp": 0.0, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.0786816251155578, + "language_loss": 0.82659888, + "learning_rate": 0.000736299177577164, + "loss": 0.83795714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1887, + "time_per_iteration": 2.601449966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127197, + "balance_loss_mlp": 1.11358309, + "diversity_loss_mlp": 0.0, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0767010159800114, + "language_loss": 0.8381778, + "learning_rate": 0.0007360245763623174, + "loss": 0.84944975, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1888, + "time_per_iteration": 2.6951138973236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106263, + "balance_loss_mlp": 1.09350717, + "diversity_loss_mlp": 0.0, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06311908909694558, + "language_loss": 0.89886129, + "learning_rate": 0.0007357498835146039, + "loss": 0.90992391, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1889, + "time_per_iteration": 2.8509137630462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094399, + "balance_loss_mlp": 1.08141732, + "diversity_loss_mlp": 0.0, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.06820711534899371, + "language_loss": 0.86674547, + "learning_rate": 0.0007354750991406684, + "loss": 0.87768942, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1890, + "time_per_iteration": 2.7162795066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089716, + "balance_loss_mlp": 1.07673419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07876014589837055, + "language_loss": 0.80930853, + "learning_rate": 0.0007352002233471919, + "loss": 0.82020569, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1891, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091374, + "balance_loss_mlp": 1.07835662, + "diversity_loss_mlp": 0.0, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.08103720744805817, + "language_loss": 0.79372823, + "learning_rate": 0.0007349252562408906, + "loss": 0.80464196, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1892, + "time_per_iteration": 2.6752734184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097706, + "balance_loss_mlp": 1.08496833, + "diversity_loss_mlp": 0.0, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.07356128462514616, + "language_loss": 0.81490725, + "learning_rate": 0.0007346501979285158, + "loss": 0.82588428, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1893, + "time_per_iteration": 2.8990893363952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_mlp": 1.03214884, + "diversity_loss_mlp": 0.0, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.022756463517582398, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81579787, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.08544922, + "routerloss_mlp": 0.0, + "step": 1894, + "time_per_iteration": 4.8097145557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098768, + "balance_loss_mlp": 1.0857501, + "diversity_loss_mlp": 0.0, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06969655176236832, + "language_loss": 0.85880721, + "learning_rate": 0.0007340998081127308, + "loss": 0.86979485, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1895, + "time_per_iteration": 2.757380485534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087705, + "balance_loss_mlp": 1.074646, + "diversity_loss_mlp": 0.0, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06910669114263218, + "language_loss": 0.91127002, + "learning_rate": 0.0007338244768230007, + "loss": 0.92214715, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1896, + "time_per_iteration": 2.7967634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098629, + "balance_loss_mlp": 1.08584976, + "diversity_loss_mlp": 0.0, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.05804787602656793, + "language_loss": 0.88684666, + "learning_rate": 0.0007335490547545578, + "loss": 0.89783299, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1897, + "time_per_iteration": 3.086498260498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095377, + "balance_loss_mlp": 1.08286643, + "diversity_loss_mlp": 0.0, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06953546528053214, + "language_loss": 0.82679451, + "learning_rate": 0.0007332735420143308, + "loss": 0.83774823, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1898, + "time_per_iteration": 2.788245439529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097867, + "balance_loss_mlp": 1.08476591, + "diversity_loss_mlp": 0.0, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.07600656362423025, + "language_loss": 0.86647844, + "learning_rate": 0.0007329979387092826, + "loss": 0.87745708, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1899, + "time_per_iteration": 2.5437934398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101105, + "balance_loss_mlp": 1.08821869, + "diversity_loss_mlp": 0.0, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.05952938167480439, + "language_loss": 0.83796108, + "learning_rate": 0.0007327222449464124, + "loss": 0.8489722, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1900, + "time_per_iteration": 3.2824244499206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011078, + "balance_loss_mlp": 1.09499097, + "diversity_loss_mlp": 0.0, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07745224305421915, + "language_loss": 0.88634431, + "learning_rate": 0.0007324464608327538, + "loss": 0.89742231, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.12823486, + "routerloss_mlp": 0.0, + "step": 1901, + "time_per_iteration": 2.6411991119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102996, + "balance_loss_mlp": 1.08995461, + "diversity_loss_mlp": 0.0, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.08223816362142805, + "language_loss": 0.88474846, + "learning_rate": 0.0007321705864753758, + "loss": 0.89577842, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.1305542, + "routerloss_mlp": 0.0, + "step": 1902, + "time_per_iteration": 2.682002544403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931657, + "balance_loss_mlp": 1.62497878, + "diversity_loss_mlp": 0.20707282, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.026825446902959647, + "language_loss": 0.84137708, + "learning_rate": 0.0007318946219813823, + "loss": 0.85069364, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563089, + "step": 1903, + "time_per_iteration": 3.0061404705047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108165, + "balance_loss_mlp": 1.09403849, + "diversity_loss_mlp": 0.0, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.07526416733947026, + "language_loss": 0.89736164, + "learning_rate": 0.000731618567457912, + "loss": 0.90844321, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.14105225, + "routerloss_mlp": 0.0, + "step": 1904, + "time_per_iteration": 2.6523027420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099252, + "balance_loss_mlp": 1.08536446, + "diversity_loss_mlp": 0.0, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07605082206895837, + "language_loss": 0.87058568, + "learning_rate": 0.000731342423012139, + "loss": 0.88157821, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1905, + "time_per_iteration": 3.0595312118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096318, + "balance_loss_mlp": 1.08213234, + "diversity_loss_mlp": 0.0, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.07718853495225737, + "language_loss": 0.82559443, + "learning_rate": 0.0007310661887512722, + "loss": 0.83655763, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1906, + "time_per_iteration": 3.056859016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090478, + "balance_loss_mlp": 1.07672131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.07458396044121823, + "language_loss": 0.8194133, + "learning_rate": 0.0007307898647825549, + "loss": 0.83031803, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1907, + "time_per_iteration": 2.670468807220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090408, + "balance_loss_mlp": 1.07666349, + "diversity_loss_mlp": 0.0, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.09231339543244264, + "language_loss": 0.89368939, + "learning_rate": 0.0007305134512132659, + "loss": 0.90459347, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.13751221, + "routerloss_mlp": 0.0, + "step": 1908, + "time_per_iteration": 2.6561663150787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091843, + "balance_loss_mlp": 1.07826495, + "diversity_loss_mlp": 0.0, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.08913139219920335, + "language_loss": 0.83308864, + "learning_rate": 0.0007302369481507183, + "loss": 0.84400707, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1909, + "time_per_iteration": 2.5485799312591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017138, + "balance_loss_mlp": 1.00979447, + "diversity_loss_mlp": 0.0, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.013277678950868657, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.80978894, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1910, + "time_per_iteration": 4.848855257034302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111697, + "balance_loss_mlp": 1.09842944, + "diversity_loss_mlp": 0.0, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.058739485749840115, + "language_loss": 0.85315347, + "learning_rate": 0.000729683673975274, + "loss": 0.86427045, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.13287354, + "routerloss_mlp": 0.0, + "step": 1911, + "time_per_iteration": 2.690218210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114863, + "balance_loss_mlp": 1.10165429, + "diversity_loss_mlp": 0.0, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.05585809452393386, + "language_loss": 0.8291769, + "learning_rate": 0.0007294069030771774, + "loss": 0.84032547, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1912, + "time_per_iteration": 3.678927183151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125561, + "balance_loss_mlp": 1.1124301, + "diversity_loss_mlp": 0.0, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.06389765233013874, + "language_loss": 0.90667701, + "learning_rate": 0.0007291300431154224, + "loss": 0.91793263, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1913, + "time_per_iteration": 2.616999387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043841, + "balance_loss_mlp": 1.03611672, + "diversity_loss_mlp": 0.0, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.02051984405011318, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7143358, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1914, + "time_per_iteration": 4.973980903625488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137116, + "balance_loss_mlp": 1.12441444, + "diversity_loss_mlp": 0.0, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.0814243559806059, + "language_loss": 0.7981922, + "learning_rate": 0.0007285760564309179, + "loss": 0.8095634, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.1270752, + "routerloss_mlp": 0.0, + "step": 1915, + "time_per_iteration": 3.091447353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127931, + "balance_loss_mlp": 1.11485386, + "diversity_loss_mlp": 0.0, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.09574055809111115, + "language_loss": 0.84848046, + "learning_rate": 0.0007282989299232448, + "loss": 0.85975981, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.13092041, + "routerloss_mlp": 0.0, + "step": 1916, + "time_per_iteration": 3.074547052383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113017, + "balance_loss_mlp": 1.09977341, + "diversity_loss_mlp": 0.0, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.08763204320127825, + "language_loss": 0.83209801, + "learning_rate": 0.0007280217147820668, + "loss": 0.84322822, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1917, + "time_per_iteration": 2.6260228157043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092055, + "balance_loss_mlp": 1.07888198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06316346716689762, + "language_loss": 0.79465461, + "learning_rate": 0.0007277444111150079, + "loss": 0.80557513, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.13189697, + "routerloss_mlp": 0.0, + "step": 1918, + "time_per_iteration": 2.6777923107147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088544, + "balance_loss_mlp": 1.07465601, + "diversity_loss_mlp": 0.0, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.09595367080188737, + "language_loss": 0.84512901, + "learning_rate": 0.0007274670190297272, + "loss": 0.85601443, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1919, + "time_per_iteration": 2.590839147567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085845, + "balance_loss_mlp": 1.07205224, + "diversity_loss_mlp": 0.0, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.07431087712553297, + "language_loss": 0.82079387, + "learning_rate": 0.0007271895386339179, + "loss": 0.83165228, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1920, + "time_per_iteration": 2.7924282550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094639, + "balance_loss_mlp": 1.08048892, + "diversity_loss_mlp": 0.0, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.07797312778631413, + "language_loss": 0.83431751, + "learning_rate": 0.0007269119700353073, + "loss": 0.84526384, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1921, + "time_per_iteration": 2.7155139446258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112357, + "balance_loss_mlp": 1.0987196, + "diversity_loss_mlp": 0.0, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.07250682713227712, + "language_loss": 0.84994757, + "learning_rate": 0.0007266343133416571, + "loss": 0.86107111, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1922, + "time_per_iteration": 2.7394983768463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073276, + "balance_loss_mlp": 1.06564641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.035523530201468645, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78190196, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.07617188, + "routerloss_mlp": 0.0, + "step": 1923, + "time_per_iteration": 4.877161026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10153794, + "diversity_loss_mlp": 0.0, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.0789330271899564, + "language_loss": 0.84356588, + "learning_rate": 0.0007260787361004556, + "loss": 0.85471952, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1924, + "time_per_iteration": 2.608745813369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_mlp": 1.02985299, + "diversity_loss_mlp": 0.0, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.021371165562314075, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74798417, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.07080078, + "routerloss_mlp": 0.0, + "step": 1925, + "time_per_iteration": 4.906585931777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114233, + "balance_loss_mlp": 1.10069048, + "diversity_loss_mlp": 0.0, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.12026638393290963, + "language_loss": 0.87422252, + "learning_rate": 0.0007255228077730903, + "loss": 0.88536477, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1926, + "time_per_iteration": 2.6886680126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123836, + "balance_loss_mlp": 1.11107421, + "diversity_loss_mlp": 0.0, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.06719853297068734, + "language_loss": 0.81722987, + "learning_rate": 0.0007252447122218632, + "loss": 0.82846814, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1927, + "time_per_iteration": 3.1511058807373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125512, + "balance_loss_mlp": 1.11258984, + "diversity_loss_mlp": 0.0, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.08764579691953547, + "language_loss": 0.87849444, + "learning_rate": 0.0007249665292228834, + "loss": 0.88974959, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1928, + "time_per_iteration": 2.565991163253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120289, + "balance_loss_mlp": 1.1073308, + "diversity_loss_mlp": 0.0, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.0633685198143462, + "language_loss": 0.83318496, + "learning_rate": 0.000724688258884151, + "loss": 0.84438789, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1929, + "time_per_iteration": 2.531827926635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_mlp": 1.10286927, + "diversity_loss_mlp": 0.0, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.05744658583323744, + "language_loss": 0.86564112, + "learning_rate": 0.0007244099013137002, + "loss": 0.8767941, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1930, + "time_per_iteration": 3.1130166053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.10404849, + "diversity_loss_mlp": 0.0, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.06880018611034966, + "language_loss": 0.88695574, + "learning_rate": 0.0007241314566195993, + "loss": 0.89812243, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.1262207, + "routerloss_mlp": 0.0, + "step": 1931, + "time_per_iteration": 3.374743700027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110097, + "balance_loss_mlp": 1.08821416, + "diversity_loss_mlp": 0.0, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.06303779661636588, + "language_loss": 0.85510373, + "learning_rate": 0.0007238529249099496, + "loss": 0.86611342, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1932, + "time_per_iteration": 2.6654059886932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097731, + "balance_loss_mlp": 1.0911988, + "diversity_loss_mlp": 0.0, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.03412398452916775, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78954613, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.06542969, + "routerloss_mlp": 0.0, + "step": 1933, + "time_per_iteration": 4.851354598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091175, + "balance_loss_mlp": 1.07859278, + "diversity_loss_mlp": 0.0, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.08014253307267598, + "language_loss": 0.80636895, + "learning_rate": 0.000723295600876581, + "loss": 0.81728071, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.12597656, + "routerloss_mlp": 0.0, + "step": 1934, + "time_per_iteration": 3.0025534629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097044, + "balance_loss_mlp": 1.08416963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.08698689907724866, + "language_loss": 0.88006312, + "learning_rate": 0.0007230168087692344, + "loss": 0.89103359, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.12872314, + "routerloss_mlp": 0.0, + "step": 1935, + "time_per_iteration": 2.6499342918395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095813, + "balance_loss_mlp": 1.0830214, + "diversity_loss_mlp": 0.0, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.07031074193849007, + "language_loss": 0.82382512, + "learning_rate": 0.0007227379300790839, + "loss": 0.8347832, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1936, + "time_per_iteration": 3.0040676593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092601, + "balance_loss_mlp": 1.07969058, + "diversity_loss_mlp": 0.0, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.07132774808829288, + "language_loss": 0.85478282, + "learning_rate": 0.0007224589649143997, + "loss": 0.86570889, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 1937, + "time_per_iteration": 2.584545612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089825, + "balance_loss_mlp": 1.07662272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.0711139803163438, + "language_loss": 0.8120302, + "learning_rate": 0.0007221799133834861, + "loss": 0.82292843, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.13214111, + "routerloss_mlp": 0.0, + "step": 1938, + "time_per_iteration": 2.6393649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109955, + "balance_loss_mlp": 1.08649623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.20460237815205612, + "language_loss": 0.81793052, + "learning_rate": 0.00072190077559468, + "loss": 0.82892597, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1939, + "time_per_iteration": 2.5494682788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127139, + "balance_loss_mlp": 1.1140976, + "diversity_loss_mlp": 0.0, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.05817015695703163, + "language_loss": 0.89248812, + "learning_rate": 0.0007216215516563527, + "loss": 0.90375948, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.13049316, + "routerloss_mlp": 0.0, + "step": 1940, + "time_per_iteration": 2.6755452156066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129035, + "balance_loss_mlp": 1.1159811, + "diversity_loss_mlp": 0.0, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.07778932214282369, + "language_loss": 0.83852386, + "learning_rate": 0.0007213422416769083, + "loss": 0.84981418, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1941, + "time_per_iteration": 2.6008002758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135994, + "balance_loss_mlp": 1.12319708, + "diversity_loss_mlp": 0.0, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.06345716224902766, + "language_loss": 0.7501297, + "learning_rate": 0.0007210628457647849, + "loss": 0.76148963, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1942, + "time_per_iteration": 2.5911362171173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140859, + "balance_loss_mlp": 1.12763917, + "diversity_loss_mlp": 0.0, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.06753886702103719, + "language_loss": 0.78585184, + "learning_rate": 0.000720783364028453, + "loss": 0.7972604, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.13238525, + "routerloss_mlp": 0.0, + "step": 1943, + "time_per_iteration": 2.7490458488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149977, + "balance_loss_mlp": 1.13685822, + "diversity_loss_mlp": 0.0, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.0650742437261564, + "language_loss": 0.87667847, + "learning_rate": 0.0007205037965764177, + "loss": 0.88817823, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1944, + "time_per_iteration": 2.5870554447174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134812, + "balance_loss_mlp": 1.12192512, + "diversity_loss_mlp": 0.0, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07468357539719116, + "language_loss": 0.85650361, + "learning_rate": 0.0007202241435172161, + "loss": 0.86785173, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1945, + "time_per_iteration": 2.7550253868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131811, + "balance_loss_mlp": 1.11901414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07270487210957549, + "language_loss": 0.87884831, + "learning_rate": 0.0007199444049594198, + "loss": 0.8901664, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1946, + "time_per_iteration": 2.9499337673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111095, + "balance_loss_mlp": 1.09783912, + "diversity_loss_mlp": 0.0, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.07247382516020226, + "language_loss": 0.83384776, + "learning_rate": 0.0007196645810116322, + "loss": 0.84495866, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1947, + "time_per_iteration": 2.70394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113218, + "balance_loss_mlp": 1.1003499, + "diversity_loss_mlp": 0.0, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.07522309633784076, + "language_loss": 0.84431696, + "learning_rate": 0.0007193846717824912, + "loss": 0.8554492, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1948, + "time_per_iteration": 2.923752546310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116002, + "balance_loss_mlp": 1.10312748, + "diversity_loss_mlp": 0.0, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.06883561802065806, + "language_loss": 0.88268626, + "learning_rate": 0.0007191046773806669, + "loss": 0.89384627, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1949, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108921, + "balance_loss_mlp": 1.09593272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07969110082801287, + "language_loss": 0.83211446, + "learning_rate": 0.0007188245979148631, + "loss": 0.84320366, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1950, + "time_per_iteration": 3.193124294281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111725, + "balance_loss_mlp": 1.09892154, + "diversity_loss_mlp": 0.0, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.07005872092850987, + "language_loss": 0.87434363, + "learning_rate": 0.0007185444334938157, + "loss": 0.88546085, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1951, + "time_per_iteration": 2.669201135635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101783, + "balance_loss_mlp": 1.0892663, + "diversity_loss_mlp": 0.0, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.08195801919923047, + "language_loss": 0.85047525, + "learning_rate": 0.0007182641842262947, + "loss": 0.86149311, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.12518311, + "routerloss_mlp": 0.0, + "step": 1952, + "time_per_iteration": 2.602139472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092197, + "balance_loss_mlp": 1.07936394, + "diversity_loss_mlp": 0.0, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.07349771430020792, + "language_loss": 0.77754879, + "learning_rate": 0.0007179838502211022, + "loss": 0.78847075, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.128479, + "routerloss_mlp": 0.0, + "step": 1953, + "time_per_iteration": 2.85720157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094498, + "balance_loss_mlp": 1.08148086, + "diversity_loss_mlp": 0.0, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.0681681729591206, + "language_loss": 0.86330736, + "learning_rate": 0.0007177034315870738, + "loss": 0.87425238, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1954, + "time_per_iteration": 2.958862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101066, + "balance_loss_mlp": 1.08803654, + "diversity_loss_mlp": 0.0, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06642365438263753, + "language_loss": 0.90809441, + "learning_rate": 0.0007174229284330773, + "loss": 0.91910505, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1955, + "time_per_iteration": 2.5824947357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108936, + "balance_loss_mlp": 1.07642531, + "diversity_loss_mlp": 0.0, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.07788827503332588, + "language_loss": 0.86705017, + "learning_rate": 0.0007171423408680141, + "loss": 0.87794375, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1956, + "time_per_iteration": 2.8101606369018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00950311, + "balance_loss_mlp": 1.6602329, + "diversity_loss_mlp": 0.20739825, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.03218717292019043, + "language_loss": 0.89567441, + "learning_rate": 0.0007168616690008176, + "loss": 0.90517747, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01649548, + "step": 1957, + "time_per_iteration": 2.6774377822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081569, + "balance_loss_mlp": 1.06840825, + "diversity_loss_mlp": 0.0, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.07242251254882147, + "language_loss": 0.85681045, + "learning_rate": 0.0007165809129404545, + "loss": 0.86762613, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1958, + "time_per_iteration": 2.8396048545837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090105, + "balance_loss_mlp": 1.07657433, + "diversity_loss_mlp": 0.0, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.08227545286248691, + "language_loss": 0.86212921, + "learning_rate": 0.0007163000727959239, + "loss": 0.87303019, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1959, + "time_per_iteration": 2.478990316390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087148, + "balance_loss_mlp": 1.07989979, + "diversity_loss_mlp": 0.0, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.05215322395932221, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79046214, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.07226562, + "routerloss_mlp": 0.0, + "step": 1960, + "time_per_iteration": 4.869986057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095327, + "balance_loss_mlp": 1.08232689, + "diversity_loss_mlp": 0.0, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.08048811275026858, + "language_loss": 0.84568793, + "learning_rate": 0.00071573814069052, + "loss": 0.85664117, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.13018799, + "routerloss_mlp": 0.0, + "step": 1961, + "time_per_iteration": 2.9122819900512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109901, + "balance_loss_mlp": 1.08614171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.06061063893945359, + "language_loss": 0.88073885, + "learning_rate": 0.0007154570489478081, + "loss": 0.89172894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1962, + "time_per_iteration": 3.1824018955230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111399, + "balance_loss_mlp": 1.10154414, + "diversity_loss_mlp": 0.0, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.06274200702745775, + "language_loss": 0.86391222, + "learning_rate": 0.0007151758735572514, + "loss": 0.87505209, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.12451172, + "routerloss_mlp": 0.0, + "step": 1963, + "time_per_iteration": 2.997624158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111089, + "balance_loss_mlp": 1.09836888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.07983075782925624, + "language_loss": 0.80894458, + "learning_rate": 0.0007148946146280119, + "loss": 0.82005548, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.12731934, + "routerloss_mlp": 0.0, + "step": 1964, + "time_per_iteration": 2.836583137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00620122, + "balance_loss_mlp": 1.05382681, + "diversity_loss_mlp": 0.16216688, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.0017779517528101797, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.72812271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01212509, + "step": 1965, + "time_per_iteration": 4.906678915023804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_mlp": 1.02436352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.025755206304302582, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.7637251, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.06176758, + "routerloss_mlp": 0.0, + "step": 1966, + "time_per_iteration": 4.93319296836853 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127949, + "balance_loss_mlp": 1.11581361, + "diversity_loss_mlp": 0.0, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.05898800907157556, + "language_loss": 0.83873129, + "learning_rate": 0.0007140503377003022, + "loss": 0.85001081, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 1967, + "time_per_iteration": 2.9807000160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123739, + "balance_loss_mlp": 1.11125755, + "diversity_loss_mlp": 0.0, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.06421364750503517, + "language_loss": 0.84625173, + "learning_rate": 0.000713768745708599, + "loss": 0.85748911, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1968, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118961, + "balance_loss_mlp": 1.10671234, + "diversity_loss_mlp": 0.0, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.06880095080762995, + "language_loss": 0.77052647, + "learning_rate": 0.0007134870707245085, + "loss": 0.78171611, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.12249756, + "routerloss_mlp": 0.0, + "step": 1969, + "time_per_iteration": 3.302985429763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120912, + "balance_loss_mlp": 1.10852587, + "diversity_loss_mlp": 0.0, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.07142024228833302, + "language_loss": 0.84469545, + "learning_rate": 0.0007132053128573864, + "loss": 0.85590458, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.12384033, + "routerloss_mlp": 0.0, + "step": 1970, + "time_per_iteration": 2.7751197814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_mlp": 1.11231327, + "diversity_loss_mlp": 0.0, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06795721743578591, + "language_loss": 0.83786452, + "learning_rate": 0.0007129234722166211, + "loss": 0.84910882, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 1971, + "time_per_iteration": 2.806898832321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114684, + "balance_loss_mlp": 1.10238707, + "diversity_loss_mlp": 0.0, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.06601167392952549, + "language_loss": 0.91087604, + "learning_rate": 0.0007126415489116328, + "loss": 0.92202282, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.1229248, + "routerloss_mlp": 0.0, + "step": 1972, + "time_per_iteration": 2.656651496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.09782279, + "diversity_loss_mlp": 0.0, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06641244535011205, + "language_loss": 0.81145501, + "learning_rate": 0.0007123595430518736, + "loss": 0.82255375, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 1973, + "time_per_iteration": 2.8665072917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102568, + "balance_loss_mlp": 1.09068835, + "diversity_loss_mlp": 0.0, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.07235703206146665, + "language_loss": 0.86411089, + "learning_rate": 0.0007120774547468282, + "loss": 0.87513655, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 1974, + "time_per_iteration": 2.5590381622314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948323, + "balance_loss_mlp": 1.65707994, + "diversity_loss_mlp": 0.20756721, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.03148003592885531, + "language_loss": 0.81558585, + "learning_rate": 0.0007117952841060128, + "loss": 0.82506907, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01599924, + "step": 1975, + "time_per_iteration": 2.6777563095092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083824, + "balance_loss_mlp": 1.07167053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.07660828670939425, + "language_loss": 0.83672053, + "learning_rate": 0.0007115130312389756, + "loss": 0.8475588, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.12145996, + "routerloss_mlp": 0.0, + "step": 1976, + "time_per_iteration": 2.7103323936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.07200503, + "diversity_loss_mlp": 0.0, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.08353002189035653, + "language_loss": 0.79290646, + "learning_rate": 0.0007112306962552973, + "loss": 0.80375111, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.12463379, + "routerloss_mlp": 0.0, + "step": 1977, + "time_per_iteration": 2.576239824295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084774, + "balance_loss_mlp": 1.07254314, + "diversity_loss_mlp": 0.0, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.06483406604645132, + "language_loss": 0.85315859, + "learning_rate": 0.0007109482792645896, + "loss": 0.86400628, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 1978, + "time_per_iteration": 2.7146143913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.07276165, + "diversity_loss_mlp": 0.0, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.06865418790878511, + "language_loss": 0.83831733, + "learning_rate": 0.0007106657803764969, + "loss": 0.84916663, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 1979, + "time_per_iteration": 2.73152494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086146, + "balance_loss_mlp": 1.07395101, + "diversity_loss_mlp": 0.0, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.07620298141647525, + "language_loss": 0.81962979, + "learning_rate": 0.0007103831997006948, + "loss": 0.83049119, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.12194824, + "routerloss_mlp": 0.0, + "step": 1980, + "time_per_iteration": 2.7383615970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094611, + "balance_loss_mlp": 1.08276772, + "diversity_loss_mlp": 0.0, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.0842263164190672, + "language_loss": 0.85342598, + "learning_rate": 0.0007101005373468908, + "loss": 0.86437213, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 1981, + "time_per_iteration": 2.889251708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097444, + "balance_loss_mlp": 1.08543372, + "diversity_loss_mlp": 0.0, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.06048237516575629, + "language_loss": 0.86649287, + "learning_rate": 0.0007098177934248242, + "loss": 0.87746727, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 1982, + "time_per_iteration": 2.773146867752075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920145, + "balance_loss_mlp": 1.60273147, + "diversity_loss_mlp": 0.20649332, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.033525346661278974, + "language_loss": 0.85516387, + "learning_rate": 0.0007095349680442661, + "loss": 0.86436534, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01553278, + "step": 1983, + "time_per_iteration": 2.8675785064697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116298, + "balance_loss_mlp": 1.1045742, + "diversity_loss_mlp": 0.0, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.06407324010727367, + "language_loss": 0.78783178, + "learning_rate": 0.0007092520613150188, + "loss": 0.79899484, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 1984, + "time_per_iteration": 2.709177017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918651, + "balance_loss_mlp": 1.59999418, + "diversity_loss_mlp": 0.20665541, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.03070680845617011, + "language_loss": 0.80925471, + "learning_rate": 0.0007089690733469165, + "loss": 0.81844121, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532666, + "step": 1985, + "time_per_iteration": 2.750558376312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135734, + "balance_loss_mlp": 1.12384343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.08571071539105668, + "language_loss": 0.82313848, + "learning_rate": 0.000708686004249825, + "loss": 0.83449578, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 1986, + "time_per_iteration": 2.7550368309020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132102, + "balance_loss_mlp": 1.12012124, + "diversity_loss_mlp": 0.0, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.07744479108461458, + "language_loss": 0.91340905, + "learning_rate": 0.0007084028541336413, + "loss": 0.92473006, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.11975098, + "routerloss_mlp": 0.0, + "step": 1987, + "time_per_iteration": 2.703339099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914957, + "balance_loss_mlp": 1.59260678, + "diversity_loss_mlp": 0.20690078, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.03035395776464378, + "language_loss": 0.86267084, + "learning_rate": 0.0007081196231082942, + "loss": 0.87182039, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01520337, + "step": 1988, + "time_per_iteration": 2.8075153827667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.10567343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.07746710731409655, + "language_loss": 0.80053389, + "learning_rate": 0.0007078363112837436, + "loss": 0.81171107, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.12036133, + "routerloss_mlp": 0.0, + "step": 1989, + "time_per_iteration": 2.811197280883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104881, + "balance_loss_mlp": 1.09261441, + "diversity_loss_mlp": 0.0, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.07961201652041947, + "language_loss": 0.84721339, + "learning_rate": 0.000707552918769981, + "loss": 0.85826218, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 1990, + "time_per_iteration": 2.4908246994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102216, + "balance_loss_mlp": 1.08987188, + "diversity_loss_mlp": 0.0, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.06284554422997896, + "language_loss": 0.83619118, + "learning_rate": 0.000707269445677029, + "loss": 0.84721333, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.12341309, + "routerloss_mlp": 0.0, + "step": 1991, + "time_per_iteration": 2.733126401901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101588, + "balance_loss_mlp": 1.08921361, + "diversity_loss_mlp": 0.0, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.07203164936975576, + "language_loss": 0.85140717, + "learning_rate": 0.0007069858921149416, + "loss": 0.86242306, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.12371826, + "routerloss_mlp": 0.0, + "step": 1992, + "time_per_iteration": 2.9382007122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096798, + "balance_loss_mlp": 1.08434701, + "diversity_loss_mlp": 0.0, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.05485930037569587, + "language_loss": 0.85794246, + "learning_rate": 0.0007067022581938043, + "loss": 0.86891043, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.12457275, + "routerloss_mlp": 0.0, + "step": 1993, + "time_per_iteration": 2.857525110244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095406, + "balance_loss_mlp": 1.08321714, + "diversity_loss_mlp": 0.0, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.0871408980162776, + "language_loss": 0.83722532, + "learning_rate": 0.0007064185440237334, + "loss": 0.8481794, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1994, + "time_per_iteration": 2.7131123542785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099231, + "balance_loss_mlp": 1.08733368, + "diversity_loss_mlp": 0.0, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.06357294591464056, + "language_loss": 0.84358412, + "learning_rate": 0.0007061347497148764, + "loss": 0.85457647, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.11895752, + "routerloss_mlp": 0.0, + "step": 1995, + "time_per_iteration": 2.7398569583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102339, + "balance_loss_mlp": 1.09015, + "diversity_loss_mlp": 0.0, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.07322887134464046, + "language_loss": 0.86299884, + "learning_rate": 0.0007058508753774122, + "loss": 0.87402225, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1996, + "time_per_iteration": 2.6903162002563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108867, + "balance_loss_mlp": 1.09709477, + "diversity_loss_mlp": 0.0, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.0698381422429368, + "language_loss": 0.86921895, + "learning_rate": 0.0007055669211215505, + "loss": 0.88030767, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 1997, + "time_per_iteration": 2.695028066635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113141, + "balance_loss_mlp": 1.10084486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.08585182349688475, + "language_loss": 0.77776283, + "learning_rate": 0.0007052828870575322, + "loss": 0.78889418, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 1998, + "time_per_iteration": 2.685685873031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011259, + "balance_loss_mlp": 1.11406291, + "diversity_loss_mlp": 0.0, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.06979871165732322, + "language_loss": 0.87060714, + "learning_rate": 0.0007049987732956291, + "loss": 0.8818661, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.11834717, + "routerloss_mlp": 0.0, + "step": 1999, + "time_per_iteration": 2.9710631370544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110647, + "balance_loss_mlp": 1.09428668, + "diversity_loss_mlp": 0.0, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.05561177596637214, + "language_loss": 0.82812738, + "learning_rate": 0.0007047145799461439, + "loss": 0.83919203, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2000, + "time_per_iteration": 2.8492860794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105216, + "balance_loss_mlp": 1.09293747, + "diversity_loss_mlp": 0.0, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06017266002852966, + "language_loss": 0.82272708, + "learning_rate": 0.00070443030711941, + "loss": 0.83377922, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2001, + "time_per_iteration": 2.769383430480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100076, + "balance_loss_mlp": 1.08806002, + "diversity_loss_mlp": 0.0, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.061888534691205976, + "language_loss": 0.82098496, + "learning_rate": 0.0007041459549257924, + "loss": 0.83198571, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2002, + "time_per_iteration": 2.876244306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089803, + "balance_loss_mlp": 1.07744145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.06816771124006925, + "language_loss": 0.78024125, + "learning_rate": 0.0007038615234756859, + "loss": 0.79113925, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.12359619, + "routerloss_mlp": 0.0, + "step": 2003, + "time_per_iteration": 3.1744768619537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086899, + "balance_loss_mlp": 1.07477546, + "diversity_loss_mlp": 0.0, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.09233530116269285, + "language_loss": 0.83808231, + "learning_rate": 0.000703577012879517, + "loss": 0.84895122, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2004, + "time_per_iteration": 2.633391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089004, + "balance_loss_mlp": 1.07705307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.07105955558417659, + "language_loss": 0.88946962, + "learning_rate": 0.0007032924232477423, + "loss": 0.90035963, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2005, + "time_per_iteration": 2.6482574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109168, + "balance_loss_mlp": 1.0797528, + "diversity_loss_mlp": 0.0, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.07024694433071269, + "language_loss": 0.80605727, + "learning_rate": 0.0007030077546908493, + "loss": 0.81697416, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2006, + "time_per_iteration": 2.6219046115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087051, + "balance_loss_mlp": 1.08056581, + "diversity_loss_mlp": 0.0, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.032453276732354666, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84151709, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.06494141, + "routerloss_mlp": 0.0, + "step": 2007, + "time_per_iteration": 4.798014402389526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099136, + "balance_loss_mlp": 1.08744717, + "diversity_loss_mlp": 0.0, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.08661380313869275, + "language_loss": 0.79137146, + "learning_rate": 0.0007024381812438117, + "loss": 0.8023628, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.11676025, + "routerloss_mlp": 0.0, + "step": 2008, + "time_per_iteration": 2.5403189659118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110095, + "balance_loss_mlp": 1.08864713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.09407170185597404, + "language_loss": 0.83448064, + "learning_rate": 0.0007021532765747951, + "loss": 0.8454901, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.12310791, + "routerloss_mlp": 0.0, + "step": 2009, + "time_per_iteration": 2.9585187435150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094144, + "balance_loss_mlp": 1.08211613, + "diversity_loss_mlp": 0.0, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.0684890586406507, + "language_loss": 0.79048979, + "learning_rate": 0.0007018682934229162, + "loss": 0.80143124, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2010, + "time_per_iteration": 2.9703307151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_mlp": 1.0842756, + "diversity_loss_mlp": 0.0, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.06303649013837292, + "language_loss": 0.82761061, + "learning_rate": 0.0007015832318988152, + "loss": 0.83857542, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2011, + "time_per_iteration": 2.6060009002685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_mlp": 1.02231336, + "diversity_loss_mlp": 0.0, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.017766506591404385, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.7491802, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.05615234, + "routerloss_mlp": 0.0, + "step": 2012, + "time_per_iteration": 4.938155651092529 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109088, + "balance_loss_mlp": 1.07810068, + "diversity_loss_mlp": 0.0, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.060967443696148906, + "language_loss": 0.84265292, + "learning_rate": 0.0007010128741766604, + "loss": 0.85356176, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 2013, + "time_per_iteration": 2.7293431758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091499, + "balance_loss_mlp": 1.07861209, + "diversity_loss_mlp": 0.0, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.07873148114105366, + "language_loss": 0.84277219, + "learning_rate": 0.0007007275782000391, + "loss": 0.85368717, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 2014, + "time_per_iteration": 2.644911766052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091625, + "balance_loss_mlp": 1.07889354, + "diversity_loss_mlp": 0.0, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.0868083489465314, + "language_loss": 0.8502394, + "learning_rate": 0.0007004422042940605, + "loss": 0.86115563, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 2015, + "time_per_iteration": 2.5096747875213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109894, + "balance_loss_mlp": 1.08593392, + "diversity_loss_mlp": 0.0, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.08227522563153689, + "language_loss": 0.89877218, + "learning_rate": 0.0007001567525695169, + "loss": 0.90976155, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 2016, + "time_per_iteration": 2.606520891189575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105972, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.06437704205290017, + "language_loss": 0.83705699, + "learning_rate": 0.0006998712231372303, + "loss": 0.84811676, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 2017, + "time_per_iteration": 3.016061305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119415, + "balance_loss_mlp": 1.10692167, + "diversity_loss_mlp": 0.0, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06622760195410109, + "language_loss": 0.85886908, + "learning_rate": 0.0006995856161080532, + "loss": 0.87006325, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.12487793, + "routerloss_mlp": 0.0, + "step": 2018, + "time_per_iteration": 2.8263893127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_mlp": 1.11165869, + "diversity_loss_mlp": 0.0, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.06957079313074316, + "language_loss": 0.82328916, + "learning_rate": 0.0006992999315928679, + "loss": 0.83453172, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.1260376, + "routerloss_mlp": 0.0, + "step": 2019, + "time_per_iteration": 2.789020299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.11772799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.05589846380959986, + "language_loss": 0.85480869, + "learning_rate": 0.0006990141697025871, + "loss": 0.86611497, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 2020, + "time_per_iteration": 2.788597345352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067569, + "balance_loss_mlp": 1.06141829, + "diversity_loss_mlp": 0.0, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.034323999481440985, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77427208, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2021, + "time_per_iteration": 4.782108545303345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130924, + "balance_loss_mlp": 1.11879468, + "diversity_loss_mlp": 0.0, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0813348018947899, + "language_loss": 0.82333553, + "learning_rate": 0.0006984424142405392, + "loss": 0.83464473, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 2022, + "time_per_iteration": 2.804866075515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118053, + "balance_loss_mlp": 1.10578668, + "diversity_loss_mlp": 0.0, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.07379903296161248, + "language_loss": 0.82117045, + "learning_rate": 0.0006981564208907474, + "loss": 0.83235097, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2023, + "time_per_iteration": 2.5883662700653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130828, + "balance_loss_mlp": 1.11855519, + "diversity_loss_mlp": 0.0, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.07869766022149485, + "language_loss": 0.8995713, + "learning_rate": 0.0006978703506098102, + "loss": 0.91087961, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2024, + "time_per_iteration": 2.730283498764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127449, + "balance_loss_mlp": 1.11556411, + "diversity_loss_mlp": 0.0, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.0665173530375796, + "language_loss": 0.88210815, + "learning_rate": 0.00069758420350879, + "loss": 0.89338267, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2025, + "time_per_iteration": 2.62969708442688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932402, + "balance_loss_mlp": 1.62686133, + "diversity_loss_mlp": 0.20693868, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.03379762859523427, + "language_loss": 0.8613863, + "learning_rate": 0.000697297979698779, + "loss": 0.87071025, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01550185, + "step": 2026, + "time_per_iteration": 2.837543249130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107077, + "balance_loss_mlp": 1.09529877, + "diversity_loss_mlp": 0.0, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06049708379655892, + "language_loss": 0.83660531, + "learning_rate": 0.0006970116792908992, + "loss": 0.84767604, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2027, + "time_per_iteration": 3.1133604049682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107629, + "balance_loss_mlp": 1.0960542, + "diversity_loss_mlp": 0.0, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.07190738956644391, + "language_loss": 0.81380564, + "learning_rate": 0.000696725302396302, + "loss": 0.82488191, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2028, + "time_per_iteration": 2.6460230350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109766, + "balance_loss_mlp": 1.08604932, + "diversity_loss_mlp": 0.0, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.06814290150602269, + "language_loss": 0.85887402, + "learning_rate": 0.0006964388491261692, + "loss": 0.86985064, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2029, + "time_per_iteration": 3.296208143234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099195, + "balance_loss_mlp": 1.0871129, + "diversity_loss_mlp": 0.0, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.075812953715104, + "language_loss": 0.87511015, + "learning_rate": 0.0006961523195917114, + "loss": 0.88610214, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2030, + "time_per_iteration": 2.803239345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107522, + "balance_loss_mlp": 1.09573865, + "diversity_loss_mlp": 0.0, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.0665807006884719, + "language_loss": 0.78137511, + "learning_rate": 0.0006958657139041696, + "loss": 0.79245031, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2031, + "time_per_iteration": 2.739151954650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061018, + "balance_loss_mlp": 1.05531955, + "diversity_loss_mlp": 0.0, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.035996309550900246, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77773988, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.05688477, + "routerloss_mlp": 0.0, + "step": 2032, + "time_per_iteration": 4.918209552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094882, + "balance_loss_mlp": 1.08307993, + "diversity_loss_mlp": 0.0, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.0751880944680772, + "language_loss": 0.78643966, + "learning_rate": 0.0006952922745149434, + "loss": 0.79738843, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2033, + "time_per_iteration": 2.6274161338806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091358, + "balance_loss_mlp": 1.07940745, + "diversity_loss_mlp": 0.0, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.07391479182011068, + "language_loss": 0.87674987, + "learning_rate": 0.000695005441035888, + "loss": 0.88766348, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2034, + "time_per_iteration": 2.647348642349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018577, + "balance_loss_mlp": 1.01280713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.010435626825017296, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74742007, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2035, + "time_per_iteration": 4.8861188888549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107172, + "balance_loss_mlp": 1.094733, + "diversity_loss_mlp": 0.0, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.06114898183694146, + "language_loss": 0.81133932, + "learning_rate": 0.0006944315470656863, + "loss": 0.82241106, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.12438965, + "routerloss_mlp": 0.0, + "step": 2036, + "time_per_iteration": 3.0057246685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108633, + "balance_loss_mlp": 1.09606266, + "diversity_loss_mlp": 0.0, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.0812142536963638, + "language_loss": 0.90953541, + "learning_rate": 0.000694144486797345, + "loss": 0.92062169, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.12579346, + "routerloss_mlp": 0.0, + "step": 2037, + "time_per_iteration": 2.6566872596740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_mlp": 1.0060699, + "diversity_loss_mlp": 0.0, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.012879447335335118, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80532491, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2038, + "time_per_iteration": 4.609802722930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103729, + "balance_loss_mlp": 1.09141517, + "diversity_loss_mlp": 0.0, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.07718413790316761, + "language_loss": 0.89271998, + "learning_rate": 0.0006935701402514156, + "loss": 0.90375727, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.12316895, + "routerloss_mlp": 0.0, + "step": 2039, + "time_per_iteration": 2.610905408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101805, + "balance_loss_mlp": 1.01206541, + "diversity_loss_mlp": 0.0, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.016017309503016164, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74052942, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2040, + "time_per_iteration": 4.954579830169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106674, + "balance_loss_mlp": 1.09434199, + "diversity_loss_mlp": 0.0, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.0728619475730698, + "language_loss": 0.84539711, + "learning_rate": 0.0006929954931031422, + "loss": 0.85646391, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2041, + "time_per_iteration": 3.6979990005493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114297, + "balance_loss_mlp": 1.10201287, + "diversity_loss_mlp": 0.0, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.07303574322286652, + "language_loss": 0.88330269, + "learning_rate": 0.0006927080570819805, + "loss": 0.89444566, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2042, + "time_per_iteration": 2.5840306282043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126631, + "balance_loss_mlp": 1.11437607, + "diversity_loss_mlp": 0.0, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.09784101638347129, + "language_loss": 0.80726093, + "learning_rate": 0.0006924205462449161, + "loss": 0.81852722, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2043, + "time_per_iteration": 2.556964159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123407, + "balance_loss_mlp": 1.11139631, + "diversity_loss_mlp": 0.0, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.07674510212981295, + "language_loss": 0.81822228, + "learning_rate": 0.0006921329607035702, + "loss": 0.82945639, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.12005615, + "routerloss_mlp": 0.0, + "step": 2044, + "time_per_iteration": 3.2355051040649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109464, + "balance_loss_mlp": 1.09777582, + "diversity_loss_mlp": 0.0, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.0626655505852987, + "language_loss": 0.87889385, + "learning_rate": 0.0006918453005695938, + "loss": 0.88998848, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2045, + "time_per_iteration": 2.616405725479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112555, + "balance_loss_mlp": 1.10047281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.0593607382511463, + "language_loss": 0.8430419, + "learning_rate": 0.0006915575659546662, + "loss": 0.85416746, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2046, + "time_per_iteration": 2.6596429347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100096, + "balance_loss_mlp": 1.08785915, + "diversity_loss_mlp": 0.0, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.0680979304239865, + "language_loss": 0.80745959, + "learning_rate": 0.0006912697569704959, + "loss": 0.81846058, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2047, + "time_per_iteration": 2.5962154865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097855, + "balance_loss_mlp": 1.08564174, + "diversity_loss_mlp": 0.0, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07634449995136075, + "language_loss": 0.8702817, + "learning_rate": 0.0006909818737288205, + "loss": 0.88126016, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.12207031, + "routerloss_mlp": 0.0, + "step": 2048, + "time_per_iteration": 2.5559332370758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111282, + "balance_loss_mlp": 1.09955215, + "diversity_loss_mlp": 0.0, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07451514550279957, + "language_loss": 0.80715293, + "learning_rate": 0.000690693916341406, + "loss": 0.81826574, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2049, + "time_per_iteration": 2.605881690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115507, + "balance_loss_mlp": 1.10377121, + "diversity_loss_mlp": 0.0, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.06516266173427393, + "language_loss": 0.82286257, + "learning_rate": 0.0006904058849200475, + "loss": 0.83401763, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2050, + "time_per_iteration": 2.7183115482330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.09360313, + "diversity_loss_mlp": 0.0, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.0753850450331705, + "language_loss": 0.84972727, + "learning_rate": 0.0006901177795765683, + "loss": 0.8607837, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 2051, + "time_per_iteration": 2.627774715423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105318, + "balance_loss_mlp": 1.09354019, + "diversity_loss_mlp": 0.0, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.06465732667856934, + "language_loss": 0.81096435, + "learning_rate": 0.0006898296004228213, + "loss": 0.82201755, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2052, + "time_per_iteration": 2.7607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050397, + "balance_loss_mlp": 1.04446077, + "diversity_loss_mlp": 0.0, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03031396698302257, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177135, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.05932617, + "routerloss_mlp": 0.0, + "step": 2053, + "time_per_iteration": 4.876460552215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117073, + "balance_loss_mlp": 1.10529494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.1105412420488248, + "language_loss": 0.79620701, + "learning_rate": 0.0006892530211320763, + "loss": 0.80737776, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2054, + "time_per_iteration": 2.702591896057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00944261, + "balance_loss_mlp": 1.6481061, + "diversity_loss_mlp": 0.21043469, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.03587460904718008, + "language_loss": 0.84313488, + "learning_rate": 0.000688964621218926, + "loss": 0.85257751, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01499031, + "step": 2055, + "time_per_iteration": 2.6392524242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109262, + "balance_loss_mlp": 1.08063984, + "diversity_loss_mlp": 0.0, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.0862390851468888, + "language_loss": 0.80478442, + "learning_rate": 0.0006886761479432037, + "loss": 0.81571066, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.11968994, + "routerloss_mlp": 0.0, + "step": 2056, + "time_per_iteration": 2.8577234745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079958, + "balance_loss_mlp": 1.06739902, + "diversity_loss_mlp": 0.0, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.06874544900142358, + "language_loss": 0.84387571, + "learning_rate": 0.0006883876014169045, + "loss": 0.85467529, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.12554932, + "routerloss_mlp": 0.0, + "step": 2057, + "time_per_iteration": 2.572458505630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073975, + "balance_loss_mlp": 1.06154716, + "diversity_loss_mlp": 0.0, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07681071569739906, + "language_loss": 0.90056652, + "learning_rate": 0.000688098981752052, + "loss": 0.91130626, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 2058, + "time_per_iteration": 2.7125563621520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080003, + "balance_loss_mlp": 1.06697917, + "diversity_loss_mlp": 0.0, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08571295812058347, + "language_loss": 0.80176479, + "learning_rate": 0.0006878102890606982, + "loss": 0.81256485, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 2059, + "time_per_iteration": 3.0797197818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108467, + "balance_loss_mlp": 1.07161617, + "diversity_loss_mlp": 0.0, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.08415103615204221, + "language_loss": 0.81576395, + "learning_rate": 0.0006875215234549239, + "loss": 0.82661068, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 2060, + "time_per_iteration": 2.5358171463012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078952, + "balance_loss_mlp": 1.06604218, + "diversity_loss_mlp": 0.0, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.08360675720274492, + "language_loss": 0.85212821, + "learning_rate": 0.0006872326850468376, + "loss": 0.86291778, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 2061, + "time_per_iteration": 2.685746669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079022, + "balance_loss_mlp": 1.06612396, + "diversity_loss_mlp": 0.0, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.08669948408116639, + "language_loss": 0.78834969, + "learning_rate": 0.0006869437739485762, + "loss": 0.79913992, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.12908936, + "routerloss_mlp": 0.0, + "step": 2062, + "time_per_iteration": 2.608938455581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085084, + "balance_loss_mlp": 1.07266808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06314890183319057, + "language_loss": 0.92750764, + "learning_rate": 0.0006866547902723053, + "loss": 0.93835843, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.12420654, + "routerloss_mlp": 0.0, + "step": 2063, + "time_per_iteration": 2.654764175415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07135844, + "diversity_loss_mlp": 0.0, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10797740353372913, + "language_loss": 0.80444092, + "learning_rate": 0.000686365734130218, + "loss": 0.81527805, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.12365723, + "routerloss_mlp": 0.0, + "step": 2064, + "time_per_iteration": 2.7161076068878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085534, + "balance_loss_mlp": 1.07345843, + "diversity_loss_mlp": 0.0, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06605501724079509, + "language_loss": 0.83883071, + "learning_rate": 0.000686076605634536, + "loss": 0.84968603, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2065, + "time_per_iteration": 2.5960052013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088286, + "balance_loss_mlp": 1.07656133, + "diversity_loss_mlp": 0.0, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.06893141882644385, + "language_loss": 0.84303313, + "learning_rate": 0.0006857874048975088, + "loss": 0.85391599, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2066, + "time_per_iteration": 2.5419557094573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.08599246, + "diversity_loss_mlp": 0.0, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.07076940729430262, + "language_loss": 0.86944497, + "learning_rate": 0.0006854981320314142, + "loss": 0.88042831, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2067, + "time_per_iteration": 2.4425127506256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101956, + "balance_loss_mlp": 1.0900414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.08678893766230582, + "language_loss": 0.86775517, + "learning_rate": 0.0006852087871485579, + "loss": 0.87877476, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2068, + "time_per_iteration": 2.617234468460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104627, + "balance_loss_mlp": 1.09308147, + "diversity_loss_mlp": 0.0, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08540761893483814, + "language_loss": 0.81805646, + "learning_rate": 0.0006849193703612735, + "loss": 0.82910275, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2069, + "time_per_iteration": 2.7818312644958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.08808875, + "diversity_loss_mlp": 0.0, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.06305964525737012, + "language_loss": 0.77731991, + "learning_rate": 0.0006846298817819225, + "loss": 0.78832221, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2070, + "time_per_iteration": 2.970045328140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099656, + "balance_loss_mlp": 1.08777106, + "diversity_loss_mlp": 0.0, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.09229213766989015, + "language_loss": 0.81058359, + "learning_rate": 0.0006843403215228945, + "loss": 0.82158017, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2071, + "time_per_iteration": 2.47542405128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.08525538, + "diversity_loss_mlp": 0.0, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.06250612449775428, + "language_loss": 0.80665851, + "learning_rate": 0.0006840506896966065, + "loss": 0.81763273, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2072, + "time_per_iteration": 2.7048730850219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102151, + "balance_loss_mlp": 1.09000397, + "diversity_loss_mlp": 0.0, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.07670911788950584, + "language_loss": 0.82343054, + "learning_rate": 0.0006837609864155038, + "loss": 0.83445203, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2073, + "time_per_iteration": 2.940208673477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111546, + "balance_loss_mlp": 1.09976768, + "diversity_loss_mlp": 0.0, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.06443735331096001, + "language_loss": 0.83203363, + "learning_rate": 0.0006834712117920592, + "loss": 0.84314907, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2074, + "time_per_iteration": 2.6217153072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111182, + "balance_loss_mlp": 1.09892166, + "diversity_loss_mlp": 0.0, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.07401760730887977, + "language_loss": 0.85670066, + "learning_rate": 0.0006831813659387729, + "loss": 0.86781245, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2075, + "time_per_iteration": 2.5696237087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109855, + "balance_loss_mlp": 1.09774292, + "diversity_loss_mlp": 0.0, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.05990934262108594, + "language_loss": 0.84167391, + "learning_rate": 0.0006828914489681733, + "loss": 0.85277247, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2076, + "time_per_iteration": 2.7859339714050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119025, + "balance_loss_mlp": 1.1072948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.06517456650976074, + "language_loss": 0.85312855, + "learning_rate": 0.0006826014609928162, + "loss": 0.86431879, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2077, + "time_per_iteration": 2.6851699352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0060157, + "balance_loss_mlp": 1.02597332, + "diversity_loss_mlp": 0.1552759, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.0013651319096223075, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8380096, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01094547, + "step": 2078, + "time_per_iteration": 4.859188795089722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.1030947, + "diversity_loss_mlp": 0.0, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.0748648316539235, + "language_loss": 0.80062771, + "learning_rate": 0.0006820212724781896, + "loss": 0.81177354, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.11486816, + "routerloss_mlp": 0.0, + "step": 2079, + "time_per_iteration": 2.6628189086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.09492946, + "diversity_loss_mlp": 0.0, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06148312623903997, + "language_loss": 0.83733618, + "learning_rate": 0.0006817310721641694, + "loss": 0.84840119, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2080, + "time_per_iteration": 2.847182512283325 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 173365568, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4718509996638208.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/training_args.bin b/sft_pretrain/Full_competesmoev30/checkpoint-2080/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b6a9277adbc97dc93da839d7637a55f6cb09192 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe769c1cc19035ec98b831c3889d46da4eb91c0444d770f41a815de3d19398a +size 7992 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-2080/zero_to_fp32.py b/sft_pretrain/Full_competesmoev30/checkpoint-2080/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-2080/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/added_tokens.json b/sft_pretrain/Full_competesmoev30/checkpoint-3120/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/config.json b/sft_pretrain/Full_competesmoev30/checkpoint-3120/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28a5bb1c149304f33214eee3c6e2764711ffb065 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.005, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.005, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": true, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 9, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev30", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/generation_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-3120/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b64bb118b70dca0b8baf4d69328d3da33b0be412 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d11430c4bea4f689e2a74c3d27d3449b8bb860a86a16f6bb1ec52652d6645bb +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63ddeca5e92307858dc76964696baa22991ffa9e --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f1c0fb3fff6e8764bb8819fc440c1599add3cba24fb3ee61efa5ec2feb2f9fa +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b39f89f404fc7f35a50c7f544fd512eee232228 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3f8131c3d862759107bb206f38fd9c344eb34e54275886342e4c21b523aeca6 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68220fc91e90d74847eebf25dae5fe48e3e6b2f1 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44d8320f4be2cf3e64906406897a12d592fe5756093e1671cf634cedecf00c3f +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d3a62e542c1f30e6736a3a040df56e050085dc9 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908e77e3b5ae9b0798f5aaf2bc8065061cdedb8f7eae38c1031adb873a056e72 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a4c975fe83d7dee26a39a5e433fab699c67e12a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb37e410e0e10fcb0c84b9ef55b9157078b2324d56195ba861cca0ac712abb8e +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..271e4929df3d88cdab599b86c936c0cdfacc0745 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59fa2245ed689f53d780b2620e601edae9eb19aa49505608c70fd8b840b6802c +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2444f964c8cdd0d25dea068855d59dd549409c7b --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68bca661b80adbf5b3bd3384fc6d83cf6492af70ac6e17182333a35d0f354d11 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/latest b/sft_pretrain/Full_competesmoev30/checkpoint-3120/latest new file mode 100644 index 0000000000000000000000000000000000000000..804da059f781bacb3f274fb2103e4bc7f9bb7407 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/latest @@ -0,0 +1 @@ +global_step3120 \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/model-00001-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-3120/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/model-00002-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-3120/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d68348cc543f283e5d033dc53d668b00393e84f5 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f73a3e1a76d0a97d85a5ac12bff1f9c947114bc00e0f5c5ebf5517148490bfb +size 3759030203 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/model.safetensors.index.json b/sft_pretrain/Full_competesmoev30/checkpoint-3120/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..1c36aea017a82c896c2bf8d32802184967811e4c --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/model.safetensors.index.json @@ -0,0 +1,673 @@ +{ + "metadata": { + "total_size": 8731429675 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_0.pth b/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..74aaffdc337c5a168a279aed341c53617abfb292 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7428511a0f39116505eb0e78fefd1d50fe2ddacee4482cdd5d925938d450347 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_1.pth b/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_2.pth b/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_3.pth b/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/special_tokens_map.json b/sft_pretrain/Full_competesmoev30/checkpoint-3120/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/tokenizer.model b/sft_pretrain/Full_competesmoev30/checkpoint-3120/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/tokenizer_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-3120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/trainer_state.json b/sft_pretrain/Full_competesmoev30/checkpoint-3120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a9343a501626e164ee9f01fd6eddd2f0790e6c2f --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/trainer_state.json @@ -0,0 +1,52473 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6002308580223162, + "eval_steps": 500, + "global_step": 3120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03936368, + "balance_loss_mlp": 2.84994221, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 15.847607787273237, + "language_loss": 2.91765308, + "learning_rate": 0.0, + "loss": 1.97528625, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.278199672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02015882, + "balance_loss_mlp": 1.26743817, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 26.39987998366427, + "language_loss": 2.42349291, + "learning_rate": 0.00013726078121135892, + "loss": 2.44365168, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 7.4765625, + "step": 2, + "time_per_iteration": 2.74550199508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02034476, + "balance_loss_mlp": 1.28603244, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 23.46624299076427, + "language_loss": 2.13354897, + "learning_rate": 0.00021755319103969496, + "loss": 2.15389395, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 7.4765625, + "step": 3, + "time_per_iteration": 2.820986270904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02058399, + "balance_loss_mlp": 1.29927421, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 3.493910581799846, + "language_loss": 1.37129521, + "learning_rate": 0.00027452156242271784, + "loss": 1.3918792, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 7.5859375, + "step": 4, + "time_per_iteration": 2.677243947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02066247, + "balance_loss_mlp": 1.30979228, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 0.8674817587168525, + "language_loss": 1.33187473, + "learning_rate": 0.0003187096642208417, + "loss": 1.35253716, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 7.55859375, + "step": 5, + "time_per_iteration": 2.6032657623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02071583, + "balance_loss_mlp": 1.31322157, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 2.033424387355904, + "language_loss": 1.30649018, + "learning_rate": 0.0003548139722510539, + "loss": 1.32720602, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 7.578125, + "step": 6, + "time_per_iteration": 2.6967170238494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101369, + "balance_loss_mlp": 1.33652186, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7061194413900653, + "language_loss": 1.22160292, + "learning_rate": 0.00038533972973918044, + "loss": 1.24261677, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 7.64453125, + "step": 7, + "time_per_iteration": 2.7199785709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02146806, + "balance_loss_mlp": 1.36975181, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.35850971046258795, + "language_loss": 1.17196155, + "learning_rate": 0.0004117823436340768, + "loss": 1.19342971, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 7.76171875, + "step": 8, + "time_per_iteration": 2.6428823471069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02153063, + "balance_loss_mlp": 1.36837983, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.22105321402960548, + "language_loss": 1.2430563, + "learning_rate": 0.00043510638207938993, + "loss": 1.26458693, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 7.8359375, + "step": 9, + "time_per_iteration": 2.7773404121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194678, + "balance_loss_mlp": 1.4077065, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.2650641779955913, + "language_loss": 1.13927829, + "learning_rate": 0.00045597044543220066, + "loss": 1.16122508, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 7.87109375, + "step": 10, + "time_per_iteration": 2.6966803073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215625, + "balance_loss_mlp": 1.42216802, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.17099192662038445, + "language_loss": 1.11761594, + "learning_rate": 0.00047484428652143135, + "loss": 1.13977218, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 7.921875, + "step": 11, + "time_per_iteration": 2.846426010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218955, + "balance_loss_mlp": 1.42854977, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.11899482154082718, + "language_loss": 1.17641664, + "learning_rate": 0.0004920747534624128, + "loss": 1.19860613, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 7.890625, + "step": 12, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02207543, + "balance_loss_mlp": 1.41751897, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.14172497717456267, + "language_loss": 1.20158505, + "learning_rate": 0.0005079252465375872, + "loss": 1.22366059, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 7.8984375, + "step": 13, + "time_per_iteration": 2.7560088634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02203989, + "balance_loss_mlp": 1.41625452, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.1448362910448976, + "language_loss": 1.09927368, + "learning_rate": 0.0005226005109505393, + "loss": 1.12131357, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 7.859375, + "step": 14, + "time_per_iteration": 2.623379707336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02125464, + "balance_loss_mlp": 1.36481309, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.13392565488521943, + "language_loss": 1.15514731, + "learning_rate": 0.0005362628552605367, + "loss": 1.17640197, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 7.59765625, + "step": 15, + "time_per_iteration": 2.596914768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02122013, + "balance_loss_mlp": 1.3682282, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.12347082932885804, + "language_loss": 1.19854355, + "learning_rate": 0.0005490431248454357, + "loss": 1.21976352, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 7.53125, + "step": 16, + "time_per_iteration": 2.685072898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02419001, + "balance_loss_mlp": 1.67742407, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2736231848322761, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78124118, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.40625, + "step": 17, + "time_per_iteration": 5.928683757781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02002798, + "balance_loss_mlp": 1.29097593, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.09154168539226555, + "language_loss": 1.06151795, + "learning_rate": 0.0005723671632907488, + "loss": 1.08154595, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.12109375, + "step": 18, + "time_per_iteration": 2.6618175506591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945774, + "balance_loss_mlp": 1.26141703, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11342789334024792, + "language_loss": 1.1168499, + "learning_rate": 0.0005830738490244919, + "loss": 1.13630772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.8515625, + "step": 19, + "time_per_iteration": 2.5248160362243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01908107, + "balance_loss_mlp": 1.24625731, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10096694408553891, + "language_loss": 1.13845825, + "learning_rate": 0.0005932312266435596, + "loss": 1.15753937, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.62109375, + "step": 20, + "time_per_iteration": 2.800579309463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01843731, + "balance_loss_mlp": 1.21316147, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.1378013237236713, + "language_loss": 1.09039617, + "learning_rate": 0.0006028929207788754, + "loss": 1.10883355, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 6.30078125, + "step": 21, + "time_per_iteration": 2.693075656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01796963, + "balance_loss_mlp": 1.19309616, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.10529209836160877, + "language_loss": 1.11936951, + "learning_rate": 0.0006121050677327902, + "loss": 1.13733912, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 6.03125, + "step": 22, + "time_per_iteration": 2.8881568908691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746784, + "balance_loss_mlp": 1.17724967, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.085047282331545, + "language_loss": 1.02962387, + "learning_rate": 0.0006209076479463684, + "loss": 1.04709172, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 5.70703125, + "step": 23, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01714578, + "balance_loss_mlp": 1.16831291, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.1446104563316411, + "language_loss": 1.12823486, + "learning_rate": 0.0006293355346737718, + "loss": 1.1453805, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 5.46875, + "step": 24, + "time_per_iteration": 2.662325382232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664908, + "balance_loss_mlp": 1.14725351, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.08929005506461926, + "language_loss": 1.08926165, + "learning_rate": 0.0006374193284416834, + "loss": 1.10591078, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 5.17578125, + "step": 25, + "time_per_iteration": 2.7794790267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01647718, + "balance_loss_mlp": 1.15752983, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.382953647696995, + "language_loss": 1.07588863, + "learning_rate": 0.0006451860277489461, + "loss": 1.09236586, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.89453125, + "step": 26, + "time_per_iteration": 2.6574552059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01623745, + "balance_loss_mlp": 1.1686517, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.13377036730821817, + "language_loss": 1.14740276, + "learning_rate": 0.0006526595731190848, + "loss": 1.16364002, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 4.55078125, + "step": 27, + "time_per_iteration": 2.5226099491119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558493, + "balance_loss_mlp": 1.14078379, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.07887885702942038, + "language_loss": 1.08901012, + "learning_rate": 0.0006598612921618983, + "loss": 1.10459495, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 4.18359375, + "step": 28, + "time_per_iteration": 2.839459180831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01503024, + "balance_loss_mlp": 1.11487842, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.08107526710192482, + "language_loss": 1.0255661, + "learning_rate": 0.0006668102665011454, + "loss": 1.04059625, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 3.87695312, + "step": 29, + "time_per_iteration": 3.257913589477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474291, + "balance_loss_mlp": 1.11227608, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.13697687064909753, + "language_loss": 1.11483085, + "learning_rate": 0.0006735236364718957, + "loss": 1.1295737, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 3.6171875, + "step": 30, + "time_per_iteration": 2.7084178924560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0142553, + "balance_loss_mlp": 1.09460521, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.11726589989245696, + "language_loss": 1.10265064, + "learning_rate": 0.0006800168558381346, + "loss": 1.11690593, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 3.31054688, + "step": 31, + "time_per_iteration": 2.588890552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01390474, + "balance_loss_mlp": 1.08758759, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.10666498872881085, + "language_loss": 1.13109517, + "learning_rate": 0.0006863039060567947, + "loss": 1.14499998, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 3.0234375, + "step": 32, + "time_per_iteration": 2.671940326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372012, + "balance_loss_mlp": 1.09372997, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.09439068448398888, + "language_loss": 1.06106949, + "learning_rate": 0.0006923974775611263, + "loss": 1.07478976, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 2.78710938, + "step": 33, + "time_per_iteration": 2.854475498199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370442, + "balance_loss_mlp": 1.11390388, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.06215931521992215, + "language_loss": 1.03014469, + "learning_rate": 0.0006983091239737814, + "loss": 1.04384923, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 2.56445312, + "step": 34, + "time_per_iteration": 3.0690298080444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361344, + "balance_loss_mlp": 1.12464166, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.09515467516314563, + "language_loss": 1.01683736, + "learning_rate": 0.0007040493939600222, + "loss": 1.03045082, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 2.36523438, + "step": 35, + "time_per_iteration": 2.8111989498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344012, + "balance_loss_mlp": 1.12600231, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.06987238068095514, + "language_loss": 1.02534437, + "learning_rate": 0.0007096279445021078, + "loss": 1.0387845, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 2.18554688, + "step": 36, + "time_per_iteration": 2.704871654510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340389, + "balance_loss_mlp": 1.14107156, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.1404335763188921, + "language_loss": 1.09097314, + "learning_rate": 0.0007150536386503726, + "loss": 1.10437703, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 1.9921875, + "step": 37, + "time_per_iteration": 2.872793436050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315876, + "balance_loss_mlp": 1.13486814, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.16061978088166937, + "language_loss": 1.01896858, + "learning_rate": 0.0007203346302358509, + "loss": 1.0321275, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 1.81054688, + "step": 38, + "time_per_iteration": 2.9352476596832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304512, + "balance_loss_mlp": 1.13332772, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.19798610454398824, + "language_loss": 1.06942129, + "learning_rate": 0.000725478437577282, + "loss": 1.08246636, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 1.71386719, + "step": 39, + "time_per_iteration": 2.766380786895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266397, + "balance_loss_mlp": 1.10894561, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.0682924496804484, + "language_loss": 1.01676083, + "learning_rate": 0.0007304920078549186, + "loss": 1.02942467, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 1.57324219, + "step": 40, + "time_per_iteration": 2.7017316818237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260084, + "balance_loss_mlp": 1.10988009, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.18661861035366387, + "language_loss": 1.03648829, + "learning_rate": 0.0007353817735343603, + "loss": 1.04908907, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 1.50097656, + "step": 41, + "time_per_iteration": 2.7103593349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243555, + "balance_loss_mlp": 1.10651195, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.09436856387031409, + "language_loss": 0.996611, + "learning_rate": 0.0007401537019902344, + "loss": 1.00904644, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 1.37109375, + "step": 42, + "time_per_iteration": 2.6113343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223311, + "balance_loss_mlp": 1.09961998, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.12261468754490484, + "language_loss": 1.02989793, + "learning_rate": 0.0007448133392900729, + "loss": 1.04213095, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.23535156, + "step": 43, + "time_per_iteration": 2.6736834049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123183, + "balance_loss_mlp": 1.11490965, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.06742287935331995, + "language_loss": 0.98469728, + "learning_rate": 0.0007493658489441491, + "loss": 0.9970156, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.16699219, + "step": 44, + "time_per_iteration": 2.8660154342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221739, + "balance_loss_mlp": 1.11549973, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.13165016268944502, + "language_loss": 1.02125764, + "learning_rate": 0.0007538160463002316, + "loss": 1.03347504, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.06445312, + "step": 45, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219104, + "balance_loss_mlp": 1.12082767, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.09154051415002856, + "language_loss": 1.05303812, + "learning_rate": 0.0007581684291577274, + "loss": 1.06522906, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.98193359, + "step": 46, + "time_per_iteration": 2.5779762268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211973, + "balance_loss_mlp": 1.12180293, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.10098348979088022, + "language_loss": 1.08761919, + "learning_rate": 0.0007624272050891776, + "loss": 1.09973884, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.90185547, + "step": 47, + "time_per_iteration": 2.8511393070220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.09893048, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.06288361982709323, + "language_loss": 0.98731792, + "learning_rate": 0.0007665963158851307, + "loss": 0.9991011, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.79345703, + "step": 48, + "time_per_iteration": 2.7975704669952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117803, + "balance_loss_mlp": 1.10588408, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.07935638516568921, + "language_loss": 1.07018328, + "learning_rate": 0.0007706794594783609, + "loss": 1.08196378, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.72167969, + "step": 49, + "time_per_iteration": 2.762869358062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170672, + "balance_loss_mlp": 1.10281849, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.06589219417940043, + "language_loss": 1.06122911, + "learning_rate": 0.0007746801096530423, + "loss": 1.07293582, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.67919922, + "step": 50, + "time_per_iteration": 2.755232334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116692, + "balance_loss_mlp": 1.10545588, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09337036144210262, + "language_loss": 1.10751569, + "learning_rate": 0.0007786015338021173, + "loss": 1.11918497, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.61376953, + "step": 51, + "time_per_iteration": 2.6145899295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.10279799, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.0700474736529942, + "language_loss": 1.03127432, + "learning_rate": 0.0007824468089603051, + "loss": 1.04286635, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.56396484, + "step": 52, + "time_per_iteration": 2.653333902359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.1128397, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.0678828268350522, + "language_loss": 1.02721131, + "learning_rate": 0.0007862188363098669, + "loss": 1.0388329, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.4934082, + "step": 53, + "time_per_iteration": 3.16854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150565, + "balance_loss_mlp": 1.10464573, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.07226768628462193, + "language_loss": 1.03151178, + "learning_rate": 0.0007899203543304438, + "loss": 1.04301751, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.45947266, + "step": 54, + "time_per_iteration": 2.684342384338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153237, + "balance_loss_mlp": 1.10901022, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.2877805661885644, + "language_loss": 1.16480064, + "learning_rate": 0.0007935539507422731, + "loss": 1.17633295, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.44213867, + "step": 55, + "time_per_iteration": 2.550560235977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135293, + "balance_loss_mlp": 1.09545326, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.09011321470942846, + "language_loss": 1.08752644, + "learning_rate": 0.0007971220733732573, + "loss": 1.09887934, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.39819336, + "step": 56, + "time_per_iteration": 2.6777026653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138051, + "balance_loss_mlp": 1.10307515, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.08011479339587849, + "language_loss": 1.04026377, + "learning_rate": 0.0008006270400641869, + "loss": 1.05164433, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.34985352, + "step": 57, + "time_per_iteration": 2.6899423599243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140153, + "balance_loss_mlp": 1.10787153, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.11169369867739573, + "language_loss": 1.05261517, + "learning_rate": 0.0008040710477125043, + "loss": 1.06401682, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.32275391, + "step": 58, + "time_per_iteration": 2.723038911819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144338, + "balance_loss_mlp": 1.11403465, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.15034464280850074, + "language_loss": 1.06417704, + "learning_rate": 0.0008074561805429771, + "loss": 1.07562041, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.30297852, + "step": 59, + "time_per_iteration": 2.6378283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136058, + "balance_loss_mlp": 1.10842514, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.12260992246729245, + "language_loss": 1.03937411, + "learning_rate": 0.0008107844176832545, + "loss": 1.05073476, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.27612305, + "step": 60, + "time_per_iteration": 2.700141668319702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143848, + "balance_loss_mlp": 1.11745548, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.07189127634205647, + "language_loss": 1.05365705, + "learning_rate": 0.0008140576401132568, + "loss": 1.06509542, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.2644043, + "step": 61, + "time_per_iteration": 2.6508264541625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141309, + "balance_loss_mlp": 1.11781311, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.05216073972873087, + "language_loss": 1.06422329, + "learning_rate": 0.0008172776370494935, + "loss": 1.07563639, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.23461914, + "step": 62, + "time_per_iteration": 2.725492238998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136117, + "balance_loss_mlp": 1.11272764, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.101779425959611, + "language_loss": 1.13612652, + "learning_rate": 0.0008204461118185703, + "loss": 1.14748764, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.23376465, + "step": 63, + "time_per_iteration": 2.5753746032714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148279, + "balance_loss_mlp": 1.12627339, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.07447427381713748, + "language_loss": 1.0324012, + "learning_rate": 0.0008235646872681536, + "loss": 1.04388404, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.22009277, + "step": 64, + "time_per_iteration": 2.5766890048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134709, + "balance_loss_mlp": 1.11331069, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.38827595406324295, + "language_loss": 1.02755439, + "learning_rate": 0.0008266349107584288, + "loss": 1.03890157, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.2142334, + "step": 65, + "time_per_iteration": 2.6795432567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150765, + "balance_loss_mlp": 1.12982011, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.12495940986475743, + "language_loss": 1.06208372, + "learning_rate": 0.0008296582587724851, + "loss": 1.07359147, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.20947266, + "step": 66, + "time_per_iteration": 2.7176458835601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140545, + "balance_loss_mlp": 1.11969519, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.1040817091496257, + "language_loss": 1.04495656, + "learning_rate": 0.0008326361411800136, + "loss": 1.05636215, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.20861816, + "step": 67, + "time_per_iteration": 2.944484233856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11664486, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.1236975736999165, + "language_loss": 1.04613113, + "learning_rate": 0.0008355699051851403, + "loss": 1.05749726, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1998291, + "step": 68, + "time_per_iteration": 2.7155401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163949, + "balance_loss_mlp": 1.14371967, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.08669769947970225, + "language_loss": 1.11325383, + "learning_rate": 0.0008384608389860635, + "loss": 1.12489343, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.20214844, + "step": 69, + "time_per_iteration": 2.6746206283569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.15127182, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.13494585106435908, + "language_loss": 1.01927853, + "learning_rate": 0.000841310175171381, + "loss": 1.03098571, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.19433594, + "step": 70, + "time_per_iteration": 2.6096978187561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116458, + "balance_loss_mlp": 1.14537501, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.08071853308807045, + "language_loss": 0.99831259, + "learning_rate": 0.000844119093875517, + "loss": 1.00995839, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.19189453, + "step": 71, + "time_per_iteration": 2.7110228538513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172694, + "balance_loss_mlp": 1.1531322, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.1298896621631551, + "language_loss": 1.05077183, + "learning_rate": 0.0008468887257134666, + "loss": 1.06249881, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.19543457, + "step": 72, + "time_per_iteration": 2.6877832412719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117331, + "balance_loss_mlp": 1.15338969, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.15655470084299106, + "language_loss": 1.07319438, + "learning_rate": 0.0008496201545131264, + "loss": 1.08492744, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.19909668, + "step": 73, + "time_per_iteration": 2.712404251098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155518, + "balance_loss_mlp": 1.13590837, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.16190508579873739, + "language_loss": 1.04767108, + "learning_rate": 0.0008523144198617317, + "loss": 1.05922627, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.19604492, + "step": 74, + "time_per_iteration": 3.1923534870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136399, + "balance_loss_mlp": 1.11624122, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.09478832041488004, + "language_loss": 1.04861999, + "learning_rate": 0.0008549725194813783, + "loss": 1.05998397, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.20153809, + "step": 75, + "time_per_iteration": 2.6708076000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116508, + "balance_loss_mlp": 1.09800684, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.08770819878028477, + "language_loss": 1.03907192, + "learning_rate": 0.0008575954114472099, + "loss": 1.05023694, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.18481445, + "step": 76, + "time_per_iteration": 3.13152813911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115762, + "balance_loss_mlp": 1.09717751, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.13848190952411177, + "language_loss": 1.01474786, + "learning_rate": 0.0008601840162606118, + "loss": 1.02590549, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.18591309, + "step": 77, + "time_per_iteration": 3.0026464462280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126663, + "balance_loss_mlp": 1.10745883, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04300320251384177, + "language_loss": 1.07548404, + "learning_rate": 0.000862739218788641, + "loss": 1.08675063, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.19189453, + "step": 78, + "time_per_iteration": 2.780151128768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136666, + "balance_loss_mlp": 1.11736631, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.05300805683051922, + "language_loss": 1.05217659, + "learning_rate": 0.0008652618700799138, + "loss": 1.0635432, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.19287109, + "step": 79, + "time_per_iteration": 2.644989252090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115046, + "balance_loss_mlp": 1.13105261, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.13679514692214284, + "language_loss": 1.04483461, + "learning_rate": 0.0008677527890662774, + "loss": 1.05633926, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.19384766, + "step": 80, + "time_per_iteration": 2.4652533531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151484, + "balance_loss_mlp": 1.13120639, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.06949005945359786, + "language_loss": 1.05593443, + "learning_rate": 0.0008702127641587799, + "loss": 1.06744933, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.20263672, + "step": 81, + "time_per_iteration": 2.6423192024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155894, + "balance_loss_mlp": 1.13492513, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.09507058081046676, + "language_loss": 1.01514888, + "learning_rate": 0.0008726425547457192, + "loss": 1.02670789, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.20959473, + "step": 82, + "time_per_iteration": 2.7670798301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.11376882, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.0793725108169458, + "language_loss": 1.00304663, + "learning_rate": 0.0008750428925998964, + "loss": 1.01438546, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.20117188, + "step": 83, + "time_per_iteration": 2.7451062202453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145272, + "balance_loss_mlp": 1.12516141, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.14534943996774727, + "language_loss": 1.06251049, + "learning_rate": 0.0008774144832015932, + "loss": 1.07396317, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.2010498, + "step": 84, + "time_per_iteration": 2.7039954662323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01784137, + "balance_loss_mlp": 1.77116704, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.33978769388161495, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76558447, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.12988281, + "step": 85, + "time_per_iteration": 4.672428846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133263, + "balance_loss_mlp": 1.11339045, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.0814354491433929, + "language_loss": 1.01647198, + "learning_rate": 0.0008820741205014318, + "loss": 1.02780461, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.19873047, + "step": 86, + "time_per_iteration": 2.9217472076416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135249, + "balance_loss_mlp": 1.11522174, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.09136661427056217, + "language_loss": 1.02933669, + "learning_rate": 0.0008843634575408404, + "loss": 1.04068923, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20031738, + "step": 87, + "time_per_iteration": 2.7795376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126576, + "balance_loss_mlp": 1.10805094, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.08653972064742017, + "language_loss": 1.04609084, + "learning_rate": 0.0008866266301555082, + "loss": 1.0573566, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.18518066, + "step": 88, + "time_per_iteration": 2.7490010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144591, + "balance_loss_mlp": 1.12630451, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.0643644920813647, + "language_loss": 1.05052233, + "learning_rate": 0.0008888642296509615, + "loss": 1.06196821, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.18273926, + "step": 89, + "time_per_iteration": 2.594862222671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167485, + "balance_loss_mlp": 1.14840007, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.0960094219381758, + "language_loss": 1.09507632, + "learning_rate": 0.0008910768275115906, + "loss": 1.10675108, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.1907959, + "step": 90, + "time_per_iteration": 2.732243299484253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168948, + "balance_loss_mlp": 1.14970791, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.08670111946866453, + "language_loss": 1.05579484, + "learning_rate": 0.0008932649762767675, + "loss": 1.06748414, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.19238281, + "step": 91, + "time_per_iteration": 2.58011531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156938, + "balance_loss_mlp": 1.13799536, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.1377326340865385, + "language_loss": 1.07988524, + "learning_rate": 0.0008954292103690864, + "loss": 1.09145451, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.18933105, + "step": 92, + "time_per_iteration": 2.88777494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144865, + "balance_loss_mlp": 1.12581539, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.08013614344713903, + "language_loss": 1.10040021, + "learning_rate": 0.0008975700468778296, + "loss": 1.11184883, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.19042969, + "step": 93, + "time_per_iteration": 2.5774590969085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153192, + "balance_loss_mlp": 1.13429725, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.08120240816831911, + "language_loss": 1.03244281, + "learning_rate": 0.0008996879863005366, + "loss": 1.04397476, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.18896484, + "step": 94, + "time_per_iteration": 2.6684646606445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166139, + "balance_loss_mlp": 1.14685082, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.10696755240582503, + "language_loss": 1.0365541, + "learning_rate": 0.0009017835132453337, + "loss": 1.04821539, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.19262695, + "step": 95, + "time_per_iteration": 2.5731871128082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160643, + "balance_loss_mlp": 1.14130712, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.09689172385373614, + "language_loss": 1.03809953, + "learning_rate": 0.0009038570970964896, + "loss": 1.04970598, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.1932373, + "step": 96, + "time_per_iteration": 2.7642133235931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142174, + "balance_loss_mlp": 1.1226114, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0731237284630876, + "language_loss": 1.01012015, + "learning_rate": 0.0009059091926454854, + "loss": 1.02154183, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.19543457, + "step": 97, + "time_per_iteration": 2.5798768997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134737, + "balance_loss_mlp": 1.11522222, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.09616120207899966, + "language_loss": 1.00179553, + "learning_rate": 0.0009079402406897198, + "loss": 1.01314282, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.19494629, + "step": 98, + "time_per_iteration": 3.2566075325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.12357211, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.06455780129345397, + "language_loss": 1.01265812, + "learning_rate": 0.0009099506686008212, + "loss": 1.02409148, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.19763184, + "step": 99, + "time_per_iteration": 2.799565553665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129571, + "balance_loss_mlp": 1.11054564, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.10657448879387016, + "language_loss": 1.0467732, + "learning_rate": 0.0009119408908644013, + "loss": 1.05806899, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.19030762, + "step": 100, + "time_per_iteration": 2.684875249862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122425, + "balance_loss_mlp": 1.10363734, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.06970738765852934, + "language_loss": 1.09725833, + "learning_rate": 0.0009139113095929519, + "loss": 1.1084826, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.18762207, + "step": 101, + "time_per_iteration": 2.8530783653259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130466, + "balance_loss_mlp": 1.11095107, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.04951217111237057, + "language_loss": 1.03750157, + "learning_rate": 0.0009158623150134762, + "loss": 1.04880619, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.19506836, + "step": 102, + "time_per_iteration": 2.5738718509674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124372, + "balance_loss_mlp": 1.10552466, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.07829016079597523, + "language_loss": 1.03829539, + "learning_rate": 0.000917794285931332, + "loss": 1.04953909, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.18859863, + "step": 103, + "time_per_iteration": 2.6672050952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116034, + "balance_loss_mlp": 1.09756863, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.06055754000551873, + "language_loss": 0.96430528, + "learning_rate": 0.0009197075901716639, + "loss": 0.97546566, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.18444824, + "step": 104, + "time_per_iteration": 2.7030909061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143399, + "balance_loss_mlp": 1.12458754, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.08851166873462187, + "language_loss": 1.06492853, + "learning_rate": 0.0009216025849997171, + "loss": 1.07636249, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.18798828, + "step": 105, + "time_per_iteration": 2.770717144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.11799645, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.1087806769155691, + "language_loss": 1.01426148, + "learning_rate": 0.0009234796175212258, + "loss": 1.02562797, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.18640137, + "step": 106, + "time_per_iteration": 2.9345030784606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145469, + "balance_loss_mlp": 1.12691963, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.08314221817588373, + "language_loss": 1.04264343, + "learning_rate": 0.000925339025064007, + "loss": 1.05409813, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.18530273, + "step": 107, + "time_per_iteration": 2.9724230766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136133, + "balance_loss_mlp": 1.11766744, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.06103111074840472, + "language_loss": 0.9746207, + "learning_rate": 0.0009271811355418027, + "loss": 0.98598194, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.18457031, + "step": 108, + "time_per_iteration": 2.8312766551971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114122, + "balance_loss_mlp": 1.12251627, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09366723049874563, + "language_loss": 1.0430491, + "learning_rate": 0.0009290062678013548, + "loss": 1.05446124, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.18713379, + "step": 109, + "time_per_iteration": 2.8890299797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119997, + "balance_loss_mlp": 1.10091138, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.07845117671788823, + "language_loss": 1.02498507, + "learning_rate": 0.0009308147319536321, + "loss": 1.03618503, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.19067383, + "step": 110, + "time_per_iteration": 2.6301145553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124178, + "balance_loss_mlp": 1.10517561, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.06169483511964636, + "language_loss": 1.08628201, + "learning_rate": 0.0009326068296900676, + "loss": 1.09752393, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.18981934, + "step": 111, + "time_per_iteration": 2.8480148315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124091, + "balance_loss_mlp": 1.1046958, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.07277353768082521, + "language_loss": 1.00328588, + "learning_rate": 0.0009343828545846161, + "loss": 1.01452684, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.19384766, + "step": 112, + "time_per_iteration": 2.785245656967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_mlp": 1.12596965, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.0989159829516975, + "language_loss": 1.03963184, + "learning_rate": 0.0009361430923823841, + "loss": 1.05108869, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.19702148, + "step": 113, + "time_per_iteration": 2.6218817234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139838, + "balance_loss_mlp": 1.11994159, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.08134488401387123, + "language_loss": 1.07289195, + "learning_rate": 0.0009378878212755459, + "loss": 1.08429039, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.19885254, + "step": 114, + "time_per_iteration": 2.489394426345825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135841, + "balance_loss_mlp": 1.11546779, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.08931795851274972, + "language_loss": 0.98084462, + "learning_rate": 0.0009396173121672103, + "loss": 0.992203, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.20373535, + "step": 115, + "time_per_iteration": 2.6338186264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.11229324, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.07784948028132394, + "language_loss": 1.03230667, + "learning_rate": 0.0009413318289238633, + "loss": 1.04362714, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.1973877, + "step": 116, + "time_per_iteration": 2.7797064781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119746, + "balance_loss_mlp": 1.10049319, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.10235619274826367, + "language_loss": 0.95674431, + "learning_rate": 0.0009430316286169771, + "loss": 0.96794176, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.19226074, + "step": 117, + "time_per_iteration": 3.0148251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123727, + "balance_loss_mlp": 1.10400951, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.08556933686221588, + "language_loss": 1.00759292, + "learning_rate": 0.0009447169617543361, + "loss": 1.0188303, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.19714355, + "step": 118, + "time_per_iteration": 2.570577383041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147362, + "balance_loss_mlp": 1.12738276, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.14195532580527156, + "language_loss": 1.07468402, + "learning_rate": 0.0009463880725016029, + "loss": 1.08615768, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.19970703, + "step": 119, + "time_per_iteration": 2.687791585922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119491, + "balance_loss_mlp": 1.1002152, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.12580227983012474, + "language_loss": 1.02723956, + "learning_rate": 0.0009480451988946134, + "loss": 1.03843451, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19274902, + "step": 120, + "time_per_iteration": 2.86080002784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.09974504, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.09779732210141849, + "language_loss": 1.04102588, + "learning_rate": 0.0009496885730428627, + "loss": 1.05221319, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1895752, + "step": 121, + "time_per_iteration": 3.058720350265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129608, + "balance_loss_mlp": 1.11076128, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.21300696817673925, + "language_loss": 1.02294064, + "learning_rate": 0.0009513184213246156, + "loss": 1.03423667, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.18859863, + "step": 122, + "time_per_iteration": 2.634585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112851, + "balance_loss_mlp": 1.10879278, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.08876505507315528, + "language_loss": 1.05331969, + "learning_rate": 0.0009529349645740552, + "loss": 1.06460488, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.19702148, + "step": 123, + "time_per_iteration": 2.68062686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139736, + "balance_loss_mlp": 1.11948287, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.07165211399576038, + "language_loss": 1.04294729, + "learning_rate": 0.0009545384182608524, + "loss": 1.05434453, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.20239258, + "step": 124, + "time_per_iteration": 2.541867971420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147945, + "balance_loss_mlp": 1.12758446, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.1170262954091428, + "language_loss": 1.01733518, + "learning_rate": 0.0009561289926625252, + "loss": 1.02881455, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.20361328, + "step": 125, + "time_per_iteration": 2.6904866695404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144311, + "balance_loss_mlp": 1.12337756, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.0767802787123007, + "language_loss": 1.06512678, + "learning_rate": 0.0009577068930299292, + "loss": 1.07656991, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.20935059, + "step": 126, + "time_per_iteration": 2.5956666469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112026, + "balance_loss_mlp": 1.10011339, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.05578094289714296, + "language_loss": 1.01563096, + "learning_rate": 0.0009592723197462087, + "loss": 1.02683353, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.20141602, + "step": 127, + "time_per_iteration": 2.652282953262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135813, + "balance_loss_mlp": 1.11633444, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.08941911012616197, + "language_loss": 0.98464531, + "learning_rate": 0.0009608254684795125, + "loss": 0.99600339, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.19470215, + "step": 128, + "time_per_iteration": 2.9219348430633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113214, + "balance_loss_mlp": 1.11204123, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.07851670709976168, + "language_loss": 1.01339173, + "learning_rate": 0.0009623665303297678, + "loss": 1.02471328, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.20092773, + "step": 129, + "time_per_iteration": 2.72129225730896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138949, + "balance_loss_mlp": 1.11936343, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.10234054898828188, + "language_loss": 1.05215728, + "learning_rate": 0.0009638956919697878, + "loss": 1.0635469, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19580078, + "step": 130, + "time_per_iteration": 2.8943347930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120076, + "balance_loss_mlp": 1.10040641, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.07955649128739337, + "language_loss": 0.97532988, + "learning_rate": 0.0009654131357809714, + "loss": 0.98653066, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.19665527, + "step": 131, + "time_per_iteration": 2.5710790157318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131377, + "balance_loss_mlp": 1.11108756, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.09603534709419483, + "language_loss": 1.06830871, + "learning_rate": 0.0009669190399838441, + "loss": 1.07962251, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.20275879, + "step": 132, + "time_per_iteration": 3.12355899810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104613, + "balance_loss_mlp": 1.08422863, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.07678679730921736, + "language_loss": 0.99635059, + "learning_rate": 0.0009684135787636724, + "loss": 1.0073967, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.20373535, + "step": 133, + "time_per_iteration": 2.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011225, + "balance_loss_mlp": 1.10198379, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.06194161941979751, + "language_loss": 1.03999257, + "learning_rate": 0.0009698969223913726, + "loss": 1.05121756, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.2052002, + "step": 134, + "time_per_iteration": 3.0173001289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111055, + "balance_loss_mlp": 1.09066617, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.06876216863310104, + "language_loss": 1.06792855, + "learning_rate": 0.0009713692373399265, + "loss": 1.07903397, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.19873047, + "step": 135, + "time_per_iteration": 2.670929431915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134721, + "balance_loss_mlp": 1.33280921, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.15411027982306336, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.80803436, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.14355469, + "step": 136, + "time_per_iteration": 5.4502341747283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142923, + "balance_loss_mlp": 1.13023889, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.0420308652143082, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.78953964, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.911421298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.1204778, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.15008184892874737, + "language_loss": 0.99414909, + "learning_rate": 0.0009757216201974225, + "loss": 1.00555539, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.20141602, + "step": 138, + "time_per_iteration": 2.805294990539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163113, + "balance_loss_mlp": 1.1417979, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.10042691837700132, + "language_loss": 1.04683781, + "learning_rate": 0.0009771514130396581, + "loss": 1.05846894, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.21325684, + "step": 139, + "time_per_iteration": 2.6785237789154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.15150893, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.13712828131438198, + "language_loss": 1.04777944, + "learning_rate": 0.00097857095638274, + "loss": 1.05949712, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.20275879, + "step": 140, + "time_per_iteration": 2.5689632892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161751, + "balance_loss_mlp": 1.140818, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.04776427930188189, + "language_loss": 0.96152979, + "learning_rate": 0.0009799803961288726, + "loss": 0.97314727, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.20922852, + "step": 141, + "time_per_iteration": 3.005524158477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114311, + "balance_loss_mlp": 1.12280869, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.08242063446041879, + "language_loss": 1.02058709, + "learning_rate": 0.000981379875086876, + "loss": 1.03201818, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.20300293, + "step": 142, + "time_per_iteration": 3.0404272079467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149559, + "balance_loss_mlp": 1.12884021, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.08811908081945614, + "language_loss": 0.97007114, + "learning_rate": 0.0009827695330590185, + "loss": 0.98156673, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.20727539, + "step": 143, + "time_per_iteration": 2.677872896194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139202, + "balance_loss_mlp": 1.11838782, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.09095558281985278, + "language_loss": 0.9660008, + "learning_rate": 0.0009841495069248256, + "loss": 0.97739279, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.20788574, + "step": 144, + "time_per_iteration": 3.0181970596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124304, + "balance_loss_mlp": 1.10402668, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.06968867614461936, + "language_loss": 0.96011639, + "learning_rate": 0.0009855199307219871, + "loss": 0.97135949, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.20275879, + "step": 145, + "time_per_iteration": 2.6638803482055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129462, + "balance_loss_mlp": 1.10819507, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.10380696742567494, + "language_loss": 0.97768301, + "learning_rate": 0.0009868809357244854, + "loss": 0.98897767, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.21264648, + "step": 146, + "time_per_iteration": 2.6609416007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_mlp": 1.08754969, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.04767435219925792, + "language_loss": 1.01976728, + "learning_rate": 0.0009882326505180556, + "loss": 1.03085351, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.21081543, + "step": 147, + "time_per_iteration": 2.7018306255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116059, + "balance_loss_mlp": 1.09487534, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.081387986355653, + "language_loss": 1.0020777, + "learning_rate": 0.0009895752010730906, + "loss": 1.01323831, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.21191406, + "step": 148, + "time_per_iteration": 2.9776458740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114545, + "balance_loss_mlp": 1.09280121, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07164111136345892, + "language_loss": 1.06547272, + "learning_rate": 0.0009909087108150867, + "loss": 1.07661819, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.21740723, + "step": 149, + "time_per_iteration": 2.7685787677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120998, + "balance_loss_mlp": 1.09932601, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.09002123643314056, + "language_loss": 1.07463562, + "learning_rate": 0.0009922333006927371, + "loss": 1.08584571, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.2166748, + "step": 150, + "time_per_iteration": 2.5377442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134752, + "balance_loss_mlp": 1.11268604, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.07882603128859848, + "language_loss": 1.00827551, + "learning_rate": 0.0009935490892437632, + "loss": 1.01962304, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.22070312, + "step": 151, + "time_per_iteration": 2.5629055500030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126049, + "balance_loss_mlp": 1.10497248, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.07540534084758796, + "language_loss": 0.99210167, + "learning_rate": 0.0009948561926585687, + "loss": 1.00336218, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.21069336, + "step": 152, + "time_per_iteration": 2.755824565887451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133957, + "balance_loss_mlp": 1.1110214, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09890448438657973, + "language_loss": 1.02627087, + "learning_rate": 0.0009961547248418122, + "loss": 1.03761053, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.22937012, + "step": 153, + "time_per_iteration": 2.6255645751953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115876, + "balance_loss_mlp": 1.09208155, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.0750271830701194, + "language_loss": 0.99508584, + "learning_rate": 0.0009974447974719707, + "loss": 1.00624466, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.23791504, + "step": 154, + "time_per_iteration": 2.685029983520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126502, + "balance_loss_mlp": 1.10213518, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.12681443605953674, + "language_loss": 1.01620197, + "learning_rate": 0.0009987265200589763, + "loss": 1.02746701, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.24365234, + "step": 155, + "time_per_iteration": 2.7264955043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119337, + "balance_loss_mlp": 1.09590077, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.07965097154096117, + "language_loss": 1.01522899, + "learning_rate": 0.001, + "loss": 1.02642226, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.23400879, + "step": 156, + "time_per_iteration": 2.864698886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111456, + "balance_loss_mlp": 1.09257805, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.061020534493473076, + "language_loss": 0.9859184, + "learning_rate": 0.0009999999029413921, + "loss": 0.99706399, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.2199707, + "step": 157, + "time_per_iteration": 2.8241283893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125679, + "balance_loss_mlp": 1.1049242, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.05862251807890935, + "language_loss": 1.00346851, + "learning_rate": 0.0009999996117656068, + "loss": 1.01472545, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.2076416, + "step": 158, + "time_per_iteration": 2.7097458839416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113624, + "balance_loss_mlp": 1.09279847, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.09545570145123992, + "language_loss": 0.93653512, + "learning_rate": 0.0009999991264727564, + "loss": 0.94767129, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.20837402, + "step": 159, + "time_per_iteration": 2.756363868713379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110635, + "balance_loss_mlp": 1.08577418, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.09475469160316574, + "language_loss": 1.04571712, + "learning_rate": 0.0009999984470630296, + "loss": 1.05678058, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.20581055, + "step": 160, + "time_per_iteration": 2.5990707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112086, + "balance_loss_mlp": 1.09061611, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.07420241291943742, + "language_loss": 0.9342289, + "learning_rate": 0.0009999975735366902, + "loss": 0.94534969, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.21472168, + "step": 161, + "time_per_iteration": 3.06878662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114184, + "balance_loss_mlp": 1.09270215, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0799449593456649, + "language_loss": 0.95189524, + "learning_rate": 0.0009999965058940775, + "loss": 0.96303707, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.21484375, + "step": 162, + "time_per_iteration": 3.4937808513641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112457, + "balance_loss_mlp": 1.10226631, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08293329451395655, + "language_loss": 1.01278222, + "learning_rate": 0.0009999952441356057, + "loss": 1.02402782, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.22314453, + "step": 163, + "time_per_iteration": 2.535121202468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109663, + "balance_loss_mlp": 1.08820534, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.06727245316799851, + "language_loss": 1.0154388, + "learning_rate": 0.000999993788261765, + "loss": 1.02653539, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.21472168, + "step": 164, + "time_per_iteration": 3.5832889080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110987, + "balance_loss_mlp": 1.08942175, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.07205404441274409, + "language_loss": 1.03110182, + "learning_rate": 0.00099999213827312, + "loss": 1.04221165, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.21569824, + "step": 165, + "time_per_iteration": 2.8096628189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118839, + "balance_loss_mlp": 1.09684491, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.050309165813849886, + "language_loss": 0.98088074, + "learning_rate": 0.000999990294170312, + "loss": 0.99206913, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.22009277, + "step": 166, + "time_per_iteration": 2.663135051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116813, + "balance_loss_mlp": 1.09486628, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.06058681172545402, + "language_loss": 1.02190185, + "learning_rate": 0.0009999882559540566, + "loss": 1.03306985, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.21948242, + "step": 167, + "time_per_iteration": 2.649784564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118672, + "balance_loss_mlp": 1.09543872, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.10019647540930027, + "language_loss": 0.98887956, + "learning_rate": 0.000999986023625145, + "loss": 1.00006628, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.23217773, + "step": 168, + "time_per_iteration": 2.6998720169067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01817799, + "balance_loss_mlp": 1.79767668, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.21411409700219255, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80742216, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.20117188, + "step": 169, + "time_per_iteration": 5.029488563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112576, + "balance_loss_mlp": 1.10157228, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.09130724925200479, + "language_loss": 0.99515283, + "learning_rate": 0.0009999809766328958, + "loss": 1.00641036, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.24206543, + "step": 170, + "time_per_iteration": 2.6508679389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153612, + "balance_loss_mlp": 1.12968671, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.0981725040523357, + "language_loss": 1.01766157, + "learning_rate": 0.0009999781619715177, + "loss": 1.02919769, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.23925781, + "step": 171, + "time_per_iteration": 2.5449466705322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151071, + "balance_loss_mlp": 1.12767053, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.10018141203760955, + "language_loss": 1.0104121, + "learning_rate": 0.000999975153201402, + "loss": 1.02192283, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.23388672, + "step": 172, + "time_per_iteration": 2.8463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114097, + "balance_loss_mlp": 1.11745048, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.05920698759335099, + "language_loss": 0.98661143, + "learning_rate": 0.0009999719503237174, + "loss": 0.99802113, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.23498535, + "step": 173, + "time_per_iteration": 2.733147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157549, + "balance_loss_mlp": 1.1333611, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.12686135486457134, + "language_loss": 1.07479167, + "learning_rate": 0.0009999685533397073, + "loss": 1.08636713, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.24194336, + "step": 174, + "time_per_iteration": 2.5705809593200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110896, + "balance_loss_mlp": 1.08707762, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.07652801902249555, + "language_loss": 0.99758261, + "learning_rate": 0.00099996496225069, + "loss": 1.00869155, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.23815918, + "step": 175, + "time_per_iteration": 2.6572659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118018, + "balance_loss_mlp": 1.09399772, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.05463854096335067, + "language_loss": 1.01895058, + "learning_rate": 0.0009999611770580604, + "loss": 1.03013086, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.24023438, + "step": 176, + "time_per_iteration": 2.8216159343719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.09596181, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.08810438351502946, + "language_loss": 1.01167393, + "learning_rate": 0.0009999571977632876, + "loss": 1.02288568, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.25231934, + "step": 177, + "time_per_iteration": 2.581037998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115073, + "balance_loss_mlp": 1.09040904, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.08419866181616258, + "language_loss": 1.03353202, + "learning_rate": 0.0009999530243679166, + "loss": 1.04468274, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.24682617, + "step": 178, + "time_per_iteration": 2.5844500064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137225, + "balance_loss_mlp": 1.11332321, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.13671082465577608, + "language_loss": 0.99045932, + "learning_rate": 0.0009999486568735675, + "loss": 1.00183165, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.23913574, + "step": 179, + "time_per_iteration": 3.044409990310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125047, + "balance_loss_mlp": 1.1010983, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.0738854697341979, + "language_loss": 0.99422705, + "learning_rate": 0.0009999440952819362, + "loss": 1.00547755, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.23950195, + "step": 180, + "time_per_iteration": 3.644280433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112251, + "balance_loss_mlp": 1.08836114, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.04789131390967285, + "language_loss": 0.98983485, + "learning_rate": 0.0009999393395947935, + "loss": 1.00095737, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2388916, + "step": 181, + "time_per_iteration": 2.8229053020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114504, + "balance_loss_mlp": 1.08992302, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.08040661288612141, + "language_loss": 1.02358437, + "learning_rate": 0.0009999343898139858, + "loss": 1.03472936, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.24584961, + "step": 182, + "time_per_iteration": 2.6112709045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123737, + "balance_loss_mlp": 1.09824967, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.0879280890069936, + "language_loss": 1.01010704, + "learning_rate": 0.0009999292459414348, + "loss": 1.02134442, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.25476074, + "step": 183, + "time_per_iteration": 2.574800491333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111495, + "balance_loss_mlp": 1.08559036, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.08068750200828848, + "language_loss": 1.05455053, + "learning_rate": 0.0009999239079791374, + "loss": 1.06566548, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.25915527, + "step": 184, + "time_per_iteration": 2.5650548934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110884, + "balance_loss_mlp": 1.08343673, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.07300059562366337, + "language_loss": 0.98493111, + "learning_rate": 0.0009999183759291659, + "loss": 0.99601954, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.25427246, + "step": 185, + "time_per_iteration": 2.7383785247802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110571, + "balance_loss_mlp": 1.08168936, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.09426698036311254, + "language_loss": 1.00536895, + "learning_rate": 0.0009999126497936682, + "loss": 1.01642609, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.24023438, + "step": 186, + "time_per_iteration": 2.5103538036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110913, + "balance_loss_mlp": 1.08740544, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.07507023604654985, + "language_loss": 1.03590488, + "learning_rate": 0.0009999067295748676, + "loss": 1.047014, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.23510742, + "step": 187, + "time_per_iteration": 2.806403160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112247, + "balance_loss_mlp": 1.09995186, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.10679989437153373, + "language_loss": 1.00781608, + "learning_rate": 0.000999900615275062, + "loss": 1.01904082, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.22509766, + "step": 188, + "time_per_iteration": 2.6750597953796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105556, + "balance_loss_mlp": 1.0823226, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.06425431277780277, + "language_loss": 1.06987619, + "learning_rate": 0.0009998943068966256, + "loss": 1.0809319, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.23242188, + "step": 189, + "time_per_iteration": 2.4297006130218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.0826813, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.07322572175010231, + "language_loss": 1.01591444, + "learning_rate": 0.0009998878044420072, + "loss": 1.02697778, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23669434, + "step": 190, + "time_per_iteration": 2.6686899662017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108272, + "balance_loss_mlp": 1.08489525, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.07088525550270033, + "language_loss": 0.97819, + "learning_rate": 0.0009998811079137318, + "loss": 0.98927271, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.23400879, + "step": 191, + "time_per_iteration": 2.5795974731445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118931, + "balance_loss_mlp": 1.09439743, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.07437245365565072, + "language_loss": 0.9895249, + "learning_rate": 0.0009998742173143987, + "loss": 1.0007143, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.24536133, + "step": 192, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133748, + "balance_loss_mlp": 1.10824919, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.06698686336952825, + "language_loss": 0.98415262, + "learning_rate": 0.0009998671326466833, + "loss": 0.99549013, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.25524902, + "step": 193, + "time_per_iteration": 2.955780506134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136952, + "balance_loss_mlp": 1.10922432, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.07154145387165563, + "language_loss": 0.99267447, + "learning_rate": 0.0009998598539133362, + "loss": 1.00404394, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.27734375, + "step": 194, + "time_per_iteration": 3.0137686729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163557, + "balance_loss_mlp": 1.13373041, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.09795763902625766, + "language_loss": 1.00780571, + "learning_rate": 0.0009998523811171828, + "loss": 1.01944125, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2980957, + "step": 195, + "time_per_iteration": 2.5090267658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164155, + "balance_loss_mlp": 1.13323212, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0756543485462421, + "language_loss": 1.0036695, + "learning_rate": 0.0009998447142611248, + "loss": 1.015311, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.30883789, + "step": 196, + "time_per_iteration": 2.653759241104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12615836, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.10738469994654526, + "language_loss": 0.9438082, + "learning_rate": 0.0009998368533481387, + "loss": 0.95537138, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.30126953, + "step": 197, + "time_per_iteration": 3.03090763092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123277, + "balance_loss_mlp": 1.09433353, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08947148055588174, + "language_loss": 0.97516447, + "learning_rate": 0.0009998287983812762, + "loss": 0.98639727, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.28930664, + "step": 198, + "time_per_iteration": 2.842519760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.10672641, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.08719552456544254, + "language_loss": 1.03183711, + "learning_rate": 0.0009998205493636646, + "loss": 1.04316807, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.26416016, + "step": 199, + "time_per_iteration": 2.657094955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099837, + "balance_loss_mlp": 1.07485092, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.11937452390124363, + "language_loss": 0.95869702, + "learning_rate": 0.0009998121062985063, + "loss": 0.96969533, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.24987793, + "step": 200, + "time_per_iteration": 2.6954355239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108694, + "balance_loss_mlp": 1.08444691, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.09459530753006626, + "language_loss": 0.98493665, + "learning_rate": 0.0009998034691890794, + "loss": 0.9960236, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.24243164, + "step": 201, + "time_per_iteration": 2.7717928886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.08075976, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.07675440437740683, + "language_loss": 1.0290482, + "learning_rate": 0.0009997946380387369, + "loss": 1.04009235, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.23632812, + "step": 202, + "time_per_iteration": 2.63975191116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111336, + "balance_loss_mlp": 1.08706474, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09220046036918417, + "language_loss": 1.04956245, + "learning_rate": 0.0009997856128509076, + "loss": 1.06067586, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.24279785, + "step": 203, + "time_per_iteration": 2.856816053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124883, + "balance_loss_mlp": 1.10112453, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.08622839045605694, + "language_loss": 0.99688643, + "learning_rate": 0.0009997763936290952, + "loss": 1.00813532, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.23754883, + "step": 204, + "time_per_iteration": 2.5392112731933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113243, + "balance_loss_mlp": 1.10773039, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.09842935942049862, + "language_loss": 1.0453217, + "learning_rate": 0.0009997669803768789, + "loss": 1.05664587, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24694824, + "step": 205, + "time_per_iteration": 2.7708992958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108465, + "balance_loss_mlp": 1.08426595, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.10843184908981528, + "language_loss": 0.9984858, + "learning_rate": 0.0009997573730979134, + "loss": 1.00957048, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.24194336, + "step": 206, + "time_per_iteration": 2.7474939823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01685643, + "balance_loss_mlp": 1.6616106, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.13014896830523812, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80878842, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 0.24023438, + "step": 207, + "time_per_iteration": 4.682751655578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109669, + "balance_loss_mlp": 1.08474243, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.07677308889428856, + "language_loss": 0.98866731, + "learning_rate": 0.0009997375764747294, + "loss": 0.99976397, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.24926758, + "step": 208, + "time_per_iteration": 2.9866418838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110763, + "balance_loss_mlp": 1.08659935, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.07362493409063897, + "language_loss": 0.96845645, + "learning_rate": 0.0009997273871381967, + "loss": 0.97956407, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.24169922, + "step": 209, + "time_per_iteration": 2.7354848384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125333, + "balance_loss_mlp": 1.09998906, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.07873798613461079, + "language_loss": 1.01664305, + "learning_rate": 0.0009997170037902862, + "loss": 1.0278964, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.25366211, + "step": 210, + "time_per_iteration": 2.704061269760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120611, + "balance_loss_mlp": 1.09462297, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.06515356853390573, + "language_loss": 1.04550838, + "learning_rate": 0.0009997064264350292, + "loss": 1.05671442, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.26013184, + "step": 211, + "time_per_iteration": 2.8975577354431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113542, + "balance_loss_mlp": 1.08662462, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.07652094351016743, + "language_loss": 0.98263478, + "learning_rate": 0.0009996956550765317, + "loss": 0.99377024, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.26928711, + "step": 212, + "time_per_iteration": 2.6716954708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125588, + "balance_loss_mlp": 1.09752572, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07289633346919515, + "language_loss": 0.93075061, + "learning_rate": 0.0009996846897189762, + "loss": 0.94200653, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.28051758, + "step": 213, + "time_per_iteration": 2.621661901473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110085, + "balance_loss_mlp": 1.08412087, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.055838089119108855, + "language_loss": 0.99370623, + "learning_rate": 0.0009996735303666193, + "loss": 1.004807, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.2598877, + "step": 214, + "time_per_iteration": 2.6928601264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095032, + "balance_loss_mlp": 1.06966448, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.04962656356162825, + "language_loss": 1.01034558, + "learning_rate": 0.0009996621770237937, + "loss": 1.02129602, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.25390625, + "step": 215, + "time_per_iteration": 2.760256290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098352, + "balance_loss_mlp": 1.07167339, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.06820201547086252, + "language_loss": 0.97216904, + "learning_rate": 0.0009996506296949073, + "loss": 0.98315251, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.26708984, + "step": 216, + "time_per_iteration": 2.921712636947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106582, + "balance_loss_mlp": 1.0792954, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.05678696526689756, + "language_loss": 0.96681535, + "learning_rate": 0.0009996388883844428, + "loss": 0.97788119, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27282715, + "step": 217, + "time_per_iteration": 2.6392288208007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092837, + "balance_loss_mlp": 1.06704009, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06325985488704432, + "language_loss": 1.01514912, + "learning_rate": 0.0009996269530969588, + "loss": 1.02607751, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25830078, + "step": 218, + "time_per_iteration": 2.6588566303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105308, + "balance_loss_mlp": 1.08038127, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.07879458740668356, + "language_loss": 0.99769139, + "learning_rate": 0.0009996148238370888, + "loss": 1.00874448, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24938965, + "step": 219, + "time_per_iteration": 2.7322278022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103161, + "balance_loss_mlp": 1.07711363, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.0629407592127239, + "language_loss": 0.95434463, + "learning_rate": 0.0009996025006095421, + "loss": 0.96537632, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.26049805, + "step": 220, + "time_per_iteration": 3.336355209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02460831, + "balance_loss_mlp": 2.43965983, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.4526401201513886, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.80243975, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 0.21191406, + "step": 221, + "time_per_iteration": 5.584397315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138695, + "balance_loss_mlp": 1.11146736, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.08000509590360377, + "language_loss": 0.96767551, + "learning_rate": 0.0009995772722706307, + "loss": 0.9790625, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.27246094, + "step": 222, + "time_per_iteration": 2.932035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.14898777, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.06295735346771135, + "language_loss": 1.10290885, + "learning_rate": 0.0009995643671690604, + "loss": 1.1146853, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.28686523, + "step": 223, + "time_per_iteration": 2.489574909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118606, + "balance_loss_mlp": 1.15768862, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.06397701682602697, + "language_loss": 0.97599596, + "learning_rate": 0.0009995512681194023, + "loss": 0.98785651, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.28369141, + "step": 224, + "time_per_iteration": 2.8617055416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204697, + "balance_loss_mlp": 1.17644429, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.0569906191636753, + "language_loss": 0.95713508, + "learning_rate": 0.0009995379751267417, + "loss": 0.96918201, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28295898, + "step": 225, + "time_per_iteration": 3.272956371307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211045, + "balance_loss_mlp": 1.17959809, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.06210348551978246, + "language_loss": 0.970909, + "learning_rate": 0.0009995244881962398, + "loss": 0.98301941, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.31420898, + "step": 226, + "time_per_iteration": 2.629014253616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207143, + "balance_loss_mlp": 1.17750776, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.06412842399528458, + "language_loss": 0.97423029, + "learning_rate": 0.0009995108073331323, + "loss": 0.98630178, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.29614258, + "step": 227, + "time_per_iteration": 2.598266124725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.1790204, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.05900157234221112, + "language_loss": 1.00919747, + "learning_rate": 0.0009994969325427309, + "loss": 1.02128983, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.30200195, + "step": 228, + "time_per_iteration": 2.681445598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208149, + "balance_loss_mlp": 1.17727375, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.08372721248844238, + "language_loss": 0.96768719, + "learning_rate": 0.0009994828638304218, + "loss": 0.97976863, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.30883789, + "step": 229, + "time_per_iteration": 2.6330137252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213023, + "balance_loss_mlp": 1.18202829, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.09332052147555223, + "language_loss": 1.02555704, + "learning_rate": 0.0009994686012016675, + "loss": 1.0376873, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.30981445, + "step": 230, + "time_per_iteration": 2.519575595855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205079, + "balance_loss_mlp": 1.17470419, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.07303811655625075, + "language_loss": 1.02279592, + "learning_rate": 0.000999454144662005, + "loss": 1.03484678, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.3034668, + "step": 231, + "time_per_iteration": 2.8772194385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200788, + "balance_loss_mlp": 1.16729009, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.05982585511102693, + "language_loss": 0.9550131, + "learning_rate": 0.0009994394942170468, + "loss": 0.96702093, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.33520508, + "step": 232, + "time_per_iteration": 2.705536127090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200355, + "balance_loss_mlp": 1.16673827, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06482734437318205, + "language_loss": 0.93872058, + "learning_rate": 0.0009994246498724808, + "loss": 0.95072412, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.33642578, + "step": 233, + "time_per_iteration": 2.729526996612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204357, + "balance_loss_mlp": 1.17043054, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.06840473363398163, + "language_loss": 0.96267349, + "learning_rate": 0.00099940961163407, + "loss": 0.97471702, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.33935547, + "step": 234, + "time_per_iteration": 2.8506321907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210646, + "balance_loss_mlp": 1.1758604, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.061734633326469966, + "language_loss": 0.99016106, + "learning_rate": 0.0009993943795076528, + "loss": 1.0022676, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.34814453, + "step": 235, + "time_per_iteration": 2.6817193031311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012082, + "balance_loss_mlp": 1.17379582, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.07722659013027651, + "language_loss": 1.01211047, + "learning_rate": 0.0009993789534991427, + "loss": 1.02419257, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34423828, + "step": 236, + "time_per_iteration": 2.4797797203063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216471, + "balance_loss_mlp": 1.18354487, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.057771959372629855, + "language_loss": 0.96296465, + "learning_rate": 0.0009993633336145287, + "loss": 0.97512937, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.3293457, + "step": 237, + "time_per_iteration": 2.629390001296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225643, + "balance_loss_mlp": 1.19369495, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.07668042159358972, + "language_loss": 1.00654197, + "learning_rate": 0.0009993475198598752, + "loss": 1.01879823, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.31958008, + "step": 238, + "time_per_iteration": 3.01481032371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220207, + "balance_loss_mlp": 1.1866858, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08994725037560618, + "language_loss": 0.96828419, + "learning_rate": 0.0009993315122413212, + "loss": 0.98048627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.33544922, + "step": 239, + "time_per_iteration": 2.6483867168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215592, + "balance_loss_mlp": 1.18042517, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.08238446857980607, + "language_loss": 0.9678297, + "learning_rate": 0.0009993153107650818, + "loss": 0.97998565, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.35180664, + "step": 240, + "time_per_iteration": 2.594534158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199347, + "balance_loss_mlp": 1.16303563, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.09316981102360596, + "language_loss": 0.96465278, + "learning_rate": 0.0009992989154374468, + "loss": 0.9766463, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.36328125, + "step": 241, + "time_per_iteration": 2.5503900051116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190623, + "balance_loss_mlp": 1.15631413, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06540072726643342, + "language_loss": 1.03219867, + "learning_rate": 0.0009992823262647817, + "loss": 1.04410505, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.34301758, + "step": 242, + "time_per_iteration": 2.7218894958496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156597, + "balance_loss_mlp": 1.1235044, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.09177405734811558, + "language_loss": 0.97326249, + "learning_rate": 0.0009992655432535264, + "loss": 0.98482847, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.33105469, + "step": 243, + "time_per_iteration": 2.800133466720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136682, + "balance_loss_mlp": 1.10614085, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.0753000751829641, + "language_loss": 0.98140877, + "learning_rate": 0.0009992485664101973, + "loss": 0.99277562, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.30517578, + "step": 244, + "time_per_iteration": 2.6863763332366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.08648348, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.06369495608278983, + "language_loss": 1.00049853, + "learning_rate": 0.000999231395741385, + "loss": 1.01165819, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.29467773, + "step": 245, + "time_per_iteration": 3.145612955093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104415, + "balance_loss_mlp": 1.0764488, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.058358007346171054, + "language_loss": 0.97651666, + "learning_rate": 0.0009992140312537557, + "loss": 0.98756075, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.2800293, + "step": 246, + "time_per_iteration": 2.612847328186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092763, + "balance_loss_mlp": 1.06641817, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.0813165094086701, + "language_loss": 0.93562448, + "learning_rate": 0.000999196472954051, + "loss": 0.94655204, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.26379395, + "step": 247, + "time_per_iteration": 2.9633545875549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02706023, + "balance_loss_mlp": 2.55038333, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.26644214904670055, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.82130873, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.5546875, + "step": 248, + "time_per_iteration": 5.665804624557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.12381256, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.07780849766073628, + "language_loss": 1.00670481, + "learning_rate": 0.0009991607749457578, + "loss": 1.01821971, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.27709961, + "step": 249, + "time_per_iteration": 2.511357069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173992, + "balance_loss_mlp": 1.14483345, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.08242230719461915, + "language_loss": 0.98555326, + "learning_rate": 0.0009991426352510286, + "loss": 0.99729323, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.29174805, + "step": 250, + "time_per_iteration": 2.9747626781463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213643, + "balance_loss_mlp": 1.18186164, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.08110439009499554, + "language_loss": 0.99640858, + "learning_rate": 0.0009991243017719422, + "loss": 1.00854492, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.31787109, + "step": 251, + "time_per_iteration": 2.6450002193450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247147, + "balance_loss_mlp": 1.21276748, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.09531666026222298, + "language_loss": 0.94547766, + "learning_rate": 0.0009991057745156165, + "loss": 0.95794916, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.34375, + "step": 252, + "time_per_iteration": 2.608226776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0212821, + "balance_loss_mlp": 2.05687547, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.23568337742673945, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84039193, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.71484375, + "step": 253, + "time_per_iteration": 5.009166955947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253718, + "balance_loss_mlp": 1.22112656, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.11732554794190522, + "language_loss": 1.02719152, + "learning_rate": 0.0009990681387000943, + "loss": 1.03972876, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.32568359, + "step": 254, + "time_per_iteration": 2.733544111251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259536, + "balance_loss_mlp": 1.22959042, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.10757948615664437, + "language_loss": 0.99075437, + "learning_rate": 0.0009990490301555093, + "loss": 1.00334978, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.29907227, + "step": 255, + "time_per_iteration": 2.952223777770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01833791, + "balance_loss_mlp": 1.79201972, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.13001926806611183, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81048942, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.41796875, + "step": 256, + "time_per_iteration": 4.834028244018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01839647, + "balance_loss_mlp": 1.7994014, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.11989001468728706, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81082386, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.40234375, + "step": 257, + "time_per_iteration": 4.963416814804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764173, + "balance_loss_mlp": 1.72659838, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.09913369297847359, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71740055, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.375, + "step": 258, + "time_per_iteration": 4.860485076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242536, + "balance_loss_mlp": 1.21342516, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.09740558448014502, + "language_loss": 0.93272007, + "learning_rate": 0.0009989706585723202, + "loss": 0.94514549, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29101562, + "step": 259, + "time_per_iteration": 2.763617753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252163, + "balance_loss_mlp": 1.22202659, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.1249592106702951, + "language_loss": 0.99313855, + "learning_rate": 0.0009989505813633442, + "loss": 1.0056603, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.30102539, + "step": 260, + "time_per_iteration": 2.687018394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240716, + "balance_loss_mlp": 1.2099601, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12109163963871895, + "language_loss": 0.99271172, + "learning_rate": 0.000998930310444573, + "loss": 1.00511885, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.30712891, + "step": 261, + "time_per_iteration": 2.7355992794036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194626, + "balance_loss_mlp": 1.16220057, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.10196827835843725, + "language_loss": 0.96712077, + "learning_rate": 0.0009989098458238765, + "loss": 0.97906703, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.32421875, + "step": 262, + "time_per_iteration": 2.8160154819488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120265, + "balance_loss_mlp": 1.16850853, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.08050125519090791, + "language_loss": 0.96376812, + "learning_rate": 0.0009988891875091998, + "loss": 0.97579467, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.34179688, + "step": 263, + "time_per_iteration": 2.7738425731658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221172, + "balance_loss_mlp": 1.18657792, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09840792148235085, + "language_loss": 0.91716301, + "learning_rate": 0.0009988683355085636, + "loss": 0.92937469, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.34619141, + "step": 264, + "time_per_iteration": 2.7763147354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240941, + "balance_loss_mlp": 1.20393836, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.10851467261948886, + "language_loss": 0.99809039, + "learning_rate": 0.000998847289830063, + "loss": 1.01049972, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37011719, + "step": 265, + "time_per_iteration": 2.824655532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228337, + "balance_loss_mlp": 1.1930747, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.10300549526892724, + "language_loss": 0.92410266, + "learning_rate": 0.0009988260504818682, + "loss": 0.93638599, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.35253906, + "step": 266, + "time_per_iteration": 2.5484864711761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187227, + "balance_loss_mlp": 1.15127397, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.08304900792028935, + "language_loss": 0.99349552, + "learning_rate": 0.000998804617472226, + "loss": 1.00536776, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.35986328, + "step": 267, + "time_per_iteration": 2.67124342918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115754, + "balance_loss_mlp": 1.1241138, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.09977621520267708, + "language_loss": 0.94207335, + "learning_rate": 0.0009987829908094568, + "loss": 0.95364869, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.33447266, + "step": 268, + "time_per_iteration": 2.813934087753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134088, + "balance_loss_mlp": 1.09908843, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.11738978381138881, + "language_loss": 1.00792646, + "learning_rate": 0.0009987611705019569, + "loss": 1.01926744, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.3503418, + "step": 269, + "time_per_iteration": 4.138862133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117111, + "balance_loss_mlp": 1.08282614, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.05348082980263852, + "language_loss": 0.99369657, + "learning_rate": 0.0009987391565581978, + "loss": 1.00486767, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34277344, + "step": 270, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126117, + "balance_loss_mlp": 1.09176075, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.07524916084480812, + "language_loss": 0.92056942, + "learning_rate": 0.000998716948986726, + "loss": 0.93183053, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34350586, + "step": 271, + "time_per_iteration": 2.7993569374084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142479, + "balance_loss_mlp": 1.10948217, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.0817059207133684, + "language_loss": 0.94050443, + "learning_rate": 0.0009986945477961633, + "loss": 0.95192927, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.33032227, + "step": 272, + "time_per_iteration": 2.692488193511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162369, + "balance_loss_mlp": 1.13108802, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07154102990319093, + "language_loss": 0.9958387, + "learning_rate": 0.0009986719529952066, + "loss": 1.00746238, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.3125, + "step": 273, + "time_per_iteration": 2.834634780883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151893, + "balance_loss_mlp": 1.12099373, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.11641144040169231, + "language_loss": 0.98596179, + "learning_rate": 0.000998649164592628, + "loss": 0.99748075, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.30859375, + "step": 274, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128229, + "balance_loss_mlp": 1.0986656, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.08444223005841496, + "language_loss": 0.96863008, + "learning_rate": 0.0009986261825972748, + "loss": 0.97991234, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29541016, + "step": 275, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116765, + "balance_loss_mlp": 1.08734369, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.09541227165854013, + "language_loss": 0.9859423, + "learning_rate": 0.000998603007018069, + "loss": 0.99711001, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29394531, + "step": 276, + "time_per_iteration": 2.7675342559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108591, + "balance_loss_mlp": 1.07731009, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.06559506468622318, + "language_loss": 0.95903766, + "learning_rate": 0.0009985796378640089, + "loss": 0.97012359, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.3125, + "step": 277, + "time_per_iteration": 2.7019519805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111687, + "balance_loss_mlp": 1.08012068, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.07318038514420845, + "language_loss": 0.95983016, + "learning_rate": 0.0009985560751441665, + "loss": 0.97094703, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.31542969, + "step": 278, + "time_per_iteration": 2.8234922885894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111914, + "balance_loss_mlp": 1.0874306, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.07220087085065136, + "language_loss": 0.98319995, + "learning_rate": 0.00099853231886769, + "loss": 0.99439132, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.31713867, + "step": 279, + "time_per_iteration": 2.7748613357543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133292, + "balance_loss_mlp": 1.10162961, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06439402113592181, + "language_loss": 0.98657203, + "learning_rate": 0.0009985083690438024, + "loss": 0.99790496, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.31640625, + "step": 280, + "time_per_iteration": 2.700810670852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132869, + "balance_loss_mlp": 1.10204113, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.04843472954862069, + "language_loss": 0.89283121, + "learning_rate": 0.0009984842256818016, + "loss": 0.9041599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.30786133, + "step": 281, + "time_per_iteration": 3.115292549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113546, + "balance_loss_mlp": 1.10580087, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.06657413960403659, + "language_loss": 0.99515754, + "learning_rate": 0.0009984598887910613, + "loss": 1.00651217, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.29614258, + "step": 282, + "time_per_iteration": 2.735640048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140553, + "balance_loss_mlp": 1.10893846, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.07881571737542031, + "language_loss": 0.95306879, + "learning_rate": 0.0009984353583810297, + "loss": 0.96447432, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.31616211, + "step": 283, + "time_per_iteration": 2.8240931034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128602, + "balance_loss_mlp": 1.09834647, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0943213260733239, + "language_loss": 0.97471213, + "learning_rate": 0.0009984106344612302, + "loss": 0.98599815, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.30224609, + "step": 284, + "time_per_iteration": 2.802689790725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119254, + "balance_loss_mlp": 1.08964229, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.0726777825280204, + "language_loss": 0.92919928, + "learning_rate": 0.0009983857170412615, + "loss": 0.94039178, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.29589844, + "step": 285, + "time_per_iteration": 3.0111782550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.10165143, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.06957121076923053, + "language_loss": 0.92976809, + "learning_rate": 0.000998360606130798, + "loss": 0.94110835, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.32324219, + "step": 286, + "time_per_iteration": 2.8221306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01949249, + "balance_loss_mlp": 1.90461755, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.20138197735421756, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71022367, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.44726562, + "step": 287, + "time_per_iteration": 4.872509956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160615, + "balance_loss_mlp": 1.12447047, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.09083797153449202, + "language_loss": 0.98382282, + "learning_rate": 0.0009983098038774552, + "loss": 0.99542892, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.36132812, + "step": 288, + "time_per_iteration": 2.7861900329589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156365, + "balance_loss_mlp": 1.54524422, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.05039988105800305, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79733872, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.18359375, + "step": 289, + "time_per_iteration": 4.809176683425903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183294, + "balance_loss_mlp": 1.14958155, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.11767359006900376, + "language_loss": 0.95852768, + "learning_rate": 0.0009982582277800948, + "loss": 0.9703607, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.33666992, + "step": 290, + "time_per_iteration": 2.5785539150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11738336, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.09005932528563108, + "language_loss": 1.03039932, + "learning_rate": 0.0009982321495648908, + "loss": 1.04188573, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.3125, + "step": 291, + "time_per_iteration": 2.798412561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133747, + "balance_loss_mlp": 1.10218096, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.07041326246084649, + "language_loss": 0.9488259, + "learning_rate": 0.0009982058779188115, + "loss": 0.96016335, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.31542969, + "step": 292, + "time_per_iteration": 2.7117443084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113109, + "balance_loss_mlp": 1.08354521, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.0659469171672323, + "language_loss": 1.02221513, + "learning_rate": 0.0009981794128520567, + "loss": 1.0333463, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.29589844, + "step": 293, + "time_per_iteration": 2.83561372756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113063, + "balance_loss_mlp": 1.10104227, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.07618014203826041, + "language_loss": 0.98908657, + "learning_rate": 0.000998152754374901, + "loss": 1.00039291, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.2956543, + "step": 294, + "time_per_iteration": 2.879502773284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133819, + "balance_loss_mlp": 1.1052562, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.09109925372268521, + "language_loss": 0.94850433, + "learning_rate": 0.0009981259024976943, + "loss": 0.95984244, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.28564453, + "step": 295, + "time_per_iteration": 2.708038568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129466, + "balance_loss_mlp": 1.10023606, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.08548016831625774, + "language_loss": 0.92669952, + "learning_rate": 0.0009980988572308612, + "loss": 0.93799424, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.29248047, + "step": 296, + "time_per_iteration": 2.99466609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126952, + "balance_loss_mlp": 1.09779358, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.05751010220277151, + "language_loss": 0.96034563, + "learning_rate": 0.0009980716185849015, + "loss": 0.9716152, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.29174805, + "step": 297, + "time_per_iteration": 3.0216734409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135652, + "balance_loss_mlp": 1.10651755, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06310788330802251, + "language_loss": 0.92855394, + "learning_rate": 0.0009980441865703904, + "loss": 0.93991041, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29150391, + "step": 298, + "time_per_iteration": 2.6354267597198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124221, + "balance_loss_mlp": 1.09456158, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.07879622532675779, + "language_loss": 1.0091691, + "learning_rate": 0.000998016561197978, + "loss": 1.02041125, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29638672, + "step": 299, + "time_per_iteration": 2.726853370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104202, + "balance_loss_mlp": 1.0768075, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.07606317837722033, + "language_loss": 0.9243238, + "learning_rate": 0.0009979887424783895, + "loss": 0.9353658, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.27441406, + "step": 300, + "time_per_iteration": 2.866880416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03286275, + "balance_loss_mlp": 5.97428513, + "diversity_loss_mlp": 0.40086228, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.08630620995418306, + "language_loss": 1.00780904, + "learning_rate": 0.0009979607304224248, + "loss": 1.04067183, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.09870158, + "step": 301, + "time_per_iteration": 2.8737847805023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101287, + "balance_loss_mlp": 1.07100797, + "diversity_loss_mlp": 0.0, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.07465341521099292, + "language_loss": 0.98771101, + "learning_rate": 0.000997932525040959, + "loss": 0.99872386, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.30273438, + "routerloss_mlp": 0.0, + "step": 302, + "time_per_iteration": 2.646038055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097684, + "balance_loss_mlp": 1.06912112, + "diversity_loss_mlp": 0.0, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.0784548088046029, + "language_loss": 1.01345074, + "learning_rate": 0.000997904126344943, + "loss": 1.02442753, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.28527832, + "routerloss_mlp": 0.0, + "step": 303, + "time_per_iteration": 2.607773542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117476, + "balance_loss_mlp": 1.08612442, + "diversity_loss_mlp": 0.0, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.08413175271133923, + "language_loss": 0.96722186, + "learning_rate": 0.0009978755343454018, + "loss": 0.97839665, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.31323242, + "routerloss_mlp": 0.0, + "step": 304, + "time_per_iteration": 2.7423698902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.11099684, + "diversity_loss_mlp": 0.0, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.08591892096672729, + "language_loss": 0.97475642, + "learning_rate": 0.0009978467490534355, + "loss": 0.98621881, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.35229492, + "routerloss_mlp": 0.0, + "step": 305, + "time_per_iteration": 2.5751075744628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144387, + "balance_loss_mlp": 1.10974526, + "diversity_loss_mlp": 0.0, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.06674928608125212, + "language_loss": 0.95161211, + "learning_rate": 0.00099781777048022, + "loss": 0.96305597, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.34667969, + "routerloss_mlp": 0.0, + "step": 306, + "time_per_iteration": 2.697453260421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142445, + "balance_loss_mlp": 1.10766006, + "diversity_loss_mlp": 0.0, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.08714127978238019, + "language_loss": 0.96547389, + "learning_rate": 0.0009977885986370057, + "loss": 0.97689843, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.34790039, + "routerloss_mlp": 0.0, + "step": 307, + "time_per_iteration": 2.555311679840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.11098385, + "diversity_loss_mlp": 0.0, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.07630797692789458, + "language_loss": 0.93133295, + "learning_rate": 0.000997759233535118, + "loss": 0.94276774, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.32495117, + "routerloss_mlp": 0.0, + "step": 308, + "time_per_iteration": 2.7760326862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.10530353, + "diversity_loss_mlp": 0.0, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.1535726459245726, + "language_loss": 0.98530197, + "learning_rate": 0.0009977296751859576, + "loss": 0.99668187, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.32666016, + "routerloss_mlp": 0.0, + "step": 309, + "time_per_iteration": 2.7718236446380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119868, + "balance_loss_mlp": 1.09030402, + "diversity_loss_mlp": 0.0, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.09363029892750833, + "language_loss": 1.00139546, + "learning_rate": 0.0009976999236009998, + "loss": 1.01259422, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 310, + "time_per_iteration": 2.7480924129486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128418, + "balance_loss_mlp": 1.1004039, + "diversity_loss_mlp": 0.0, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.11799476734746514, + "language_loss": 1.01830125, + "learning_rate": 0.0009976699787917955, + "loss": 1.02958548, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.28051758, + "routerloss_mlp": 0.0, + "step": 311, + "time_per_iteration": 2.6702628135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02237821, + "balance_loss_mlp": 2.22513723, + "diversity_loss_mlp": 0.0, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.1521885653041848, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75680816, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 312, + "time_per_iteration": 4.968472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01934551, + "balance_loss_mlp": 3.38140035, + "diversity_loss_mlp": 0.39575127, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.05936914788699087, + "language_loss": 0.983639, + "learning_rate": 0.0009976095095472243, + "loss": 1.00298452, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.04597524, + "step": 313, + "time_per_iteration": 2.6077775955200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140935, + "balance_loss_mlp": 1.11120427, + "diversity_loss_mlp": 0.0, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.09323488343042824, + "language_loss": 0.95392269, + "learning_rate": 0.0009975789851353334, + "loss": 0.96533203, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29736328, + "routerloss_mlp": 0.0, + "step": 314, + "time_per_iteration": 2.810530424118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152798, + "balance_loss_mlp": 1.12359178, + "diversity_loss_mlp": 0.0, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.09115128879339694, + "language_loss": 0.97407585, + "learning_rate": 0.0009975482675461487, + "loss": 0.98560387, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.29223633, + "routerloss_mlp": 0.0, + "step": 315, + "time_per_iteration": 2.658961772918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165231, + "balance_loss_mlp": 1.13464189, + "diversity_loss_mlp": 0.0, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08232329918432242, + "language_loss": 0.95008749, + "learning_rate": 0.0009975173567915952, + "loss": 0.96173978, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.3059082, + "routerloss_mlp": 0.0, + "step": 316, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208938, + "balance_loss_mlp": 1.17508304, + "diversity_loss_mlp": 0.0, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.11734128354988786, + "language_loss": 0.89037865, + "learning_rate": 0.000997486252883674, + "loss": 0.90246803, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.33886719, + "routerloss_mlp": 0.0, + "step": 317, + "time_per_iteration": 2.82440447807312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246386, + "balance_loss_mlp": 1.21069503, + "diversity_loss_mlp": 0.0, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.09191065951965113, + "language_loss": 0.94435382, + "learning_rate": 0.0009974549558344602, + "loss": 0.95681769, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.35693359, + "routerloss_mlp": 0.0, + "step": 318, + "time_per_iteration": 3.6594014167785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256455, + "balance_loss_mlp": 1.22028661, + "diversity_loss_mlp": 0.0, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.10186826507715854, + "language_loss": 1.03254342, + "learning_rate": 0.000997423465656105, + "loss": 1.04510808, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.36181641, + "routerloss_mlp": 0.0, + "step": 319, + "time_per_iteration": 2.7277376651763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228783, + "balance_loss_mlp": 1.19342566, + "diversity_loss_mlp": 0.0, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.07892523617459922, + "language_loss": 1.00628281, + "learning_rate": 0.0009973917823608335, + "loss": 1.01857066, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.35375977, + "routerloss_mlp": 0.0, + "step": 320, + "time_per_iteration": 2.608973503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216411, + "balance_loss_mlp": 1.18279386, + "diversity_loss_mlp": 0.0, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.08046246772740448, + "language_loss": 0.96186835, + "learning_rate": 0.0009973599059609462, + "loss": 0.9740324, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 321, + "time_per_iteration": 2.736543655395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188588, + "balance_loss_mlp": 1.15735531, + "diversity_loss_mlp": 0.0, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.06958940991484033, + "language_loss": 0.93877137, + "learning_rate": 0.000997327836468819, + "loss": 0.95065725, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.31225586, + "routerloss_mlp": 0.0, + "step": 322, + "time_per_iteration": 2.6034624576568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_mlp": 1.14392066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.10097410409674823, + "language_loss": 0.96476239, + "learning_rate": 0.000997295573896902, + "loss": 0.97648811, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28649902, + "routerloss_mlp": 0.0, + "step": 323, + "time_per_iteration": 2.8207039833068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02388506, + "balance_loss_mlp": 2.37343788, + "diversity_loss_mlp": 0.0, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.2858946964689234, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83584547, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 324, + "time_per_iteration": 4.691263437271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01793915, + "balance_loss_mlp": 1.78142214, + "diversity_loss_mlp": 0.0, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.11944332826526777, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80365855, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 325, + "time_per_iteration": 4.837715148925781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214832, + "balance_loss_mlp": 1.18657923, + "diversity_loss_mlp": 0.0, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.0814388529334085, + "language_loss": 0.91516924, + "learning_rate": 0.000997197627828043, + "loss": 0.92731762, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 326, + "time_per_iteration": 2.5261096954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228602, + "balance_loss_mlp": 1.20018268, + "diversity_loss_mlp": 0.0, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.08774897428196327, + "language_loss": 0.86495018, + "learning_rate": 0.0009971645930629716, + "loss": 0.87723619, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.28442383, + "routerloss_mlp": 0.0, + "step": 327, + "time_per_iteration": 2.73193621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236303, + "balance_loss_mlp": 1.20914674, + "diversity_loss_mlp": 0.0, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.0823367638378532, + "language_loss": 0.99889791, + "learning_rate": 0.0009971313652814872, + "loss": 1.01126099, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.2722168, + "routerloss_mlp": 0.0, + "step": 328, + "time_per_iteration": 2.79278826713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224995, + "balance_loss_mlp": 1.1973865, + "diversity_loss_mlp": 0.0, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.1407341288256049, + "language_loss": 0.97435188, + "learning_rate": 0.0009970979444964903, + "loss": 0.98660183, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27636719, + "routerloss_mlp": 0.0, + "step": 329, + "time_per_iteration": 2.9955334663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213807, + "balance_loss_mlp": 1.18553066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.10291010686297611, + "language_loss": 0.9869082, + "learning_rate": 0.0009970643307209556, + "loss": 0.99904621, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 330, + "time_per_iteration": 2.79775071144104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202809, + "balance_loss_mlp": 1.17248201, + "diversity_loss_mlp": 0.0, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.08231148280507655, + "language_loss": 0.94842714, + "learning_rate": 0.0009970305239679334, + "loss": 0.96045524, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.30322266, + "routerloss_mlp": 0.0, + "step": 331, + "time_per_iteration": 2.802400827407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203401, + "balance_loss_mlp": 1.17300248, + "diversity_loss_mlp": 0.0, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.08804880344809486, + "language_loss": 0.99692816, + "learning_rate": 0.0009969965242505483, + "loss": 1.00896215, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.30371094, + "routerloss_mlp": 0.0, + "step": 332, + "time_per_iteration": 2.634702682495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224958, + "balance_loss_mlp": 1.19243741, + "diversity_loss_mlp": 0.0, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06414677867033303, + "language_loss": 0.95931363, + "learning_rate": 0.0009969623315820007, + "loss": 0.97156322, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 333, + "time_per_iteration": 2.6661436557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245141, + "balance_loss_mlp": 1.21149969, + "diversity_loss_mlp": 0.0, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06624608002660057, + "language_loss": 0.9590115, + "learning_rate": 0.000996927945975565, + "loss": 0.97146285, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 334, + "time_per_iteration": 2.576922655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252992, + "balance_loss_mlp": 1.21672821, + "diversity_loss_mlp": 0.0, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.07108304231036514, + "language_loss": 0.93002915, + "learning_rate": 0.0009968933674445906, + "loss": 0.94255906, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.36230469, + "routerloss_mlp": 0.0, + "step": 335, + "time_per_iteration": 2.706836462020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267675, + "balance_loss_mlp": 1.23026776, + "diversity_loss_mlp": 0.0, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.0701420022906001, + "language_loss": 0.95153642, + "learning_rate": 0.0009968585960025028, + "loss": 0.96421325, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.37402344, + "routerloss_mlp": 0.0, + "step": 336, + "time_per_iteration": 2.9356396198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01838771, + "balance_loss_mlp": 1.81416643, + "diversity_loss_mlp": 0.0, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.09587986506557475, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79491967, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.24511719, + "routerloss_mlp": 0.0, + "step": 337, + "time_per_iteration": 4.784119606018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242978, + "balance_loss_mlp": 1.20874155, + "diversity_loss_mlp": 0.0, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.1007121907193806, + "language_loss": 0.9314844, + "learning_rate": 0.0009967884744390583, + "loss": 0.94391423, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.3425293, + "routerloss_mlp": 0.0, + "step": 338, + "time_per_iteration": 3.5315823554992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209945, + "balance_loss_mlp": 1.1758039, + "diversity_loss_mlp": 0.0, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.10820011352875603, + "language_loss": 0.93812096, + "learning_rate": 0.0009967531243449256, + "loss": 0.95022047, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.34130859, + "routerloss_mlp": 0.0, + "step": 339, + "time_per_iteration": 2.6663827896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172072, + "balance_loss_mlp": 1.13959908, + "diversity_loss_mlp": 0.0, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.07246387309668721, + "language_loss": 1.014539, + "learning_rate": 0.000996717581394126, + "loss": 1.02625966, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 340, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142038, + "balance_loss_mlp": 1.11142516, + "diversity_loss_mlp": 0.0, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.07622939946709405, + "language_loss": 1.01788783, + "learning_rate": 0.000996681845600459, + "loss": 1.0293082, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.30615234, + "routerloss_mlp": 0.0, + "step": 341, + "time_per_iteration": 2.6651370525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138836, + "balance_loss_mlp": 1.10901034, + "diversity_loss_mlp": 0.0, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.06359259902727714, + "language_loss": 0.94080132, + "learning_rate": 0.0009966459169777982, + "loss": 0.95218974, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.29785156, + "routerloss_mlp": 0.0, + "step": 342, + "time_per_iteration": 2.524775981903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136152, + "balance_loss_mlp": 1.10670757, + "diversity_loss_mlp": 0.0, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.07912610309003802, + "language_loss": 1.03090763, + "learning_rate": 0.0009966097955400924, + "loss": 1.04226899, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.29418945, + "routerloss_mlp": 0.0, + "step": 343, + "time_per_iteration": 2.662269115447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074802, + "balance_loss_mlp": 1.74366593, + "diversity_loss_mlp": 0.35364389, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.10968898462568231, + "language_loss": 0.99445379, + "learning_rate": 0.0009965734813013652, + "loss": 1.00520182, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02614743, + "step": 344, + "time_per_iteration": 2.82026743888855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138748, + "balance_loss_mlp": 1.10989952, + "diversity_loss_mlp": 0.0, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.13046244738635646, + "language_loss": 0.99630761, + "learning_rate": 0.0009965369742757151, + "loss": 1.00769508, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.28833008, + "routerloss_mlp": 0.0, + "step": 345, + "time_per_iteration": 2.565809965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112942, + "balance_loss_mlp": 1.10131097, + "diversity_loss_mlp": 0.0, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.1120170016707216, + "language_loss": 0.96858162, + "learning_rate": 0.0009965002744773152, + "loss": 0.9798758, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 346, + "time_per_iteration": 3.52542781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144914, + "balance_loss_mlp": 1.1170671, + "diversity_loss_mlp": 0.0, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.08447825810050776, + "language_loss": 0.93369007, + "learning_rate": 0.0009964633819204139, + "loss": 0.94513917, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.27832031, + "routerloss_mlp": 0.0, + "step": 347, + "time_per_iteration": 2.6504640579223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02729187, + "balance_loss_mlp": 2.68856025, + "diversity_loss_mlp": 0.0, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.36365581545094156, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.84530306, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 348, + "time_per_iteration": 4.9217259883880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01886969, + "balance_loss_mlp": 1.8606472, + "diversity_loss_mlp": 0.0, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.11180228987157655, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.77040851, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.26367188, + "routerloss_mlp": 0.0, + "step": 349, + "time_per_iteration": 4.915479898452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148873, + "balance_loss_mlp": 1.11942816, + "diversity_loss_mlp": 0.0, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.08620115988858058, + "language_loss": 0.93105251, + "learning_rate": 0.000996351547842304, + "loss": 0.94254124, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29443359, + "routerloss_mlp": 0.0, + "step": 350, + "time_per_iteration": 3.2273383140563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183797, + "balance_loss_mlp": 1.152946, + "diversity_loss_mlp": 0.0, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.10656846418921655, + "language_loss": 0.91589314, + "learning_rate": 0.0009963138843953744, + "loss": 0.92773116, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.30810547, + "routerloss_mlp": 0.0, + "step": 351, + "time_per_iteration": 2.6443302631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122224, + "balance_loss_mlp": 1.19079256, + "diversity_loss_mlp": 0.0, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.12218392571909323, + "language_loss": 0.95582229, + "learning_rate": 0.000996276028262306, + "loss": 0.9680447, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.31420898, + "routerloss_mlp": 0.0, + "step": 352, + "time_per_iteration": 2.819287061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121763, + "balance_loss_mlp": 1.18711233, + "diversity_loss_mlp": 0.0, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.14903684788896404, + "language_loss": 1.01496267, + "learning_rate": 0.0009962379794577964, + "loss": 1.02713895, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.30493164, + "routerloss_mlp": 0.0, + "step": 353, + "time_per_iteration": 2.591759204864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123139, + "balance_loss_mlp": 1.2003479, + "diversity_loss_mlp": 0.0, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.0632056956592815, + "language_loss": 0.9195236, + "learning_rate": 0.000996199737996617, + "loss": 0.9318375, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "routerloss_mlp": 0.0, + "step": 354, + "time_per_iteration": 2.889040231704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209696, + "balance_loss_mlp": 1.17963195, + "diversity_loss_mlp": 0.0, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.07119928644727336, + "language_loss": 1.00405252, + "learning_rate": 0.0009961613038936149, + "loss": 1.0161494, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.30029297, + "routerloss_mlp": 0.0, + "step": 355, + "time_per_iteration": 2.5856525897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187257, + "balance_loss_mlp": 1.15755057, + "diversity_loss_mlp": 0.0, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.07116362106359332, + "language_loss": 0.93361115, + "learning_rate": 0.000996122677163711, + "loss": 0.9454838, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.296875, + "routerloss_mlp": 0.0, + "step": 356, + "time_per_iteration": 2.8134818077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213028, + "balance_loss_mlp": 1.18367887, + "diversity_loss_mlp": 0.0, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.08014414191517881, + "language_loss": 0.98940754, + "learning_rate": 0.000996083857821902, + "loss": 1.0015378, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.29345703, + "routerloss_mlp": 0.0, + "step": 357, + "time_per_iteration": 3.0531890392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237281, + "balance_loss_mlp": 1.20714498, + "diversity_loss_mlp": 0.0, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.06260381392843543, + "language_loss": 0.96791607, + "learning_rate": 0.0009960448458832588, + "loss": 0.98028892, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30126953, + "routerloss_mlp": 0.0, + "step": 358, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.20750594, + "diversity_loss_mlp": 0.0, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.07177130169486132, + "language_loss": 0.96227086, + "learning_rate": 0.000996005641362927, + "loss": 0.97463197, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28637695, + "routerloss_mlp": 0.0, + "step": 359, + "time_per_iteration": 2.58060884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229528, + "balance_loss_mlp": 1.19984436, + "diversity_loss_mlp": 0.0, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.09877521418753983, + "language_loss": 0.99257219, + "learning_rate": 0.0009959662442761274, + "loss": 1.00486755, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.29663086, + "routerloss_mlp": 0.0, + "step": 360, + "time_per_iteration": 2.8970725536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241998, + "balance_loss_mlp": 1.21033561, + "diversity_loss_mlp": 0.0, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.07509157549903762, + "language_loss": 0.93086261, + "learning_rate": 0.000995926654638155, + "loss": 0.9432826, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.31640625, + "routerloss_mlp": 0.0, + "step": 361, + "time_per_iteration": 2.787796974182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225169, + "balance_loss_mlp": 1.19405532, + "diversity_loss_mlp": 0.0, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.08313329413520473, + "language_loss": 0.94580126, + "learning_rate": 0.00099588687246438, + "loss": 0.95805293, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.31103516, + "routerloss_mlp": 0.0, + "step": 362, + "time_per_iteration": 2.826186418533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188587, + "balance_loss_mlp": 1.15785527, + "diversity_loss_mlp": 0.0, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.12654684897021498, + "language_loss": 1.02203465, + "learning_rate": 0.0009958468977702471, + "loss": 1.03392053, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 363, + "time_per_iteration": 2.5915637016296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02117372, + "balance_loss_mlp": 1.97470212, + "diversity_loss_mlp": 0.0, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.12517092959889778, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81852078, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.4296875, + "routerloss_mlp": 0.0, + "step": 364, + "time_per_iteration": 4.79950737953186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195198, + "balance_loss_mlp": 1.16406059, + "diversity_loss_mlp": 0.0, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.08484436116426784, + "language_loss": 0.90580225, + "learning_rate": 0.0009957663708830612, + "loss": 0.91775423, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 365, + "time_per_iteration": 3.2616662979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119947, + "balance_loss_mlp": 1.16575801, + "diversity_loss_mlp": 0.0, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.10575932689534903, + "language_loss": 0.93159938, + "learning_rate": 0.0009957258187212714, + "loss": 0.9435941, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.33740234, + "routerloss_mlp": 0.0, + "step": 366, + "time_per_iteration": 3.0113134384155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02012454, + "balance_loss_mlp": 1.90030205, + "diversity_loss_mlp": 0.0, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.0781885975604906, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.81207317, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.125, + "routerloss_mlp": 0.0, + "step": 367, + "time_per_iteration": 4.857182502746582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238272, + "balance_loss_mlp": 1.20377314, + "diversity_loss_mlp": 0.0, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.10459556468103207, + "language_loss": 0.9040041, + "learning_rate": 0.0009956441370400167, + "loss": 0.91638684, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.34472656, + "routerloss_mlp": 0.0, + "step": 368, + "time_per_iteration": 2.6384623050689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212552, + "balance_loss_mlp": 1.17986465, + "diversity_loss_mlp": 0.0, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.11871319311308551, + "language_loss": 0.96155751, + "learning_rate": 0.0009956030075522636, + "loss": 0.973683, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.3269043, + "routerloss_mlp": 0.0, + "step": 369, + "time_per_iteration": 2.7690951824188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.85686088, + "diversity_loss_mlp": 0.26596725, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0445321938876095, + "language_loss": 0.99161661, + "learning_rate": 0.0009955616856543587, + "loss": 1.00259984, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.03691306, + "step": 370, + "time_per_iteration": 2.6551451683044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136923, + "balance_loss_mlp": 1.10690594, + "diversity_loss_mlp": 0.0, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.06345816714032589, + "language_loss": 0.89315635, + "learning_rate": 0.0009955201713623448, + "loss": 0.90452558, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.29980469, + "routerloss_mlp": 0.0, + "step": 371, + "time_per_iteration": 2.7738049030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01981215, + "balance_loss_mlp": 1.93124223, + "diversity_loss_mlp": 0.0, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.16358882606758401, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78653932, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.5, + "routerloss_mlp": 0.0, + "step": 372, + "time_per_iteration": 4.94252347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117773, + "balance_loss_mlp": 1.08999681, + "diversity_loss_mlp": 0.0, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.14652608757044766, + "language_loss": 1.03006279, + "learning_rate": 0.0009954365656605333, + "loss": 1.04124057, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.27783203, + "routerloss_mlp": 0.0, + "step": 373, + "time_per_iteration": 2.551156759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138367, + "balance_loss_mlp": 1.10901785, + "diversity_loss_mlp": 0.0, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.09116429227244367, + "language_loss": 0.95790577, + "learning_rate": 0.0009953944742831947, + "loss": 0.96928942, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.29296875, + "routerloss_mlp": 0.0, + "step": 374, + "time_per_iteration": 2.995286226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159694, + "balance_loss_mlp": 1.13084567, + "diversity_loss_mlp": 0.0, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.10582188185488459, + "language_loss": 0.99257255, + "learning_rate": 0.0009953521905766642, + "loss": 1.00416946, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.28808594, + "routerloss_mlp": 0.0, + "step": 375, + "time_per_iteration": 2.946237325668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186879, + "balance_loss_mlp": 1.15664721, + "diversity_loss_mlp": 0.0, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.09648654328935216, + "language_loss": 0.97696835, + "learning_rate": 0.0009953097145573577, + "loss": 0.98883718, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.30200195, + "routerloss_mlp": 0.0, + "step": 376, + "time_per_iteration": 2.64080548286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119333, + "balance_loss_mlp": 1.16164398, + "diversity_loss_mlp": 0.0, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.11805021949506506, + "language_loss": 0.95023847, + "learning_rate": 0.000995267046241766, + "loss": 0.96217185, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 377, + "time_per_iteration": 3.2120020389556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188603, + "balance_loss_mlp": 1.15617776, + "diversity_loss_mlp": 0.0, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.10215127385841216, + "language_loss": 0.94931126, + "learning_rate": 0.0009952241856464547, + "loss": 0.96119732, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.32421875, + "routerloss_mlp": 0.0, + "step": 378, + "time_per_iteration": 2.595047950744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183617, + "balance_loss_mlp": 1.14971423, + "diversity_loss_mlp": 0.0, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.08294465031859817, + "language_loss": 1.01604176, + "learning_rate": 0.0009951811327880632, + "loss": 1.02787805, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.33911133, + "routerloss_mlp": 0.0, + "step": 379, + "time_per_iteration": 2.7318813800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173744, + "balance_loss_mlp": 1.13891101, + "diversity_loss_mlp": 0.0, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.06744176383892367, + "language_loss": 0.94898254, + "learning_rate": 0.0009951378876833063, + "loss": 0.96071994, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.34838867, + "routerloss_mlp": 0.0, + "step": 380, + "time_per_iteration": 2.565268039703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198329, + "balance_loss_mlp": 1.16392517, + "diversity_loss_mlp": 0.0, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.08808941505023588, + "language_loss": 1.01867247, + "learning_rate": 0.0009950944503489736, + "loss": 1.03065586, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.34399414, + "routerloss_mlp": 0.0, + "step": 381, + "time_per_iteration": 2.7605583667755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220014, + "balance_loss_mlp": 1.18479919, + "diversity_loss_mlp": 0.0, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.09503573620830386, + "language_loss": 0.95487726, + "learning_rate": 0.0009950508208019285, + "loss": 0.96707737, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.35253906, + "routerloss_mlp": 0.0, + "step": 382, + "time_per_iteration": 3.023996591567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224507, + "balance_loss_mlp": 1.19086623, + "diversity_loss_mlp": 0.0, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.09021711867793632, + "language_loss": 1.0023253, + "learning_rate": 0.0009950069990591096, + "loss": 1.01457047, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.33666992, + "routerloss_mlp": 0.0, + "step": 383, + "time_per_iteration": 2.62634015083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02435347, + "balance_loss_mlp": 2.36668229, + "diversity_loss_mlp": 0.0, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.252441104666548, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78836709, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.6875, + "routerloss_mlp": 0.0, + "step": 384, + "time_per_iteration": 4.887000322341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205074, + "balance_loss_mlp": 1.17217231, + "diversity_loss_mlp": 0.0, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.13776686153508858, + "language_loss": 0.92669415, + "learning_rate": 0.0009949187790542777, + "loss": 0.93874478, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.32910156, + "routerloss_mlp": 0.0, + "step": 385, + "time_per_iteration": 2.7325563430786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158547, + "balance_loss_mlp": 1.12683773, + "diversity_loss_mlp": 0.0, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.09404920935129117, + "language_loss": 0.89306223, + "learning_rate": 0.0009948743808265148, + "loss": 0.90464771, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 386, + "time_per_iteration": 2.723581314086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152345, + "balance_loss_mlp": 1.12321043, + "diversity_loss_mlp": 0.0, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.11553674714385681, + "language_loss": 0.98625511, + "learning_rate": 0.0009948297904714782, + "loss": 0.99777853, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29125977, + "routerloss_mlp": 0.0, + "step": 387, + "time_per_iteration": 2.6925902366638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152671, + "balance_loss_mlp": 1.12460923, + "diversity_loss_mlp": 0.0, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.10281917509950625, + "language_loss": 0.91430104, + "learning_rate": 0.0009947850080064796, + "loss": 0.92582774, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.28076172, + "routerloss_mlp": 0.0, + "step": 388, + "time_per_iteration": 2.7813222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_mlp": 1.80238378, + "diversity_loss_mlp": 0.24433145, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.03140321958098528, + "language_loss": 0.96549261, + "learning_rate": 0.0009947400334489047, + "loss": 0.97600979, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0283502, + "step": 389, + "time_per_iteration": 3.055640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_mlp": 1.11867988, + "diversity_loss_mlp": 0.0, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.10120121915973303, + "language_loss": 0.87344396, + "learning_rate": 0.0009946948668162145, + "loss": 0.88490444, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.27392578, + "routerloss_mlp": 0.0, + "step": 390, + "time_per_iteration": 2.7240688800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159261, + "balance_loss_mlp": 1.13079381, + "diversity_loss_mlp": 0.0, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.0733706931740777, + "language_loss": 0.92598295, + "learning_rate": 0.0009946495081259441, + "loss": 0.93757558, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 391, + "time_per_iteration": 2.8451168537139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145886, + "balance_loss_mlp": 1.11753774, + "diversity_loss_mlp": 0.0, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.0986246500370879, + "language_loss": 0.95604634, + "learning_rate": 0.0009946039573957035, + "loss": 0.96750522, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.28344727, + "routerloss_mlp": 0.0, + "step": 392, + "time_per_iteration": 2.943962574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142945, + "balance_loss_mlp": 1.11550307, + "diversity_loss_mlp": 0.0, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.0698233472363084, + "language_loss": 0.92221498, + "learning_rate": 0.000994558214643177, + "loss": 0.93364441, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.27441406, + "routerloss_mlp": 0.0, + "step": 393, + "time_per_iteration": 2.7336390018463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137637, + "balance_loss_mlp": 1.10933709, + "diversity_loss_mlp": 0.0, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.0667709001177297, + "language_loss": 0.93581867, + "learning_rate": 0.000994512279886123, + "loss": 0.94719505, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 394, + "time_per_iteration": 3.0792524814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148773, + "balance_loss_mlp": 1.12104487, + "diversity_loss_mlp": 0.0, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.057306164352953166, + "language_loss": 0.94243777, + "learning_rate": 0.0009944661531423758, + "loss": 0.95392549, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.27758789, + "routerloss_mlp": 0.0, + "step": 395, + "time_per_iteration": 2.7003707885742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169709, + "balance_loss_mlp": 1.14162326, + "diversity_loss_mlp": 0.0, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.09187664036534561, + "language_loss": 0.92709243, + "learning_rate": 0.000994419834429843, + "loss": 0.93878949, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 396, + "time_per_iteration": 2.654961109161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184579, + "balance_loss_mlp": 1.15613592, + "diversity_loss_mlp": 0.0, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.10401840603132484, + "language_loss": 0.96742636, + "learning_rate": 0.0009943733237665069, + "loss": 0.97927213, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 397, + "time_per_iteration": 2.8282015323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204203, + "balance_loss_mlp": 1.17542565, + "diversity_loss_mlp": 0.0, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.06433229599495933, + "language_loss": 0.96130294, + "learning_rate": 0.0009943266211704248, + "loss": 0.97334492, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.28759766, + "routerloss_mlp": 0.0, + "step": 398, + "time_per_iteration": 2.970426321029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183998, + "balance_loss_mlp": 1.15534043, + "diversity_loss_mlp": 0.0, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.08157022591406732, + "language_loss": 0.98195136, + "learning_rate": 0.000994279726659728, + "loss": 0.99379134, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.28662109, + "routerloss_mlp": 0.0, + "step": 399, + "time_per_iteration": 2.5123794078826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177562, + "balance_loss_mlp": 1.14926195, + "diversity_loss_mlp": 0.0, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.07895179134063258, + "language_loss": 0.95376462, + "learning_rate": 0.0009942326402526231, + "loss": 0.96554029, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.28320312, + "routerloss_mlp": 0.0, + "step": 400, + "time_per_iteration": 2.52349591255188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146856, + "balance_loss_mlp": 1.11905658, + "diversity_loss_mlp": 0.0, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.0705701607591385, + "language_loss": 0.94442534, + "learning_rate": 0.0009941853619673902, + "loss": 0.95589387, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.27807617, + "routerloss_mlp": 0.0, + "step": 401, + "time_per_iteration": 2.643442153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.10811007, + "diversity_loss_mlp": 0.0, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.11619926948996102, + "language_loss": 0.97199881, + "learning_rate": 0.0009941378918223844, + "loss": 0.9833436, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.26416016, + "routerloss_mlp": 0.0, + "step": 402, + "time_per_iteration": 3.05241322517395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124539, + "balance_loss_mlp": 1.09765708, + "diversity_loss_mlp": 0.0, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.0628584922031364, + "language_loss": 0.90586787, + "learning_rate": 0.0009940902298360354, + "loss": 0.91711324, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.26916504, + "routerloss_mlp": 0.0, + "step": 403, + "time_per_iteration": 2.739593744277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123125, + "balance_loss_mlp": 1.09564674, + "diversity_loss_mlp": 0.0, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.07463467829204698, + "language_loss": 0.99357891, + "learning_rate": 0.0009940423760268473, + "loss": 1.00481009, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.27478027, + "routerloss_mlp": 0.0, + "step": 404, + "time_per_iteration": 2.863248825073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123907, + "balance_loss_mlp": 1.09644127, + "diversity_loss_mlp": 0.0, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.08544352707712408, + "language_loss": 0.93046296, + "learning_rate": 0.0009939943304133982, + "loss": 0.94170201, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.27514648, + "routerloss_mlp": 0.0, + "step": 405, + "time_per_iteration": 2.631242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929276, + "balance_loss_mlp": 1.55583501, + "diversity_loss_mlp": 0.25816602, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.039808149400508724, + "language_loss": 1.0085814, + "learning_rate": 0.0009939460930143416, + "loss": 1.017874, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02227605, + "step": 406, + "time_per_iteration": 2.655000925064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908113, + "balance_loss_mlp": 1.5136435, + "diversity_loss_mlp": 0.25845903, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.031543409668047605, + "language_loss": 0.94866949, + "learning_rate": 0.0009938976638484043, + "loss": 0.95775062, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02206134, + "step": 407, + "time_per_iteration": 2.932522773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125815, + "balance_loss_mlp": 1.09954083, + "diversity_loss_mlp": 0.0, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.0874520562524596, + "language_loss": 0.93291676, + "learning_rate": 0.0009938490429343887, + "loss": 0.94417489, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 408, + "time_per_iteration": 2.5488343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128514, + "balance_loss_mlp": 1.10140562, + "diversity_loss_mlp": 0.0, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.1051667442879041, + "language_loss": 0.94155729, + "learning_rate": 0.0009938002302911709, + "loss": 0.95284247, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 409, + "time_per_iteration": 2.7672979831695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.10946035, + "diversity_loss_mlp": 0.0, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.09613329153911296, + "language_loss": 0.9601537, + "learning_rate": 0.0009937512259377015, + "loss": 0.97151482, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 410, + "time_per_iteration": 2.674072504043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159019, + "balance_loss_mlp": 1.13217306, + "diversity_loss_mlp": 0.0, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.05951235305386178, + "language_loss": 0.95475662, + "learning_rate": 0.000993702029893006, + "loss": 0.96634674, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.26879883, + "routerloss_mlp": 0.0, + "step": 411, + "time_per_iteration": 2.7913753986358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185856, + "balance_loss_mlp": 1.15731764, + "diversity_loss_mlp": 0.0, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.10961223184545879, + "language_loss": 0.95336723, + "learning_rate": 0.0009936526421761838, + "loss": 0.96522582, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.28540039, + "routerloss_mlp": 0.0, + "step": 412, + "time_per_iteration": 3.036557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181446, + "balance_loss_mlp": 1.15414703, + "diversity_loss_mlp": 0.0, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.09075853005030154, + "language_loss": 0.97731507, + "learning_rate": 0.000993603062806409, + "loss": 0.98912954, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.27319336, + "routerloss_mlp": 0.0, + "step": 413, + "time_per_iteration": 2.690500259399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166438, + "balance_loss_mlp": 1.1394248, + "diversity_loss_mlp": 0.0, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.0841151797190701, + "language_loss": 1.00301099, + "learning_rate": 0.0009935532918029298, + "loss": 1.01467538, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.27050781, + "routerloss_mlp": 0.0, + "step": 414, + "time_per_iteration": 2.6386477947235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171646, + "balance_loss_mlp": 1.14432323, + "diversity_loss_mlp": 0.0, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.07267589634089947, + "language_loss": 0.94145483, + "learning_rate": 0.0009935033291850694, + "loss": 0.95317131, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.27307129, + "routerloss_mlp": 0.0, + "step": 415, + "time_per_iteration": 2.6771326065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138565, + "balance_loss_mlp": 1.11312544, + "diversity_loss_mlp": 0.0, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.09244391725109519, + "language_loss": 0.96404541, + "learning_rate": 0.0009934531749722247, + "loss": 0.97543103, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 416, + "time_per_iteration": 2.586975574493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132517, + "balance_loss_mlp": 1.10733998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.0915153559751851, + "language_loss": 0.94398224, + "learning_rate": 0.0009934028291838672, + "loss": 0.95530736, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.25183105, + "routerloss_mlp": 0.0, + "step": 417, + "time_per_iteration": 2.7062928676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150706, + "balance_loss_mlp": 1.1251713, + "diversity_loss_mlp": 0.0, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.10053131301435142, + "language_loss": 0.89968443, + "learning_rate": 0.0009933522918395433, + "loss": 0.91119152, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.25549316, + "routerloss_mlp": 0.0, + "step": 418, + "time_per_iteration": 2.65326189994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00760745, + "balance_loss_mlp": 1.16580379, + "diversity_loss_mlp": 0.256477, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.006992447528439397, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79011846, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.049605, + "step": 419, + "time_per_iteration": 4.8772523403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176473, + "balance_loss_mlp": 1.15143883, + "diversity_loss_mlp": 0.0, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08608768077535772, + "language_loss": 1.07860529, + "learning_rate": 0.000993250642561551, + "loss": 1.09036994, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.25061035, + "routerloss_mlp": 0.0, + "step": 420, + "time_per_iteration": 2.588672399520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.15165043, + "diversity_loss_mlp": 0.0, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.09804047271530963, + "language_loss": 0.93524832, + "learning_rate": 0.0009931995306673466, + "loss": 0.94701445, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 421, + "time_per_iteration": 2.734513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200943, + "balance_loss_mlp": 1.17474103, + "diversity_loss_mlp": 0.0, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.0768650968130289, + "language_loss": 0.98959565, + "learning_rate": 0.000993148227296103, + "loss": 1.00160503, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 422, + "time_per_iteration": 2.6389012336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185361, + "balance_loss_mlp": 1.1604228, + "diversity_loss_mlp": 0.0, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.08220754838372611, + "language_loss": 0.87845761, + "learning_rate": 0.000993096732467738, + "loss": 0.89031118, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.24938965, + "routerloss_mlp": 0.0, + "step": 423, + "time_per_iteration": 2.976412057876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00884908, + "balance_loss_mlp": 1.45653749, + "diversity_loss_mlp": 0.26738948, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.04326164577840749, + "language_loss": 0.94753903, + "learning_rate": 0.0009930450462022435, + "loss": 0.95638812, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02294483, + "step": 424, + "time_per_iteration": 2.9038002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02462639, + "balance_loss_mlp": 2.35582733, + "diversity_loss_mlp": 0.0, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.15208391867633483, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.81652445, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.0703125, + "routerloss_mlp": 0.0, + "step": 425, + "time_per_iteration": 4.893689155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182525, + "balance_loss_mlp": 1.15690684, + "diversity_loss_mlp": 0.0, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.10181541083425144, + "language_loss": 0.92197704, + "learning_rate": 0.0009929410994402065, + "loss": 0.93380231, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.25646973, + "routerloss_mlp": 0.0, + "step": 426, + "time_per_iteration": 3.793488025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863772, + "balance_loss_mlp": 1.42266524, + "diversity_loss_mlp": 0.26325443, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.038163151149059646, + "language_loss": 0.97185421, + "learning_rate": 0.0009928888389840196, + "loss": 0.98049194, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02081174, + "step": 427, + "time_per_iteration": 2.7310097217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196199, + "balance_loss_mlp": 1.1708436, + "diversity_loss_mlp": 0.0, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.1014811860289813, + "language_loss": 0.98936689, + "learning_rate": 0.0009928363871714147, + "loss": 1.00132895, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.25378418, + "routerloss_mlp": 0.0, + "step": 428, + "time_per_iteration": 2.650698184967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198239, + "balance_loss_mlp": 1.17194164, + "diversity_loss_mlp": 0.0, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.0884548399202502, + "language_loss": 0.93840969, + "learning_rate": 0.0009927837440227556, + "loss": 0.95039201, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 429, + "time_per_iteration": 2.8162689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199498, + "balance_loss_mlp": 1.17399931, + "diversity_loss_mlp": 0.0, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.0660726649824177, + "language_loss": 0.88846099, + "learning_rate": 0.0009927309095584798, + "loss": 0.90045595, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.25524902, + "routerloss_mlp": 0.0, + "step": 430, + "time_per_iteration": 2.975594997406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190829, + "balance_loss_mlp": 1.1661284, + "diversity_loss_mlp": 0.0, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.08430379744466543, + "language_loss": 0.98639262, + "learning_rate": 0.0009926778837991, + "loss": 0.99830091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.24682617, + "routerloss_mlp": 0.0, + "step": 431, + "time_per_iteration": 2.595855236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187757, + "balance_loss_mlp": 1.16231799, + "diversity_loss_mlp": 0.0, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.08045199303169787, + "language_loss": 0.97297168, + "learning_rate": 0.000992624666765202, + "loss": 0.98484921, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.2545166, + "routerloss_mlp": 0.0, + "step": 432, + "time_per_iteration": 2.828488826751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195331, + "balance_loss_mlp": 1.17080951, + "diversity_loss_mlp": 0.0, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.08518069864439091, + "language_loss": 0.9513936, + "learning_rate": 0.000992571258477447, + "loss": 0.96334684, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 433, + "time_per_iteration": 2.7914628982543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181479, + "balance_loss_mlp": 1.15727913, + "diversity_loss_mlp": 0.0, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.08514456826718247, + "language_loss": 0.89393032, + "learning_rate": 0.0009925176589565695, + "loss": 0.90574509, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.24182129, + "routerloss_mlp": 0.0, + "step": 434, + "time_per_iteration": 2.847381830215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154073, + "balance_loss_mlp": 1.13002813, + "diversity_loss_mlp": 0.0, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.09497783603336436, + "language_loss": 0.99263078, + "learning_rate": 0.0009924638682233791, + "loss": 1.00417161, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.24047852, + "routerloss_mlp": 0.0, + "step": 435, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505725, + "balance_loss_mlp": 2.43934894, + "diversity_loss_mlp": 0.0, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06827578128022488, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82070321, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.6640625, + "routerloss_mlp": 0.0, + "step": 436, + "time_per_iteration": 4.539026737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138708, + "balance_loss_mlp": 1.11440182, + "diversity_loss_mlp": 0.0, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.10357837156718612, + "language_loss": 0.8856501, + "learning_rate": 0.0009923557132036668, + "loss": 0.89703721, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 437, + "time_per_iteration": 3.0414698123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124752, + "balance_loss_mlp": 1.09998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.06660243724344939, + "language_loss": 0.94103611, + "learning_rate": 0.0009923013489591345, + "loss": 0.95228368, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.24768066, + "routerloss_mlp": 0.0, + "step": 438, + "time_per_iteration": 2.7426626682281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857144, + "balance_loss_mlp": 1.4199276, + "diversity_loss_mlp": 0.26049304, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04620678173721227, + "language_loss": 0.92873847, + "learning_rate": 0.0009922467935862681, + "loss": 0.93730992, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01693399, + "step": 439, + "time_per_iteration": 3.107149124145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113851, + "balance_loss_mlp": 1.11386943, + "diversity_loss_mlp": 0.0, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.07763968648184205, + "language_loss": 0.95120305, + "learning_rate": 0.0009921920471062478, + "loss": 0.96258819, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 440, + "time_per_iteration": 2.572195529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139984, + "balance_loss_mlp": 1.11489022, + "diversity_loss_mlp": 0.0, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.0880262953369173, + "language_loss": 0.92829931, + "learning_rate": 0.0009921371095403281, + "loss": 0.93969917, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.25109863, + "routerloss_mlp": 0.0, + "step": 441, + "time_per_iteration": 2.6386919021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156684, + "balance_loss_mlp": 1.13206697, + "diversity_loss_mlp": 0.0, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.09427081021892933, + "language_loss": 0.95792937, + "learning_rate": 0.0009920819809098379, + "loss": 0.96949625, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 442, + "time_per_iteration": 2.588674783706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169571, + "balance_loss_mlp": 1.1441319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.0873536117240321, + "language_loss": 0.91373646, + "learning_rate": 0.0009920266612361798, + "loss": 0.92543221, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 443, + "time_per_iteration": 2.755526065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167225, + "balance_loss_mlp": 1.14349055, + "diversity_loss_mlp": 0.0, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.07116177044877865, + "language_loss": 0.90907955, + "learning_rate": 0.0009919711505408308, + "loss": 0.92075175, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.23718262, + "routerloss_mlp": 0.0, + "step": 444, + "time_per_iteration": 2.7939865589141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116645, + "balance_loss_mlp": 1.14170241, + "diversity_loss_mlp": 0.0, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.09221719775958219, + "language_loss": 0.89192301, + "learning_rate": 0.000991915448845342, + "loss": 0.90358752, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.24731445, + "routerloss_mlp": 0.0, + "step": 445, + "time_per_iteration": 2.5457842350006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154656, + "balance_loss_mlp": 1.13168466, + "diversity_loss_mlp": 0.0, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.08780021998431992, + "language_loss": 0.98329008, + "learning_rate": 0.000991859556171339, + "loss": 0.99483669, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.22973633, + "routerloss_mlp": 0.0, + "step": 446, + "time_per_iteration": 2.6356756687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0083848, + "balance_loss_mlp": 1.38336182, + "diversity_loss_mlp": 0.25472927, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.049564893991705376, + "language_loss": 1.00050902, + "learning_rate": 0.000991803472540521, + "loss": 1.00889397, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01943407, + "step": 447, + "time_per_iteration": 2.631704807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130193, + "balance_loss_mlp": 1.1087712, + "diversity_loss_mlp": 0.0, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.11682082282160788, + "language_loss": 0.94917679, + "learning_rate": 0.0009917471979746615, + "loss": 0.96047872, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 448, + "time_per_iteration": 2.9820516109466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122722, + "balance_loss_mlp": 1.10119319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.07207820272739716, + "language_loss": 0.94521272, + "learning_rate": 0.0009916907324956086, + "loss": 0.95643997, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 449, + "time_per_iteration": 2.701571464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127326, + "balance_loss_mlp": 1.10453379, + "diversity_loss_mlp": 0.0, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.081693490118891, + "language_loss": 0.90889072, + "learning_rate": 0.0009916340761252837, + "loss": 0.92016399, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 450, + "time_per_iteration": 2.598238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124635, + "balance_loss_mlp": 1.10287929, + "diversity_loss_mlp": 0.0, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.08322873762038852, + "language_loss": 0.88526833, + "learning_rate": 0.0009915772288856832, + "loss": 0.89651471, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.21765137, + "routerloss_mlp": 0.0, + "step": 451, + "time_per_iteration": 3.0680441856384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121178, + "balance_loss_mlp": 1.09876692, + "diversity_loss_mlp": 0.0, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.07764148626601892, + "language_loss": 0.8994481, + "learning_rate": 0.000991520190798877, + "loss": 0.91065991, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22412109, + "routerloss_mlp": 0.0, + "step": 452, + "time_per_iteration": 2.7982983589172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136254, + "balance_loss_mlp": 1.11281788, + "diversity_loss_mlp": 0.0, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.11496723003988224, + "language_loss": 0.98584056, + "learning_rate": 0.0009914629618870089, + "loss": 0.99720311, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 453, + "time_per_iteration": 2.8737423419952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0218934, + "balance_loss_mlp": 2.1624465, + "diversity_loss_mlp": 0.0, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.09249743450545506, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.8086521, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.26953125, + "routerloss_mlp": 0.0, + "step": 454, + "time_per_iteration": 4.756322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065274, + "balance_loss_mlp": 2.03780842, + "diversity_loss_mlp": 0.0, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.0744981683452351, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83493233, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.27539062, + "routerloss_mlp": 0.0, + "step": 455, + "time_per_iteration": 2.173584461212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848454, + "balance_loss_mlp": 1.40727437, + "diversity_loss_mlp": 0.24745712, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.04702924064086775, + "language_loss": 0.92085564, + "learning_rate": 0.0009912901304235883, + "loss": 0.92934018, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0210887, + "step": 456, + "time_per_iteration": 2.868276596069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273346, + "balance_loss_mlp": 1.24886012, + "diversity_loss_mlp": 0.0, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.1518400720273604, + "language_loss": 0.87943619, + "learning_rate": 0.000991232138434397, + "loss": 0.89216965, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.24499512, + "routerloss_mlp": 0.0, + "step": 457, + "time_per_iteration": 2.8729381561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262571, + "balance_loss_mlp": 1.23763299, + "diversity_loss_mlp": 0.0, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.14470377187588201, + "language_loss": 0.94336045, + "learning_rate": 0.000991173955731976, + "loss": 0.9559862, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 458, + "time_per_iteration": 2.7100729942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218734, + "balance_loss_mlp": 1.19520259, + "diversity_loss_mlp": 0.0, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.09239254139658798, + "language_loss": 0.99845707, + "learning_rate": 0.0009911155823389137, + "loss": 1.01064444, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.23547363, + "routerloss_mlp": 0.0, + "step": 459, + "time_per_iteration": 2.9462080001831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178782, + "balance_loss_mlp": 1.1555717, + "diversity_loss_mlp": 0.0, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.0878830171329016, + "language_loss": 0.95269191, + "learning_rate": 0.000991057018277873, + "loss": 0.9644798, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.23205566, + "routerloss_mlp": 0.0, + "step": 460, + "time_per_iteration": 2.7473583221435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151252, + "balance_loss_mlp": 1.12904322, + "diversity_loss_mlp": 0.0, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.1205367347306004, + "language_loss": 0.9509443, + "learning_rate": 0.0009909982635715898, + "loss": 0.96245682, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22216797, + "routerloss_mlp": 0.0, + "step": 461, + "time_per_iteration": 2.6226725578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145607, + "balance_loss_mlp": 1.12300491, + "diversity_loss_mlp": 0.0, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.0884001914091671, + "language_loss": 0.94182885, + "learning_rate": 0.0009909393182428751, + "loss": 0.95328492, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22619629, + "routerloss_mlp": 0.0, + "step": 462, + "time_per_iteration": 2.632216453552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157329, + "balance_loss_mlp": 1.13402367, + "diversity_loss_mlp": 0.0, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.09814328047414513, + "language_loss": 0.89072084, + "learning_rate": 0.000990880182314614, + "loss": 0.90229416, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 463, + "time_per_iteration": 2.6763410568237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.008652, + "balance_loss_mlp": 1.44467092, + "diversity_loss_mlp": 0.24997658, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.034550824680377484, + "language_loss": 0.89998591, + "learning_rate": 0.0009908208558097643, + "loss": 0.90863788, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01787652, + "step": 464, + "time_per_iteration": 2.9323060512542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224446, + "balance_loss_mlp": 1.20036614, + "diversity_loss_mlp": 0.0, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.11121459240038054, + "language_loss": 0.9153899, + "learning_rate": 0.000990761338751359, + "loss": 0.92763436, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 465, + "time_per_iteration": 2.7976956367492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01887012, + "balance_loss_mlp": 1.84867477, + "diversity_loss_mlp": 0.0, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.10155840838291885, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75546634, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.3828125, + "routerloss_mlp": 0.0, + "step": 466, + "time_per_iteration": 4.965139150619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319273, + "balance_loss_mlp": 1.29344034, + "diversity_loss_mlp": 0.0, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.10901527230577203, + "language_loss": 0.93872285, + "learning_rate": 0.0009906417330663815, + "loss": 0.95191562, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 467, + "time_per_iteration": 2.628042459487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01352641, + "balance_loss_mlp": 1.3264153, + "diversity_loss_mlp": 0.0, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.10051526680757361, + "language_loss": 0.90321958, + "learning_rate": 0.0009905816444862442, + "loss": 0.91674596, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 468, + "time_per_iteration": 2.613952398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396274, + "balance_loss_mlp": 1.36905813, + "diversity_loss_mlp": 0.0, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.10220310656667285, + "language_loss": 0.88433367, + "learning_rate": 0.0009905213654454216, + "loss": 0.89829642, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.27209473, + "routerloss_mlp": 0.0, + "step": 469, + "time_per_iteration": 2.897365093231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01363851, + "balance_loss_mlp": 1.3367548, + "diversity_loss_mlp": 0.0, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.11223211494597432, + "language_loss": 0.94907629, + "learning_rate": 0.0009904608959673158, + "loss": 0.96271479, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.2713623, + "routerloss_mlp": 0.0, + "step": 470, + "time_per_iteration": 2.7828967571258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328731, + "balance_loss_mlp": 1.30289829, + "diversity_loss_mlp": 0.0, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.10534875872888719, + "language_loss": 0.94143116, + "learning_rate": 0.000990400236075403, + "loss": 0.95471847, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 471, + "time_per_iteration": 2.5291385650634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126, + "balance_loss_mlp": 1.23546696, + "diversity_loss_mlp": 0.0, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.08150240013734093, + "language_loss": 0.92401147, + "learning_rate": 0.0009903393857932338, + "loss": 0.93661153, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 472, + "time_per_iteration": 2.6317975521087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234666, + "balance_loss_mlp": 1.21105075, + "diversity_loss_mlp": 0.0, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.1079858906687858, + "language_loss": 0.89742762, + "learning_rate": 0.0009902783451444317, + "loss": 0.90977424, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.23583984, + "routerloss_mlp": 0.0, + "step": 473, + "time_per_iteration": 2.708159923553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204783, + "balance_loss_mlp": 1.18326581, + "diversity_loss_mlp": 0.0, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.08561107807714156, + "language_loss": 0.94620812, + "learning_rate": 0.0009902171141526956, + "loss": 0.95825595, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 474, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196875, + "balance_loss_mlp": 1.17460644, + "diversity_loss_mlp": 0.0, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.10745755704500252, + "language_loss": 0.82875264, + "learning_rate": 0.000990155692841797, + "loss": 0.84072143, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.22277832, + "routerloss_mlp": 0.0, + "step": 475, + "time_per_iteration": 2.985820770263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191147, + "balance_loss_mlp": 1.16911697, + "diversity_loss_mlp": 0.0, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.10692573165988825, + "language_loss": 0.93685389, + "learning_rate": 0.0009900940812355818, + "loss": 0.9487654, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.22033691, + "routerloss_mlp": 0.0, + "step": 476, + "time_per_iteration": 2.882946014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182015, + "balance_loss_mlp": 1.15972316, + "diversity_loss_mlp": 0.0, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.15748592495925862, + "language_loss": 0.89566875, + "learning_rate": 0.00099003227935797, + "loss": 0.90748894, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.22290039, + "routerloss_mlp": 0.0, + "step": 477, + "time_per_iteration": 2.729729413986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176422, + "balance_loss_mlp": 1.15324748, + "diversity_loss_mlp": 0.0, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.11223041806675033, + "language_loss": 0.92644513, + "learning_rate": 0.000989970287232955, + "loss": 0.93820935, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.23156738, + "routerloss_mlp": 0.0, + "step": 478, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168241, + "balance_loss_mlp": 1.14524555, + "diversity_loss_mlp": 0.0, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.08330283562574453, + "language_loss": 0.90444613, + "learning_rate": 0.0009899081048846043, + "loss": 0.91612852, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 479, + "time_per_iteration": 2.548454523086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230508, + "balance_loss_mlp": 1.20630884, + "diversity_loss_mlp": 0.0, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.17103007353978975, + "language_loss": 0.94793594, + "learning_rate": 0.0009898457323370593, + "loss": 0.96024096, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.24206543, + "routerloss_mlp": 0.0, + "step": 480, + "time_per_iteration": 2.582655668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249007, + "balance_loss_mlp": 1.22349596, + "diversity_loss_mlp": 0.0, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.11976742763400251, + "language_loss": 0.9370476, + "learning_rate": 0.000989783169614535, + "loss": 0.94953763, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25537109, + "routerloss_mlp": 0.0, + "step": 481, + "time_per_iteration": 2.6305787563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01772239, + "balance_loss_mlp": 1.74649, + "diversity_loss_mlp": 0.0, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.0876770513617693, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80524993, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 482, + "time_per_iteration": 4.8690409660339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276229, + "balance_loss_mlp": 1.25084925, + "diversity_loss_mlp": 0.0, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.10686208189243855, + "language_loss": 0.91100538, + "learning_rate": 0.000989657473741779, + "loss": 0.92376775, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25402832, + "routerloss_mlp": 0.0, + "step": 483, + "time_per_iteration": 2.8294553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275465, + "balance_loss_mlp": 1.25022864, + "diversity_loss_mlp": 0.0, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.09087050091564236, + "language_loss": 0.92375994, + "learning_rate": 0.0009895943406403465, + "loss": 0.93651462, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.25244141, + "routerloss_mlp": 0.0, + "step": 484, + "time_per_iteration": 2.728445053100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231643, + "balance_loss_mlp": 1.20584655, + "diversity_loss_mlp": 0.0, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.11173906110031175, + "language_loss": 0.85102737, + "learning_rate": 0.0009895310174615338, + "loss": 0.86334383, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.25805664, + "routerloss_mlp": 0.0, + "step": 485, + "time_per_iteration": 2.809858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01674879, + "balance_loss_mlp": 1.65122819, + "diversity_loss_mlp": 0.0, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.0891862493938321, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.77393395, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.23632812, + "routerloss_mlp": 0.0, + "step": 486, + "time_per_iteration": 4.675356388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149968, + "balance_loss_mlp": 1.1268059, + "diversity_loss_mlp": 0.0, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.12873710921953274, + "language_loss": 0.89867461, + "learning_rate": 0.0009894038009701782, + "loss": 0.91017425, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.23168945, + "routerloss_mlp": 0.0, + "step": 487, + "time_per_iteration": 2.646655797958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141338, + "balance_loss_mlp": 1.11786556, + "diversity_loss_mlp": 0.0, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.11717214663903742, + "language_loss": 0.89069557, + "learning_rate": 0.0009893399077070253, + "loss": 0.90210891, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.23474121, + "routerloss_mlp": 0.0, + "step": 488, + "time_per_iteration": 2.578733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936332, + "balance_loss_mlp": 1.59238243, + "diversity_loss_mlp": 0.24211329, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.03786592480343135, + "language_loss": 0.88446009, + "learning_rate": 0.0009892758244652718, + "loss": 0.89382339, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0190843, + "step": 489, + "time_per_iteration": 2.72853946685791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131365, + "balance_loss_mlp": 1.10876274, + "diversity_loss_mlp": 0.0, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.09957245788293691, + "language_loss": 0.92780352, + "learning_rate": 0.0009892115512697968, + "loss": 0.93911719, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 490, + "time_per_iteration": 2.6975181102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127367, + "balance_loss_mlp": 1.10648203, + "diversity_loss_mlp": 0.0, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.09077239739165983, + "language_loss": 0.95311546, + "learning_rate": 0.0009891470881455537, + "loss": 0.96438909, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 491, + "time_per_iteration": 2.674140214920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141034, + "balance_loss_mlp": 1.12092364, + "diversity_loss_mlp": 0.0, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.08843271909801863, + "language_loss": 0.91967297, + "learning_rate": 0.0009890824351175692, + "loss": 0.93108326, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.20092773, + "routerloss_mlp": 0.0, + "step": 492, + "time_per_iteration": 2.689789295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148763, + "balance_loss_mlp": 1.12847304, + "diversity_loss_mlp": 0.0, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.0818574716555875, + "language_loss": 0.96715915, + "learning_rate": 0.0009890175922109435, + "loss": 0.97864676, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.20288086, + "routerloss_mlp": 0.0, + "step": 493, + "time_per_iteration": 2.653787136077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161837, + "balance_loss_mlp": 1.14108253, + "diversity_loss_mlp": 0.0, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.10785532679009643, + "language_loss": 0.94627249, + "learning_rate": 0.0009889525594508513, + "loss": 0.95789087, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.20751953, + "routerloss_mlp": 0.0, + "step": 494, + "time_per_iteration": 3.013289213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168804, + "balance_loss_mlp": 1.14887238, + "diversity_loss_mlp": 0.0, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.09313196509024183, + "language_loss": 0.89226812, + "learning_rate": 0.0009888873368625404, + "loss": 0.90395617, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 495, + "time_per_iteration": 2.4990835189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215397, + "balance_loss_mlp": 1.19448745, + "diversity_loss_mlp": 0.0, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.11525575263217126, + "language_loss": 0.92808712, + "learning_rate": 0.0009888219244713326, + "loss": 0.94024116, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 496, + "time_per_iteration": 2.828477382659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235818, + "balance_loss_mlp": 1.2138716, + "diversity_loss_mlp": 0.0, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.13708349411569606, + "language_loss": 0.92383498, + "learning_rate": 0.0009887563223026229, + "loss": 0.93619317, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 497, + "time_per_iteration": 2.6688501834869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03358766, + "balance_loss_mlp": 3.33902526, + "diversity_loss_mlp": 0.0, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.4973253845941573, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.82426929, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19726562, + "routerloss_mlp": 0.0, + "step": 498, + "time_per_iteration": 4.9225428104400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125204, + "balance_loss_mlp": 1.22810328, + "diversity_loss_mlp": 0.0, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.09338533863845942, + "language_loss": 0.9145627, + "learning_rate": 0.0009886245487346482, + "loss": 0.92708313, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.23925781, + "routerloss_mlp": 0.0, + "step": 499, + "time_per_iteration": 3.0396392345428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273949, + "balance_loss_mlp": 1.24874783, + "diversity_loss_mlp": 0.0, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.12406156723875504, + "language_loss": 0.94657683, + "learning_rate": 0.0009885583773865422, + "loss": 0.95931631, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 500, + "time_per_iteration": 2.434283971786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_mlp": 1.29096031, + "diversity_loss_mlp": 0.0, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.11518840252548597, + "language_loss": 0.91528684, + "learning_rate": 0.0009884920163632524, + "loss": 0.92847896, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 501, + "time_per_iteration": 2.6888957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131255, + "balance_loss_mlp": 1.28246212, + "diversity_loss_mlp": 0.0, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.12991803618191863, + "language_loss": 0.93797207, + "learning_rate": 0.000988425465690543, + "loss": 0.95109755, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.30102539, + "routerloss_mlp": 0.0, + "step": 502, + "time_per_iteration": 2.5672004222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283439, + "balance_loss_mlp": 1.25225365, + "diversity_loss_mlp": 0.0, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.11000587000012971, + "language_loss": 0.91223967, + "learning_rate": 0.0009883587253942505, + "loss": 0.92507404, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 503, + "time_per_iteration": 2.7560157775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_mlp": 1.24281311, + "diversity_loss_mlp": 0.0, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.10509235815923167, + "language_loss": 0.97371984, + "learning_rate": 0.0009882917955002862, + "loss": 0.9864552, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 504, + "time_per_iteration": 2.5183091163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227481, + "balance_loss_mlp": 1.1978929, + "diversity_loss_mlp": 0.0, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.11004475447178139, + "language_loss": 0.90284961, + "learning_rate": 0.0009882246760346343, + "loss": 0.91512442, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 505, + "time_per_iteration": 2.6169376373291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215441, + "balance_loss_mlp": 1.18637753, + "diversity_loss_mlp": 0.0, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.13294554223904492, + "language_loss": 0.94025862, + "learning_rate": 0.0009881573670233533, + "loss": 0.95241302, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.29077148, + "routerloss_mlp": 0.0, + "step": 506, + "time_per_iteration": 2.5373079776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012064, + "balance_loss_mlp": 1.17976809, + "diversity_loss_mlp": 0.0, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.07932421313758002, + "language_loss": 0.89223576, + "learning_rate": 0.0009880898684925747, + "loss": 0.90429974, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 507, + "time_per_iteration": 2.661796808242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206827, + "balance_loss_mlp": 1.18070853, + "diversity_loss_mlp": 0.0, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.09132088261693337, + "language_loss": 0.87935519, + "learning_rate": 0.0009880221804685037, + "loss": 0.89142346, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.26159668, + "routerloss_mlp": 0.0, + "step": 508, + "time_per_iteration": 2.542513608932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02552291, + "balance_loss_mlp": 2.42869496, + "diversity_loss_mlp": 0.0, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.1282373293100265, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.8189671, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 509, + "time_per_iteration": 4.707206964492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280503, + "balance_loss_mlp": 1.25399113, + "diversity_loss_mlp": 0.0, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.09929466646798928, + "language_loss": 0.93586993, + "learning_rate": 0.0009878862360456733, + "loss": 0.94867498, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.265625, + "routerloss_mlp": 0.0, + "step": 510, + "time_per_iteration": 2.6981284618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284628, + "balance_loss_mlp": 1.25883126, + "diversity_loss_mlp": 0.0, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.10250849932844218, + "language_loss": 0.87516463, + "learning_rate": 0.0009878179796996922, + "loss": 0.88801086, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.25817871, + "routerloss_mlp": 0.0, + "step": 511, + "time_per_iteration": 2.7541561126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281708, + "balance_loss_mlp": 1.25468373, + "diversity_loss_mlp": 0.0, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.10234956077068923, + "language_loss": 0.90780497, + "learning_rate": 0.0009877495339659754, + "loss": 0.92062211, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.27038574, + "routerloss_mlp": 0.0, + "step": 512, + "time_per_iteration": 2.7744665145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278173, + "balance_loss_mlp": 1.25241184, + "diversity_loss_mlp": 0.0, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.11291475079800635, + "language_loss": 0.85683644, + "learning_rate": 0.000987680898871096, + "loss": 0.86961818, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 513, + "time_per_iteration": 2.8321592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289018, + "balance_loss_mlp": 1.26217198, + "diversity_loss_mlp": 0.0, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.10190264212433507, + "language_loss": 0.85800934, + "learning_rate": 0.0009876120744417, + "loss": 0.87089956, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.26867676, + "routerloss_mlp": 0.0, + "step": 514, + "time_per_iteration": 2.945312023162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245022, + "balance_loss_mlp": 1.2198211, + "diversity_loss_mlp": 0.0, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.09616865198011539, + "language_loss": 0.94088352, + "learning_rate": 0.0009875430607045078, + "loss": 0.9533338, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 515, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214395, + "balance_loss_mlp": 1.19058895, + "diversity_loss_mlp": 0.0, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.0895550710797692, + "language_loss": 0.91242373, + "learning_rate": 0.000987473857686313, + "loss": 0.9245677, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.23791504, + "routerloss_mlp": 0.0, + "step": 516, + "time_per_iteration": 2.7530250549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218622, + "balance_loss_mlp": 1.19458985, + "diversity_loss_mlp": 0.0, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.11626991588591096, + "language_loss": 0.92559797, + "learning_rate": 0.0009874044654139824, + "loss": 0.93778414, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.24023438, + "routerloss_mlp": 0.0, + "step": 517, + "time_per_iteration": 2.7673146724700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188049, + "balance_loss_mlp": 1.16410005, + "diversity_loss_mlp": 0.0, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.09260385447056875, + "language_loss": 0.91065013, + "learning_rate": 0.0009873348839144563, + "loss": 0.92253065, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.23950195, + "routerloss_mlp": 0.0, + "step": 518, + "time_per_iteration": 2.5385515689849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.13979197, + "diversity_loss_mlp": 0.0, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.07604390633760301, + "language_loss": 0.95252264, + "learning_rate": 0.000987265113214749, + "loss": 0.96414435, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.22375488, + "routerloss_mlp": 0.0, + "step": 519, + "time_per_iteration": 2.556882619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171262, + "balance_loss_mlp": 1.14849353, + "diversity_loss_mlp": 0.0, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.093032650642813, + "language_loss": 0.94720447, + "learning_rate": 0.0009871951533419476, + "loss": 0.95891708, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.22753906, + "routerloss_mlp": 0.0, + "step": 520, + "time_per_iteration": 2.724825143814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163285, + "balance_loss_mlp": 1.14063525, + "diversity_loss_mlp": 0.0, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.07732484115861517, + "language_loss": 0.87440532, + "learning_rate": 0.0009871250043232132, + "loss": 0.88603818, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.22631836, + "routerloss_mlp": 0.0, + "step": 521, + "time_per_iteration": 2.756647825241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171709, + "balance_loss_mlp": 1.14840364, + "diversity_loss_mlp": 0.0, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.08586449419627491, + "language_loss": 0.8592059, + "learning_rate": 0.0009870546661857797, + "loss": 0.87092298, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 522, + "time_per_iteration": 2.611241340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188262, + "balance_loss_mlp": 1.16447985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.11121774977632432, + "language_loss": 0.93899059, + "learning_rate": 0.0009869841389569553, + "loss": 0.9508732, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.2376709, + "routerloss_mlp": 0.0, + "step": 523, + "time_per_iteration": 2.986001491546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897074, + "balance_loss_mlp": 1.51972795, + "diversity_loss_mlp": 0.23477924, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.04055297882665198, + "language_loss": 0.88430732, + "learning_rate": 0.0009869134226641206, + "loss": 0.89327806, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01982057, + "step": 524, + "time_per_iteration": 2.5944766998291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213869, + "balance_loss_mlp": 1.19106424, + "diversity_loss_mlp": 0.0, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.1040439940574723, + "language_loss": 0.87633705, + "learning_rate": 0.0009868425173347303, + "loss": 0.88847572, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 525, + "time_per_iteration": 2.679245710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202393, + "balance_loss_mlp": 1.17973125, + "diversity_loss_mlp": 0.0, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.10306076043273057, + "language_loss": 0.95430547, + "learning_rate": 0.0009867714229963125, + "loss": 0.96632946, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 526, + "time_per_iteration": 2.6960504055023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194179, + "balance_loss_mlp": 1.17121899, + "diversity_loss_mlp": 0.0, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.13221329860014494, + "language_loss": 0.92439747, + "learning_rate": 0.000986700139676468, + "loss": 0.93633932, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 527, + "time_per_iteration": 2.5740442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226752, + "balance_loss_mlp": 1.20331526, + "diversity_loss_mlp": 0.0, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.07480383753700154, + "language_loss": 0.90178651, + "learning_rate": 0.0009866286674028717, + "loss": 0.91405398, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 528, + "time_per_iteration": 2.6214394569396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00901033, + "balance_loss_mlp": 1.53179681, + "diversity_loss_mlp": 0.23385583, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.042015219172821444, + "language_loss": 0.87127066, + "learning_rate": 0.0009865570062032717, + "loss": 0.88028097, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820667, + "step": 529, + "time_per_iteration": 2.947612762451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243163, + "balance_loss_mlp": 1.21885657, + "diversity_loss_mlp": 0.0, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.11620953964099495, + "language_loss": 0.91896212, + "learning_rate": 0.0009864851561054893, + "loss": 0.93139374, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 530, + "time_per_iteration": 2.8097901344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192516, + "balance_loss_mlp": 1.16937733, + "diversity_loss_mlp": 0.0, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.0991735208834069, + "language_loss": 0.90383148, + "learning_rate": 0.0009864131171374191, + "loss": 0.9157567, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.23132324, + "routerloss_mlp": 0.0, + "step": 531, + "time_per_iteration": 2.6775832176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169363, + "balance_loss_mlp": 1.14682031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.08125371515716559, + "language_loss": 0.90489674, + "learning_rate": 0.0009863408893270292, + "loss": 0.91659039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.2253418, + "routerloss_mlp": 0.0, + "step": 532, + "time_per_iteration": 2.7877254486083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.1120224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.11770570969906818, + "language_loss": 0.85183895, + "learning_rate": 0.0009862684727023605, + "loss": 0.8631804, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 533, + "time_per_iteration": 2.717573642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128208, + "balance_loss_mlp": 1.10571277, + "diversity_loss_mlp": 0.0, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.10673213842736717, + "language_loss": 0.88664484, + "learning_rate": 0.0009861958672915283, + "loss": 0.89792687, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.22497559, + "routerloss_mlp": 0.0, + "step": 534, + "time_per_iteration": 2.7880847454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111155, + "balance_loss_mlp": 1.08948302, + "diversity_loss_mlp": 0.0, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.11915216532291298, + "language_loss": 0.88834876, + "learning_rate": 0.0009861230731227201, + "loss": 0.89946032, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.21679688, + "routerloss_mlp": 0.0, + "step": 535, + "time_per_iteration": 2.844203233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121725, + "balance_loss_mlp": 1.10002935, + "diversity_loss_mlp": 0.0, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.11019657032079996, + "language_loss": 0.90318179, + "learning_rate": 0.0009860500902241973, + "loss": 0.91439903, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.21716309, + "routerloss_mlp": 0.0, + "step": 536, + "time_per_iteration": 2.5753133296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126411, + "balance_loss_mlp": 1.10444033, + "diversity_loss_mlp": 0.0, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.13353850851854182, + "language_loss": 0.95278764, + "learning_rate": 0.0009859769186242942, + "loss": 0.96405172, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.21984863, + "routerloss_mlp": 0.0, + "step": 537, + "time_per_iteration": 2.544611930847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894726, + "balance_loss_mlp": 1.52693653, + "diversity_loss_mlp": 0.22699235, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04205207536563703, + "language_loss": 0.88558614, + "learning_rate": 0.0009859035583514187, + "loss": 0.8945334, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01776124, + "step": 538, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257859, + "balance_loss_mlp": 1.23475599, + "diversity_loss_mlp": 0.0, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.11200334451020948, + "language_loss": 0.89448857, + "learning_rate": 0.0009858300094340517, + "loss": 0.90706718, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.23071289, + "routerloss_mlp": 0.0, + "step": 539, + "time_per_iteration": 2.7679364681243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291272, + "balance_loss_mlp": 1.26785898, + "diversity_loss_mlp": 0.0, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.17493624211104222, + "language_loss": 0.84562349, + "learning_rate": 0.0009857562719007473, + "loss": 0.85853624, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.23388672, + "routerloss_mlp": 0.0, + "step": 540, + "time_per_iteration": 2.6256375312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267144, + "balance_loss_mlp": 1.24492311, + "diversity_loss_mlp": 0.0, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.14114133743563548, + "language_loss": 0.86615884, + "learning_rate": 0.0009856823457801331, + "loss": 0.87883031, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 541, + "time_per_iteration": 2.8773691654205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254714, + "balance_loss_mlp": 1.23256469, + "diversity_loss_mlp": 0.0, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.08733197639022866, + "language_loss": 0.93604994, + "learning_rate": 0.00098560823110091, + "loss": 0.94859707, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.22167969, + "routerloss_mlp": 0.0, + "step": 542, + "time_per_iteration": 2.6173057556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206738, + "balance_loss_mlp": 1.18436217, + "diversity_loss_mlp": 0.0, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.14252191795618116, + "language_loss": 0.94814467, + "learning_rate": 0.000985533927891851, + "loss": 0.96021199, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.22387695, + "routerloss_mlp": 0.0, + "step": 543, + "time_per_iteration": 2.682035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00924177, + "balance_loss_mlp": 1.58877563, + "diversity_loss_mlp": 0.22542018, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.04171093567104517, + "language_loss": 0.92462713, + "learning_rate": 0.0009854594361818044, + "loss": 0.93386889, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01707876, + "step": 544, + "time_per_iteration": 2.771606922149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134006, + "balance_loss_mlp": 1.11126077, + "diversity_loss_mlp": 0.0, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.16622789723447462, + "language_loss": 0.91736549, + "learning_rate": 0.0009853847559996897, + "loss": 0.92870551, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.22729492, + "routerloss_mlp": 0.0, + "step": 545, + "time_per_iteration": 2.714980363845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131307, + "balance_loss_mlp": 1.10896707, + "diversity_loss_mlp": 0.0, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.13863422454282084, + "language_loss": 0.90834534, + "learning_rate": 0.0009853098873745, + "loss": 0.91965836, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.22351074, + "routerloss_mlp": 0.0, + "step": 546, + "time_per_iteration": 2.98349928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127677, + "balance_loss_mlp": 1.10500383, + "diversity_loss_mlp": 0.0, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.15888834478547278, + "language_loss": 0.90073705, + "learning_rate": 0.0009852348303353027, + "loss": 0.91201389, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.22668457, + "routerloss_mlp": 0.0, + "step": 547, + "time_per_iteration": 2.782012701034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148987, + "balance_loss_mlp": 1.12613487, + "diversity_loss_mlp": 0.0, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.10179846154330349, + "language_loss": 0.82990968, + "learning_rate": 0.000985159584911237, + "loss": 0.84139955, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 548, + "time_per_iteration": 3.102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216658, + "balance_loss_mlp": 1.19307828, + "diversity_loss_mlp": 0.0, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.12466178148261096, + "language_loss": 0.89916652, + "learning_rate": 0.0009850841511315162, + "loss": 0.91133308, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.2355957, + "routerloss_mlp": 0.0, + "step": 549, + "time_per_iteration": 2.61226749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241093, + "balance_loss_mlp": 1.21708441, + "diversity_loss_mlp": 0.0, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.11901003741868514, + "language_loss": 0.90615034, + "learning_rate": 0.0009850085290254256, + "loss": 0.91856128, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.23986816, + "routerloss_mlp": 0.0, + "step": 550, + "time_per_iteration": 2.7958199977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914838, + "balance_loss_mlp": 1.5724771, + "diversity_loss_mlp": 0.22113116, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.03122458898086593, + "language_loss": 0.87977409, + "learning_rate": 0.0009849327186223246, + "loss": 0.88892245, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0180343, + "step": 551, + "time_per_iteration": 2.799394130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242815, + "balance_loss_mlp": 1.21818638, + "diversity_loss_mlp": 0.0, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.10957849833176474, + "language_loss": 0.95181417, + "learning_rate": 0.000984856719951646, + "loss": 0.96424234, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.24609375, + "routerloss_mlp": 0.0, + "step": 552, + "time_per_iteration": 2.559286117553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121032, + "balance_loss_mlp": 1.18546462, + "diversity_loss_mlp": 0.0, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.09349197696587547, + "language_loss": 0.91760498, + "learning_rate": 0.0009847805330428943, + "loss": 0.92970818, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.24865723, + "routerloss_mlp": 0.0, + "step": 553, + "time_per_iteration": 2.906571388244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875983, + "balance_loss_mlp": 1.49139261, + "diversity_loss_mlp": 0.22127438, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05457604420902532, + "language_loss": 0.93558431, + "learning_rate": 0.0009847041579256481, + "loss": 0.94434416, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01964992, + "step": 554, + "time_per_iteration": 2.6159372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202664, + "balance_loss_mlp": 1.17859542, + "diversity_loss_mlp": 0.0, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.08395889420783041, + "language_loss": 0.94042808, + "learning_rate": 0.0009846275946295592, + "loss": 0.95245475, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 555, + "time_per_iteration": 2.592341184616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182493, + "balance_loss_mlp": 1.15904498, + "diversity_loss_mlp": 0.0, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.08262845202589308, + "language_loss": 0.8740595, + "learning_rate": 0.0009845508431843518, + "loss": 0.8858844, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 556, + "time_per_iteration": 3.0123813152313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177615, + "balance_loss_mlp": 1.15481031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.07593810566908125, + "language_loss": 0.88148719, + "learning_rate": 0.0009844739036198233, + "loss": 0.8932634, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 557, + "time_per_iteration": 2.6356143951416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184514, + "balance_loss_mlp": 1.16157842, + "diversity_loss_mlp": 0.0, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.09177793780956148, + "language_loss": 0.94916999, + "learning_rate": 0.0009843967759658448, + "loss": 0.96101511, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 558, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02293865, + "balance_loss_mlp": 2.17026901, + "diversity_loss_mlp": 0.0, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.09925677209713644, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.75061619, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 559, + "time_per_iteration": 4.829499244689941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207667, + "balance_loss_mlp": 1.18555331, + "diversity_loss_mlp": 0.0, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.1031420062274817, + "language_loss": 0.9552027, + "learning_rate": 0.000984241956509384, + "loss": 0.96727937, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 560, + "time_per_iteration": 2.65759539604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204016, + "balance_loss_mlp": 1.18220043, + "diversity_loss_mlp": 0.0, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.08944048757536185, + "language_loss": 0.90505213, + "learning_rate": 0.0009841642647670078, + "loss": 0.91709226, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 561, + "time_per_iteration": 2.591806173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194467, + "balance_loss_mlp": 1.17308092, + "diversity_loss_mlp": 0.0, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.08297191380839272, + "language_loss": 0.85483265, + "learning_rate": 0.0009840863850553944, + "loss": 0.8667773, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.21398926, + "routerloss_mlp": 0.0, + "step": 562, + "time_per_iteration": 2.963149309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179499, + "balance_loss_mlp": 1.15856552, + "diversity_loss_mlp": 0.0, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.18759249419324772, + "language_loss": 0.9088884, + "learning_rate": 0.0009840083174047782, + "loss": 0.92068338, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.20947266, + "routerloss_mlp": 0.0, + "step": 563, + "time_per_iteration": 2.71415114402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169496, + "balance_loss_mlp": 1.14940953, + "diversity_loss_mlp": 0.0, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.08351477183844232, + "language_loss": 0.86295354, + "learning_rate": 0.0009839300618454685, + "loss": 0.87464857, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.20080566, + "routerloss_mlp": 0.0, + "step": 564, + "time_per_iteration": 2.8288042545318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163807, + "balance_loss_mlp": 1.14280224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0761185875884483, + "language_loss": 0.9141686, + "learning_rate": 0.0009838516184078466, + "loss": 0.92580664, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.21020508, + "routerloss_mlp": 0.0, + "step": 565, + "time_per_iteration": 2.8194022178649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177734, + "balance_loss_mlp": 1.15682447, + "diversity_loss_mlp": 0.0, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.14122321260962364, + "language_loss": 0.88377023, + "learning_rate": 0.0009837729871223669, + "loss": 0.89554763, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 566, + "time_per_iteration": 2.6096079349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194985, + "balance_loss_mlp": 1.17372978, + "diversity_loss_mlp": 0.0, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.1066586812750682, + "language_loss": 0.88896918, + "learning_rate": 0.0009836941680195568, + "loss": 0.90091902, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.21264648, + "routerloss_mlp": 0.0, + "step": 567, + "time_per_iteration": 2.779846429824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210465, + "balance_loss_mlp": 1.18900692, + "diversity_loss_mlp": 0.0, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.09744135285550241, + "language_loss": 0.84777021, + "learning_rate": 0.0009836151611300166, + "loss": 0.85987484, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 568, + "time_per_iteration": 3.2130274772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210546, + "balance_loss_mlp": 1.18979168, + "diversity_loss_mlp": 0.0, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.0877787159655237, + "language_loss": 0.95202124, + "learning_rate": 0.0009835359664844194, + "loss": 0.96412671, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.2076416, + "routerloss_mlp": 0.0, + "step": 569, + "time_per_iteration": 2.614626407623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02163392, + "balance_loss_mlp": 2.12848806, + "diversity_loss_mlp": 0.0, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.098326155744124, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.83200204, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.34960938, + "routerloss_mlp": 0.0, + "step": 570, + "time_per_iteration": 4.910563230514526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188679, + "balance_loss_mlp": 1.16738796, + "diversity_loss_mlp": 0.0, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.10673198509513786, + "language_loss": 0.92503107, + "learning_rate": 0.0009833770140481118, + "loss": 0.93691778, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.21313477, + "routerloss_mlp": 0.0, + "step": 571, + "time_per_iteration": 2.6361794471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167522, + "balance_loss_mlp": 1.14587367, + "diversity_loss_mlp": 0.0, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.06757736028097705, + "language_loss": 0.82720339, + "learning_rate": 0.000983297256319112, + "loss": 0.83887863, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.21655273, + "routerloss_mlp": 0.0, + "step": 572, + "time_per_iteration": 3.2420709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148716, + "balance_loss_mlp": 1.12606621, + "diversity_loss_mlp": 0.0, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.09218112459591986, + "language_loss": 0.87054348, + "learning_rate": 0.000983217310957477, + "loss": 0.88203067, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 573, + "time_per_iteration": 2.7485547065734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139139, + "balance_loss_mlp": 1.11725259, + "diversity_loss_mlp": 0.0, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.08282639029669561, + "language_loss": 0.90421212, + "learning_rate": 0.000983137177994244, + "loss": 0.91560352, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.21899414, + "routerloss_mlp": 0.0, + "step": 574, + "time_per_iteration": 2.8651185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142425, + "balance_loss_mlp": 1.11990607, + "diversity_loss_mlp": 0.0, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.08655490231030577, + "language_loss": 0.8561765, + "learning_rate": 0.0009830568574605235, + "loss": 0.8676008, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.22521973, + "routerloss_mlp": 0.0, + "step": 575, + "time_per_iteration": 2.942331075668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162411, + "balance_loss_mlp": 1.13946342, + "diversity_loss_mlp": 0.0, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.08792859421485215, + "language_loss": 0.88113999, + "learning_rate": 0.0009829763493874992, + "loss": 0.89276409, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 576, + "time_per_iteration": 3.0282514095306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173849, + "balance_loss_mlp": 1.15098429, + "diversity_loss_mlp": 0.0, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.10676499351314739, + "language_loss": 0.9303807, + "learning_rate": 0.0009828956538064264, + "loss": 0.94211912, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.2286377, + "routerloss_mlp": 0.0, + "step": 577, + "time_per_iteration": 2.7946369647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173248, + "balance_loss_mlp": 1.1503005, + "diversity_loss_mlp": 0.0, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.11074471638842859, + "language_loss": 0.91223717, + "learning_rate": 0.0009828147707486344, + "loss": 0.92396963, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 578, + "time_per_iteration": 2.731588125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.13424993, + "diversity_loss_mlp": 0.0, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.09317476454713723, + "language_loss": 0.86116958, + "learning_rate": 0.0009827337002455245, + "loss": 0.87273794, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 579, + "time_per_iteration": 2.639047145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134437, + "balance_loss_mlp": 1.11184728, + "diversity_loss_mlp": 0.0, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.07918824025832125, + "language_loss": 0.88299757, + "learning_rate": 0.0009826524423285712, + "loss": 0.89434195, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.22595215, + "routerloss_mlp": 0.0, + "step": 580, + "time_per_iteration": 2.911012649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114105, + "balance_loss_mlp": 1.11881745, + "diversity_loss_mlp": 0.0, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.10469703454021252, + "language_loss": 0.89618349, + "learning_rate": 0.0009825709970293218, + "loss": 0.90759397, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 581, + "time_per_iteration": 2.8837828636169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135626, + "balance_loss_mlp": 1.11433506, + "diversity_loss_mlp": 0.0, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.1022616119694228, + "language_loss": 0.95317924, + "learning_rate": 0.0009824893643793956, + "loss": 0.96453559, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.21289062, + "routerloss_mlp": 0.0, + "step": 582, + "time_per_iteration": 3.0962114334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948798, + "balance_loss_mlp": 1.63779283, + "diversity_loss_mlp": 0.22248407, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.04350556393742171, + "language_loss": 0.88843536, + "learning_rate": 0.0009824075444104857, + "loss": 0.89792335, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01865991, + "step": 583, + "time_per_iteration": 2.719085454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157517, + "balance_loss_mlp": 1.13638163, + "diversity_loss_mlp": 0.0, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.10740950198198211, + "language_loss": 0.93831933, + "learning_rate": 0.000982325537154357, + "loss": 0.94989443, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.21154785, + "routerloss_mlp": 0.0, + "step": 584, + "time_per_iteration": 2.597120523452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117352, + "balance_loss_mlp": 1.15234792, + "diversity_loss_mlp": 0.0, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.12322952105084124, + "language_loss": 0.94442445, + "learning_rate": 0.0009822433426428484, + "loss": 0.95615965, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.21179199, + "routerloss_mlp": 0.0, + "step": 585, + "time_per_iteration": 2.571805238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238103, + "balance_loss_mlp": 1.2166214, + "diversity_loss_mlp": 0.0, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.08678287386034968, + "language_loss": 0.87089044, + "learning_rate": 0.0009821609609078697, + "loss": 0.88327146, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.21484375, + "routerloss_mlp": 0.0, + "step": 586, + "time_per_iteration": 2.586289405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320429, + "balance_loss_mlp": 1.29861343, + "diversity_loss_mlp": 0.0, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.09324667942342675, + "language_loss": 0.89581811, + "learning_rate": 0.0009820783919814045, + "loss": 0.90902239, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 587, + "time_per_iteration": 2.804417848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01397697, + "balance_loss_mlp": 1.37499988, + "diversity_loss_mlp": 0.0, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.11766834316785481, + "language_loss": 0.82825267, + "learning_rate": 0.0009819956358955095, + "loss": 0.8422296, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.22705078, + "routerloss_mlp": 0.0, + "step": 588, + "time_per_iteration": 2.5654590129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433511, + "balance_loss_mlp": 1.41009879, + "diversity_loss_mlp": 0.0, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.13254981657968556, + "language_loss": 0.84316242, + "learning_rate": 0.0009819126926823127, + "loss": 0.85749757, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.23413086, + "routerloss_mlp": 0.0, + "step": 589, + "time_per_iteration": 2.5090954303741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369151, + "balance_loss_mlp": 1.34720445, + "diversity_loss_mlp": 0.0, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.12923638752993147, + "language_loss": 0.87131608, + "learning_rate": 0.000981829562374016, + "loss": 0.88500756, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 590, + "time_per_iteration": 2.7904558181762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263432, + "balance_loss_mlp": 1.24309444, + "diversity_loss_mlp": 0.0, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.0979331207375339, + "language_loss": 0.97635686, + "learning_rate": 0.0009817462450028933, + "loss": 0.98899126, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 591, + "time_per_iteration": 2.6596498489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186211, + "balance_loss_mlp": 1.16698265, + "diversity_loss_mlp": 0.0, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.0791908179615389, + "language_loss": 0.85476398, + "learning_rate": 0.0009816627406012916, + "loss": 0.86662614, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 592, + "time_per_iteration": 2.795384168624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143018, + "balance_loss_mlp": 1.12423062, + "diversity_loss_mlp": 0.0, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.14133504737490046, + "language_loss": 0.85158926, + "learning_rate": 0.0009815790492016295, + "loss": 0.86301947, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.18774414, + "routerloss_mlp": 0.0, + "step": 593, + "time_per_iteration": 2.968202829360962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113857, + "balance_loss_mlp": 1.11954474, + "diversity_loss_mlp": 0.0, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.10990083394980393, + "language_loss": 0.87156999, + "learning_rate": 0.0009814951708363993, + "loss": 0.88295579, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.19006348, + "routerloss_mlp": 0.0, + "step": 594, + "time_per_iteration": 2.8341050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01993613, + "balance_loss_mlp": 1.96176016, + "diversity_loss_mlp": 0.0, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.10325359814292956, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79984605, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.31835938, + "routerloss_mlp": 0.0, + "step": 595, + "time_per_iteration": 4.746119976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11163688, + "diversity_loss_mlp": 0.0, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.1448933947746474, + "language_loss": 0.89056683, + "learning_rate": 0.0009813268533395648, + "loss": 0.90187395, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.19067383, + "routerloss_mlp": 0.0, + "step": 596, + "time_per_iteration": 2.592421054840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151969, + "balance_loss_mlp": 1.13301492, + "diversity_loss_mlp": 0.0, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.12455054099529249, + "language_loss": 0.8755219, + "learning_rate": 0.0009812424142733073, + "loss": 0.88704157, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.18933105, + "routerloss_mlp": 0.0, + "step": 597, + "time_per_iteration": 2.549654483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158973, + "balance_loss_mlp": 1.13961387, + "diversity_loss_mlp": 0.0, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.1533400924271749, + "language_loss": 0.86129421, + "learning_rate": 0.000981157788372175, + "loss": 0.87288398, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 598, + "time_per_iteration": 3.029372453689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181573, + "balance_loss_mlp": 1.16308403, + "diversity_loss_mlp": 0.0, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.08122879346901381, + "language_loss": 0.89185023, + "learning_rate": 0.0009810729756690223, + "loss": 0.90366596, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.18481445, + "routerloss_mlp": 0.0, + "step": 599, + "time_per_iteration": 2.72200608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225343, + "balance_loss_mlp": 1.20584035, + "diversity_loss_mlp": 0.0, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.09322481346022114, + "language_loss": 0.91937912, + "learning_rate": 0.0009809879761967766, + "loss": 0.93163252, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.19482422, + "routerloss_mlp": 0.0, + "step": 600, + "time_per_iteration": 2.9454104900360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240049, + "balance_loss_mlp": 1.22046316, + "diversity_loss_mlp": 0.0, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.11235514763344263, + "language_loss": 0.86727029, + "learning_rate": 0.0009809027899884378, + "loss": 0.87967086, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.19580078, + "routerloss_mlp": 0.0, + "step": 601, + "time_per_iteration": 2.888047218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288764, + "balance_loss_mlp": 1.26829576, + "diversity_loss_mlp": 0.0, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.07021797329248278, + "language_loss": 0.88593882, + "learning_rate": 0.0009808174170770779, + "loss": 0.89882648, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.20458984, + "routerloss_mlp": 0.0, + "step": 602, + "time_per_iteration": 2.8045670986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02144093, + "balance_loss_mlp": 2.11128712, + "diversity_loss_mlp": 0.0, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.1124732092134732, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.87042338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.328125, + "routerloss_mlp": 0.0, + "step": 603, + "time_per_iteration": 4.899731397628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341078, + "balance_loss_mlp": 1.32069361, + "diversity_loss_mlp": 0.0, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.10202627615666406, + "language_loss": 0.93765342, + "learning_rate": 0.0009806461112779462, + "loss": 0.95106417, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 604, + "time_per_iteration": 2.6618311405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291209, + "balance_loss_mlp": 1.27080083, + "diversity_loss_mlp": 0.0, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.13219567018011513, + "language_loss": 0.87928259, + "learning_rate": 0.0009805601784566814, + "loss": 0.89219463, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.20397949, + "routerloss_mlp": 0.0, + "step": 605, + "time_per_iteration": 2.4783012866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229751, + "balance_loss_mlp": 1.20996237, + "diversity_loss_mlp": 0.0, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.07794567116482086, + "language_loss": 0.95705628, + "learning_rate": 0.0009804740590654089, + "loss": 0.9693538, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.19787598, + "routerloss_mlp": 0.0, + "step": 606, + "time_per_iteration": 2.6886532306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155761, + "balance_loss_mlp": 1.13543582, + "diversity_loss_mlp": 0.0, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.09113538166915294, + "language_loss": 0.90117687, + "learning_rate": 0.0009803877531375635, + "loss": 0.91273439, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 607, + "time_per_iteration": 2.877068281173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127783, + "balance_loss_mlp": 1.1072073, + "diversity_loss_mlp": 0.0, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.0886917383310614, + "language_loss": 0.90959686, + "learning_rate": 0.0009803012607066523, + "loss": 0.92087471, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.20581055, + "routerloss_mlp": 0.0, + "step": 608, + "time_per_iteration": 2.7187952995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110833, + "balance_loss_mlp": 1.08786178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.061304878637031934, + "language_loss": 0.89645171, + "learning_rate": 0.0009802145818062543, + "loss": 0.90753502, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 609, + "time_per_iteration": 2.692622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920288, + "balance_loss_mlp": 1.57755673, + "diversity_loss_mlp": 0.22646153, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.03934500472587961, + "language_loss": 0.91726142, + "learning_rate": 0.0009801277164700212, + "loss": 0.92646432, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01827916, + "step": 610, + "time_per_iteration": 2.5983645915985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100014, + "balance_loss_mlp": 1.07810283, + "diversity_loss_mlp": 0.0, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.11493980483313035, + "language_loss": 0.90203917, + "learning_rate": 0.0009800406647316776, + "loss": 0.91303933, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.21911621, + "routerloss_mlp": 0.0, + "step": 611, + "time_per_iteration": 2.83890438079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02350268, + "balance_loss_mlp": 2.30563617, + "diversity_loss_mlp": 0.0, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.20114955038596882, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7926473, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.44726562, + "routerloss_mlp": 0.0, + "step": 612, + "time_per_iteration": 4.795763254165649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111114, + "balance_loss_mlp": 1.09067178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.10624240262278996, + "language_loss": 0.88978302, + "learning_rate": 0.000979866002183916, + "loss": 0.9008944, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 613, + "time_per_iteration": 2.660820484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.10140252, + "diversity_loss_mlp": 0.0, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.11793468153173196, + "language_loss": 0.90023279, + "learning_rate": 0.0009797783914423082, + "loss": 0.91144633, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.19946289, + "routerloss_mlp": 0.0, + "step": 614, + "time_per_iteration": 2.8052501678466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.13508475, + "diversity_loss_mlp": 0.0, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.09232041353489327, + "language_loss": 0.84365702, + "learning_rate": 0.0009796905944342094, + "loss": 0.8552016, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.19360352, + "routerloss_mlp": 0.0, + "step": 615, + "time_per_iteration": 2.829193115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164283, + "balance_loss_mlp": 1.14475632, + "diversity_loss_mlp": 0.0, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.08204462941928636, + "language_loss": 0.88193601, + "learning_rate": 0.0009796026111937057, + "loss": 0.89357883, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.19519043, + "routerloss_mlp": 0.0, + "step": 616, + "time_per_iteration": 2.5868873596191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165697, + "balance_loss_mlp": 1.14656377, + "diversity_loss_mlp": 0.0, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.08667467412120618, + "language_loss": 0.88612103, + "learning_rate": 0.0009795144417549552, + "loss": 0.89777797, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.19128418, + "routerloss_mlp": 0.0, + "step": 617, + "time_per_iteration": 2.689771890640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163262, + "balance_loss_mlp": 1.14452195, + "diversity_loss_mlp": 0.0, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.07824422885129345, + "language_loss": 0.8978498, + "learning_rate": 0.0009794260861521883, + "loss": 0.90948236, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.18737793, + "routerloss_mlp": 0.0, + "step": 618, + "time_per_iteration": 2.78352689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154292, + "balance_loss_mlp": 1.13528955, + "diversity_loss_mlp": 0.0, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.09960243519509318, + "language_loss": 0.86907887, + "learning_rate": 0.0009793375444197075, + "loss": 0.88062179, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 619, + "time_per_iteration": 2.618597984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159739, + "balance_loss_mlp": 1.14053416, + "diversity_loss_mlp": 0.0, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.09155899478389973, + "language_loss": 0.85016847, + "learning_rate": 0.000979248816591888, + "loss": 0.86176586, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 620, + "time_per_iteration": 2.7570278644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145713, + "balance_loss_mlp": 1.12721133, + "diversity_loss_mlp": 0.0, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.1108991519321712, + "language_loss": 0.86349535, + "learning_rate": 0.0009791599027031766, + "loss": 0.87495244, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 621, + "time_per_iteration": 3.2095139026641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137126, + "balance_loss_mlp": 1.11841059, + "diversity_loss_mlp": 0.0, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.09815511109151757, + "language_loss": 0.86187375, + "learning_rate": 0.0009790708027880932, + "loss": 0.873245, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 622, + "time_per_iteration": 2.878537654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01885107, + "balance_loss_mlp": 1.84448004, + "diversity_loss_mlp": 0.0, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.060338107853692736, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.79312396, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 623, + "time_per_iteration": 4.854407787322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.12785053, + "diversity_loss_mlp": 0.0, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.08227936779447462, + "language_loss": 0.9313252, + "learning_rate": 0.0009788920450172487, + "loss": 0.94280195, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.19812012, + "routerloss_mlp": 0.0, + "step": 624, + "time_per_iteration": 2.633763551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173257, + "balance_loss_mlp": 1.15283692, + "diversity_loss_mlp": 0.0, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.08898942147955141, + "language_loss": 0.90448737, + "learning_rate": 0.0009788023872308875, + "loss": 0.91621995, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.20410156, + "routerloss_mlp": 0.0, + "step": 625, + "time_per_iteration": 2.5277719497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01862648, + "balance_loss_mlp": 1.82163978, + "diversity_loss_mlp": 0.0, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.06145643913195344, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.77291644, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.41015625, + "routerloss_mlp": 0.0, + "step": 626, + "time_per_iteration": 4.746332883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165065, + "balance_loss_mlp": 1.1446321, + "diversity_loss_mlp": 0.0, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.07179626691480034, + "language_loss": 0.93775636, + "learning_rate": 0.0009786225140303285, + "loss": 0.94940698, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.2043457, + "routerloss_mlp": 0.0, + "step": 627, + "time_per_iteration": 2.650980234146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.13354802, + "diversity_loss_mlp": 0.0, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.1000912175423248, + "language_loss": 0.91955918, + "learning_rate": 0.0009785322986859634, + "loss": 0.93110657, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.21191406, + "routerloss_mlp": 0.0, + "step": 628, + "time_per_iteration": 2.699179172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0098085, + "balance_loss_mlp": 1.69793713, + "diversity_loss_mlp": 0.22907162, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.03434932946066091, + "language_loss": 0.92752671, + "learning_rate": 0.0009784418975588838, + "loss": 0.93733525, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01734566, + "step": 629, + "time_per_iteration": 2.7467246055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131113, + "balance_loss_mlp": 1.10905957, + "diversity_loss_mlp": 0.0, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.08662072407619689, + "language_loss": 0.93157279, + "learning_rate": 0.0009783513106841862, + "loss": 0.94288397, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.22070312, + "routerloss_mlp": 0.0, + "step": 630, + "time_per_iteration": 2.699862003326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01893774, + "balance_loss_mlp": 1.85181284, + "diversity_loss_mlp": 0.0, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.08318726834589595, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78626478, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.41992188, + "routerloss_mlp": 0.0, + "step": 631, + "time_per_iteration": 4.952157258987427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129662, + "balance_loss_mlp": 1.10740614, + "diversity_loss_mlp": 0.0, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.08011431594745816, + "language_loss": 0.87836802, + "learning_rate": 0.0009781695798326854, + "loss": 0.88966465, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.22265625, + "routerloss_mlp": 0.0, + "step": 632, + "time_per_iteration": 2.5692520141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.10132909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.08866631591317527, + "language_loss": 0.87804729, + "learning_rate": 0.0009780784359264365, + "loss": 0.88928837, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 633, + "time_per_iteration": 2.6267781257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00719882, + "balance_loss_mlp": 1.16367078, + "diversity_loss_mlp": 0.22089316, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.0030158712959469035, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.74908578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02760048, + "step": 634, + "time_per_iteration": 4.819004535675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00956665, + "balance_loss_mlp": 1.64561963, + "diversity_loss_mlp": 0.23289478, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.029780004210258365, + "language_loss": 0.87410563, + "learning_rate": 0.000977895591329867, + "loss": 0.88367236, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017408, + "step": 635, + "time_per_iteration": 2.8417630195617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111829, + "balance_loss_mlp": 1.09035909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.07301537581986137, + "language_loss": 0.86799347, + "learning_rate": 0.000977803890710533, + "loss": 0.87911177, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 636, + "time_per_iteration": 2.721245765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_mlp": 1.08507979, + "diversity_loss_mlp": 0.0, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.0646034576227674, + "language_loss": 0.93395561, + "learning_rate": 0.0009777120045912774, + "loss": 0.94501537, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.20898438, + "routerloss_mlp": 0.0, + "step": 637, + "time_per_iteration": 2.5976381301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114015, + "balance_loss_mlp": 1.09267688, + "diversity_loss_mlp": 0.0, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.07520229878174765, + "language_loss": 0.89586985, + "learning_rate": 0.0009776199330077736, + "loss": 0.90700996, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.21362305, + "routerloss_mlp": 0.0, + "step": 638, + "time_per_iteration": 2.7055575847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127487, + "balance_loss_mlp": 1.10741186, + "diversity_loss_mlp": 0.0, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.08952902399696973, + "language_loss": 0.91934389, + "learning_rate": 0.0009775276759957667, + "loss": 0.93061876, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.20068359, + "routerloss_mlp": 0.0, + "step": 639, + "time_per_iteration": 2.703442096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113385, + "balance_loss_mlp": 1.11285698, + "diversity_loss_mlp": 0.0, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.08734236555353025, + "language_loss": 0.8993817, + "learning_rate": 0.0009774352335910745, + "loss": 0.91072023, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.21008301, + "routerloss_mlp": 0.0, + "step": 640, + "time_per_iteration": 2.798133373260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133691, + "balance_loss_mlp": 1.11327052, + "diversity_loss_mlp": 0.0, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.08010684820371014, + "language_loss": 0.94195282, + "learning_rate": 0.000977342605829586, + "loss": 0.95328975, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.20422363, + "routerloss_mlp": 0.0, + "step": 641, + "time_per_iteration": 2.72929310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167832, + "balance_loss_mlp": 1.14699411, + "diversity_loss_mlp": 0.0, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.08202605728626432, + "language_loss": 0.85741401, + "learning_rate": 0.0009772497927472623, + "loss": 0.86909235, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.20837402, + "routerloss_mlp": 0.0, + "step": 642, + "time_per_iteration": 3.071017265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166824, + "balance_loss_mlp": 1.14637995, + "diversity_loss_mlp": 0.0, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.0829252807022359, + "language_loss": 0.84863311, + "learning_rate": 0.0009771567943801368, + "loss": 0.86030138, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.20446777, + "routerloss_mlp": 0.0, + "step": 643, + "time_per_iteration": 2.667830228805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.16058123, + "diversity_loss_mlp": 0.0, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.07304892670416417, + "language_loss": 0.89067769, + "learning_rate": 0.0009770636107643152, + "loss": 0.90248668, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.203125, + "routerloss_mlp": 0.0, + "step": 644, + "time_per_iteration": 2.715703010559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187033, + "balance_loss_mlp": 1.16633821, + "diversity_loss_mlp": 0.0, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.07624328698635177, + "language_loss": 0.87043303, + "learning_rate": 0.0009769702419359738, + "loss": 0.88230342, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.20703125, + "routerloss_mlp": 0.0, + "step": 645, + "time_per_iteration": 2.645270586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199535, + "balance_loss_mlp": 1.17913866, + "diversity_loss_mlp": 0.0, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.10325279424343262, + "language_loss": 0.88927197, + "learning_rate": 0.000976876687931362, + "loss": 0.90126729, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 646, + "time_per_iteration": 2.9558987617492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154414, + "balance_loss_mlp": 1.13427997, + "diversity_loss_mlp": 0.0, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.10259074887379964, + "language_loss": 0.84658372, + "learning_rate": 0.0009767829487868005, + "loss": 0.85812783, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.20129395, + "routerloss_mlp": 0.0, + "step": 647, + "time_per_iteration": 2.593254566192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165839, + "balance_loss_mlp": 1.14557362, + "diversity_loss_mlp": 0.0, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.08660672395493044, + "language_loss": 0.88729513, + "learning_rate": 0.000976689024538682, + "loss": 0.8989535, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.20263672, + "routerloss_mlp": 0.0, + "step": 648, + "time_per_iteration": 2.6087043285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.12564492, + "diversity_loss_mlp": 0.0, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.09471610460140056, + "language_loss": 0.86980593, + "learning_rate": 0.0009765949152234716, + "loss": 0.88127637, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.21411133, + "routerloss_mlp": 0.0, + "step": 649, + "time_per_iteration": 2.8878984451293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130912, + "balance_loss_mlp": 2.08723378, + "diversity_loss_mlp": 0.0, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.17488169385486374, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.80816996, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.4375, + "routerloss_mlp": 0.0, + "step": 650, + "time_per_iteration": 4.7227959632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125186, + "balance_loss_mlp": 1.10393071, + "diversity_loss_mlp": 0.0, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.09783498118048492, + "language_loss": 0.81436628, + "learning_rate": 0.0009764061415379919, + "loss": 0.82561815, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.21276855, + "routerloss_mlp": 0.0, + "step": 651, + "time_per_iteration": 3.2849485874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135606, + "balance_loss_mlp": 1.11419618, + "diversity_loss_mlp": 0.0, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08568090703098526, + "language_loss": 0.88376707, + "learning_rate": 0.0009763114772410109, + "loss": 0.89512312, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 652, + "time_per_iteration": 2.640482187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147244, + "balance_loss_mlp": 1.12633479, + "diversity_loss_mlp": 0.0, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.0799999486499222, + "language_loss": 0.86490756, + "learning_rate": 0.0009762166280235146, + "loss": 0.87638003, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.20910645, + "routerloss_mlp": 0.0, + "step": 653, + "time_per_iteration": 2.9535903930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188128, + "balance_loss_mlp": 1.16659844, + "diversity_loss_mlp": 0.0, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.09522027236447655, + "language_loss": 0.86765033, + "learning_rate": 0.0009761215939223267, + "loss": 0.87953162, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 654, + "time_per_iteration": 2.7124929428100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186032, + "balance_loss_mlp": 1.16533732, + "diversity_loss_mlp": 0.0, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.11212167432887624, + "language_loss": 0.85993934, + "learning_rate": 0.0009760263749743428, + "loss": 0.87179965, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.20690918, + "routerloss_mlp": 0.0, + "step": 655, + "time_per_iteration": 2.5919461250305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171572, + "balance_loss_mlp": 1.1518662, + "diversity_loss_mlp": 0.0, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.09226162692886594, + "language_loss": 0.89700639, + "learning_rate": 0.0009759309712165299, + "loss": 0.9087221, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.19702148, + "routerloss_mlp": 0.0, + "step": 656, + "time_per_iteration": 2.746537685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161192, + "balance_loss_mlp": 1.14149833, + "diversity_loss_mlp": 0.0, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.08627335840647962, + "language_loss": 0.92326117, + "learning_rate": 0.0009758353826859272, + "loss": 0.9348731, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 657, + "time_per_iteration": 2.5861480236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128682, + "balance_loss_mlp": 1.10790431, + "diversity_loss_mlp": 0.0, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.1059978443595565, + "language_loss": 0.88603538, + "learning_rate": 0.0009757396094196456, + "loss": 0.89732224, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.20788574, + "routerloss_mlp": 0.0, + "step": 658, + "time_per_iteration": 2.8773136138916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130444, + "balance_loss_mlp": 1.11040533, + "diversity_loss_mlp": 0.0, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.12293029558515219, + "language_loss": 0.83426332, + "learning_rate": 0.0009756436514548673, + "loss": 0.8455677, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.20031738, + "routerloss_mlp": 0.0, + "step": 659, + "time_per_iteration": 2.810722589492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.11438441, + "diversity_loss_mlp": 0.0, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.06793027871708798, + "language_loss": 0.87658846, + "learning_rate": 0.0009755475088288466, + "loss": 0.88793576, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.20349121, + "routerloss_mlp": 0.0, + "step": 660, + "time_per_iteration": 2.7121376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.12785089, + "diversity_loss_mlp": 0.0, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08710392398912287, + "language_loss": 0.89421189, + "learning_rate": 0.0009754511815789095, + "loss": 0.90569162, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.20117188, + "routerloss_mlp": 0.0, + "step": 661, + "time_per_iteration": 2.777318239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162586, + "balance_loss_mlp": 1.14171267, + "diversity_loss_mlp": 0.0, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08537034247511402, + "language_loss": 0.84716892, + "learning_rate": 0.0009753546697424533, + "loss": 0.85879481, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 662, + "time_per_iteration": 2.6664726734161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169368, + "balance_loss_mlp": 1.14935231, + "diversity_loss_mlp": 0.0, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.08593929583832248, + "language_loss": 0.89815515, + "learning_rate": 0.0009752579733569475, + "loss": 0.90984881, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.20019531, + "routerloss_mlp": 0.0, + "step": 663, + "time_per_iteration": 2.695844888687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02192512, + "balance_loss_mlp": 2.16352034, + "diversity_loss_mlp": 0.0, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.2093028146020386, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.77073896, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.2890625, + "routerloss_mlp": 0.0, + "step": 664, + "time_per_iteration": 4.96467137336731 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00927072, + "balance_loss_mlp": 1.59828615, + "diversity_loss_mlp": 0.21952696, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.040572636524321984, + "language_loss": 0.8949101, + "learning_rate": 0.0009750640270890217, + "loss": 0.90418077, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816532, + "step": 665, + "time_per_iteration": 2.7632246017456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241186, + "balance_loss_mlp": 1.22053885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.08846289988129392, + "language_loss": 0.95572138, + "learning_rate": 0.0009749667772818983, + "loss": 0.96813321, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.20654297, + "routerloss_mlp": 0.0, + "step": 666, + "time_per_iteration": 3.037458896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183198, + "balance_loss_mlp": 1.80241597, + "diversity_loss_mlp": 0.0, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.11554481164154014, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.7876792, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.29492188, + "routerloss_mlp": 0.0, + "step": 667, + "time_per_iteration": 4.810182332992554 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244511, + "balance_loss_mlp": 1.22299325, + "diversity_loss_mlp": 0.0, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.09137997717488894, + "language_loss": 0.94816601, + "learning_rate": 0.0009747717245101093, + "loss": 0.9606111, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.21520996, + "routerloss_mlp": 0.0, + "step": 668, + "time_per_iteration": 2.552507162094116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917856, + "balance_loss_mlp": 1.58052325, + "diversity_loss_mlp": 0.21830653, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.03508480239171642, + "language_loss": 0.8457346, + "learning_rate": 0.00097467392162117, + "loss": 0.85491318, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01844162, + "step": 669, + "time_per_iteration": 2.6064391136169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242109, + "balance_loss_mlp": 1.21882796, + "diversity_loss_mlp": 0.0, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.1666980552990896, + "language_loss": 0.90609741, + "learning_rate": 0.0009745759344474708, + "loss": 0.91851848, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.23266602, + "routerloss_mlp": 0.0, + "step": 670, + "time_per_iteration": 2.826202392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229033, + "balance_loss_mlp": 1.2077179, + "diversity_loss_mlp": 0.0, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.09671049007121679, + "language_loss": 0.88974905, + "learning_rate": 0.0009744777630270536, + "loss": 0.90203935, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.21337891, + "routerloss_mlp": 0.0, + "step": 671, + "time_per_iteration": 2.578334331512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233527, + "balance_loss_mlp": 1.21067417, + "diversity_loss_mlp": 0.0, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.08999527722625096, + "language_loss": 0.92790663, + "learning_rate": 0.000974379407398032, + "loss": 0.94024187, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 672, + "time_per_iteration": 2.8661158084869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237675, + "balance_loss_mlp": 1.21589506, + "diversity_loss_mlp": 0.0, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.09653126460783178, + "language_loss": 0.81875724, + "learning_rate": 0.0009742808675985913, + "loss": 0.83113402, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.21801758, + "routerloss_mlp": 0.0, + "step": 673, + "time_per_iteration": 3.0861356258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260533, + "balance_loss_mlp": 1.23754919, + "diversity_loss_mlp": 0.0, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.08653130412501808, + "language_loss": 0.90219223, + "learning_rate": 0.0009741821436669876, + "loss": 0.91479754, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 674, + "time_per_iteration": 2.5609960556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267597, + "balance_loss_mlp": 1.24489975, + "diversity_loss_mlp": 0.0, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.09623752325881015, + "language_loss": 0.91791725, + "learning_rate": 0.0009740832356415492, + "loss": 0.93059325, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.22680664, + "routerloss_mlp": 0.0, + "step": 675, + "time_per_iteration": 2.544027805328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295128, + "balance_loss_mlp": 1.27278781, + "diversity_loss_mlp": 0.0, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.08903369590662558, + "language_loss": 0.87403589, + "learning_rate": 0.0009739841435606756, + "loss": 0.88698715, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.22338867, + "routerloss_mlp": 0.0, + "step": 676, + "time_per_iteration": 2.9931325912475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261461, + "balance_loss_mlp": 1.23933589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.0602287995404217, + "language_loss": 0.89557111, + "learning_rate": 0.0009738848674628377, + "loss": 0.90818572, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 677, + "time_per_iteration": 2.7290966510772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264602, + "balance_loss_mlp": 1.24307275, + "diversity_loss_mlp": 0.0, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.10468610894957399, + "language_loss": 0.88751101, + "learning_rate": 0.000973785407386578, + "loss": 0.90015703, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 678, + "time_per_iteration": 2.7950329780578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00969584, + "balance_loss_mlp": 1.6979661, + "diversity_loss_mlp": 0.20886885, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.03344489204860934, + "language_loss": 0.86933386, + "learning_rate": 0.0009736857633705103, + "loss": 0.87902969, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01616703, + "step": 679, + "time_per_iteration": 2.8691866397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193718, + "balance_loss_mlp": 1.17283261, + "diversity_loss_mlp": 0.0, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.08130386374469858, + "language_loss": 0.92363989, + "learning_rate": 0.0009735859354533196, + "loss": 0.93557703, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 680, + "time_per_iteration": 2.6832337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155917, + "balance_loss_mlp": 1.13447094, + "diversity_loss_mlp": 0.0, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.0924188238597787, + "language_loss": 0.91083395, + "learning_rate": 0.0009734859236737628, + "loss": 0.92239314, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.21459961, + "routerloss_mlp": 0.0, + "step": 681, + "time_per_iteration": 2.6023473739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125397, + "balance_loss_mlp": 1.10410571, + "diversity_loss_mlp": 0.0, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.08442474228180671, + "language_loss": 0.93186569, + "learning_rate": 0.0009733857280706678, + "loss": 0.9431197, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.2130127, + "routerloss_mlp": 0.0, + "step": 682, + "time_per_iteration": 2.5775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968386, + "balance_loss_mlp": 1.69064701, + "diversity_loss_mlp": 0.21057674, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.03992508312329801, + "language_loss": 0.84369749, + "learning_rate": 0.000973285348682934, + "loss": 0.85338134, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777408, + "step": 683, + "time_per_iteration": 2.768641233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01618305, + "balance_loss_mlp": 1.58530831, + "diversity_loss_mlp": 0.0, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.09794042911652269, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79516685, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.33007812, + "routerloss_mlp": 0.0, + "step": 684, + "time_per_iteration": 4.802167177200317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094162, + "balance_loss_mlp": 1.07383704, + "diversity_loss_mlp": 0.0, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.12652995306024198, + "language_loss": 0.84832728, + "learning_rate": 0.0009730840387095046, + "loss": 0.8592689, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.20324707, + "routerloss_mlp": 0.0, + "step": 685, + "time_per_iteration": 3.2910287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112414, + "balance_loss_mlp": 1.09188628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.13012317463795417, + "language_loss": 0.90537834, + "learning_rate": 0.0009729831082019642, + "loss": 0.91650254, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.20532227, + "routerloss_mlp": 0.0, + "step": 686, + "time_per_iteration": 2.7909138202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121716, + "balance_loss_mlp": 1.101331, + "diversity_loss_mlp": 0.0, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08096428549902779, + "language_loss": 0.88353586, + "learning_rate": 0.0009728819940660958, + "loss": 0.89475298, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 687, + "time_per_iteration": 2.7699429988861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131653, + "balance_loss_mlp": 1.11135173, + "diversity_loss_mlp": 0.0, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.07933225152322496, + "language_loss": 0.85085285, + "learning_rate": 0.0009727806963411557, + "loss": 0.86216938, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 688, + "time_per_iteration": 2.581984519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144043, + "balance_loss_mlp": 1.12350333, + "diversity_loss_mlp": 0.0, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.09807362554425139, + "language_loss": 0.87180853, + "learning_rate": 0.000972679215066471, + "loss": 0.88324893, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.20544434, + "routerloss_mlp": 0.0, + "step": 689, + "time_per_iteration": 2.6538989543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148083, + "balance_loss_mlp": 1.12809181, + "diversity_loss_mlp": 0.0, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.09247782934143206, + "language_loss": 0.98983967, + "learning_rate": 0.0009725775502814401, + "loss": 1.00132048, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.19995117, + "routerloss_mlp": 0.0, + "step": 690, + "time_per_iteration": 2.610485315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167941, + "balance_loss_mlp": 1.14827132, + "diversity_loss_mlp": 0.0, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.08082631328369684, + "language_loss": 0.84880829, + "learning_rate": 0.0009724757020255327, + "loss": 0.8604877, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.1965332, + "routerloss_mlp": 0.0, + "step": 691, + "time_per_iteration": 2.8424370288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152001, + "balance_loss_mlp": 1.13209307, + "diversity_loss_mlp": 0.0, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09067820147092803, + "language_loss": 0.87807095, + "learning_rate": 0.0009723736703382902, + "loss": 0.88959098, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.19897461, + "routerloss_mlp": 0.0, + "step": 692, + "time_per_iteration": 2.5578606128692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149266, + "balance_loss_mlp": 1.13037133, + "diversity_loss_mlp": 0.0, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07979062216362842, + "language_loss": 0.82877922, + "learning_rate": 0.0009722714552593244, + "loss": 0.84027195, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 693, + "time_per_iteration": 2.6148533821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153464, + "balance_loss_mlp": 1.13444984, + "diversity_loss_mlp": 0.0, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08708336283232748, + "language_loss": 0.94164526, + "learning_rate": 0.000972169056828319, + "loss": 0.9531799, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 694, + "time_per_iteration": 2.517944097518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154915, + "balance_loss_mlp": 1.1360321, + "diversity_loss_mlp": 0.0, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.0753733884935208, + "language_loss": 0.86921358, + "learning_rate": 0.0009720664750850283, + "loss": 0.8807627, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 695, + "time_per_iteration": 2.8149421215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148667, + "balance_loss_mlp": 1.1299628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.09445278911045346, + "language_loss": 0.92951906, + "learning_rate": 0.0009719637100692784, + "loss": 0.94100577, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 696, + "time_per_iteration": 2.719451904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149322, + "balance_loss_mlp": 1.13098741, + "diversity_loss_mlp": 0.0, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.10008701466446891, + "language_loss": 0.82604736, + "learning_rate": 0.0009718607618209661, + "loss": 0.83754057, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 697, + "time_per_iteration": 2.8692104816436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148038, + "balance_loss_mlp": 1.12914348, + "diversity_loss_mlp": 0.0, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.07908911060166324, + "language_loss": 0.87701273, + "learning_rate": 0.0009717576303800595, + "loss": 0.88849318, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 698, + "time_per_iteration": 3.0484437942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139219, + "balance_loss_mlp": 1.11988366, + "diversity_loss_mlp": 0.0, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.12480577454910273, + "language_loss": 0.85819161, + "learning_rate": 0.0009716543157865975, + "loss": 0.86958385, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.1932373, + "routerloss_mlp": 0.0, + "step": 699, + "time_per_iteration": 2.706787347793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144768, + "balance_loss_mlp": 1.12586117, + "diversity_loss_mlp": 0.0, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.16362357873421526, + "language_loss": 0.83352965, + "learning_rate": 0.0009715508180806907, + "loss": 0.84497738, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.18896484, + "routerloss_mlp": 0.0, + "step": 700, + "time_per_iteration": 3.1985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162079, + "balance_loss_mlp": 1.14230227, + "diversity_loss_mlp": 0.0, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.08746408781150025, + "language_loss": 0.90170425, + "learning_rate": 0.0009714471373025202, + "loss": 0.91332507, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.19763184, + "routerloss_mlp": 0.0, + "step": 701, + "time_per_iteration": 3.487022638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.13656974, + "diversity_loss_mlp": 0.0, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.10787745491017559, + "language_loss": 0.88186693, + "learning_rate": 0.0009713432734923386, + "loss": 0.89343208, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 702, + "time_per_iteration": 2.6239736080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.14830136, + "diversity_loss_mlp": 0.0, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.09670789671988574, + "language_loss": 0.86879516, + "learning_rate": 0.0009712392266904696, + "loss": 0.88047349, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.19506836, + "routerloss_mlp": 0.0, + "step": 703, + "time_per_iteration": 2.7542335987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181198, + "balance_loss_mlp": 1.16149247, + "diversity_loss_mlp": 0.0, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.10598212751912446, + "language_loss": 0.85246772, + "learning_rate": 0.0009711349969373076, + "loss": 0.86427975, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 704, + "time_per_iteration": 3.162461042404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175522, + "balance_loss_mlp": 1.15518451, + "diversity_loss_mlp": 0.0, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.0954290464489283, + "language_loss": 0.80285007, + "learning_rate": 0.0009710305842733178, + "loss": 0.81460524, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 705, + "time_per_iteration": 2.7630715370178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155052, + "balance_loss_mlp": 1.13601446, + "diversity_loss_mlp": 0.0, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.09437017973872532, + "language_loss": 0.89630616, + "learning_rate": 0.0009709259887390373, + "loss": 0.9078567, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.19030762, + "routerloss_mlp": 0.0, + "step": 706, + "time_per_iteration": 2.6160268783569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895019, + "balance_loss_mlp": 1.55161047, + "diversity_loss_mlp": 0.20666173, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.04273378361131697, + "language_loss": 0.90874577, + "learning_rate": 0.0009708212103750737, + "loss": 0.91769588, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588319, + "step": 707, + "time_per_iteration": 2.594606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180444, + "balance_loss_mlp": 1.16110778, + "diversity_loss_mlp": 0.0, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.08814378894040824, + "language_loss": 0.87522972, + "learning_rate": 0.0009707162492221051, + "loss": 0.88703418, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.19335938, + "routerloss_mlp": 0.0, + "step": 708, + "time_per_iteration": 2.8884427547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197388, + "balance_loss_mlp": 1.17801642, + "diversity_loss_mlp": 0.0, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.07892254834086627, + "language_loss": 0.87611169, + "learning_rate": 0.0009706111053208815, + "loss": 0.8880856, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 709, + "time_per_iteration": 2.7824413776397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213311, + "balance_loss_mlp": 1.19383228, + "diversity_loss_mlp": 0.0, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.10389736734512126, + "language_loss": 0.85504246, + "learning_rate": 0.0009705057787122232, + "loss": 0.86717558, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.19458008, + "routerloss_mlp": 0.0, + "step": 710, + "time_per_iteration": 2.529498815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178108, + "balance_loss_mlp": 1.15870059, + "diversity_loss_mlp": 0.0, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.07975606670492637, + "language_loss": 0.91293353, + "learning_rate": 0.0009704002694370216, + "loss": 0.92471457, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.19384766, + "routerloss_mlp": 0.0, + "step": 711, + "time_per_iteration": 2.5365610122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152018, + "balance_loss_mlp": 1.13282573, + "diversity_loss_mlp": 0.0, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.08453852441771745, + "language_loss": 0.86583841, + "learning_rate": 0.0009702945775362388, + "loss": 0.87735862, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.19177246, + "routerloss_mlp": 0.0, + "step": 712, + "time_per_iteration": 2.595674514770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111883, + "balance_loss_mlp": 1.10022175, + "diversity_loss_mlp": 0.0, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.08096963371537849, + "language_loss": 0.87088716, + "learning_rate": 0.0009701887030509086, + "loss": 0.88207549, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.18615723, + "routerloss_mlp": 0.0, + "step": 713, + "time_per_iteration": 2.6124320030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112657, + "balance_loss_mlp": 1.09444165, + "diversity_loss_mlp": 0.0, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.12434454369652892, + "language_loss": 0.91262931, + "learning_rate": 0.0009700826460221346, + "loss": 0.92375588, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.18225098, + "routerloss_mlp": 0.0, + "step": 714, + "time_per_iteration": 2.674612283706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115832, + "balance_loss_mlp": 1.09812903, + "diversity_loss_mlp": 0.0, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.11407804289300516, + "language_loss": 0.92571628, + "learning_rate": 0.0009699764064910921, + "loss": 0.93687463, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.17712402, + "routerloss_mlp": 0.0, + "step": 715, + "time_per_iteration": 2.8810853958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121116, + "balance_loss_mlp": 1.10322237, + "diversity_loss_mlp": 0.0, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08940816195623212, + "language_loss": 0.86826718, + "learning_rate": 0.0009698699844990268, + "loss": 0.87947834, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.17907715, + "routerloss_mlp": 0.0, + "step": 716, + "time_per_iteration": 2.697970151901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153213, + "balance_loss_mlp": 1.13561809, + "diversity_loss_mlp": 0.0, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.07906779204708066, + "language_loss": 0.88138282, + "learning_rate": 0.0009697633800872555, + "loss": 0.89291501, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 717, + "time_per_iteration": 2.8897392749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197417, + "balance_loss_mlp": 1.1801312, + "diversity_loss_mlp": 0.0, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.10867682790127652, + "language_loss": 0.9066782, + "learning_rate": 0.0009696565932971655, + "loss": 0.91865242, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 718, + "time_per_iteration": 2.8944718837738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209582, + "balance_loss_mlp": 1.19165277, + "diversity_loss_mlp": 0.0, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.0949883595308799, + "language_loss": 0.89814746, + "learning_rate": 0.0009695496241702153, + "loss": 0.91024327, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 719, + "time_per_iteration": 2.7888894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188478, + "balance_loss_mlp": 1.17082274, + "diversity_loss_mlp": 0.0, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.11627833553714081, + "language_loss": 0.86245799, + "learning_rate": 0.0009694424727479339, + "loss": 0.87434286, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.17687988, + "routerloss_mlp": 0.0, + "step": 720, + "time_per_iteration": 2.901224374771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157865, + "balance_loss_mlp": 1.14056826, + "diversity_loss_mlp": 0.0, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.09369792564045784, + "language_loss": 0.88928097, + "learning_rate": 0.0009693351390719213, + "loss": 0.90085959, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 721, + "time_per_iteration": 2.6945152282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126431, + "balance_loss_mlp": 1.10868096, + "diversity_loss_mlp": 0.0, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.07998653864580182, + "language_loss": 0.90800881, + "learning_rate": 0.000969227623183848, + "loss": 0.91927308, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 722, + "time_per_iteration": 2.789515733718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110503, + "balance_loss_mlp": 1.0873754, + "diversity_loss_mlp": 0.0, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.07914116119322331, + "language_loss": 0.90912664, + "learning_rate": 0.0009691199251254554, + "loss": 0.92017698, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.17663574, + "routerloss_mlp": 0.0, + "step": 723, + "time_per_iteration": 2.8231685161590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0093359, + "balance_loss_mlp": 1.62175167, + "diversity_loss_mlp": 0.20987722, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.03669424434563534, + "language_loss": 0.86868215, + "learning_rate": 0.0009690120449385555, + "loss": 0.87801802, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777578, + "step": 724, + "time_per_iteration": 2.8498518466949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093753, + "balance_loss_mlp": 1.07543111, + "diversity_loss_mlp": 0.0, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.10366482624390064, + "language_loss": 0.92449063, + "learning_rate": 0.0009689039826650312, + "loss": 0.93542814, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.18322754, + "routerloss_mlp": 0.0, + "step": 725, + "time_per_iteration": 2.7611966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0154366, + "balance_loss_mlp": 1.50932813, + "diversity_loss_mlp": 0.0, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.08078369374569346, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.78066719, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.34375, + "routerloss_mlp": 0.0, + "step": 726, + "time_per_iteration": 4.927435398101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933775, + "balance_loss_mlp": 1.62253523, + "diversity_loss_mlp": 0.20735951, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.04309218151041253, + "language_loss": 0.87429261, + "learning_rate": 0.0009686873120259941, + "loss": 0.88363039, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01882811, + "step": 727, + "time_per_iteration": 2.602264165878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113926, + "balance_loss_mlp": 1.12035322, + "diversity_loss_mlp": 0.0, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.14876828859354083, + "language_loss": 0.8713131, + "learning_rate": 0.0009685787037446004, + "loss": 0.88270569, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.18884277, + "routerloss_mlp": 0.0, + "step": 728, + "time_per_iteration": 2.806549072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118218, + "balance_loss_mlp": 1.09903765, + "diversity_loss_mlp": 0.0, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.1987640778264907, + "language_loss": 0.87505388, + "learning_rate": 0.0009684699135448201, + "loss": 0.88623607, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 729, + "time_per_iteration": 2.7200138568878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112792, + "balance_loss_mlp": 1.09435034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.0640895655048784, + "language_loss": 0.92135447, + "learning_rate": 0.0009683609414688895, + "loss": 0.93248242, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.18432617, + "routerloss_mlp": 0.0, + "step": 730, + "time_per_iteration": 2.7423696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911127, + "balance_loss_mlp": 1.58117688, + "diversity_loss_mlp": 0.20959289, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.03249579551243702, + "language_loss": 0.86587501, + "learning_rate": 0.0009682517875591154, + "loss": 0.87498629, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01574249, + "step": 731, + "time_per_iteration": 2.809400796890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199938, + "balance_loss_mlp": 1.18138909, + "diversity_loss_mlp": 0.0, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.07609394509363156, + "language_loss": 0.86229968, + "learning_rate": 0.0009681424518578749, + "loss": 0.87429905, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.18530273, + "routerloss_mlp": 0.0, + "step": 732, + "time_per_iteration": 2.725839614868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283686, + "balance_loss_mlp": 1.26505399, + "diversity_loss_mlp": 0.0, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.1414658743658329, + "language_loss": 0.87506676, + "learning_rate": 0.000968032934407616, + "loss": 0.88790363, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.1862793, + "routerloss_mlp": 0.0, + "step": 733, + "time_per_iteration": 2.583768844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01310281, + "balance_loss_mlp": 1.29136264, + "diversity_loss_mlp": 0.0, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.10963887531318486, + "language_loss": 0.81871867, + "learning_rate": 0.0009679232352508571, + "loss": 0.8318215, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.18908691, + "routerloss_mlp": 0.0, + "step": 734, + "time_per_iteration": 2.785585880279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286635, + "balance_loss_mlp": 1.26744211, + "diversity_loss_mlp": 0.0, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.10469043869015734, + "language_loss": 0.80695581, + "learning_rate": 0.0009678133544301871, + "loss": 0.81982213, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 735, + "time_per_iteration": 2.6638481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224375, + "balance_loss_mlp": 1.20588589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.06500438819618859, + "language_loss": 0.91870093, + "learning_rate": 0.0009677032919882658, + "loss": 0.93094468, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 736, + "time_per_iteration": 2.6578378677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197974, + "balance_loss_mlp": 1.18056929, + "diversity_loss_mlp": 0.0, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.09940630997209131, + "language_loss": 0.91374373, + "learning_rate": 0.000967593047967823, + "loss": 0.92572349, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.17419434, + "routerloss_mlp": 0.0, + "step": 737, + "time_per_iteration": 2.5236403942108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117212, + "balance_loss_mlp": 1.15476346, + "diversity_loss_mlp": 0.0, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.10840920786543624, + "language_loss": 0.86479127, + "learning_rate": 0.0009674826224116593, + "loss": 0.87651253, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 738, + "time_per_iteration": 2.803260326385498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134605, + "balance_loss_mlp": 1.11759412, + "diversity_loss_mlp": 0.0, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.09051392518082112, + "language_loss": 0.86862409, + "learning_rate": 0.0009673720153626455, + "loss": 0.87997013, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 739, + "time_per_iteration": 2.6086573600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.10798764, + "diversity_loss_mlp": 0.0, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.11444093339414264, + "language_loss": 0.8689152, + "learning_rate": 0.0009672612268637235, + "loss": 0.88016504, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.17016602, + "routerloss_mlp": 0.0, + "step": 740, + "time_per_iteration": 2.582648277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116151, + "balance_loss_mlp": 1.09880614, + "diversity_loss_mlp": 0.0, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.10874190594389947, + "language_loss": 0.84213787, + "learning_rate": 0.0009671502569579048, + "loss": 0.85329938, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 741, + "time_per_iteration": 2.7945284843444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132432, + "balance_loss_mlp": 1.11539662, + "diversity_loss_mlp": 0.0, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.07140691777849974, + "language_loss": 0.89503837, + "learning_rate": 0.0009670391056882719, + "loss": 0.90636265, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.17053223, + "routerloss_mlp": 0.0, + "step": 742, + "time_per_iteration": 2.71687912940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149228, + "balance_loss_mlp": 1.13240731, + "diversity_loss_mlp": 0.0, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.08672376963732596, + "language_loss": 0.88698781, + "learning_rate": 0.0009669277730979776, + "loss": 0.89848006, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 743, + "time_per_iteration": 3.2029030323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147207, + "balance_loss_mlp": 1.13025546, + "diversity_loss_mlp": 0.0, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.09113342882689801, + "language_loss": 0.85227454, + "learning_rate": 0.0009668162592302449, + "loss": 0.86374664, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 744, + "time_per_iteration": 2.899656057357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165032, + "balance_loss_mlp": 1.14748406, + "diversity_loss_mlp": 0.0, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.07780467137911447, + "language_loss": 0.86560214, + "learning_rate": 0.0009667045641283676, + "loss": 0.87725246, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.17553711, + "routerloss_mlp": 0.0, + "step": 745, + "time_per_iteration": 2.6474997997283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.14148676, + "diversity_loss_mlp": 0.0, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.09864944110558675, + "language_loss": 0.95312673, + "learning_rate": 0.0009665926878357092, + "loss": 0.96471858, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.17700195, + "routerloss_mlp": 0.0, + "step": 746, + "time_per_iteration": 2.946307420730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00851982, + "balance_loss_mlp": 1.46230698, + "diversity_loss_mlp": 0.20995456, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.034792990408202794, + "language_loss": 0.91192698, + "learning_rate": 0.0009664806303957043, + "loss": 0.92044681, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01585159, + "step": 747, + "time_per_iteration": 2.706286668777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160661, + "balance_loss_mlp": 1.14221931, + "diversity_loss_mlp": 0.0, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.08367194984434445, + "language_loss": 0.87066692, + "learning_rate": 0.0009663683918518571, + "loss": 0.88227355, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 748, + "time_per_iteration": 2.892982244491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136526, + "balance_loss_mlp": 1.11831081, + "diversity_loss_mlp": 0.0, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.07455761265115375, + "language_loss": 0.85490787, + "learning_rate": 0.0009662559722477428, + "loss": 0.86627316, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.18237305, + "routerloss_mlp": 0.0, + "step": 749, + "time_per_iteration": 2.6979615688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01292346, + "balance_loss_mlp": 1.2582047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.08640394257539531, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77455318, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.34179688, + "routerloss_mlp": 0.0, + "step": 750, + "time_per_iteration": 4.991304397583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.11068118, + "diversity_loss_mlp": 0.0, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.07866539193327844, + "language_loss": 0.89197791, + "learning_rate": 0.0009660305900333632, + "loss": 0.90326303, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.17834473, + "routerloss_mlp": 0.0, + "step": 751, + "time_per_iteration": 2.6706793308258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121937, + "balance_loss_mlp": 1.1038413, + "diversity_loss_mlp": 0.0, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.10038132697844201, + "language_loss": 0.82478833, + "learning_rate": 0.0009659176275105992, + "loss": 0.83600777, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.1809082, + "routerloss_mlp": 0.0, + "step": 752, + "time_per_iteration": 2.697909355163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126212, + "balance_loss_mlp": 1.10777032, + "diversity_loss_mlp": 0.0, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.10638604925915984, + "language_loss": 0.85756153, + "learning_rate": 0.0009658044841025701, + "loss": 0.86882365, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 753, + "time_per_iteration": 2.7749171257019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128331, + "balance_loss_mlp": 1.1107595, + "diversity_loss_mlp": 0.0, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.09130861127340602, + "language_loss": 0.81584072, + "learning_rate": 0.0009656911598532021, + "loss": 0.827124, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.17590332, + "routerloss_mlp": 0.0, + "step": 754, + "time_per_iteration": 2.635702610015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136592, + "balance_loss_mlp": 1.11914003, + "diversity_loss_mlp": 0.0, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.06835454276473461, + "language_loss": 0.90494555, + "learning_rate": 0.0009655776548064917, + "loss": 0.9163115, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.17456055, + "routerloss_mlp": 0.0, + "step": 755, + "time_per_iteration": 2.6545748710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135969, + "balance_loss_mlp": 1.11902952, + "diversity_loss_mlp": 0.0, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.07886906074703284, + "language_loss": 0.88367254, + "learning_rate": 0.0009654639690065054, + "loss": 0.89503217, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 756, + "time_per_iteration": 2.8773815631866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150961, + "balance_loss_mlp": 1.13343716, + "diversity_loss_mlp": 0.0, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.07604063018618923, + "language_loss": 0.8823185, + "learning_rate": 0.00096535010249738, + "loss": 0.89382815, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.17529297, + "routerloss_mlp": 0.0, + "step": 757, + "time_per_iteration": 2.7175021171569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846707, + "balance_loss_mlp": 1.45519352, + "diversity_loss_mlp": 0.20419648, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.03954501513556402, + "language_loss": 0.82782531, + "learning_rate": 0.0009652360553233224, + "loss": 0.83629239, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017012, + "step": 758, + "time_per_iteration": 2.7434637546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115333, + "balance_loss_mlp": 1.12624609, + "diversity_loss_mlp": 0.0, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.03342191973393777, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.7492708, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 759, + "time_per_iteration": 4.910880088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188786, + "balance_loss_mlp": 1.17063034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.0638252555407819, + "language_loss": 0.81659228, + "learning_rate": 0.0009650074191575883, + "loss": 0.82848012, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.18151855, + "routerloss_mlp": 0.0, + "step": 760, + "time_per_iteration": 3.2028603553771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213565, + "balance_loss_mlp": 1.19484925, + "diversity_loss_mlp": 0.0, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.07046318146001718, + "language_loss": 0.86031073, + "learning_rate": 0.0009648928302546766, + "loss": 0.87244636, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 761, + "time_per_iteration": 2.6812515258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243947, + "balance_loss_mlp": 1.22551703, + "diversity_loss_mlp": 0.0, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.0884537515073792, + "language_loss": 0.85470825, + "learning_rate": 0.0009647780608643613, + "loss": 0.86714768, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.1842041, + "routerloss_mlp": 0.0, + "step": 762, + "time_per_iteration": 3.3486785888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012302, + "balance_loss_mlp": 1.21243811, + "diversity_loss_mlp": 0.0, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.12042495658723557, + "language_loss": 0.874053, + "learning_rate": 0.0009646631110312001, + "loss": 0.88635492, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 763, + "time_per_iteration": 2.6648313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172377, + "balance_loss_mlp": 1.1544956, + "diversity_loss_mlp": 0.0, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.05916332097574664, + "language_loss": 0.8841719, + "learning_rate": 0.0009645479807998203, + "loss": 0.89589572, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 764, + "time_per_iteration": 2.7347912788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147505, + "balance_loss_mlp": 1.12983775, + "diversity_loss_mlp": 0.0, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06985321722585584, + "language_loss": 0.92467874, + "learning_rate": 0.0009644326702149196, + "loss": 0.93615377, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.17675781, + "routerloss_mlp": 0.0, + "step": 765, + "time_per_iteration": 2.7316319942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135962, + "balance_loss_mlp": 1.11803293, + "diversity_loss_mlp": 0.0, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.09157028460957184, + "language_loss": 0.84919345, + "learning_rate": 0.0009643171793212653, + "loss": 0.86055309, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 766, + "time_per_iteration": 3.116917610168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105537, + "balance_loss_mlp": 1.08738184, + "diversity_loss_mlp": 0.0, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.08034801396880724, + "language_loss": 0.89233959, + "learning_rate": 0.0009642015081636952, + "loss": 0.90339494, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.18164062, + "routerloss_mlp": 0.0, + "step": 767, + "time_per_iteration": 2.705993175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103513, + "balance_loss_mlp": 1.08563185, + "diversity_loss_mlp": 0.0, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.09221888586765616, + "language_loss": 0.88360566, + "learning_rate": 0.0009640856567871166, + "loss": 0.8946408, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.17895508, + "routerloss_mlp": 0.0, + "step": 768, + "time_per_iteration": 2.5172243118286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108598, + "balance_loss_mlp": 1.08981061, + "diversity_loss_mlp": 0.0, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.0844592716079577, + "language_loss": 0.89047211, + "learning_rate": 0.0009639696252365072, + "loss": 0.9015581, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.18786621, + "routerloss_mlp": 0.0, + "step": 769, + "time_per_iteration": 3.034848690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105095, + "balance_loss_mlp": 1.08673656, + "diversity_loss_mlp": 0.0, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.07095543604969227, + "language_loss": 0.81996548, + "learning_rate": 0.0009638534135569144, + "loss": 0.83101642, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.18371582, + "routerloss_mlp": 0.0, + "step": 770, + "time_per_iteration": 2.947564125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106726, + "balance_loss_mlp": 1.08859468, + "diversity_loss_mlp": 0.0, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.08627707323979403, + "language_loss": 0.9012745, + "learning_rate": 0.0009637370217934554, + "loss": 0.91234171, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.18139648, + "routerloss_mlp": 0.0, + "step": 771, + "time_per_iteration": 2.6592423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.09355128, + "diversity_loss_mlp": 0.0, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.06345294765682771, + "language_loss": 0.82981932, + "learning_rate": 0.0009636204499913175, + "loss": 0.84093815, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 772, + "time_per_iteration": 2.8836610317230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115362, + "balance_loss_mlp": 1.09749293, + "diversity_loss_mlp": 0.0, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06338786563117527, + "language_loss": 0.87914705, + "learning_rate": 0.0009635036981957581, + "loss": 0.89030063, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 773, + "time_per_iteration": 2.885239601135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132405, + "balance_loss_mlp": 1.11417794, + "diversity_loss_mlp": 0.0, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.08623405645423676, + "language_loss": 0.90735364, + "learning_rate": 0.0009633867664521043, + "loss": 0.91867769, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.18212891, + "routerloss_mlp": 0.0, + "step": 774, + "time_per_iteration": 2.802264451980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159356, + "balance_loss_mlp": 1.14176083, + "diversity_loss_mlp": 0.0, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.09977443827883303, + "language_loss": 0.86760318, + "learning_rate": 0.0009632696548057527, + "loss": 0.8791967, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 775, + "time_per_iteration": 2.5641794204711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187156, + "balance_loss_mlp": 1.16960835, + "diversity_loss_mlp": 0.0, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.08744626586779954, + "language_loss": 0.85013115, + "learning_rate": 0.0009631523633021704, + "loss": 0.86200273, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 776, + "time_per_iteration": 2.7851786613464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881631, + "balance_loss_mlp": 1.52411294, + "diversity_loss_mlp": 0.20632464, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.038364140445948956, + "language_loss": 0.88378215, + "learning_rate": 0.0009630348919868936, + "loss": 0.89259851, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164127, + "step": 777, + "time_per_iteration": 2.7285845279693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191902, + "balance_loss_mlp": 1.17415154, + "diversity_loss_mlp": 0.0, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.14061909589017782, + "language_loss": 0.81450796, + "learning_rate": 0.0009629172409055293, + "loss": 0.82642698, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 778, + "time_per_iteration": 2.5018203258514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154162, + "balance_loss_mlp": 1.13728166, + "diversity_loss_mlp": 0.0, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.06968828956123203, + "language_loss": 0.87518388, + "learning_rate": 0.0009627994101037531, + "loss": 0.88672549, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 779, + "time_per_iteration": 2.763136863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139257, + "balance_loss_mlp": 1.12231779, + "diversity_loss_mlp": 0.0, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.07833298109740298, + "language_loss": 0.88761836, + "learning_rate": 0.0009626813996273114, + "loss": 0.8990109, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 780, + "time_per_iteration": 2.8791675567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117275, + "balance_loss_mlp": 1.09990597, + "diversity_loss_mlp": 0.0, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.09603506751758703, + "language_loss": 0.89051467, + "learning_rate": 0.0009625632095220198, + "loss": 0.90168738, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 781, + "time_per_iteration": 2.8194801807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119786, + "balance_loss_mlp": 1.10251248, + "diversity_loss_mlp": 0.0, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.1003760880169841, + "language_loss": 0.86904705, + "learning_rate": 0.0009624448398337637, + "loss": 0.88024497, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 782, + "time_per_iteration": 2.511925458908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117445, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.08409428795596587, + "language_loss": 0.8913728, + "learning_rate": 0.0009623262906084984, + "loss": 0.90254724, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.17236328, + "routerloss_mlp": 0.0, + "step": 783, + "time_per_iteration": 2.9890754222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125496, + "balance_loss_mlp": 1.10804367, + "diversity_loss_mlp": 0.0, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.07818041002140835, + "language_loss": 0.90351313, + "learning_rate": 0.0009622075618922486, + "loss": 0.9147681, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 784, + "time_per_iteration": 2.6550891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119712, + "balance_loss_mlp": 1.10261774, + "diversity_loss_mlp": 0.0, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.07239943737193227, + "language_loss": 0.87125635, + "learning_rate": 0.0009620886537311091, + "loss": 0.88245344, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.17114258, + "routerloss_mlp": 0.0, + "step": 785, + "time_per_iteration": 2.646864652633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125571, + "balance_loss_mlp": 1.10794032, + "diversity_loss_mlp": 0.0, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.08980079735835493, + "language_loss": 0.85309643, + "learning_rate": 0.000961969566171244, + "loss": 0.86435217, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.1763916, + "routerloss_mlp": 0.0, + "step": 786, + "time_per_iteration": 2.5803041458129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136873, + "balance_loss_mlp": 1.11938524, + "diversity_loss_mlp": 0.0, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.08282756535064502, + "language_loss": 0.8993417, + "learning_rate": 0.0009618502992588873, + "loss": 0.91071045, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.17504883, + "routerloss_mlp": 0.0, + "step": 787, + "time_per_iteration": 2.6479151248931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124837, + "balance_loss_mlp": 1.10727715, + "diversity_loss_mlp": 0.0, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07571751270322945, + "language_loss": 0.8792628, + "learning_rate": 0.0009617308530403424, + "loss": 0.89051116, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 788, + "time_per_iteration": 3.002804756164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125535, + "balance_loss_mlp": 1.10758173, + "diversity_loss_mlp": 0.0, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0842913885359751, + "language_loss": 0.88032806, + "learning_rate": 0.0009616112275619825, + "loss": 0.89158338, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 789, + "time_per_iteration": 2.6842775344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110837, + "balance_loss_mlp": 1.09398067, + "diversity_loss_mlp": 0.0, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.07451962795351484, + "language_loss": 0.83893597, + "learning_rate": 0.0009614914228702503, + "loss": 0.85004437, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 790, + "time_per_iteration": 2.714026689529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095726, + "balance_loss_mlp": 1.07848811, + "diversity_loss_mlp": 0.0, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.07099161447381937, + "language_loss": 0.89133644, + "learning_rate": 0.0009613714390116581, + "loss": 0.90229368, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.17260742, + "routerloss_mlp": 0.0, + "step": 791, + "time_per_iteration": 2.947917938232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089669, + "balance_loss_mlp": 1.0730865, + "diversity_loss_mlp": 0.0, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.07518738092336623, + "language_loss": 0.86102855, + "learning_rate": 0.0009612512760327879, + "loss": 0.87192523, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 792, + "time_per_iteration": 2.887404203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092437, + "balance_loss_mlp": 1.07553315, + "diversity_loss_mlp": 0.0, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.09992337759040973, + "language_loss": 0.85428631, + "learning_rate": 0.0009611309339802909, + "loss": 0.86521071, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 793, + "time_per_iteration": 2.463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101517, + "balance_loss_mlp": 1.08537626, + "diversity_loss_mlp": 0.0, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.07717151134226699, + "language_loss": 0.84535038, + "learning_rate": 0.0009610104129008881, + "loss": 0.85636556, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 794, + "time_per_iteration": 3.1276698112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108014, + "balance_loss_mlp": 1.09176612, + "diversity_loss_mlp": 0.0, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.07067272187318202, + "language_loss": 0.88475168, + "learning_rate": 0.0009608897128413701, + "loss": 0.89583182, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 795, + "time_per_iteration": 2.7658157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110863, + "balance_loss_mlp": 1.09251332, + "diversity_loss_mlp": 0.0, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.05987412473430484, + "language_loss": 0.85522842, + "learning_rate": 0.0009607688338485965, + "loss": 0.86631477, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 796, + "time_per_iteration": 2.849942207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112598, + "balance_loss_mlp": 1.10935068, + "diversity_loss_mlp": 0.0, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.07148533051381147, + "language_loss": 0.90245026, + "learning_rate": 0.0009606477759694969, + "loss": 0.91371006, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 797, + "time_per_iteration": 3.0240113735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144466, + "balance_loss_mlp": 1.12839675, + "diversity_loss_mlp": 0.0, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07535837127697287, + "language_loss": 0.87540114, + "learning_rate": 0.0009605265392510703, + "loss": 0.88684577, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 798, + "time_per_iteration": 2.6324868202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147656, + "balance_loss_mlp": 1.13140786, + "diversity_loss_mlp": 0.0, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.070317951825601, + "language_loss": 0.91919398, + "learning_rate": 0.0009604051237403846, + "loss": 0.93067056, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 799, + "time_per_iteration": 2.6472957134246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159957, + "balance_loss_mlp": 1.14441192, + "diversity_loss_mlp": 0.0, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.08825283549053219, + "language_loss": 0.8626982, + "learning_rate": 0.0009602835294845776, + "loss": 0.8742978, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 800, + "time_per_iteration": 2.4501516819000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141823, + "balance_loss_mlp": 1.12552738, + "diversity_loss_mlp": 0.0, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.07489761537063061, + "language_loss": 0.89964634, + "learning_rate": 0.0009601617565308565, + "loss": 0.91106457, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 801, + "time_per_iteration": 2.6480391025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00945745, + "balance_loss_mlp": 1.65525413, + "diversity_loss_mlp": 0.20237769, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.03656221347615257, + "language_loss": 0.8655234, + "learning_rate": 0.0009600398049264977, + "loss": 0.87498081, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01692954, + "step": 802, + "time_per_iteration": 3.0029048919677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00923116, + "balance_loss_mlp": 1.61011553, + "diversity_loss_mlp": 0.20312682, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.045238735441598905, + "language_loss": 0.92041564, + "learning_rate": 0.0009599176747188469, + "loss": 0.92964679, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164945, + "step": 803, + "time_per_iteration": 2.860461473464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113914, + "balance_loss_mlp": 1.12246239, + "diversity_loss_mlp": 0.0, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.08350523706559901, + "language_loss": 0.83155477, + "learning_rate": 0.0009597953659553196, + "loss": 0.84294617, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.16687012, + "routerloss_mlp": 0.0, + "step": 804, + "time_per_iteration": 2.733302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139673, + "balance_loss_mlp": 1.12363935, + "diversity_loss_mlp": 0.0, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.08094420015679657, + "language_loss": 0.89484847, + "learning_rate": 0.0009596728786833997, + "loss": 0.90624517, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.16027832, + "routerloss_mlp": 0.0, + "step": 805, + "time_per_iteration": 2.602963447570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112483, + "balance_loss_mlp": 1.10851073, + "diversity_loss_mlp": 0.0, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.09295267358895155, + "language_loss": 0.8926357, + "learning_rate": 0.0009595502129506415, + "loss": 0.90388405, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 806, + "time_per_iteration": 3.358494997024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112525, + "balance_loss_mlp": 1.10893035, + "diversity_loss_mlp": 0.0, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.09807919542340894, + "language_loss": 0.82600027, + "learning_rate": 0.0009594273688046678, + "loss": 0.83725274, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 807, + "time_per_iteration": 2.7516088485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121041, + "balance_loss_mlp": 1.10408974, + "diversity_loss_mlp": 0.0, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.13657059547118527, + "language_loss": 0.85685933, + "learning_rate": 0.000959304346293171, + "loss": 0.86806977, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 808, + "time_per_iteration": 2.676118850708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133717, + "balance_loss_mlp": 1.11686087, + "diversity_loss_mlp": 0.0, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.08670416080232539, + "language_loss": 0.88104093, + "learning_rate": 0.0009591811454639125, + "loss": 0.89237815, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.16870117, + "routerloss_mlp": 0.0, + "step": 809, + "time_per_iteration": 2.806877613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143795, + "balance_loss_mlp": 1.12712979, + "diversity_loss_mlp": 0.0, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.07575766208840308, + "language_loss": 0.88623202, + "learning_rate": 0.0009590577663647234, + "loss": 0.89766991, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 810, + "time_per_iteration": 2.705397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167139, + "balance_loss_mlp": 1.15012765, + "diversity_loss_mlp": 0.0, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.07966338850805216, + "language_loss": 0.86178398, + "learning_rate": 0.0009589342090435036, + "loss": 0.87345541, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 811, + "time_per_iteration": 2.767648935317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164589, + "balance_loss_mlp": 1.14749408, + "diversity_loss_mlp": 0.0, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07988119295983553, + "language_loss": 0.87430739, + "learning_rate": 0.0009588104735482223, + "loss": 0.88595331, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 812, + "time_per_iteration": 2.6543996334075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.14989901, + "diversity_loss_mlp": 0.0, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.09429144108453459, + "language_loss": 0.83906114, + "learning_rate": 0.0009586865599269177, + "loss": 0.85073483, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 813, + "time_per_iteration": 2.632206439971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180179, + "balance_loss_mlp": 1.1632992, + "diversity_loss_mlp": 0.0, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.08748302318090055, + "language_loss": 0.88416874, + "learning_rate": 0.0009585624682276977, + "loss": 0.89597052, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 814, + "time_per_iteration": 2.7365036010742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187488, + "balance_loss_mlp": 1.17066741, + "diversity_loss_mlp": 0.0, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.08109713122840453, + "language_loss": 0.87263978, + "learning_rate": 0.0009584381984987386, + "loss": 0.88451469, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 815, + "time_per_iteration": 2.5354831218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011941, + "balance_loss_mlp": 1.1770407, + "diversity_loss_mlp": 0.0, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.07928759805262754, + "language_loss": 0.89978456, + "learning_rate": 0.0009583137507882864, + "loss": 0.91172552, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.17077637, + "routerloss_mlp": 0.0, + "step": 816, + "time_per_iteration": 2.679156541824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895961, + "balance_loss_mlp": 1.55854249, + "diversity_loss_mlp": 0.20119007, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.035733799703693336, + "language_loss": 0.81236839, + "learning_rate": 0.000958189125144656, + "loss": 0.82132804, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160944, + "step": 817, + "time_per_iteration": 2.6629080772399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211679, + "balance_loss_mlp": 1.1954186, + "diversity_loss_mlp": 0.0, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08655764528844483, + "language_loss": 0.88309336, + "learning_rate": 0.0009580643216162313, + "loss": 0.89521015, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 818, + "time_per_iteration": 2.6631743907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174608, + "balance_loss_mlp": 1.15813375, + "diversity_loss_mlp": 0.0, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.07543766685957613, + "language_loss": 0.79610753, + "learning_rate": 0.0009579393402514652, + "loss": 0.80785358, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 819, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116637, + "balance_loss_mlp": 1.15002656, + "diversity_loss_mlp": 0.0, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.08555828674018097, + "language_loss": 0.90543056, + "learning_rate": 0.0009578141810988801, + "loss": 0.91709423, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 820, + "time_per_iteration": 2.6443581581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154879, + "balance_loss_mlp": 1.13852358, + "diversity_loss_mlp": 0.0, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.08457683432578478, + "language_loss": 0.90617025, + "learning_rate": 0.0009576888442070668, + "loss": 0.91771901, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.16357422, + "routerloss_mlp": 0.0, + "step": 821, + "time_per_iteration": 2.588172197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131243, + "balance_loss_mlp": 1.11597228, + "diversity_loss_mlp": 0.0, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08246293521158644, + "language_loss": 0.92183721, + "learning_rate": 0.0009575633296246854, + "loss": 0.93314958, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 822, + "time_per_iteration": 2.5674116611480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894902, + "balance_loss_mlp": 1.55344844, + "diversity_loss_mlp": 0.20225295, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.035537794180972825, + "language_loss": 0.83368647, + "learning_rate": 0.0009574376374004652, + "loss": 0.84263551, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01705186, + "step": 823, + "time_per_iteration": 2.6215808391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124038, + "balance_loss_mlp": 1.10815978, + "diversity_loss_mlp": 0.0, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.07732147283422666, + "language_loss": 0.801727, + "learning_rate": 0.000957311767583204, + "loss": 0.81296742, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 824, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114811, + "balance_loss_mlp": 1.12617576, + "diversity_loss_mlp": 0.0, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.06675818035974217, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83219701, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.21972656, + "routerloss_mlp": 0.0, + "step": 825, + "time_per_iteration": 4.730658531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00883043, + "balance_loss_mlp": 1.5295732, + "diversity_loss_mlp": 0.20110103, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.0472865977200058, + "language_loss": 0.91635585, + "learning_rate": 0.0009570594953650961, + "loss": 0.92518628, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01770616, + "step": 826, + "time_per_iteration": 2.528219699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119191, + "balance_loss_mlp": 1.10247803, + "diversity_loss_mlp": 0.0, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.1137923923451387, + "language_loss": 0.80430406, + "learning_rate": 0.00095693309306219, + "loss": 0.81549597, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 827, + "time_per_iteration": 3.0950989723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111184, + "balance_loss_mlp": 1.09513879, + "diversity_loss_mlp": 0.0, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.08215179220405018, + "language_loss": 0.87886679, + "learning_rate": 0.0009568065133621244, + "loss": 0.8899852, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.16699219, + "routerloss_mlp": 0.0, + "step": 828, + "time_per_iteration": 3.367777109146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106235, + "balance_loss_mlp": 1.08993912, + "diversity_loss_mlp": 0.0, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.0806870261134831, + "language_loss": 0.85100621, + "learning_rate": 0.0009566797563140422, + "loss": 0.86206853, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 829, + "time_per_iteration": 2.8803212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122437, + "balance_loss_mlp": 1.10618925, + "diversity_loss_mlp": 0.0, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.0881590388408274, + "language_loss": 0.88045579, + "learning_rate": 0.0009565528219671547, + "loss": 0.89168018, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 830, + "time_per_iteration": 2.8965914249420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130205, + "balance_loss_mlp": 1.11437368, + "diversity_loss_mlp": 0.0, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.08433678519740714, + "language_loss": 0.84820044, + "learning_rate": 0.0009564257103707418, + "loss": 0.85950249, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 831, + "time_per_iteration": 2.6071205139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138047, + "balance_loss_mlp": 1.12237096, + "diversity_loss_mlp": 0.0, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08192391736137887, + "language_loss": 0.90990019, + "learning_rate": 0.0009562984215741533, + "loss": 0.92128068, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.15661621, + "routerloss_mlp": 0.0, + "step": 832, + "time_per_iteration": 2.647022008895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126204, + "balance_loss_mlp": 1.11050415, + "diversity_loss_mlp": 0.0, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.08304692865674389, + "language_loss": 0.8233614, + "learning_rate": 0.0009561709556268065, + "loss": 0.83462346, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 833, + "time_per_iteration": 2.7033326625823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113334, + "balance_loss_mlp": 1.09758639, + "diversity_loss_mlp": 0.0, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.1118379895427605, + "language_loss": 0.94022137, + "learning_rate": 0.0009560433125781884, + "loss": 0.95135468, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 834, + "time_per_iteration": 2.7286314964294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137088, + "balance_loss_mlp": 1.12088716, + "diversity_loss_mlp": 0.0, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.07457680689162895, + "language_loss": 0.92389894, + "learning_rate": 0.0009559154924778544, + "loss": 0.93526971, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 835, + "time_per_iteration": 2.7348785400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143876, + "balance_loss_mlp": 1.12812805, + "diversity_loss_mlp": 0.0, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.10043267780752475, + "language_loss": 0.85037422, + "learning_rate": 0.0009557874953754284, + "loss": 0.86181295, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 836, + "time_per_iteration": 3.069246768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156501, + "balance_loss_mlp": 1.14049125, + "diversity_loss_mlp": 0.0, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08327927090533828, + "language_loss": 0.83506572, + "learning_rate": 0.0009556593213206038, + "loss": 0.84663069, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 837, + "time_per_iteration": 2.7368414402008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190738, + "balance_loss_mlp": 1.17505026, + "diversity_loss_mlp": 0.0, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.08045457133261572, + "language_loss": 0.87076676, + "learning_rate": 0.0009555309703631414, + "loss": 0.88267422, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 838, + "time_per_iteration": 2.72027850151062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180132, + "balance_loss_mlp": 1.16382456, + "diversity_loss_mlp": 0.0, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.09367634959673259, + "language_loss": 0.87476748, + "learning_rate": 0.0009554024425528722, + "loss": 0.88656878, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 839, + "time_per_iteration": 2.7314722537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173375, + "balance_loss_mlp": 1.15756762, + "diversity_loss_mlp": 0.0, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0683151622017414, + "language_loss": 0.88983327, + "learning_rate": 0.0009552737379396948, + "loss": 0.90156698, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.15795898, + "routerloss_mlp": 0.0, + "step": 840, + "time_per_iteration": 2.6384117603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165757, + "balance_loss_mlp": 1.14950919, + "diversity_loss_mlp": 0.0, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.08203724053437887, + "language_loss": 0.87545735, + "learning_rate": 0.0009551448565735767, + "loss": 0.88711488, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 841, + "time_per_iteration": 2.7497382164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158402, + "balance_loss_mlp": 1.14156926, + "diversity_loss_mlp": 0.0, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.08523302245909381, + "language_loss": 0.84374112, + "learning_rate": 0.0009550157985045543, + "loss": 0.8553251, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 842, + "time_per_iteration": 3.080169916152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114708, + "balance_loss_mlp": 1.13046193, + "diversity_loss_mlp": 0.0, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.10255895710786052, + "language_loss": 0.89356017, + "learning_rate": 0.0009548865637827321, + "loss": 0.90503097, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 843, + "time_per_iteration": 2.684195041656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158581, + "balance_loss_mlp": 1.14129627, + "diversity_loss_mlp": 0.0, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.08376364289368579, + "language_loss": 0.89409387, + "learning_rate": 0.0009547571524582838, + "loss": 0.90567964, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 844, + "time_per_iteration": 2.5846645832061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157702, + "balance_loss_mlp": 1.14051175, + "diversity_loss_mlp": 0.0, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.09201378669766774, + "language_loss": 0.92096436, + "learning_rate": 0.0009546275645814512, + "loss": 0.93254137, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.17211914, + "routerloss_mlp": 0.0, + "step": 845, + "time_per_iteration": 2.603830575942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165367, + "balance_loss_mlp": 1.1485343, + "diversity_loss_mlp": 0.0, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.11870998115484692, + "language_loss": 0.8935858, + "learning_rate": 0.0009544978002025446, + "loss": 0.90523952, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 846, + "time_per_iteration": 2.57155179977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167547, + "balance_loss_mlp": 1.15075064, + "diversity_loss_mlp": 0.0, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.08095587687984966, + "language_loss": 0.86639023, + "learning_rate": 0.0009543678593719434, + "loss": 0.87806571, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.16809082, + "routerloss_mlp": 0.0, + "step": 847, + "time_per_iteration": 2.7022597789764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189002, + "balance_loss_mlp": 1.17215741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.06757237913003537, + "language_loss": 0.87374425, + "learning_rate": 0.0009542377421400945, + "loss": 0.8856343, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 848, + "time_per_iteration": 2.7858939170837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209239, + "balance_loss_mlp": 1.1922878, + "diversity_loss_mlp": 0.0, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.0709695929057924, + "language_loss": 0.83489215, + "learning_rate": 0.0009541074485575145, + "loss": 0.84698457, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 849, + "time_per_iteration": 2.7202138900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206318, + "balance_loss_mlp": 1.18949735, + "diversity_loss_mlp": 0.0, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.09796618546415216, + "language_loss": 0.91934282, + "learning_rate": 0.0009539769786747874, + "loss": 0.93140602, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 850, + "time_per_iteration": 2.6165611743927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183142, + "balance_loss_mlp": 1.16619003, + "diversity_loss_mlp": 0.0, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.08882238893928415, + "language_loss": 0.81184316, + "learning_rate": 0.0009538463325425665, + "loss": 0.82367456, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 851, + "time_per_iteration": 2.686708927154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.13394117, + "diversity_loss_mlp": 0.0, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.07439357185799754, + "language_loss": 0.85950458, + "learning_rate": 0.0009537155102115728, + "loss": 0.87101221, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 852, + "time_per_iteration": 2.5918595790863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875998, + "balance_loss_mlp": 1.52336514, + "diversity_loss_mlp": 0.19506347, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.033648266618603755, + "language_loss": 0.83653182, + "learning_rate": 0.0009535845117325961, + "loss": 0.84529185, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0167836, + "step": 853, + "time_per_iteration": 2.724388599395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106481, + "balance_loss_mlp": 1.08957744, + "diversity_loss_mlp": 0.0, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.08216353114673619, + "language_loss": 0.93429655, + "learning_rate": 0.0009534533371564946, + "loss": 0.94536138, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 854, + "time_per_iteration": 2.7487661838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011031, + "balance_loss_mlp": 1.08627963, + "diversity_loss_mlp": 0.0, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.1393079137823864, + "language_loss": 0.88947123, + "learning_rate": 0.0009533219865341949, + "loss": 0.9005022, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 855, + "time_per_iteration": 2.5900051593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095935, + "balance_loss_mlp": 1.0794363, + "diversity_loss_mlp": 0.0, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.09213408499242232, + "language_loss": 0.86629748, + "learning_rate": 0.0009531904599166916, + "loss": 0.87725687, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 856, + "time_per_iteration": 2.6516594886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093162, + "balance_loss_mlp": 1.07659197, + "diversity_loss_mlp": 0.0, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.11803940214792888, + "language_loss": 0.85319799, + "learning_rate": 0.0009530587573550478, + "loss": 0.86412966, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 857, + "time_per_iteration": 2.6046345233917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.06968486, + "diversity_loss_mlp": 0.0, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.035898632567184195, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75406808, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 858, + "time_per_iteration": 5.039424180984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113669, + "balance_loss_mlp": 1.12172914, + "diversity_loss_mlp": 0.0, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.11200047020164162, + "language_loss": 0.90257657, + "learning_rate": 0.0009527948246039337, + "loss": 0.91394353, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.14929199, + "routerloss_mlp": 0.0, + "step": 859, + "time_per_iteration": 2.550898551940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912162, + "balance_loss_mlp": 1.5939728, + "diversity_loss_mlp": 0.19291875, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.041813305841329106, + "language_loss": 0.87981749, + "learning_rate": 0.000952662594516931, + "loss": 0.88893914, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871633, + "step": 860, + "time_per_iteration": 3.135986089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159964, + "balance_loss_mlp": 1.14404976, + "diversity_loss_mlp": 0.0, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.09693666764449156, + "language_loss": 0.86321676, + "learning_rate": 0.0009525301886907234, + "loss": 0.87481636, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 861, + "time_per_iteration": 2.8601465225219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117936, + "balance_loss_mlp": 1.16340995, + "diversity_loss_mlp": 0.0, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.08775979857040934, + "language_loss": 0.87897611, + "learning_rate": 0.0009523976071767155, + "loss": 0.89076972, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 862, + "time_per_iteration": 2.676481246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186964, + "balance_loss_mlp": 1.17058492, + "diversity_loss_mlp": 0.0, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.08829714099376759, + "language_loss": 0.87565947, + "learning_rate": 0.00095226485002638, + "loss": 0.88752913, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 863, + "time_per_iteration": 2.7554168701171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188442, + "balance_loss_mlp": 1.17221785, + "diversity_loss_mlp": 0.0, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.07683945950910559, + "language_loss": 0.89008975, + "learning_rate": 0.0009521319172912576, + "loss": 0.90197414, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.16223145, + "routerloss_mlp": 0.0, + "step": 864, + "time_per_iteration": 2.7515084743499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180456, + "balance_loss_mlp": 1.16381395, + "diversity_loss_mlp": 0.0, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.07957847945510911, + "language_loss": 0.95031559, + "learning_rate": 0.0009519988090229579, + "loss": 0.96212018, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 865, + "time_per_iteration": 2.671473741531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177408, + "balance_loss_mlp": 1.16058719, + "diversity_loss_mlp": 0.0, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.08787110668844439, + "language_loss": 0.87748879, + "learning_rate": 0.0009518655252731576, + "loss": 0.8892628, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 866, + "time_per_iteration": 2.7561991214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152051, + "balance_loss_mlp": 1.13470602, + "diversity_loss_mlp": 0.0, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.07641565274747647, + "language_loss": 0.90193641, + "learning_rate": 0.0009517320660936022, + "loss": 0.91345698, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.17358398, + "routerloss_mlp": 0.0, + "step": 867, + "time_per_iteration": 2.7005693912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177189, + "balance_loss_mlp": 1.16064239, + "diversity_loss_mlp": 0.0, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.08424262891613502, + "language_loss": 0.83321446, + "learning_rate": 0.0009515984315361051, + "loss": 0.84498632, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.16552734, + "routerloss_mlp": 0.0, + "step": 868, + "time_per_iteration": 2.7969586849212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167914, + "balance_loss_mlp": 1.15145087, + "diversity_loss_mlp": 0.0, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08829416831991993, + "language_loss": 0.87132847, + "learning_rate": 0.000951464621652548, + "loss": 0.88300765, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 869, + "time_per_iteration": 2.6121644973754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152825, + "balance_loss_mlp": 1.13639808, + "diversity_loss_mlp": 0.0, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.07099792340868973, + "language_loss": 0.79077303, + "learning_rate": 0.0009513306364948804, + "loss": 0.80230129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 870, + "time_per_iteration": 2.7814862728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140954, + "balance_loss_mlp": 1.12481356, + "diversity_loss_mlp": 0.0, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09401721418936884, + "language_loss": 0.89126736, + "learning_rate": 0.0009511964761151197, + "loss": 0.90267694, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 871, + "time_per_iteration": 2.601903200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152354, + "balance_loss_mlp": 1.13628435, + "diversity_loss_mlp": 0.0, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.07594901152089473, + "language_loss": 0.90430808, + "learning_rate": 0.0009510621405653521, + "loss": 0.91583163, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 872, + "time_per_iteration": 2.6015260219573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140995, + "balance_loss_mlp": 1.12449682, + "diversity_loss_mlp": 0.0, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.08553354640914074, + "language_loss": 0.84159112, + "learning_rate": 0.0009509276298977309, + "loss": 0.85300112, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 873, + "time_per_iteration": 2.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156157, + "balance_loss_mlp": 1.13969469, + "diversity_loss_mlp": 0.0, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.09960357111836311, + "language_loss": 0.81973028, + "learning_rate": 0.0009507929441644778, + "loss": 0.83129185, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 874, + "time_per_iteration": 3.518749237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141075, + "balance_loss_mlp": 1.12455297, + "diversity_loss_mlp": 0.0, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.09789550875526438, + "language_loss": 0.86003464, + "learning_rate": 0.0009506580834178826, + "loss": 0.87144536, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.1652832, + "routerloss_mlp": 0.0, + "step": 875, + "time_per_iteration": 2.7423431873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152406, + "balance_loss_mlp": 1.13565707, + "diversity_loss_mlp": 0.0, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.08790070613593892, + "language_loss": 0.91631377, + "learning_rate": 0.0009505230477103028, + "loss": 0.92783785, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 876, + "time_per_iteration": 2.698725938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133355, + "balance_loss_mlp": 1.11677289, + "diversity_loss_mlp": 0.0, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.09908277874944699, + "language_loss": 0.81365788, + "learning_rate": 0.0009503878370941641, + "loss": 0.82499135, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 877, + "time_per_iteration": 2.791314125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891363, + "balance_loss_mlp": 1.54620337, + "diversity_loss_mlp": 0.20141272, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.04203797903351432, + "language_loss": 0.89092785, + "learning_rate": 0.0009502524516219595, + "loss": 0.89984149, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01755447, + "step": 878, + "time_per_iteration": 2.776076078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_mlp": 1.12719083, + "diversity_loss_mlp": 0.0, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.08982042340710936, + "language_loss": 0.90123284, + "learning_rate": 0.0009501168913462506, + "loss": 0.91266429, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 879, + "time_per_iteration": 2.6948277950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112281, + "balance_loss_mlp": 1.09587741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.05096984028598956, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80234206, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 880, + "time_per_iteration": 4.850466728210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143836, + "balance_loss_mlp": 1.12831497, + "diversity_loss_mlp": 0.0, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.08080936273118028, + "language_loss": 0.85235959, + "learning_rate": 0.0009498452465949042, + "loss": 0.8637979, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.1550293, + "routerloss_mlp": 0.0, + "step": 881, + "time_per_iteration": 3.2163655757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147334, + "balance_loss_mlp": 1.13156271, + "diversity_loss_mlp": 0.0, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.06875421208466073, + "language_loss": 0.91363323, + "learning_rate": 0.0009497091622247285, + "loss": 0.92510653, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 882, + "time_per_iteration": 2.686939239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152935, + "balance_loss_mlp": 1.13735437, + "diversity_loss_mlp": 0.0, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.08376903723107024, + "language_loss": 0.93688583, + "learning_rate": 0.0009495729032619723, + "loss": 0.94841516, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.15563965, + "routerloss_mlp": 0.0, + "step": 883, + "time_per_iteration": 2.709554433822632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164282, + "balance_loss_mlp": 1.14845097, + "diversity_loss_mlp": 0.0, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07836441801613908, + "language_loss": 0.83897853, + "learning_rate": 0.0009494364697595354, + "loss": 0.85062128, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 884, + "time_per_iteration": 2.905869722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192457, + "balance_loss_mlp": 1.17685246, + "diversity_loss_mlp": 0.0, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.08347533231949411, + "language_loss": 0.89193916, + "learning_rate": 0.0009492998617703867, + "loss": 0.90386373, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 885, + "time_per_iteration": 2.655181884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.18021917, + "diversity_loss_mlp": 0.0, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.09597329726050118, + "language_loss": 0.87667245, + "learning_rate": 0.0009491630793475619, + "loss": 0.88863432, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 886, + "time_per_iteration": 2.6077725887298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195953, + "balance_loss_mlp": 1.17983615, + "diversity_loss_mlp": 0.0, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.09161300078510141, + "language_loss": 0.8529889, + "learning_rate": 0.0009490261225441643, + "loss": 0.86494851, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 887, + "time_per_iteration": 2.8882617950439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169082, + "balance_loss_mlp": 1.15244031, + "diversity_loss_mlp": 0.0, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07944379291645969, + "language_loss": 0.90366387, + "learning_rate": 0.0009488889914133656, + "loss": 0.91535467, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 888, + "time_per_iteration": 2.969808578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192276, + "balance_loss_mlp": 1.17532432, + "diversity_loss_mlp": 0.0, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.0816216626447537, + "language_loss": 0.89335579, + "learning_rate": 0.0009487516860084047, + "loss": 0.90527856, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 889, + "time_per_iteration": 2.6975717544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164555, + "balance_loss_mlp": 1.14738929, + "diversity_loss_mlp": 0.0, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.08956429914743876, + "language_loss": 0.88835347, + "learning_rate": 0.0009486142063825884, + "loss": 0.89999902, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 890, + "time_per_iteration": 2.5376908779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087842, + "balance_loss_mlp": 1.07248783, + "diversity_loss_mlp": 0.0, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.041165905845677725, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73514056, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 891, + "time_per_iteration": 4.961901664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168071, + "balance_loss_mlp": 1.15150142, + "diversity_loss_mlp": 0.0, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.09530662242326329, + "language_loss": 0.89790797, + "learning_rate": 0.0009483387246819542, + "loss": 0.90958869, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 892, + "time_per_iteration": 2.7075483798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.0489924, + "diversity_loss_mlp": 0.0, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.03173229244132217, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83349359, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 893, + "time_per_iteration": 4.639479398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175334, + "balance_loss_mlp": 1.15915704, + "diversity_loss_mlp": 0.0, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.09568003043121609, + "language_loss": 0.88799989, + "learning_rate": 0.0009480625467392688, + "loss": 0.89975327, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 894, + "time_per_iteration": 2.6601061820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.04933381, + "diversity_loss_mlp": 0.0, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.02668432598653126, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79057646, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 895, + "time_per_iteration": 4.739619970321655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154117, + "balance_loss_mlp": 1.13857174, + "diversity_loss_mlp": 0.0, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0641043143423189, + "language_loss": 0.87743723, + "learning_rate": 0.0009477856729834196, + "loss": 0.88897842, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 896, + "time_per_iteration": 2.7397632598876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143695, + "balance_loss_mlp": 1.12863934, + "diversity_loss_mlp": 0.0, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.08265751895316475, + "language_loss": 0.89999056, + "learning_rate": 0.0009476469753098809, + "loss": 0.9114275, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 897, + "time_per_iteration": 2.7494678497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.13624024, + "diversity_loss_mlp": 0.0, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08701823937514089, + "language_loss": 0.86839932, + "learning_rate": 0.0009475081038443738, + "loss": 0.87991428, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.15234375, + "routerloss_mlp": 0.0, + "step": 898, + "time_per_iteration": 2.6241486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147135, + "balance_loss_mlp": 1.13179302, + "diversity_loss_mlp": 0.0, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.10104724937619765, + "language_loss": 0.85756111, + "learning_rate": 0.0009473690586408124, + "loss": 0.86903244, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 899, + "time_per_iteration": 2.8371973037719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141451, + "balance_loss_mlp": 1.1257633, + "diversity_loss_mlp": 0.0, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.08019640817702944, + "language_loss": 0.86364079, + "learning_rate": 0.0009472298397531792, + "loss": 0.87505525, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 900, + "time_per_iteration": 2.742392063140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158392, + "balance_loss_mlp": 1.14285886, + "diversity_loss_mlp": 0.0, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.08623310667606855, + "language_loss": 0.86846912, + "learning_rate": 0.0009470904472355235, + "loss": 0.88005304, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 901, + "time_per_iteration": 2.6695165634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168499, + "balance_loss_mlp": 1.15235806, + "diversity_loss_mlp": 0.0, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.08505658620970231, + "language_loss": 0.7976377, + "learning_rate": 0.0009469508811419626, + "loss": 0.80932266, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 902, + "time_per_iteration": 2.706495761871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295395, + "balance_loss_mlp": 1.28533375, + "diversity_loss_mlp": 0.0, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.12561294289393785, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72909224, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 903, + "time_per_iteration": 4.816544532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201232, + "balance_loss_mlp": 1.18432808, + "diversity_loss_mlp": 0.0, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.08260915403461032, + "language_loss": 0.83578205, + "learning_rate": 0.0009466712284439292, + "loss": 0.84779429, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 904, + "time_per_iteration": 2.7518186569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225673, + "balance_loss_mlp": 1.20837545, + "diversity_loss_mlp": 0.0, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.10172065741669829, + "language_loss": 0.88445127, + "learning_rate": 0.0009465311419480276, + "loss": 0.89670801, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 905, + "time_per_iteration": 2.6713294982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222896, + "balance_loss_mlp": 1.20540833, + "diversity_loss_mlp": 0.0, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.08928567213571854, + "language_loss": 0.88188136, + "learning_rate": 0.0009463908820933622, + "loss": 0.89411032, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 906, + "time_per_iteration": 2.838935375213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211371, + "balance_loss_mlp": 1.19455028, + "diversity_loss_mlp": 0.0, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.07641026648080583, + "language_loss": 0.82561022, + "learning_rate": 0.0009462504489343868, + "loss": 0.83772391, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 907, + "time_per_iteration": 2.814695119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176767, + "balance_loss_mlp": 1.15961313, + "diversity_loss_mlp": 0.0, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.1031074016814366, + "language_loss": 0.88790941, + "learning_rate": 0.0009461098425256222, + "loss": 0.89967716, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 908, + "time_per_iteration": 2.6116297245025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159634, + "balance_loss_mlp": 1.14329028, + "diversity_loss_mlp": 0.0, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.08015161116044169, + "language_loss": 0.86030436, + "learning_rate": 0.0009459690629216567, + "loss": 0.87190068, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 909, + "time_per_iteration": 2.6483752727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130085, + "balance_loss_mlp": 1.11407518, + "diversity_loss_mlp": 0.0, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.1301831169035446, + "language_loss": 0.87761313, + "learning_rate": 0.0009458281101771457, + "loss": 0.88891399, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 910, + "time_per_iteration": 2.6089227199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00992009, + "balance_loss_mlp": 1.75545192, + "diversity_loss_mlp": 0.19214596, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.033219305186726854, + "language_loss": 0.82887536, + "learning_rate": 0.0009456869843468122, + "loss": 0.83879542, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820984, + "step": 911, + "time_per_iteration": 2.895577907562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110935, + "balance_loss_mlp": 1.09519958, + "diversity_loss_mlp": 0.0, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.09801228329993106, + "language_loss": 0.78689641, + "learning_rate": 0.0009455456854854459, + "loss": 0.79800576, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 912, + "time_per_iteration": 2.61677885055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112332, + "balance_loss_mlp": 1.09684718, + "diversity_loss_mlp": 0.0, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.10345929433375275, + "language_loss": 0.84027654, + "learning_rate": 0.0009454042136479039, + "loss": 0.8513999, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 913, + "time_per_iteration": 2.63289737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970368, + "balance_loss_mlp": 1.71473479, + "diversity_loss_mlp": 0.18966624, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.036406885856323776, + "language_loss": 0.82874572, + "learning_rate": 0.0009452625688891103, + "loss": 0.83844936, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816791, + "step": 914, + "time_per_iteration": 2.5505056381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00652668, + "balance_loss_mlp": 1.1176697, + "diversity_loss_mlp": 0.15453993, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.002103211778310914, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79387403, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01656273, + "step": 915, + "time_per_iteration": 4.6835761070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138887, + "balance_loss_mlp": 1.12381876, + "diversity_loss_mlp": 0.0, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.10180381633640839, + "language_loss": 0.92940623, + "learning_rate": 0.0009449787608278015, + "loss": 0.94079512, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 916, + "time_per_iteration": 2.7294180393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155245, + "balance_loss_mlp": 1.13949776, + "diversity_loss_mlp": 0.0, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08481056496958321, + "language_loss": 0.92318904, + "learning_rate": 0.0009448365976354704, + "loss": 0.9347415, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 917, + "time_per_iteration": 2.4908158779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174187, + "balance_loss_mlp": 1.15821338, + "diversity_loss_mlp": 0.0, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.1031397623895646, + "language_loss": 0.89928877, + "learning_rate": 0.0009446942617422558, + "loss": 0.91103065, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 918, + "time_per_iteration": 2.5721499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191219, + "balance_loss_mlp": 1.1748755, + "diversity_loss_mlp": 0.0, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.17804953788653613, + "language_loss": 0.85687363, + "learning_rate": 0.0009445517532034176, + "loss": 0.86878586, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 919, + "time_per_iteration": 2.6613845825195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195517, + "balance_loss_mlp": 1.18031824, + "diversity_loss_mlp": 0.0, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.09678678856513988, + "language_loss": 0.89147103, + "learning_rate": 0.0009444090720742824, + "loss": 0.90342629, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 920, + "time_per_iteration": 2.587042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186456, + "balance_loss_mlp": 1.17107785, + "diversity_loss_mlp": 0.0, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.10185153476697495, + "language_loss": 0.87654328, + "learning_rate": 0.0009442662184102439, + "loss": 0.88840789, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 921, + "time_per_iteration": 2.8263702392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153869, + "balance_loss_mlp": 1.13851511, + "diversity_loss_mlp": 0.0, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.07023953845341, + "language_loss": 0.87764925, + "learning_rate": 0.000944123192266763, + "loss": 0.88918793, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 922, + "time_per_iteration": 2.789288282394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914197, + "balance_loss_mlp": 1.60349846, + "diversity_loss_mlp": 0.18745996, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.03372690713262746, + "language_loss": 0.83555657, + "learning_rate": 0.0009439799936993671, + "loss": 0.84469855, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871805, + "step": 923, + "time_per_iteration": 2.7374520301818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137351, + "balance_loss_mlp": 1.12125802, + "diversity_loss_mlp": 0.0, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.08202300708599226, + "language_loss": 0.87886107, + "learning_rate": 0.0009438366227636511, + "loss": 0.89023459, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.16088867, + "routerloss_mlp": 0.0, + "step": 924, + "time_per_iteration": 2.7159595489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148154, + "balance_loss_mlp": 1.13190556, + "diversity_loss_mlp": 0.0, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.08035818105278464, + "language_loss": 0.86048192, + "learning_rate": 0.0009436930795152763, + "loss": 0.8719635, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 925, + "time_per_iteration": 2.8248116970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143318, + "balance_loss_mlp": 1.12739205, + "diversity_loss_mlp": 0.0, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07405817727017547, + "language_loss": 0.86317486, + "learning_rate": 0.0009435493640099713, + "loss": 0.87460804, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 926, + "time_per_iteration": 2.8155741691589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161834, + "balance_loss_mlp": 1.1451211, + "diversity_loss_mlp": 0.0, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.09122083849675254, + "language_loss": 0.84453332, + "learning_rate": 0.0009434054763035314, + "loss": 0.8561517, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 927, + "time_per_iteration": 2.636686325073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158411, + "balance_loss_mlp": 1.1422224, + "diversity_loss_mlp": 0.0, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.0663266274239875, + "language_loss": 0.85362542, + "learning_rate": 0.0009432614164518185, + "loss": 0.86520946, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 928, + "time_per_iteration": 2.9446685314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171163, + "balance_loss_mlp": 1.15443754, + "diversity_loss_mlp": 0.0, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07726522608444414, + "language_loss": 0.84178561, + "learning_rate": 0.000943117184510762, + "loss": 0.85349721, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 929, + "time_per_iteration": 3.0194530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175374, + "balance_loss_mlp": 1.16435885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.030831515732685378, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79965341, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 930, + "time_per_iteration": 5.04656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172004, + "balance_loss_mlp": 1.15555263, + "diversity_loss_mlp": 0.0, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.08209248711818126, + "language_loss": 0.88495553, + "learning_rate": 0.0009428282045846674, + "loss": 0.89667559, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.16455078, + "routerloss_mlp": 0.0, + "step": 931, + "time_per_iteration": 2.6833221912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905029, + "balance_loss_mlp": 1.58147573, + "diversity_loss_mlp": 0.18920106, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.030391877730158674, + "language_loss": 0.89804769, + "learning_rate": 0.0009426834567118214, + "loss": 0.90709794, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01969042, + "step": 932, + "time_per_iteration": 3.0804004669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174106, + "balance_loss_mlp": 1.15761924, + "diversity_loss_mlp": 0.0, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.06967623980831897, + "language_loss": 0.80600739, + "learning_rate": 0.0009425385369740155, + "loss": 0.81774843, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.16491699, + "routerloss_mlp": 0.0, + "step": 933, + "time_per_iteration": 3.039576530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172613, + "balance_loss_mlp": 1.15553069, + "diversity_loss_mlp": 0.0, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.09198882046168515, + "language_loss": 0.87049097, + "learning_rate": 0.0009423934454275125, + "loss": 0.88221705, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 934, + "time_per_iteration": 2.8528192043304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147429, + "balance_loss_mlp": 1.13053656, + "diversity_loss_mlp": 0.0, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.09002999058802562, + "language_loss": 0.92077851, + "learning_rate": 0.0009422481821286418, + "loss": 0.93225282, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.16906738, + "routerloss_mlp": 0.0, + "step": 935, + "time_per_iteration": 2.720700740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140916, + "balance_loss_mlp": 1.12434602, + "diversity_loss_mlp": 0.0, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.11818586168906865, + "language_loss": 0.88474637, + "learning_rate": 0.0009421027471337998, + "loss": 0.89615548, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 936, + "time_per_iteration": 2.61820125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114364, + "balance_loss_mlp": 1.12680769, + "diversity_loss_mlp": 0.0, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.13119105141522364, + "language_loss": 0.82430404, + "learning_rate": 0.0009419571404994493, + "loss": 0.83574045, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 937, + "time_per_iteration": 2.6458749771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.11016333, + "diversity_loss_mlp": 0.0, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.10011425098636609, + "language_loss": 0.90748799, + "learning_rate": 0.00094181136228212, + "loss": 0.91875559, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 938, + "time_per_iteration": 2.659946918487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132333, + "balance_loss_mlp": 1.11602521, + "diversity_loss_mlp": 0.0, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06984091109722412, + "language_loss": 0.86027002, + "learning_rate": 0.0009416654125384077, + "loss": 0.8715933, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 939, + "time_per_iteration": 2.723839044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182476, + "balance_loss_mlp": 1.17174697, + "diversity_loss_mlp": 0.0, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.0414358910702132, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.8095485, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 940, + "time_per_iteration": 4.920511722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141007, + "balance_loss_mlp": 1.12453222, + "diversity_loss_mlp": 0.0, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.0813056862192268, + "language_loss": 0.83903325, + "learning_rate": 0.000941372998698552, + "loss": 0.85044336, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 941, + "time_per_iteration": 2.937645673751831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00896978, + "balance_loss_mlp": 1.56833267, + "diversity_loss_mlp": 0.1911485, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.04191931915848681, + "language_loss": 0.82149267, + "learning_rate": 0.0009412265347159336, + "loss": 0.83046246, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0172378, + "step": 942, + "time_per_iteration": 2.7250781059265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116112, + "balance_loss_mlp": 1.14446664, + "diversity_loss_mlp": 0.0, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.08706600394859935, + "language_loss": 0.84761524, + "learning_rate": 0.0009410798994339829, + "loss": 0.85922647, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 943, + "time_per_iteration": 2.5916900634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115721, + "balance_loss_mlp": 1.14027047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.07414862428622851, + "language_loss": 0.87698966, + "learning_rate": 0.000940933092909628, + "loss": 0.88856173, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 944, + "time_per_iteration": 2.6747801303863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166789, + "balance_loss_mlp": 1.15049326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.07390491400887403, + "language_loss": 0.83424389, + "learning_rate": 0.0009407861151998649, + "loss": 0.84591174, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 945, + "time_per_iteration": 2.602691411972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163795, + "balance_loss_mlp": 1.14708209, + "diversity_loss_mlp": 0.0, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.07435679337016335, + "language_loss": 0.86087269, + "learning_rate": 0.0009406389663617552, + "loss": 0.87251067, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 946, + "time_per_iteration": 2.6775379180908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139209, + "balance_loss_mlp": 1.12300825, + "diversity_loss_mlp": 0.0, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.08423780444915897, + "language_loss": 0.86031067, + "learning_rate": 0.000940491646452427, + "loss": 0.87170279, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 947, + "time_per_iteration": 2.717313051223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134537, + "balance_loss_mlp": 1.11805058, + "diversity_loss_mlp": 0.0, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.0716601161320721, + "language_loss": 0.90799212, + "learning_rate": 0.000940344155529075, + "loss": 0.91933751, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 948, + "time_per_iteration": 2.645601749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905236, + "balance_loss_mlp": 1.57791471, + "diversity_loss_mlp": 0.19691566, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.03478780514937427, + "language_loss": 0.87420666, + "learning_rate": 0.0009401964936489605, + "loss": 0.883259, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01782099, + "step": 949, + "time_per_iteration": 2.546546459197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132433, + "balance_loss_mlp": 1.11666203, + "diversity_loss_mlp": 0.0, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.11218622077210595, + "language_loss": 0.85308415, + "learning_rate": 0.0009400486608694108, + "loss": 0.86440849, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 950, + "time_per_iteration": 2.71462345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135805, + "balance_loss_mlp": 1.1190201, + "diversity_loss_mlp": 0.0, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.07143871570155125, + "language_loss": 0.87176299, + "learning_rate": 0.0009399006572478195, + "loss": 0.88312101, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 951, + "time_per_iteration": 3.0933260917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137853, + "balance_loss_mlp": 1.12129509, + "diversity_loss_mlp": 0.0, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.08672794105569953, + "language_loss": 0.90997601, + "learning_rate": 0.0009397524828416468, + "loss": 0.92135453, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 952, + "time_per_iteration": 2.6721160411834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906668, + "balance_loss_mlp": 1.58174932, + "diversity_loss_mlp": 0.19792399, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.0341945315399877, + "language_loss": 0.96079636, + "learning_rate": 0.0009396041377084192, + "loss": 0.96986312, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01683164, + "step": 953, + "time_per_iteration": 2.6563429832458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147916, + "balance_loss_mlp": 1.1312983, + "diversity_loss_mlp": 0.0, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.07156922543086394, + "language_loss": 0.87274891, + "learning_rate": 0.0009394556219057295, + "loss": 0.88422805, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 954, + "time_per_iteration": 2.710129499435425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.1480366, + "diversity_loss_mlp": 0.0, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08933499459227748, + "language_loss": 0.83389091, + "learning_rate": 0.0009393069354912362, + "loss": 0.84553862, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 955, + "time_per_iteration": 2.736077070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162546, + "balance_loss_mlp": 1.1459167, + "diversity_loss_mlp": 0.0, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.10088049230192819, + "language_loss": 0.81851852, + "learning_rate": 0.0009391580785226649, + "loss": 0.83014399, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 956, + "time_per_iteration": 2.8675243854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139492, + "balance_loss_mlp": 1.12933517, + "diversity_loss_mlp": 0.0, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.028623000900350283, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80479944, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 957, + "time_per_iteration": 4.758531332015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128949, + "balance_loss_mlp": 1.11177051, + "diversity_loss_mlp": 0.0, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.0742792603097427, + "language_loss": 0.8674221, + "learning_rate": 0.0009388598531545196, + "loss": 0.87871158, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 958, + "time_per_iteration": 2.8665144443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110151, + "balance_loss_mlp": 1.09304404, + "diversity_loss_mlp": 0.0, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.08387101873752756, + "language_loss": 0.85292655, + "learning_rate": 0.000938710484870727, + "loss": 0.86402804, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.17126465, + "routerloss_mlp": 0.0, + "step": 959, + "time_per_iteration": 2.5621094703674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113798, + "balance_loss_mlp": 1.09718001, + "diversity_loss_mlp": 0.0, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.08027143748444723, + "language_loss": 0.85896957, + "learning_rate": 0.0009385609462644189, + "loss": 0.87010753, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 960, + "time_per_iteration": 2.6949400901794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122642, + "balance_loss_mlp": 1.10596502, + "diversity_loss_mlp": 0.0, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07967759372686231, + "language_loss": 0.8535409, + "learning_rate": 0.0009384112373936514, + "loss": 0.86476731, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 961, + "time_per_iteration": 2.644244432449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.11566615, + "diversity_loss_mlp": 0.0, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.09330138113238175, + "language_loss": 0.91539109, + "learning_rate": 0.0009382613583165467, + "loss": 0.92671585, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 962, + "time_per_iteration": 2.8191375732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128481, + "balance_loss_mlp": 1.11161256, + "diversity_loss_mlp": 0.0, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.08799115365988901, + "language_loss": 0.89600122, + "learning_rate": 0.0009381113090912928, + "loss": 0.90728599, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 963, + "time_per_iteration": 2.77341890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137775, + "balance_loss_mlp": 1.12159812, + "diversity_loss_mlp": 0.0, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.08224545608030313, + "language_loss": 0.89354098, + "learning_rate": 0.000937961089776144, + "loss": 0.90491867, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 964, + "time_per_iteration": 2.6057045459747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140677, + "balance_loss_mlp": 1.12448788, + "diversity_loss_mlp": 0.0, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.08763662153745684, + "language_loss": 0.82399738, + "learning_rate": 0.0009378107004294208, + "loss": 0.83540416, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 965, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132665, + "balance_loss_mlp": 1.11624968, + "diversity_loss_mlp": 0.0, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.0696996408734829, + "language_loss": 0.91584361, + "learning_rate": 0.0009376601411095096, + "loss": 0.92717028, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.16418457, + "routerloss_mlp": 0.0, + "step": 966, + "time_per_iteration": 2.6557700634002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108368, + "balance_loss_mlp": 1.09209585, + "diversity_loss_mlp": 0.0, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.0928645758984953, + "language_loss": 0.86438054, + "learning_rate": 0.0009375094118748622, + "loss": 0.8754642, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.16271973, + "routerloss_mlp": 0.0, + "step": 967, + "time_per_iteration": 2.5574727058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121341, + "balance_loss_mlp": 1.10546279, + "diversity_loss_mlp": 0.0, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.08866997131388626, + "language_loss": 0.90710455, + "learning_rate": 0.0009373585127839976, + "loss": 0.91831791, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 968, + "time_per_iteration": 2.9949731826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.1066587, + "diversity_loss_mlp": 0.0, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08663719992470821, + "language_loss": 0.90892541, + "learning_rate": 0.0009372074438954994, + "loss": 0.92014849, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.15637207, + "routerloss_mlp": 0.0, + "step": 969, + "time_per_iteration": 2.583392381668091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115205, + "balance_loss_mlp": 1.09983897, + "diversity_loss_mlp": 0.0, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.1288159292638968, + "language_loss": 0.91714692, + "learning_rate": 0.0009370562052680181, + "loss": 0.92829901, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.15356445, + "routerloss_mlp": 0.0, + "step": 970, + "time_per_iteration": 2.476053476333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131477, + "balance_loss_mlp": 1.1160872, + "diversity_loss_mlp": 0.0, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.05501755081279848, + "language_loss": 0.89296091, + "learning_rate": 0.0009369047969602695, + "loss": 0.90427566, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 971, + "time_per_iteration": 2.705310344696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161734, + "balance_loss_mlp": 1.14604628, + "diversity_loss_mlp": 0.0, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.09590230746039986, + "language_loss": 0.86690193, + "learning_rate": 0.0009367532190310357, + "loss": 0.8785193, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 972, + "time_per_iteration": 2.551683187484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151378, + "balance_loss_mlp": 1.13526106, + "diversity_loss_mlp": 0.0, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.13723256450586457, + "language_loss": 0.88859725, + "learning_rate": 0.0009366014715391644, + "loss": 0.90011096, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 973, + "time_per_iteration": 2.6311707496643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140536, + "balance_loss_mlp": 1.12521768, + "diversity_loss_mlp": 0.0, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.0667022200872989, + "language_loss": 0.83902818, + "learning_rate": 0.0009364495545435693, + "loss": 0.85043353, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 974, + "time_per_iteration": 2.756056308746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121055, + "balance_loss_mlp": 1.10528326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.06720472395514528, + "language_loss": 0.88235438, + "learning_rate": 0.0009362974681032297, + "loss": 0.89356488, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 975, + "time_per_iteration": 2.601027488708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117499, + "balance_loss_mlp": 1.10179889, + "diversity_loss_mlp": 0.0, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.09372829562862567, + "language_loss": 0.88529336, + "learning_rate": 0.0009361452122771907, + "loss": 0.8964684, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 976, + "time_per_iteration": 2.8729074001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124468, + "balance_loss_mlp": 1.107934, + "diversity_loss_mlp": 0.0, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.10248565336705484, + "language_loss": 0.83506191, + "learning_rate": 0.0009359927871245635, + "loss": 0.84630656, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 977, + "time_per_iteration": 2.4633541107177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114403, + "balance_loss_mlp": 1.12861657, + "diversity_loss_mlp": 0.0, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09207140211488826, + "language_loss": 0.85937703, + "learning_rate": 0.0009358401927045246, + "loss": 0.87081736, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 978, + "time_per_iteration": 2.8528451919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165656, + "balance_loss_mlp": 1.15002799, + "diversity_loss_mlp": 0.0, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.09819064259764942, + "language_loss": 0.88151729, + "learning_rate": 0.0009356874290763166, + "loss": 0.89317381, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 979, + "time_per_iteration": 3.4732589721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165217, + "balance_loss_mlp": 1.14985144, + "diversity_loss_mlp": 0.0, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.07125364842819645, + "language_loss": 0.88739443, + "learning_rate": 0.0009355344962992474, + "loss": 0.8990466, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 980, + "time_per_iteration": 2.618013381958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092711, + "balance_loss_mlp": 1.61735535, + "diversity_loss_mlp": 0.20325859, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.031158428526317693, + "language_loss": 0.8787328, + "learning_rate": 0.0009353813944326908, + "loss": 0.88800395, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0168031, + "step": 981, + "time_per_iteration": 2.926612377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00925726, + "balance_loss_mlp": 1.616956, + "diversity_loss_mlp": 0.20126666, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0354798675553145, + "language_loss": 0.82752389, + "learning_rate": 0.0009352281235360863, + "loss": 0.83678114, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01661466, + "step": 982, + "time_per_iteration": 2.7461719512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156754, + "balance_loss_mlp": 1.14193642, + "diversity_loss_mlp": 0.0, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.08008026175511872, + "language_loss": 0.84875655, + "learning_rate": 0.0009350746836689389, + "loss": 0.86032403, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 983, + "time_per_iteration": 2.5128703117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232965, + "balance_loss_mlp": 1.22199774, + "diversity_loss_mlp": 0.0, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.06420942239022731, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82672185, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 984, + "time_per_iteration": 4.987680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144146, + "balance_loss_mlp": 1.12880325, + "diversity_loss_mlp": 0.0, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08702988523082197, + "language_loss": 0.82654107, + "learning_rate": 0.0009347672972613634, + "loss": 0.83798254, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 985, + "time_per_iteration": 2.586580514907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891878, + "balance_loss_mlp": 1.54986262, + "diversity_loss_mlp": 0.20135348, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.032521151954013804, + "language_loss": 0.85226321, + "learning_rate": 0.0009346133508402735, + "loss": 0.86118197, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01626948, + "step": 986, + "time_per_iteration": 2.7389352321624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151414, + "balance_loss_mlp": 1.13596404, + "diversity_loss_mlp": 0.0, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.0982536864932062, + "language_loss": 0.84267235, + "learning_rate": 0.0009344592356873166, + "loss": 0.85418648, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 987, + "time_per_iteration": 2.6327145099639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157169, + "balance_loss_mlp": 1.14155281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.07528447862042392, + "language_loss": 0.78532755, + "learning_rate": 0.0009343049518623255, + "loss": 0.79689926, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 988, + "time_per_iteration": 2.7461259365081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161817, + "balance_loss_mlp": 1.14693928, + "diversity_loss_mlp": 0.0, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.07061488940634471, + "language_loss": 0.83142781, + "learning_rate": 0.0009341504994251985, + "loss": 0.84304595, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 989, + "time_per_iteration": 2.9033045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128003, + "balance_loss_mlp": 1.11765516, + "diversity_loss_mlp": 0.0, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.02664126889468688, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74648499, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 990, + "time_per_iteration": 5.065544605255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116372, + "balance_loss_mlp": 1.14821064, + "diversity_loss_mlp": 0.0, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.062492069067547173, + "language_loss": 0.81668103, + "learning_rate": 0.0009338410889544574, + "loss": 0.82831824, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 991, + "time_per_iteration": 3.0360453128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160077, + "balance_loss_mlp": 1.14444828, + "diversity_loss_mlp": 0.0, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.07188646642614673, + "language_loss": 0.87598348, + "learning_rate": 0.000933686131040967, + "loss": 0.88758421, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 992, + "time_per_iteration": 4.194309234619141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132508, + "balance_loss_mlp": 1.11693931, + "diversity_loss_mlp": 0.0, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.07096950165415856, + "language_loss": 0.90250611, + "learning_rate": 0.0009335310047555883, + "loss": 0.91383117, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 993, + "time_per_iteration": 2.7198565006256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128004, + "balance_loss_mlp": 1.11225605, + "diversity_loss_mlp": 0.0, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.07682750770192658, + "language_loss": 0.8836562, + "learning_rate": 0.0009333757101585467, + "loss": 0.89493626, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 994, + "time_per_iteration": 2.6651480197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.10621142, + "diversity_loss_mlp": 0.0, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.10461680978710068, + "language_loss": 0.9317944, + "learning_rate": 0.0009332202473101329, + "loss": 0.94301325, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 995, + "time_per_iteration": 2.667943239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00890685, + "balance_loss_mlp": 1.54595685, + "diversity_loss_mlp": 0.2013846, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.03439253799161941, + "language_loss": 0.8270663, + "learning_rate": 0.0009330646162707028, + "loss": 0.83597314, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170145, + "step": 996, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130524, + "balance_loss_mlp": 1.11483645, + "diversity_loss_mlp": 0.0, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.07379991060729872, + "language_loss": 0.84002179, + "learning_rate": 0.0009329088171006779, + "loss": 0.85132706, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 997, + "time_per_iteration": 3.133023738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136353, + "balance_loss_mlp": 1.12061739, + "diversity_loss_mlp": 0.0, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.09187105070084006, + "language_loss": 0.85599297, + "learning_rate": 0.0009327528498605446, + "loss": 0.86735654, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 998, + "time_per_iteration": 2.5390877723693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888942, + "balance_loss_mlp": 1.54108667, + "diversity_loss_mlp": 0.20404731, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.03685920036749298, + "language_loss": 0.89166534, + "learning_rate": 0.0009325967146108548, + "loss": 0.90055484, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01637482, + "step": 999, + "time_per_iteration": 2.7167420387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159789, + "balance_loss_mlp": 1.14361215, + "diversity_loss_mlp": 0.0, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.08415694153473897, + "language_loss": 0.87386107, + "learning_rate": 0.0009324404114122258, + "loss": 0.88545901, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 1000, + "time_per_iteration": 2.6833291053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164843, + "balance_loss_mlp": 1.1492269, + "diversity_loss_mlp": 0.0, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.07516183221332183, + "language_loss": 0.86446774, + "learning_rate": 0.0009322839403253397, + "loss": 0.87611622, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 1001, + "time_per_iteration": 4.16480565071106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173642, + "balance_loss_mlp": 1.15789402, + "diversity_loss_mlp": 0.0, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07739515949456567, + "language_loss": 0.84035075, + "learning_rate": 0.0009321273014109439, + "loss": 0.8520872, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1002, + "time_per_iteration": 2.9390604496002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183539, + "balance_loss_mlp": 1.16795826, + "diversity_loss_mlp": 0.0, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.08102605487142737, + "language_loss": 0.84643984, + "learning_rate": 0.0009319704947298513, + "loss": 0.85827518, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1003, + "time_per_iteration": 2.923952579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116012, + "balance_loss_mlp": 1.14496815, + "diversity_loss_mlp": 0.0, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.060771133612280225, + "language_loss": 0.88448775, + "learning_rate": 0.0009318135203429393, + "loss": 0.89608896, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.15124512, + "routerloss_mlp": 0.0, + "step": 1004, + "time_per_iteration": 2.7170984745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135222, + "balance_loss_mlp": 1.11972475, + "diversity_loss_mlp": 0.0, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.07023398647530335, + "language_loss": 0.87528408, + "learning_rate": 0.0009316563783111511, + "loss": 0.88663626, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1005, + "time_per_iteration": 2.7271320819854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011162, + "balance_loss_mlp": 1.10061884, + "diversity_loss_mlp": 0.0, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.07388032809600253, + "language_loss": 0.82009041, + "learning_rate": 0.0009314990686954943, + "loss": 0.83125246, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1006, + "time_per_iteration": 2.9210305213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108745, + "balance_loss_mlp": 1.09337938, + "diversity_loss_mlp": 0.0, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.06330578200459082, + "language_loss": 0.80805916, + "learning_rate": 0.000931341591557042, + "loss": 0.81914663, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 1007, + "time_per_iteration": 3.695157051086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095993, + "balance_loss_mlp": 1.08054364, + "diversity_loss_mlp": 0.0, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.07858263731415134, + "language_loss": 0.87216473, + "learning_rate": 0.0009311839469569325, + "loss": 0.88312465, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1008, + "time_per_iteration": 2.633854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108854, + "balance_loss_mlp": 1.07287586, + "diversity_loss_mlp": 0.0, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.14235975733457876, + "language_loss": 0.87399781, + "learning_rate": 0.0009310261349563687, + "loss": 0.88488322, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1009, + "time_per_iteration": 2.702073574066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898627, + "balance_loss_mlp": 1.56164169, + "diversity_loss_mlp": 0.20371187, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.03011805945399338, + "language_loss": 0.85438645, + "learning_rate": 0.0009308681556166186, + "loss": 0.86337274, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01594995, + "step": 1010, + "time_per_iteration": 2.8698601722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111744, + "balance_loss_mlp": 1.0962348, + "diversity_loss_mlp": 0.0, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.08879322612819535, + "language_loss": 0.87462533, + "learning_rate": 0.0009307100089990152, + "loss": 0.88574278, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1011, + "time_per_iteration": 2.7149901390075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140864, + "balance_loss_mlp": 1.12543821, + "diversity_loss_mlp": 0.0, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.07383907155719892, + "language_loss": 0.83837229, + "learning_rate": 0.0009305516951649568, + "loss": 0.84978092, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.15405273, + "routerloss_mlp": 0.0, + "step": 1012, + "time_per_iteration": 2.702683448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161407, + "balance_loss_mlp": 1.14599323, + "diversity_loss_mlp": 0.0, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07624018834593461, + "language_loss": 0.86570859, + "learning_rate": 0.0009303932141759057, + "loss": 0.87732267, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 1013, + "time_per_iteration": 2.7500197887420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168747, + "balance_loss_mlp": 1.15382242, + "diversity_loss_mlp": 0.0, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.08469076174706892, + "language_loss": 0.83575755, + "learning_rate": 0.0009302345660933902, + "loss": 0.84744501, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1014, + "time_per_iteration": 2.8010780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171185, + "balance_loss_mlp": 1.15642715, + "diversity_loss_mlp": 0.0, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.08619273283705803, + "language_loss": 0.85146868, + "learning_rate": 0.0009300757509790026, + "loss": 0.86318052, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1015, + "time_per_iteration": 2.840315103530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150596, + "balance_loss_mlp": 1.13570654, + "diversity_loss_mlp": 0.0, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.10655365126946059, + "language_loss": 0.90244913, + "learning_rate": 0.0009299167688944005, + "loss": 0.91395509, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1016, + "time_per_iteration": 2.502391815185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130549, + "balance_loss_mlp": 1.11540985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07757202619564983, + "language_loss": 0.85754222, + "learning_rate": 0.0009297576199013063, + "loss": 0.86884773, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1017, + "time_per_iteration": 2.7255496978759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00657481, + "balance_loss_mlp": 1.1064117, + "diversity_loss_mlp": 0.17609364, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.0027779106975556575, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.73659611, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01622855, + "step": 1018, + "time_per_iteration": 4.943171739578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01384914, + "balance_loss_mlp": 1.37351775, + "diversity_loss_mlp": 0.0, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.09054623740471555, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80811214, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 1019, + "time_per_iteration": 5.518418788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125322, + "balance_loss_mlp": 1.11074281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.08202201534603108, + "language_loss": 0.8648417, + "learning_rate": 0.0009292791720892659, + "loss": 0.87609494, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1020, + "time_per_iteration": 2.889078140258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131715, + "balance_loss_mlp": 1.11721921, + "diversity_loss_mlp": 0.0, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07932574612707302, + "language_loss": 0.88913518, + "learning_rate": 0.0009291193560807218, + "loss": 0.90045238, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1021, + "time_per_iteration": 2.5933609008789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136679, + "balance_loss_mlp": 1.122159, + "diversity_loss_mlp": 0.0, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.08278255048112054, + "language_loss": 0.87034905, + "learning_rate": 0.0009289593734732688, + "loss": 0.88171583, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1022, + "time_per_iteration": 2.600834369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132774, + "balance_loss_mlp": 1.11842132, + "diversity_loss_mlp": 0.0, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.08270608551386573, + "language_loss": 0.93774927, + "learning_rate": 0.0009287992243290175, + "loss": 0.94907701, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1023, + "time_per_iteration": 2.474914312362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111783, + "balance_loss_mlp": 1.10275006, + "diversity_loss_mlp": 0.0, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.06901830196983176, + "language_loss": 0.90473127, + "learning_rate": 0.0009286389087101435, + "loss": 0.91590953, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1024, + "time_per_iteration": 2.7718465328216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120328, + "balance_loss_mlp": 1.1055932, + "diversity_loss_mlp": 0.0, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07476522676232629, + "language_loss": 0.8853035, + "learning_rate": 0.0009284784266788864, + "loss": 0.89650679, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1025, + "time_per_iteration": 2.7143290042877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122071, + "balance_loss_mlp": 1.10795665, + "diversity_loss_mlp": 0.0, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.08990804702262417, + "language_loss": 0.91984832, + "learning_rate": 0.0009283177782975512, + "loss": 0.93106908, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1026, + "time_per_iteration": 2.948909282684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115739, + "balance_loss_mlp": 1.10118401, + "diversity_loss_mlp": 0.0, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08229992096701991, + "language_loss": 0.88074464, + "learning_rate": 0.000928156963628507, + "loss": 0.89190209, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1027, + "time_per_iteration": 2.5764074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109846, + "balance_loss_mlp": 1.09483802, + "diversity_loss_mlp": 0.0, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.08379460495492784, + "language_loss": 0.87978798, + "learning_rate": 0.0009279959827341877, + "loss": 0.89088643, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1028, + "time_per_iteration": 2.752347946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08043635, + "diversity_loss_mlp": 0.0, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.08467225305095022, + "language_loss": 0.87624389, + "learning_rate": 0.0009278348356770915, + "loss": 0.88720024, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1029, + "time_per_iteration": 2.555527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.08132768, + "diversity_loss_mlp": 0.0, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.0755245964113765, + "language_loss": 0.85285002, + "learning_rate": 0.0009276735225197814, + "loss": 0.86381966, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1030, + "time_per_iteration": 2.5947089195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104122, + "balance_loss_mlp": 1.08832633, + "diversity_loss_mlp": 0.0, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.08972056860523267, + "language_loss": 0.85732102, + "learning_rate": 0.0009275120433248847, + "loss": 0.86836231, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.15783691, + "routerloss_mlp": 0.0, + "step": 1031, + "time_per_iteration": 2.676872730255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109193, + "balance_loss_mlp": 1.09355247, + "diversity_loss_mlp": 0.0, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.07488561277584621, + "language_loss": 0.85529125, + "learning_rate": 0.0009273503981550931, + "loss": 0.86638314, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1032, + "time_per_iteration": 3.09958815574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099668, + "balance_loss_mlp": 1.08494592, + "diversity_loss_mlp": 0.0, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.1040963884260124, + "language_loss": 0.86882496, + "learning_rate": 0.0009271885870731626, + "loss": 0.87982166, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1033, + "time_per_iteration": 2.509047269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098009, + "balance_loss_mlp": 1.08258307, + "diversity_loss_mlp": 0.0, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.09324111295027285, + "language_loss": 0.88376671, + "learning_rate": 0.0009270266101419143, + "loss": 0.89474678, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1034, + "time_per_iteration": 2.6504034996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094236, + "balance_loss_mlp": 1.07954955, + "diversity_loss_mlp": 0.0, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.12545708784893086, + "language_loss": 0.85201651, + "learning_rate": 0.0009268644674242328, + "loss": 0.86295891, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1035, + "time_per_iteration": 2.6919047832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105423, + "balance_loss_mlp": 1.08997381, + "diversity_loss_mlp": 0.0, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.09055239952020887, + "language_loss": 0.80814689, + "learning_rate": 0.0009267021589830678, + "loss": 0.81920111, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1036, + "time_per_iteration": 2.582871198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278291, + "balance_loss_mlp": 1.26927888, + "diversity_loss_mlp": 0.0, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.10087907784966592, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78905374, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 1037, + "time_per_iteration": 4.955699920654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112922, + "balance_loss_mlp": 1.11371088, + "diversity_loss_mlp": 0.0, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.08737337363848705, + "language_loss": 0.9264009, + "learning_rate": 0.000926377045182406, + "loss": 0.93769312, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1038, + "time_per_iteration": 2.8884389400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140143, + "balance_loss_mlp": 1.12453878, + "diversity_loss_mlp": 0.0, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.10415849564176528, + "language_loss": 0.87916917, + "learning_rate": 0.0009262142399491296, + "loss": 0.89057058, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1039, + "time_per_iteration": 3.045872211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143419, + "balance_loss_mlp": 1.12763548, + "diversity_loss_mlp": 0.0, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.09906225236156592, + "language_loss": 0.87455821, + "learning_rate": 0.0009260512692448105, + "loss": 0.88599241, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.15771484, + "routerloss_mlp": 0.0, + "step": 1040, + "time_per_iteration": 2.699052572250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.10879421, + "diversity_loss_mlp": 0.0, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0911420547130344, + "language_loss": 0.8431657, + "learning_rate": 0.000925888133132719, + "loss": 0.85441184, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1041, + "time_per_iteration": 2.780141830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063145, + "balance_loss_mlp": 1.05260694, + "diversity_loss_mlp": 0.0, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.04139604987307943, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80673575, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 1042, + "time_per_iteration": 4.971017360687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100746, + "balance_loss_mlp": 1.08498645, + "diversity_loss_mlp": 0.0, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.08950731646766712, + "language_loss": 0.81070006, + "learning_rate": 0.0009255613649386244, + "loss": 0.82170749, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.1574707, + "routerloss_mlp": 0.0, + "step": 1043, + "time_per_iteration": 2.6508612632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091355, + "balance_loss_mlp": 1.07623935, + "diversity_loss_mlp": 0.0, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07614483401418765, + "language_loss": 0.78829026, + "learning_rate": 0.0009253977329834838, + "loss": 0.79920387, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1044, + "time_per_iteration": 2.7090582847595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109464, + "balance_loss_mlp": 1.07947624, + "diversity_loss_mlp": 0.0, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.0989854096864982, + "language_loss": 0.86366481, + "learning_rate": 0.0009252339358742965, + "loss": 0.8746112, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1045, + "time_per_iteration": 2.801323652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100349, + "balance_loss_mlp": 1.08526874, + "diversity_loss_mlp": 0.0, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07994799859902735, + "language_loss": 0.83704323, + "learning_rate": 0.000925069973674654, + "loss": 0.84804672, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1046, + "time_per_iteration": 2.6286635398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011046, + "balance_loss_mlp": 1.09036636, + "diversity_loss_mlp": 0.0, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.05803081938267982, + "language_loss": 0.88841283, + "learning_rate": 0.000924905846448212, + "loss": 0.89945889, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1047, + "time_per_iteration": 2.7208023071289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135029, + "balance_loss_mlp": 1.12078381, + "diversity_loss_mlp": 0.0, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.09159511175118457, + "language_loss": 0.85692465, + "learning_rate": 0.0009247415542586906, + "loss": 0.86827493, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1048, + "time_per_iteration": 2.8772377967834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089504, + "balance_loss_mlp": 1.55797935, + "diversity_loss_mlp": 0.19993141, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.028193920194447036, + "language_loss": 0.83094788, + "learning_rate": 0.0009245770971698735, + "loss": 0.83989829, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01608507, + "step": 1049, + "time_per_iteration": 2.922792911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143099, + "balance_loss_mlp": 1.12878203, + "diversity_loss_mlp": 0.0, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.08345797467079887, + "language_loss": 0.88434327, + "learning_rate": 0.0009244124752456087, + "loss": 0.89577425, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1050, + "time_per_iteration": 2.5263967514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141188, + "balance_loss_mlp": 1.12675214, + "diversity_loss_mlp": 0.0, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.07479960387863874, + "language_loss": 0.85303241, + "learning_rate": 0.0009242476885498081, + "loss": 0.86444432, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1051, + "time_per_iteration": 2.8012773990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146474, + "balance_loss_mlp": 1.13181126, + "diversity_loss_mlp": 0.0, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.07632391919964465, + "language_loss": 0.81114984, + "learning_rate": 0.0009240827371464474, + "loss": 0.82261455, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1052, + "time_per_iteration": 2.546449661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146729, + "balance_loss_mlp": 1.1323998, + "diversity_loss_mlp": 0.0, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.11219768477147798, + "language_loss": 0.84167284, + "learning_rate": 0.0009239176210995666, + "loss": 0.85314012, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1053, + "time_per_iteration": 3.4905290603637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153158, + "balance_loss_mlp": 1.13878179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.07345468089138417, + "language_loss": 0.93850195, + "learning_rate": 0.0009237523404732695, + "loss": 0.95003355, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1054, + "time_per_iteration": 2.8854215145111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116831, + "balance_loss_mlp": 1.15374279, + "diversity_loss_mlp": 0.0, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.08788286689344726, + "language_loss": 0.84136868, + "learning_rate": 0.0009235868953317235, + "loss": 0.85305184, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1055, + "time_per_iteration": 2.785616397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115453, + "balance_loss_mlp": 1.14033246, + "diversity_loss_mlp": 0.0, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.07006303181868268, + "language_loss": 0.85314858, + "learning_rate": 0.0009234212857391602, + "loss": 0.86469388, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1056, + "time_per_iteration": 3.192293167114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167757, + "balance_loss_mlp": 1.15304708, + "diversity_loss_mlp": 0.0, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.07469852363602907, + "language_loss": 0.89220309, + "learning_rate": 0.000923255511759875, + "loss": 0.9038806, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1057, + "time_per_iteration": 2.783778429031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881428, + "balance_loss_mlp": 1.53356147, + "diversity_loss_mlp": 0.1968638, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.032510948660132113, + "language_loss": 0.84587663, + "learning_rate": 0.000923089573458227, + "loss": 0.85469091, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621579, + "step": 1058, + "time_per_iteration": 2.8847100734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150746, + "balance_loss_mlp": 1.13623881, + "diversity_loss_mlp": 0.0, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.11181454207252314, + "language_loss": 0.83516467, + "learning_rate": 0.0009229234708986392, + "loss": 0.84667218, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1059, + "time_per_iteration": 2.9079415798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172867, + "balance_loss_mlp": 1.16251993, + "diversity_loss_mlp": 0.0, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.06024273804144221, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82839763, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1060, + "time_per_iteration": 4.646218776702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112152, + "balance_loss_mlp": 1.10713172, + "diversity_loss_mlp": 0.0, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.08928557521337042, + "language_loss": 0.85345757, + "learning_rate": 0.0009225907732636548, + "loss": 0.86467278, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1061, + "time_per_iteration": 2.745448112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106249, + "balance_loss_mlp": 1.09209883, + "diversity_loss_mlp": 0.0, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.079028173596336, + "language_loss": 0.86936563, + "learning_rate": 0.0009224241783174227, + "loss": 0.88042819, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1062, + "time_per_iteration": 2.6923935413360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090506, + "balance_loss_mlp": 1.07616472, + "diversity_loss_mlp": 0.0, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.07452632641130948, + "language_loss": 0.85384166, + "learning_rate": 0.0009222574193715802, + "loss": 0.86474669, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1063, + "time_per_iteration": 2.7701327800750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092958, + "balance_loss_mlp": 1.07850981, + "diversity_loss_mlp": 0.0, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06517233034985846, + "language_loss": 0.85915947, + "learning_rate": 0.000922090496490869, + "loss": 0.87008905, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1064, + "time_per_iteration": 2.7387099266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098934, + "balance_loss_mlp": 1.08404493, + "diversity_loss_mlp": 0.0, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.06963355430403552, + "language_loss": 0.89889115, + "learning_rate": 0.0009219234097400937, + "loss": 0.90988052, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1065, + "time_per_iteration": 2.859334707260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112884, + "balance_loss_mlp": 1.09778059, + "diversity_loss_mlp": 0.0, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06723697540994414, + "language_loss": 0.83086514, + "learning_rate": 0.0009217561591841237, + "loss": 0.84199405, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1066, + "time_per_iteration": 3.3065547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00886484, + "balance_loss_mlp": 1.54046464, + "diversity_loss_mlp": 0.1982768, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.03984406199709606, + "language_loss": 0.80820358, + "learning_rate": 0.0009215887448878913, + "loss": 0.8170684, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01711285, + "step": 1067, + "time_per_iteration": 2.6291754245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131678, + "balance_loss_mlp": 1.11697936, + "diversity_loss_mlp": 0.0, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.07633348035576148, + "language_loss": 0.85365784, + "learning_rate": 0.0009214211669163922, + "loss": 0.86497462, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1068, + "time_per_iteration": 2.747936725616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136914, + "balance_loss_mlp": 1.12220347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.07197705825645119, + "language_loss": 0.9405331, + "learning_rate": 0.0009212534253346862, + "loss": 0.95190227, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1069, + "time_per_iteration": 2.696131467819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128507, + "balance_loss_mlp": 1.11372542, + "diversity_loss_mlp": 0.0, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.09743186487320747, + "language_loss": 0.84269625, + "learning_rate": 0.0009210855202078964, + "loss": 0.85398132, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1070, + "time_per_iteration": 2.6194372177124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114316, + "balance_loss_mlp": 1.12903321, + "diversity_loss_mlp": 0.0, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.08033414700046611, + "language_loss": 0.87081122, + "learning_rate": 0.0009209174516012091, + "loss": 0.88224292, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1071, + "time_per_iteration": 2.5169904232025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146914, + "balance_loss_mlp": 1.13247752, + "diversity_loss_mlp": 0.0, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.06769648970134874, + "language_loss": 0.89207751, + "learning_rate": 0.0009207492195798747, + "loss": 0.90354669, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1072, + "time_per_iteration": 2.804577112197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137485, + "balance_loss_mlp": 1.12303698, + "diversity_loss_mlp": 0.0, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.0857236005827703, + "language_loss": 0.84780991, + "learning_rate": 0.0009205808242092061, + "loss": 0.85918474, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1073, + "time_per_iteration": 2.6134936809539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122455, + "balance_loss_mlp": 1.10787559, + "diversity_loss_mlp": 0.0, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.09531084522047072, + "language_loss": 0.82512677, + "learning_rate": 0.0009204122655545808, + "loss": 0.83635134, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1074, + "time_per_iteration": 3.461315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888955, + "balance_loss_mlp": 1.54418314, + "diversity_loss_mlp": 0.20175909, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.03221822204199988, + "language_loss": 0.80952764, + "learning_rate": 0.0009202435436814388, + "loss": 0.81841719, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01598355, + "step": 1075, + "time_per_iteration": 2.728055238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146745, + "balance_loss_mlp": 1.13259482, + "diversity_loss_mlp": 0.0, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.0831097658087499, + "language_loss": 0.89925295, + "learning_rate": 0.0009200746586552836, + "loss": 0.91072041, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1076, + "time_per_iteration": 2.929422616958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136182, + "balance_loss_mlp": 1.12185347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.07960863169785164, + "language_loss": 0.84148425, + "learning_rate": 0.0009199056105416825, + "loss": 0.85284609, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1077, + "time_per_iteration": 3.0795576572418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148051, + "balance_loss_mlp": 1.13384151, + "diversity_loss_mlp": 0.0, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.06589509494701294, + "language_loss": 0.86599898, + "learning_rate": 0.0009197363994062654, + "loss": 0.87747955, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1078, + "time_per_iteration": 2.8304550647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891417, + "balance_loss_mlp": 1.54815006, + "diversity_loss_mlp": 0.20151556, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.027729032115243194, + "language_loss": 0.84302026, + "learning_rate": 0.0009195670253147262, + "loss": 0.85193443, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01658459, + "step": 1079, + "time_per_iteration": 2.987715005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168872, + "balance_loss_mlp": 1.15472198, + "diversity_loss_mlp": 0.0, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.07878432741989363, + "language_loss": 0.82508785, + "learning_rate": 0.0009193974883328216, + "loss": 0.83677661, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1080, + "time_per_iteration": 2.6007754802703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178335, + "balance_loss_mlp": 1.16408908, + "diversity_loss_mlp": 0.0, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06872318796781544, + "language_loss": 0.86871535, + "learning_rate": 0.0009192277885263718, + "loss": 0.88049871, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1081, + "time_per_iteration": 2.645918846130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116777, + "balance_loss_mlp": 1.15339386, + "diversity_loss_mlp": 0.0, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.08475435362049728, + "language_loss": 0.86010319, + "learning_rate": 0.0009190579259612602, + "loss": 0.87178093, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1082, + "time_per_iteration": 3.2688331604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153529, + "balance_loss_mlp": 1.13914001, + "diversity_loss_mlp": 0.0, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.06676527060715894, + "language_loss": 0.86419082, + "learning_rate": 0.000918887900703433, + "loss": 0.8757261, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1083, + "time_per_iteration": 2.7645068168640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129996, + "balance_loss_mlp": 1.11559522, + "diversity_loss_mlp": 0.0, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07296749014166971, + "language_loss": 0.89779425, + "learning_rate": 0.0009187177128188999, + "loss": 0.90909421, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1084, + "time_per_iteration": 2.441312313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128832, + "balance_loss_mlp": 1.11915255, + "diversity_loss_mlp": 0.0, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.053207927956046876, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78285372, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1085, + "time_per_iteration": 4.864179849624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117368, + "balance_loss_mlp": 1.1029439, + "diversity_loss_mlp": 0.0, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07905606819783856, + "language_loss": 0.85833263, + "learning_rate": 0.000918376849434071, + "loss": 0.86950636, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1086, + "time_per_iteration": 4.049270868301392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112999, + "balance_loss_mlp": 1.09849179, + "diversity_loss_mlp": 0.0, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.08954509639668791, + "language_loss": 0.90778226, + "learning_rate": 0.0009182061740661098, + "loss": 0.91891223, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1087, + "time_per_iteration": 2.557358741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128974, + "balance_loss_mlp": 1.11446643, + "diversity_loss_mlp": 0.0, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.08446380837501397, + "language_loss": 0.85054636, + "learning_rate": 0.0009180353363361127, + "loss": 0.86183608, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1088, + "time_per_iteration": 3.0897305011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118573, + "balance_loss_mlp": 1.10417306, + "diversity_loss_mlp": 0.0, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.08173869768976531, + "language_loss": 0.82508695, + "learning_rate": 0.0009178643363104044, + "loss": 0.83627272, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1089, + "time_per_iteration": 3.124645948410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113657, + "balance_loss_mlp": 1.09938824, + "diversity_loss_mlp": 0.0, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.09307233053408402, + "language_loss": 0.90518665, + "learning_rate": 0.0009176931740553735, + "loss": 0.9163233, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1090, + "time_per_iteration": 2.6098225116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113731, + "balance_loss_mlp": 1.09981966, + "diversity_loss_mlp": 0.0, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.09489388322063774, + "language_loss": 0.8240813, + "learning_rate": 0.0009175218496374708, + "loss": 0.83521861, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1091, + "time_per_iteration": 3.336355686187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110612, + "balance_loss_mlp": 1.09205294, + "diversity_loss_mlp": 0.0, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.08870561470384966, + "language_loss": 0.86057436, + "learning_rate": 0.0009173503631232103, + "loss": 0.87163556, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1092, + "time_per_iteration": 3.356015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106884, + "balance_loss_mlp": 1.09269798, + "diversity_loss_mlp": 0.0, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.09478788106803046, + "language_loss": 0.82067865, + "learning_rate": 0.0009171787145791691, + "loss": 0.83174753, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1093, + "time_per_iteration": 3.2546143531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116222, + "balance_loss_mlp": 1.10199988, + "diversity_loss_mlp": 0.0, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.14674509624116924, + "language_loss": 0.80160701, + "learning_rate": 0.000917006904071987, + "loss": 0.81276917, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1094, + "time_per_iteration": 2.5837080478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911953, + "balance_loss_mlp": 1.58726883, + "diversity_loss_mlp": 0.20477253, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.035943125208157026, + "language_loss": 0.8737694, + "learning_rate": 0.0009168349316683669, + "loss": 0.88288891, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01593196, + "step": 1095, + "time_per_iteration": 2.768296718597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136825, + "balance_loss_mlp": 1.1224122, + "diversity_loss_mlp": 0.0, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.06639171103878667, + "language_loss": 0.82719827, + "learning_rate": 0.0009166627974350741, + "loss": 0.83856648, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1096, + "time_per_iteration": 2.8819992542266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145046, + "balance_loss_mlp": 1.13041949, + "diversity_loss_mlp": 0.0, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.08337696606413014, + "language_loss": 0.89929205, + "learning_rate": 0.0009164905014389373, + "loss": 0.91074252, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1097, + "time_per_iteration": 2.7877442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163813, + "balance_loss_mlp": 1.1495918, + "diversity_loss_mlp": 0.0, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.08033808486911229, + "language_loss": 0.86386079, + "learning_rate": 0.0009163180437468476, + "loss": 0.87549889, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1098, + "time_per_iteration": 2.6314592361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176615, + "balance_loss_mlp": 1.16195273, + "diversity_loss_mlp": 0.0, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.09094665560265827, + "language_loss": 0.85629344, + "learning_rate": 0.000916145424425759, + "loss": 0.86805964, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1099, + "time_per_iteration": 2.6608541011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181873, + "balance_loss_mlp": 1.16744852, + "diversity_loss_mlp": 0.0, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.09944182260515583, + "language_loss": 0.9083795, + "learning_rate": 0.0009159726435426885, + "loss": 0.9201982, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1100, + "time_per_iteration": 3.0502405166625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.134619, + "diversity_loss_mlp": 0.0, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.09151162791452093, + "language_loss": 0.90900993, + "learning_rate": 0.0009157997011647154, + "loss": 0.92050231, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1101, + "time_per_iteration": 2.6048476696014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127613, + "balance_loss_mlp": 1.11389172, + "diversity_loss_mlp": 0.0, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.07696729699318336, + "language_loss": 0.86130077, + "learning_rate": 0.0009156265973589817, + "loss": 0.87257689, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1102, + "time_per_iteration": 2.7552144527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114805, + "balance_loss_mlp": 1.10088181, + "diversity_loss_mlp": 0.0, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.07661877314329607, + "language_loss": 0.89485067, + "learning_rate": 0.0009154533321926926, + "loss": 0.90599877, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.13909912, + "routerloss_mlp": 0.0, + "step": 1103, + "time_per_iteration": 4.073851108551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105254, + "balance_loss_mlp": 1.09134197, + "diversity_loss_mlp": 0.0, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.08363594534482698, + "language_loss": 0.8717171, + "learning_rate": 0.0009152799057331156, + "loss": 0.88276958, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1104, + "time_per_iteration": 3.142221450805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100132, + "balance_loss_mlp": 1.08656633, + "diversity_loss_mlp": 0.0, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.1056362594360365, + "language_loss": 0.91270363, + "learning_rate": 0.0009151063180475805, + "loss": 0.92370498, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1105, + "time_per_iteration": 2.512547016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095772, + "balance_loss_mlp": 1.08196795, + "diversity_loss_mlp": 0.0, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.08072473316090223, + "language_loss": 0.84285367, + "learning_rate": 0.0009149325692034803, + "loss": 0.85381138, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1106, + "time_per_iteration": 2.5711469650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.06266928, + "diversity_loss_mlp": 0.0, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.04229613635199888, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.8027482, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1107, + "time_per_iteration": 4.817704916000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129097, + "balance_loss_mlp": 1.11547112, + "diversity_loss_mlp": 0.0, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.07382538641756346, + "language_loss": 0.8748607, + "learning_rate": 0.0009145845883094678, + "loss": 0.88615161, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1108, + "time_per_iteration": 3.039318561553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150208, + "balance_loss_mlp": 1.13671303, + "diversity_loss_mlp": 0.0, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07887220377556703, + "language_loss": 0.85174125, + "learning_rate": 0.000914410356394654, + "loss": 0.86324334, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1109, + "time_per_iteration": 2.76413893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116209, + "balance_loss_mlp": 1.1484766, + "diversity_loss_mlp": 0.0, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.06362602917472766, + "language_loss": 0.84447891, + "learning_rate": 0.0009142359635914709, + "loss": 0.85609984, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1110, + "time_per_iteration": 3.007201671600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163563, + "balance_loss_mlp": 1.15004468, + "diversity_loss_mlp": 0.0, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.07633144605420673, + "language_loss": 0.84598219, + "learning_rate": 0.0009140614099676245, + "loss": 0.85761786, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1111, + "time_per_iteration": 2.569401979446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161722, + "balance_loss_mlp": 1.14807272, + "diversity_loss_mlp": 0.0, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.0712977258009472, + "language_loss": 0.82590818, + "learning_rate": 0.0009138866955908821, + "loss": 0.83752549, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1112, + "time_per_iteration": 2.870701789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166663, + "balance_loss_mlp": 1.15294182, + "diversity_loss_mlp": 0.0, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.09239605609063735, + "language_loss": 0.80485952, + "learning_rate": 0.0009137118205290738, + "loss": 0.81652606, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.13739014, + "routerloss_mlp": 0.0, + "step": 1113, + "time_per_iteration": 2.9623591899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174843, + "balance_loss_mlp": 1.16082442, + "diversity_loss_mlp": 0.0, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.08763873550503462, + "language_loss": 0.90553653, + "learning_rate": 0.0009135367848500924, + "loss": 0.91728497, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1114, + "time_per_iteration": 2.5287492275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165146, + "balance_loss_mlp": 1.15138936, + "diversity_loss_mlp": 0.0, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.11593363319598911, + "language_loss": 0.86361086, + "learning_rate": 0.0009133615886218927, + "loss": 0.87526232, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1115, + "time_per_iteration": 2.6945505142211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141616, + "balance_loss_mlp": 1.12725139, + "diversity_loss_mlp": 0.0, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.08371979294567897, + "language_loss": 0.87389791, + "learning_rate": 0.0009131862319124917, + "loss": 0.88531411, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1116, + "time_per_iteration": 2.6219210624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130003, + "balance_loss_mlp": 1.1162107, + "diversity_loss_mlp": 0.0, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08272793517794225, + "language_loss": 0.83981287, + "learning_rate": 0.0009130107147899691, + "loss": 0.85111284, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1117, + "time_per_iteration": 2.698151111602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.1039083, + "diversity_loss_mlp": 0.0, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.4685945915436946, + "language_loss": 0.85086691, + "learning_rate": 0.0009128350373224665, + "loss": 0.86204791, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1118, + "time_per_iteration": 2.545565128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059182, + "balance_loss_mlp": 1.04950213, + "diversity_loss_mlp": 0.0, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.03761711697708654, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82515609, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1119, + "time_per_iteration": 4.648902416229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118843, + "balance_loss_mlp": 1.10412121, + "diversity_loss_mlp": 0.0, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07492511871579786, + "language_loss": 0.85205054, + "learning_rate": 0.0009124832016254005, + "loss": 0.86323893, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1120, + "time_per_iteration": 2.5875513553619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112958, + "balance_loss_mlp": 1.11404657, + "diversity_loss_mlp": 0.0, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.10623123993924175, + "language_loss": 0.88117284, + "learning_rate": 0.0009123070435324316, + "loss": 0.89246857, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1121, + "time_per_iteration": 2.752814769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119114, + "balance_loss_mlp": 1.10852826, + "diversity_loss_mlp": 0.0, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.05861429426141409, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78994894, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 1122, + "time_per_iteration": 4.993450880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114894, + "balance_loss_mlp": 1.13229823, + "diversity_loss_mlp": 0.0, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.09758120262844092, + "language_loss": 0.86477894, + "learning_rate": 0.0009119542471995752, + "loss": 0.87626839, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 1123, + "time_per_iteration": 2.8260560035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132116, + "balance_loss_mlp": 1.1160109, + "diversity_loss_mlp": 0.0, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.1175490331770948, + "language_loss": 0.81597894, + "learning_rate": 0.0009117776090966554, + "loss": 0.82730007, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1124, + "time_per_iteration": 2.955768585205078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133281, + "balance_loss_mlp": 1.1166153, + "diversity_loss_mlp": 0.0, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.08908783615486303, + "language_loss": 0.86717665, + "learning_rate": 0.0009116008111274899, + "loss": 0.87850952, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1125, + "time_per_iteration": 3.2493131160736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_mlp": 1.02921367, + "diversity_loss_mlp": 0.0, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.03267712428803131, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80145574, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 1126, + "time_per_iteration": 4.8121678829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148434, + "balance_loss_mlp": 1.13257909, + "diversity_loss_mlp": 0.0, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.09699177011816186, + "language_loss": 0.85244691, + "learning_rate": 0.0009112467358650396, + "loss": 0.86393118, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1127, + "time_per_iteration": 3.144075393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166528, + "balance_loss_mlp": 1.15056634, + "diversity_loss_mlp": 0.0, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.07985175184807933, + "language_loss": 0.86319685, + "learning_rate": 0.0009110694587092192, + "loss": 0.87486213, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.1595459, + "routerloss_mlp": 0.0, + "step": 1128, + "time_per_iteration": 2.7497644424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179675, + "balance_loss_mlp": 1.1634866, + "diversity_loss_mlp": 0.0, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.1038215552752292, + "language_loss": 0.81267089, + "learning_rate": 0.0009108920219620815, + "loss": 0.82446766, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 1129, + "time_per_iteration": 2.6150496006011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195026, + "balance_loss_mlp": 1.1788609, + "diversity_loss_mlp": 0.0, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06771714561059723, + "language_loss": 0.89286679, + "learning_rate": 0.0009107144256925133, + "loss": 0.9048171, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1130, + "time_per_iteration": 2.6569926738739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196317, + "balance_loss_mlp": 1.18006873, + "diversity_loss_mlp": 0.0, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08333124164895586, + "language_loss": 0.82520813, + "learning_rate": 0.0009105366699694638, + "loss": 0.83717132, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 1131, + "time_per_iteration": 2.7384698390960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200769, + "balance_loss_mlp": 1.18390059, + "diversity_loss_mlp": 0.0, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.07018840625680964, + "language_loss": 0.81826723, + "learning_rate": 0.0009103587548619439, + "loss": 0.83027488, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 1132, + "time_per_iteration": 2.8361291885375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188026, + "balance_loss_mlp": 1.17064476, + "diversity_loss_mlp": 0.0, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.08238158624987729, + "language_loss": 0.85952497, + "learning_rate": 0.0009101806804390261, + "loss": 0.87140524, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.1739502, + "routerloss_mlp": 0.0, + "step": 1133, + "time_per_iteration": 2.8646528720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846565, + "balance_loss_mlp": 1.45559311, + "diversity_loss_mlp": 0.20202307, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.03511986753794681, + "language_loss": 0.90682399, + "learning_rate": 0.0009100024467698453, + "loss": 0.91528964, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01775702, + "step": 1134, + "time_per_iteration": 2.628955364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119036, + "balance_loss_mlp": 1.17289567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.09831196896097749, + "language_loss": 0.82889581, + "learning_rate": 0.0009098240539235981, + "loss": 0.84079945, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 1135, + "time_per_iteration": 2.6857638359069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179858, + "balance_loss_mlp": 1.16191649, + "diversity_loss_mlp": 0.0, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.07855046788509763, + "language_loss": 0.87649047, + "learning_rate": 0.0009096455019695423, + "loss": 0.88828909, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 1136, + "time_per_iteration": 2.814746856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175201, + "balance_loss_mlp": 1.15702188, + "diversity_loss_mlp": 0.0, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.090535881946018, + "language_loss": 0.89789271, + "learning_rate": 0.000909466790976998, + "loss": 0.90964472, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.18188477, + "routerloss_mlp": 0.0, + "step": 1137, + "time_per_iteration": 2.503934144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151304, + "balance_loss_mlp": 1.13231349, + "diversity_loss_mlp": 0.0, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07386356915969775, + "language_loss": 0.82546908, + "learning_rate": 0.0009092879210153473, + "loss": 0.83698207, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.18981934, + "routerloss_mlp": 0.0, + "step": 1138, + "time_per_iteration": 3.106015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143167, + "balance_loss_mlp": 1.12445128, + "diversity_loss_mlp": 0.0, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.08443059177839436, + "language_loss": 0.89126158, + "learning_rate": 0.0009091088921540333, + "loss": 0.90269327, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.18701172, + "routerloss_mlp": 0.0, + "step": 1139, + "time_per_iteration": 2.5165584087371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197317, + "balance_loss_mlp": 1.18491888, + "diversity_loss_mlp": 0.0, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.06938907882855633, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76705992, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12402344, + "routerloss_mlp": 0.0, + "step": 1140, + "time_per_iteration": 4.907839775085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845315, + "balance_loss_mlp": 1.45913088, + "diversity_loss_mlp": 0.19676474, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.04157801253712285, + "language_loss": 0.84799111, + "learning_rate": 0.0009087503580104985, + "loss": 0.8564443, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01736734, + "step": 1141, + "time_per_iteration": 2.6928980350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106137, + "balance_loss_mlp": 1.08643126, + "diversity_loss_mlp": 0.0, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.09652849342648293, + "language_loss": 0.7964108, + "learning_rate": 0.0009085708528674728, + "loss": 0.80747211, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 1142, + "time_per_iteration": 2.7800490856170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.09476519, + "diversity_loss_mlp": 0.0, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.11345906914127299, + "language_loss": 0.8700006, + "learning_rate": 0.0009083911891031745, + "loss": 0.88115132, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 1143, + "time_per_iteration": 3.104893684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110402, + "balance_loss_mlp": 1.08533978, + "diversity_loss_mlp": 0.0, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.12428556161586228, + "language_loss": 0.91569418, + "learning_rate": 0.0009082113667873553, + "loss": 0.92673439, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.18676758, + "routerloss_mlp": 0.0, + "step": 1144, + "time_per_iteration": 3.0838277339935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138926, + "balance_loss_mlp": 1.12060392, + "diversity_loss_mlp": 0.0, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.0955721440223133, + "language_loss": 0.90911627, + "learning_rate": 0.0009080313859898283, + "loss": 0.92050546, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 1145, + "time_per_iteration": 2.4998109340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162548, + "balance_loss_mlp": 1.14463091, + "diversity_loss_mlp": 0.0, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07871728913387968, + "language_loss": 0.91642439, + "learning_rate": 0.0009078512467804684, + "loss": 0.92804986, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.17932129, + "routerloss_mlp": 0.0, + "step": 1146, + "time_per_iteration": 2.583137273788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192448, + "balance_loss_mlp": 1.17516243, + "diversity_loss_mlp": 0.0, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.10815580627735921, + "language_loss": 0.90245295, + "learning_rate": 0.0009076709492292119, + "loss": 0.91437739, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 1147, + "time_per_iteration": 2.6189510822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199389, + "balance_loss_mlp": 1.18260384, + "diversity_loss_mlp": 0.0, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.10018226205073696, + "language_loss": 0.88948917, + "learning_rate": 0.0009074904934060562, + "loss": 0.90148306, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1148, + "time_per_iteration": 2.6619913578033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.17623389, + "diversity_loss_mlp": 0.0, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.09879445691718633, + "language_loss": 0.85041308, + "learning_rate": 0.0009073098793810607, + "loss": 0.8623414, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.1661377, + "routerloss_mlp": 0.0, + "step": 1149, + "time_per_iteration": 2.9382119178771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185083, + "balance_loss_mlp": 1.16848898, + "diversity_loss_mlp": 0.0, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.09716543961816822, + "language_loss": 0.88557786, + "learning_rate": 0.000907129107224346, + "loss": 0.89742863, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.16601562, + "routerloss_mlp": 0.0, + "step": 1150, + "time_per_iteration": 2.717400550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.17356002, + "diversity_loss_mlp": 0.0, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.0741661773141201, + "language_loss": 0.88313866, + "learning_rate": 0.0009069481770060939, + "loss": 0.89504004, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1151, + "time_per_iteration": 2.676938056945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118655, + "balance_loss_mlp": 1.17039752, + "diversity_loss_mlp": 0.0, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06827936796637825, + "language_loss": 0.83848286, + "learning_rate": 0.000906767088796548, + "loss": 0.85034835, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1152, + "time_per_iteration": 3.442782163619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185002, + "balance_loss_mlp": 1.16889715, + "diversity_loss_mlp": 0.0, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.07358747282835834, + "language_loss": 0.87001419, + "learning_rate": 0.0009065858426660127, + "loss": 0.88186425, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1153, + "time_per_iteration": 2.6501753330230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178927, + "balance_loss_mlp": 1.16286922, + "diversity_loss_mlp": 0.0, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.0863709920952229, + "language_loss": 0.84764236, + "learning_rate": 0.0009064044386848543, + "loss": 0.85943162, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1154, + "time_per_iteration": 2.920689344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176891, + "balance_loss_mlp": 1.16032064, + "diversity_loss_mlp": 0.0, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.07669791788600007, + "language_loss": 0.88829726, + "learning_rate": 0.0009062228769234997, + "loss": 0.90006614, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 1155, + "time_per_iteration": 2.561638832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154629, + "balance_loss_mlp": 1.13797593, + "diversity_loss_mlp": 0.0, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.08447027490527963, + "language_loss": 0.81123281, + "learning_rate": 0.0009060411574524376, + "loss": 0.82277906, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 1156, + "time_per_iteration": 2.655132293701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162354, + "balance_loss_mlp": 1.14597416, + "diversity_loss_mlp": 0.0, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.08665349089557017, + "language_loss": 0.87817705, + "learning_rate": 0.0009058592803422178, + "loss": 0.88980061, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 1157, + "time_per_iteration": 3.1417362689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183028, + "balance_loss_mlp": 1.17430186, + "diversity_loss_mlp": 0.0, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.06198684812147071, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79893315, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1158, + "time_per_iteration": 4.867843866348267 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.1120069, + "diversity_loss_mlp": 0.0, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.0864152607347894, + "language_loss": 0.90156865, + "learning_rate": 0.00090549505348681, + "loss": 0.91285539, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1159, + "time_per_iteration": 2.581865072250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118982, + "balance_loss_mlp": 1.1025548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07056827667929483, + "language_loss": 0.83819324, + "learning_rate": 0.0009053127038830275, + "loss": 0.84938306, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 1160, + "time_per_iteration": 2.9969708919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881169, + "balance_loss_mlp": 1.53314447, + "diversity_loss_mlp": 0.19063006, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.04002382495760162, + "language_loss": 0.87460124, + "learning_rate": 0.000905130196922898, + "loss": 0.88341296, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01928164, + "step": 1161, + "time_per_iteration": 2.6307718753814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881407, + "balance_loss_mlp": 1.5316093, + "diversity_loss_mlp": 0.19140732, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.030280826501304762, + "language_loss": 0.86784196, + "learning_rate": 0.0009049475326772769, + "loss": 0.87665606, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01989887, + "step": 1162, + "time_per_iteration": 2.6021478176116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00889034, + "balance_loss_mlp": 1.54766631, + "diversity_loss_mlp": 0.19066738, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.03198536270345376, + "language_loss": 0.83124602, + "learning_rate": 0.0009047647112170811, + "loss": 0.84013629, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01986698, + "step": 1163, + "time_per_iteration": 2.804150342941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123868, + "balance_loss_mlp": 1.1070838, + "diversity_loss_mlp": 0.0, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.09901141435665076, + "language_loss": 0.87948084, + "learning_rate": 0.0009045817326132876, + "loss": 0.89071947, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1164, + "time_per_iteration": 3.6840732097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125369, + "balance_loss_mlp": 1.107988, + "diversity_loss_mlp": 0.0, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.08432013167879508, + "language_loss": 0.83142793, + "learning_rate": 0.0009043985969369357, + "loss": 0.84268159, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.17407227, + "routerloss_mlp": 0.0, + "step": 1165, + "time_per_iteration": 2.8148193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146301, + "balance_loss_mlp": 1.12976706, + "diversity_loss_mlp": 0.0, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.06944445596490195, + "language_loss": 0.84334069, + "learning_rate": 0.0009042153042591245, + "loss": 0.85480368, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 1166, + "time_per_iteration": 2.8004493713378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142176, + "balance_loss_mlp": 1.12542677, + "diversity_loss_mlp": 0.0, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.06821660135571728, + "language_loss": 0.85225487, + "learning_rate": 0.0009040318546510146, + "loss": 0.86367661, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 1167, + "time_per_iteration": 3.1969215869903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156354, + "balance_loss_mlp": 1.13979554, + "diversity_loss_mlp": 0.0, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.06547364647617461, + "language_loss": 0.84988701, + "learning_rate": 0.0009038482481838275, + "loss": 0.86145055, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 1168, + "time_per_iteration": 2.7087180614471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00861334, + "balance_loss_mlp": 1.49333596, + "diversity_loss_mlp": 0.19261675, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.02892951533663535, + "language_loss": 0.87266529, + "learning_rate": 0.0009036644849288455, + "loss": 0.88127863, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01835741, + "step": 1169, + "time_per_iteration": 3.1039352416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.1631248, + "diversity_loss_mlp": 0.0, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.06865085555084699, + "language_loss": 0.85404736, + "learning_rate": 0.0009034805649574118, + "loss": 0.86584634, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.16784668, + "routerloss_mlp": 0.0, + "step": 1170, + "time_per_iteration": 2.659322738647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208955, + "balance_loss_mlp": 1.1926589, + "diversity_loss_mlp": 0.0, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.07685307661183591, + "language_loss": 0.85691977, + "learning_rate": 0.0009032964883409308, + "loss": 0.86900926, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 1171, + "time_per_iteration": 2.8938751220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128211, + "balance_loss_mlp": 1.11910319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.06058864885284362, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74178743, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 1172, + "time_per_iteration": 4.983820676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217918, + "balance_loss_mlp": 1.20207548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.1048847225020503, + "language_loss": 0.8717351, + "learning_rate": 0.0009029278654587462, + "loss": 0.88391435, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.1583252, + "routerloss_mlp": 0.0, + "step": 1173, + "time_per_iteration": 2.639632225036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181665, + "balance_loss_mlp": 1.16508245, + "diversity_loss_mlp": 0.0, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.07111002228073603, + "language_loss": 0.82226282, + "learning_rate": 0.0009027433193361548, + "loss": 0.83407944, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1174, + "time_per_iteration": 2.7443323135375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159983, + "balance_loss_mlp": 1.14366364, + "diversity_loss_mlp": 0.0, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06531304020653, + "language_loss": 0.86980343, + "learning_rate": 0.00090255861685474, + "loss": 0.88140327, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 1175, + "time_per_iteration": 2.7534220218658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142116, + "balance_loss_mlp": 1.12533128, + "diversity_loss_mlp": 0.0, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.10016618462748716, + "language_loss": 0.90750074, + "learning_rate": 0.0009023737580862095, + "loss": 0.91892195, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1176, + "time_per_iteration": 2.5116937160491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114077, + "balance_loss_mlp": 1.12470055, + "diversity_loss_mlp": 0.0, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0707285441494173, + "language_loss": 0.83225566, + "learning_rate": 0.0009021887431023321, + "loss": 0.84366333, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1177, + "time_per_iteration": 2.599956512451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130034, + "balance_loss_mlp": 1.11444104, + "diversity_loss_mlp": 0.0, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.08431891612549362, + "language_loss": 0.87212515, + "learning_rate": 0.0009020035719749369, + "loss": 0.88342547, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1178, + "time_per_iteration": 2.7144312858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135701, + "balance_loss_mlp": 1.1205014, + "diversity_loss_mlp": 0.0, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.09883499682369536, + "language_loss": 0.77450085, + "learning_rate": 0.0009018182447759136, + "loss": 0.7858578, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1179, + "time_per_iteration": 2.98848557472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137145, + "balance_loss_mlp": 1.12187457, + "diversity_loss_mlp": 0.0, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.08173095074239418, + "language_loss": 0.79878223, + "learning_rate": 0.0009016327615772126, + "loss": 0.81015366, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1180, + "time_per_iteration": 2.9338154792785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149275, + "balance_loss_mlp": 1.13449335, + "diversity_loss_mlp": 0.0, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.08374692364956231, + "language_loss": 0.87680298, + "learning_rate": 0.0009014471224508451, + "loss": 0.88829577, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1181, + "time_per_iteration": 2.7131431102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881934, + "balance_loss_mlp": 1.53494334, + "diversity_loss_mlp": 0.19571492, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.04185105584005936, + "language_loss": 0.83154267, + "learning_rate": 0.0009012613274688823, + "loss": 0.84036207, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01660516, + "step": 1182, + "time_per_iteration": 2.649559736251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184244, + "balance_loss_mlp": 1.1692239, + "diversity_loss_mlp": 0.0, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.12019924395271459, + "language_loss": 0.87753081, + "learning_rate": 0.0009010753767034565, + "loss": 0.8893733, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1183, + "time_per_iteration": 2.5258986949920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175003, + "balance_loss_mlp": 1.16030502, + "diversity_loss_mlp": 0.0, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.08783280174490297, + "language_loss": 0.78918862, + "learning_rate": 0.0009008892702267599, + "loss": 0.80093861, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1184, + "time_per_iteration": 2.9962406158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139115, + "balance_loss_mlp": 1.12460732, + "diversity_loss_mlp": 0.0, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.08254121322216867, + "language_loss": 0.88525105, + "learning_rate": 0.0009007030081110457, + "loss": 0.89664215, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1185, + "time_per_iteration": 2.5990660190582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125585, + "balance_loss_mlp": 1.11087465, + "diversity_loss_mlp": 0.0, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.07610459395316062, + "language_loss": 0.84548527, + "learning_rate": 0.000900516590428627, + "loss": 0.85674113, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1186, + "time_per_iteration": 2.7377407550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121666, + "balance_loss_mlp": 1.1070751, + "diversity_loss_mlp": 0.0, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.13748029932532174, + "language_loss": 0.89182103, + "learning_rate": 0.0009003300172518778, + "loss": 0.90303767, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1187, + "time_per_iteration": 2.6916556358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116227, + "balance_loss_mlp": 1.10145736, + "diversity_loss_mlp": 0.0, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.11313229810108143, + "language_loss": 0.84335989, + "learning_rate": 0.0009001432886532321, + "loss": 0.85452211, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1188, + "time_per_iteration": 2.9698264598846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114727, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.06729358528862889, + "language_loss": 0.86774516, + "learning_rate": 0.0008999564047051843, + "loss": 0.87889242, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1189, + "time_per_iteration": 2.5002098083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136799, + "balance_loss_mlp": 1.12243462, + "diversity_loss_mlp": 0.0, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.0714274855120672, + "language_loss": 0.84824312, + "learning_rate": 0.0008997693654802894, + "loss": 0.85961115, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1190, + "time_per_iteration": 2.6300055980682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149047, + "balance_loss_mlp": 1.13425303, + "diversity_loss_mlp": 0.0, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.07754985979781381, + "language_loss": 0.86714745, + "learning_rate": 0.0008995821710511625, + "loss": 0.87863791, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1191, + "time_per_iteration": 2.7126989364624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162855, + "balance_loss_mlp": 1.14807296, + "diversity_loss_mlp": 0.0, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.11547698788472376, + "language_loss": 0.85060751, + "learning_rate": 0.0008993948214904786, + "loss": 0.86223602, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1192, + "time_per_iteration": 2.5562260150909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152534, + "balance_loss_mlp": 1.14361739, + "diversity_loss_mlp": 0.0, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.05307726892258072, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79574746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 1193, + "time_per_iteration": 4.909748792648315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187526, + "balance_loss_mlp": 1.17205215, + "diversity_loss_mlp": 0.0, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.09739164860103838, + "language_loss": 0.78353333, + "learning_rate": 0.0008990196572654427, + "loss": 0.79540861, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.15454102, + "routerloss_mlp": 0.0, + "step": 1194, + "time_per_iteration": 2.8592262268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117424, + "balance_loss_mlp": 1.1592319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.06260411033315277, + "language_loss": 0.87559408, + "learning_rate": 0.0008988318427467426, + "loss": 0.88733649, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1195, + "time_per_iteration": 2.7444722652435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00878316, + "balance_loss_mlp": 1.52780199, + "diversity_loss_mlp": 0.1948241, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.0364111048645648, + "language_loss": 0.86376345, + "learning_rate": 0.0008986438733877887, + "loss": 0.87254667, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01700337, + "step": 1196, + "time_per_iteration": 3.5090088844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137546, + "balance_loss_mlp": 1.1229074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.08413871186116019, + "language_loss": 0.83810687, + "learning_rate": 0.0008984557492615576, + "loss": 0.84948236, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1197, + "time_per_iteration": 2.9953744411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10803354, + "diversity_loss_mlp": 0.0, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.08617240411661099, + "language_loss": 0.90267789, + "learning_rate": 0.0008982674704410854, + "loss": 0.91390687, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1198, + "time_per_iteration": 2.7513339519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110338, + "balance_loss_mlp": 1.09598517, + "diversity_loss_mlp": 0.0, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.11146547076727734, + "language_loss": 0.77876621, + "learning_rate": 0.0008980790369994682, + "loss": 0.78986955, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1199, + "time_per_iteration": 2.989825487136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120977, + "balance_loss_mlp": 1.10670781, + "diversity_loss_mlp": 0.0, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.0677628031660983, + "language_loss": 0.8729977, + "learning_rate": 0.000897890449009863, + "loss": 0.88420743, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1200, + "time_per_iteration": 2.6784448623657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127646, + "balance_loss_mlp": 1.11330509, + "diversity_loss_mlp": 0.0, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.080414080555838, + "language_loss": 0.89825618, + "learning_rate": 0.0008977017065454853, + "loss": 0.90953267, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1201, + "time_per_iteration": 2.6610703468322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00880483, + "balance_loss_mlp": 1.52539706, + "diversity_loss_mlp": 0.19880572, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.03277795962214655, + "language_loss": 0.80367738, + "learning_rate": 0.0008975128096796121, + "loss": 0.81248224, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01838172, + "step": 1202, + "time_per_iteration": 2.901998996734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145011, + "balance_loss_mlp": 1.13089633, + "diversity_loss_mlp": 0.0, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.10693947298766643, + "language_loss": 0.85848922, + "learning_rate": 0.0008973237584855794, + "loss": 0.86993933, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1203, + "time_per_iteration": 2.872408151626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160017, + "balance_loss_mlp": 1.1457237, + "diversity_loss_mlp": 0.0, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.08753213296005687, + "language_loss": 0.82586002, + "learning_rate": 0.0008971345530367832, + "loss": 0.83746028, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1204, + "time_per_iteration": 2.4641921520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185717, + "balance_loss_mlp": 1.17120886, + "diversity_loss_mlp": 0.0, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07947534631123947, + "language_loss": 0.85658818, + "learning_rate": 0.0008969451934066799, + "loss": 0.8684454, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1205, + "time_per_iteration": 2.7822117805480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173591, + "balance_loss_mlp": 1.15872586, + "diversity_loss_mlp": 0.0, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08780432716538046, + "language_loss": 0.79991889, + "learning_rate": 0.0008967556796687854, + "loss": 0.81165481, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1206, + "time_per_iteration": 2.8849406242370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117013, + "balance_loss_mlp": 1.15584886, + "diversity_loss_mlp": 0.0, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.07569633120476413, + "language_loss": 0.83779937, + "learning_rate": 0.0008965660118966752, + "loss": 0.84950066, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1207, + "time_per_iteration": 2.9316329956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146692, + "balance_loss_mlp": 1.1319102, + "diversity_loss_mlp": 0.0, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06968265941642382, + "language_loss": 0.90114093, + "learning_rate": 0.0008963761901639851, + "loss": 0.91260791, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1208, + "time_per_iteration": 2.8140323162078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113879, + "balance_loss_mlp": 1.12392485, + "diversity_loss_mlp": 0.0, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.08612535310277082, + "language_loss": 0.83098078, + "learning_rate": 0.0008961862145444103, + "loss": 0.84236872, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1209, + "time_per_iteration": 2.7529945373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122935, + "balance_loss_mlp": 1.10796285, + "diversity_loss_mlp": 0.0, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08243119711445285, + "language_loss": 0.85338795, + "learning_rate": 0.0008959960851117059, + "loss": 0.86461735, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1210, + "time_per_iteration": 2.624340534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108554, + "balance_loss_mlp": 1.09396267, + "diversity_loss_mlp": 0.0, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.10596241027535934, + "language_loss": 0.84048676, + "learning_rate": 0.0008958058019396868, + "loss": 0.85157233, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1211, + "time_per_iteration": 2.8316566944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112068, + "balance_loss_mlp": 1.09751284, + "diversity_loss_mlp": 0.0, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.07651667178885936, + "language_loss": 0.86494702, + "learning_rate": 0.0008956153651022274, + "loss": 0.8760677, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1212, + "time_per_iteration": 2.684788465499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103634, + "balance_loss_mlp": 1.08926892, + "diversity_loss_mlp": 0.0, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.07459915787800217, + "language_loss": 0.83929688, + "learning_rate": 0.0008954247746732618, + "loss": 0.85033321, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1213, + "time_per_iteration": 2.6184399127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117524, + "balance_loss_mlp": 1.10321903, + "diversity_loss_mlp": 0.0, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.08317009769115577, + "language_loss": 0.90604293, + "learning_rate": 0.0008952340307267837, + "loss": 0.91721821, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1214, + "time_per_iteration": 2.8993093967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119644, + "balance_loss_mlp": 1.10553002, + "diversity_loss_mlp": 0.0, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.09601716623847659, + "language_loss": 0.83731341, + "learning_rate": 0.0008950431333368468, + "loss": 0.84850979, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1215, + "time_per_iteration": 2.6151199340820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130858, + "balance_loss_mlp": 1.11676729, + "diversity_loss_mlp": 0.0, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.08049188450288745, + "language_loss": 0.84623635, + "learning_rate": 0.0008948520825775634, + "loss": 0.8575449, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1216, + "time_per_iteration": 3.645200490951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123063, + "balance_loss_mlp": 1.10880601, + "diversity_loss_mlp": 0.0, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.08038238822992319, + "language_loss": 0.83978343, + "learning_rate": 0.0008946608785231067, + "loss": 0.85101402, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1217, + "time_per_iteration": 2.871616840362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126329, + "balance_loss_mlp": 1.11263156, + "diversity_loss_mlp": 0.0, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.07832391647543825, + "language_loss": 0.84442961, + "learning_rate": 0.0008944695212477084, + "loss": 0.85569292, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1218, + "time_per_iteration": 2.507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123493, + "balance_loss_mlp": 1.10867572, + "diversity_loss_mlp": 0.0, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.07420792055611987, + "language_loss": 0.86334574, + "learning_rate": 0.0008942780108256599, + "loss": 0.87458062, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1219, + "time_per_iteration": 2.6183433532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107778, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.07657909053901747, + "language_loss": 0.86160946, + "learning_rate": 0.0008940863473313121, + "loss": 0.87268722, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1220, + "time_per_iteration": 2.495164632797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107377, + "balance_loss_mlp": 1.09272623, + "diversity_loss_mlp": 0.0, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07962638616920462, + "language_loss": 0.87889743, + "learning_rate": 0.0008938945308390756, + "loss": 0.88997114, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1221, + "time_per_iteration": 2.613927125930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097552, + "balance_loss_mlp": 1.08298469, + "diversity_loss_mlp": 0.0, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.06679649396710063, + "language_loss": 0.87179595, + "learning_rate": 0.00089370256142342, + "loss": 0.88277149, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1222, + "time_per_iteration": 2.732208013534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094745, + "balance_loss_mlp": 1.07952189, + "diversity_loss_mlp": 0.0, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.06680688140454344, + "language_loss": 0.84810197, + "learning_rate": 0.0008935104391588746, + "loss": 0.85904944, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1223, + "time_per_iteration": 2.7585461139678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094902, + "balance_loss_mlp": 1.07917881, + "diversity_loss_mlp": 0.0, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.07271030004651308, + "language_loss": 0.83111542, + "learning_rate": 0.0008933181641200276, + "loss": 0.84206444, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.15710449, + "routerloss_mlp": 0.0, + "step": 1224, + "time_per_iteration": 3.1440725326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087445, + "balance_loss_mlp": 1.07139981, + "diversity_loss_mlp": 0.0, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.07882513603721358, + "language_loss": 0.85824931, + "learning_rate": 0.0008931257363815271, + "loss": 0.8691237, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.16040039, + "routerloss_mlp": 0.0, + "step": 1225, + "time_per_iteration": 2.8887243270874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092595, + "balance_loss_mlp": 1.07659674, + "diversity_loss_mlp": 0.0, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.09571789824401095, + "language_loss": 0.89901638, + "learning_rate": 0.0008929331560180798, + "loss": 0.90994227, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.15991211, + "routerloss_mlp": 0.0, + "step": 1226, + "time_per_iteration": 2.897155284881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095366, + "balance_loss_mlp": 1.07965469, + "diversity_loss_mlp": 0.0, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.068724406385502, + "language_loss": 0.90771782, + "learning_rate": 0.0008927404231044525, + "loss": 0.91867149, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 1227, + "time_per_iteration": 2.6892144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_mlp": 1.08764625, + "diversity_loss_mlp": 0.0, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.06943954848997126, + "language_loss": 0.81646705, + "learning_rate": 0.0008925475377154703, + "loss": 0.82749879, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1228, + "time_per_iteration": 2.727325201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.11394727, + "diversity_loss_mlp": 0.0, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.0778889683705481, + "language_loss": 0.8212285, + "learning_rate": 0.0008923544999260183, + "loss": 0.83252132, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1229, + "time_per_iteration": 2.7520618438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.13194346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.0853653064859127, + "language_loss": 0.91254115, + "learning_rate": 0.00089216130981104, + "loss": 0.92400861, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1230, + "time_per_iteration": 3.016228199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138894, + "balance_loss_mlp": 1.12364721, + "diversity_loss_mlp": 0.0, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.08048994442870243, + "language_loss": 0.82752085, + "learning_rate": 0.000891967967445539, + "loss": 0.83890975, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.15222168, + "routerloss_mlp": 0.0, + "step": 1231, + "time_per_iteration": 2.65736722946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126061, + "balance_loss_mlp": 1.11135054, + "diversity_loss_mlp": 0.0, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.05909715635047166, + "language_loss": 0.889099, + "learning_rate": 0.0008917744729045772, + "loss": 0.90035963, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1232, + "time_per_iteration": 2.8686273097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110871, + "balance_loss_mlp": 1.0962795, + "diversity_loss_mlp": 0.0, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.08046733758331526, + "language_loss": 0.83836448, + "learning_rate": 0.0008915808262632757, + "loss": 0.84947324, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1233, + "time_per_iteration": 2.860353708267212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918962, + "balance_loss_mlp": 1.60287488, + "diversity_loss_mlp": 0.20008399, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.03182006079144566, + "language_loss": 0.93544835, + "learning_rate": 0.0008913870275968148, + "loss": 0.94463801, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017482, + "step": 1234, + "time_per_iteration": 2.7328829765319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095751, + "balance_loss_mlp": 1.08008718, + "diversity_loss_mlp": 0.0, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.07195832826776788, + "language_loss": 0.87503707, + "learning_rate": 0.0008911930769804342, + "loss": 0.88599461, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1235, + "time_per_iteration": 3.2619638442993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091405, + "balance_loss_mlp": 1.07551408, + "diversity_loss_mlp": 0.0, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.07148547933088874, + "language_loss": 0.91313815, + "learning_rate": 0.0008909989744894318, + "loss": 0.92405218, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1236, + "time_per_iteration": 2.8687992095947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080974, + "balance_loss_mlp": 1.06530952, + "diversity_loss_mlp": 0.0, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.08021447901266163, + "language_loss": 0.81662518, + "learning_rate": 0.0008908047201991649, + "loss": 0.8274349, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1237, + "time_per_iteration": 2.737638235092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076608, + "balance_loss_mlp": 1.06138515, + "diversity_loss_mlp": 0.0, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.07749899394714953, + "language_loss": 0.86585152, + "learning_rate": 0.0008906103141850502, + "loss": 0.87661767, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.15197754, + "routerloss_mlp": 0.0, + "step": 1238, + "time_per_iteration": 2.9184746742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068848, + "balance_loss_mlp": 1.05385113, + "diversity_loss_mlp": 0.0, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.10230617436374452, + "language_loss": 0.88104367, + "learning_rate": 0.0008904157565225621, + "loss": 0.89173216, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.1496582, + "routerloss_mlp": 0.0, + "step": 1239, + "time_per_iteration": 2.6396749019622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_mlp": 1.06220865, + "diversity_loss_mlp": 0.0, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.10467557893696883, + "language_loss": 0.81824136, + "learning_rate": 0.000890221047287235, + "loss": 0.82901168, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1240, + "time_per_iteration": 3.496812582015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081371, + "balance_loss_mlp": 1.06710172, + "diversity_loss_mlp": 0.0, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.09443583580909311, + "language_loss": 0.91125917, + "learning_rate": 0.0008900261865546615, + "loss": 0.92207289, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1241, + "time_per_iteration": 2.6527724266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103977, + "balance_loss_mlp": 1.0890398, + "diversity_loss_mlp": 0.0, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.08429957072104315, + "language_loss": 0.84985352, + "learning_rate": 0.0008898311744004936, + "loss": 0.86089325, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1242, + "time_per_iteration": 2.6740338802337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118763, + "balance_loss_mlp": 1.10411179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.07332762129893158, + "language_loss": 0.86932802, + "learning_rate": 0.0008896360109004414, + "loss": 0.88051569, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1243, + "time_per_iteration": 2.643489122390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142112, + "balance_loss_mlp": 1.12715125, + "diversity_loss_mlp": 0.0, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.09306092844590973, + "language_loss": 0.84636557, + "learning_rate": 0.0008894406961302742, + "loss": 0.85778666, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1244, + "time_per_iteration": 2.5876173973083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150798, + "balance_loss_mlp": 1.13590896, + "diversity_loss_mlp": 0.0, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.0838589606869783, + "language_loss": 0.83944738, + "learning_rate": 0.0008892452301658201, + "loss": 0.85095537, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1245, + "time_per_iteration": 2.928391218185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116146, + "balance_loss_mlp": 1.1460346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.0736247551351698, + "language_loss": 0.83299339, + "learning_rate": 0.0008890496130829653, + "loss": 0.84460801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1246, + "time_per_iteration": 2.6510462760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915571, + "balance_loss_mlp": 1.59993446, + "diversity_loss_mlp": 0.1987851, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.03287481157446996, + "language_loss": 0.85918486, + "learning_rate": 0.0008888538449576555, + "loss": 0.86834061, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621127, + "step": 1247, + "time_per_iteration": 2.5719456672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178279, + "balance_loss_mlp": 1.16323447, + "diversity_loss_mlp": 0.0, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.10811715250715398, + "language_loss": 0.83036304, + "learning_rate": 0.0008886579258658944, + "loss": 0.8421458, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.15014648, + "routerloss_mlp": 0.0, + "step": 1248, + "time_per_iteration": 2.5736701488494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148631, + "balance_loss_mlp": 1.13341999, + "diversity_loss_mlp": 0.0, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.07868761607649298, + "language_loss": 0.84717274, + "learning_rate": 0.0008884618558837446, + "loss": 0.85865903, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1249, + "time_per_iteration": 2.8215761184692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911764, + "balance_loss_mlp": 1.59372783, + "diversity_loss_mlp": 0.19720009, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.03236174678929329, + "language_loss": 0.8677094, + "learning_rate": 0.0008882656350873273, + "loss": 0.87682706, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01629994, + "step": 1250, + "time_per_iteration": 2.885092258453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126022, + "balance_loss_mlp": 1.11122799, + "diversity_loss_mlp": 0.0, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.08347743908005935, + "language_loss": 0.87000573, + "learning_rate": 0.0008880692635528219, + "loss": 0.88126594, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1251, + "time_per_iteration": 3.049070119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106629, + "balance_loss_mlp": 1.09177542, + "diversity_loss_mlp": 0.0, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.07406446185181008, + "language_loss": 0.89514965, + "learning_rate": 0.0008878727413564669, + "loss": 0.90621597, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1252, + "time_per_iteration": 2.734839677810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.06804204, + "diversity_loss_mlp": 0.0, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.048930323133030355, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81211317, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1253, + "time_per_iteration": 4.854974031448364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00873083, + "balance_loss_mlp": 1.51531768, + "diversity_loss_mlp": 0.19563958, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.03648198852202315, + "language_loss": 0.78763413, + "learning_rate": 0.0008874792452834528, + "loss": 0.7963649, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01760404, + "step": 1254, + "time_per_iteration": 2.803690195083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090026, + "balance_loss_mlp": 1.07530415, + "diversity_loss_mlp": 0.0, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.09659900556863026, + "language_loss": 0.8729195, + "learning_rate": 0.0008872822715595626, + "loss": 0.88381982, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1255, + "time_per_iteration": 2.657867670059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084204, + "balance_loss_mlp": 1.06968451, + "diversity_loss_mlp": 0.0, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.10497791491954662, + "language_loss": 0.87333822, + "learning_rate": 0.0008870851474793598, + "loss": 0.88418031, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1256, + "time_per_iteration": 2.5694568157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083756, + "balance_loss_mlp": 1.06920075, + "diversity_loss_mlp": 0.0, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.07331256259210016, + "language_loss": 0.89243567, + "learning_rate": 0.0008868878731193752, + "loss": 0.90327322, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1257, + "time_per_iteration": 2.829789400100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086138, + "balance_loss_mlp": 1.07158267, + "diversity_loss_mlp": 0.0, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.07236027639177293, + "language_loss": 0.89720446, + "learning_rate": 0.0008866904485561973, + "loss": 0.90806586, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1258, + "time_per_iteration": 2.731635570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078524, + "balance_loss_mlp": 1.06384969, + "diversity_loss_mlp": 0.0, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.0727569881861308, + "language_loss": 0.83084273, + "learning_rate": 0.000886492873866473, + "loss": 0.84162796, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1259, + "time_per_iteration": 2.8250575065612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080175, + "balance_loss_mlp": 1.06528533, + "diversity_loss_mlp": 0.0, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.10762424055834904, + "language_loss": 0.84672934, + "learning_rate": 0.000886295149126908, + "loss": 0.85753107, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1260, + "time_per_iteration": 2.7148356437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086434, + "balance_loss_mlp": 1.07181931, + "diversity_loss_mlp": 0.0, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.07159531524201106, + "language_loss": 0.85693741, + "learning_rate": 0.0008860972744142655, + "loss": 0.86780179, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1261, + "time_per_iteration": 2.931696653366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115009, + "balance_loss_mlp": 1.10064411, + "diversity_loss_mlp": 0.0, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.065367920687613, + "language_loss": 0.81639904, + "learning_rate": 0.0008858992498053671, + "loss": 0.82754916, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1262, + "time_per_iteration": 2.846466541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055704, + "balance_loss_mlp": 1.04764521, + "diversity_loss_mlp": 0.0, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.03374572714932058, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77644455, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1263, + "time_per_iteration": 4.882519006729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00872344, + "balance_loss_mlp": 1.51226497, + "diversity_loss_mlp": 0.19974959, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.03166105856965055, + "language_loss": 0.83409035, + "learning_rate": 0.0008855027512063817, + "loss": 0.84281385, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633644, + "step": 1264, + "time_per_iteration": 2.7414488792419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185798, + "balance_loss_mlp": 1.17132628, + "diversity_loss_mlp": 0.0, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.06261248257395001, + "language_loss": 0.85949916, + "learning_rate": 0.0008853042773702292, + "loss": 0.8713572, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1265, + "time_per_iteration": 2.695514440536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196886, + "balance_loss_mlp": 1.18234205, + "diversity_loss_mlp": 0.0, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.08760826562773598, + "language_loss": 0.87981403, + "learning_rate": 0.0008851056539456896, + "loss": 0.89178288, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1266, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119913, + "balance_loss_mlp": 1.18489647, + "diversity_loss_mlp": 0.0, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.07991839198753149, + "language_loss": 0.81904382, + "learning_rate": 0.0008849068810098755, + "loss": 0.83103514, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1267, + "time_per_iteration": 3.3067915439605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174372, + "balance_loss_mlp": 1.15992332, + "diversity_loss_mlp": 0.0, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.10499473220259715, + "language_loss": 0.83550054, + "learning_rate": 0.0008847079586399575, + "loss": 0.84724426, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1268, + "time_per_iteration": 2.4791157245635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115106, + "balance_loss_mlp": 1.13699341, + "diversity_loss_mlp": 0.0, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07765469411987547, + "language_loss": 0.86144567, + "learning_rate": 0.0008845088869131641, + "loss": 0.87295628, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1269, + "time_per_iteration": 2.6733555793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111483, + "balance_loss_mlp": 1.10053682, + "diversity_loss_mlp": 0.0, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.0888033537849515, + "language_loss": 0.88898385, + "learning_rate": 0.0008843096659067818, + "loss": 0.90013218, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1270, + "time_per_iteration": 2.6315910816192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111213, + "balance_loss_mlp": 1.09708679, + "diversity_loss_mlp": 0.0, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.09475560383246978, + "language_loss": 0.86565858, + "learning_rate": 0.000884110295698155, + "loss": 0.87677073, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1271, + "time_per_iteration": 2.926668643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.08752966, + "diversity_loss_mlp": 0.0, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.09917556522455147, + "language_loss": 0.85849231, + "learning_rate": 0.0008839107763646861, + "loss": 0.86951411, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1272, + "time_per_iteration": 2.58022403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110242, + "balance_loss_mlp": 1.08751881, + "diversity_loss_mlp": 0.0, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.08783320449451974, + "language_loss": 0.89941388, + "learning_rate": 0.0008837111079838353, + "loss": 0.91043806, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1273, + "time_per_iteration": 2.6877150535583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111566, + "balance_loss_mlp": 1.10096157, + "diversity_loss_mlp": 0.0, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.07640958054403056, + "language_loss": 0.89671296, + "learning_rate": 0.000883511290633121, + "loss": 0.90786958, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1274, + "time_per_iteration": 2.5929813385009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123812, + "balance_loss_mlp": 1.10898256, + "diversity_loss_mlp": 0.0, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.05814589763763208, + "language_loss": 0.92211604, + "learning_rate": 0.000883311324390119, + "loss": 0.93335414, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1275, + "time_per_iteration": 2.721343517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138447, + "balance_loss_mlp": 1.12315261, + "diversity_loss_mlp": 0.0, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.10098653640048322, + "language_loss": 0.81237984, + "learning_rate": 0.0008831112093324629, + "loss": 0.82376432, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.15283203, + "routerloss_mlp": 0.0, + "step": 1276, + "time_per_iteration": 3.066657543182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148152, + "balance_loss_mlp": 1.13266695, + "diversity_loss_mlp": 0.0, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.07328274291062464, + "language_loss": 0.89255905, + "learning_rate": 0.0008829109455378444, + "loss": 0.90404058, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 1277, + "time_per_iteration": 2.6705071926116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163573, + "balance_loss_mlp": 1.14844561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.08343231090098181, + "language_loss": 0.86569774, + "learning_rate": 0.000882710533084013, + "loss": 0.87733346, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1278, + "time_per_iteration": 2.632864236831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152351, + "balance_loss_mlp": 1.13783133, + "diversity_loss_mlp": 0.0, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.0729065811951457, + "language_loss": 0.8929435, + "learning_rate": 0.0008825099720487755, + "loss": 0.90446699, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1279, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00676302, + "balance_loss_mlp": 1.12665224, + "diversity_loss_mlp": 0.19835761, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.0027483074809680533, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.75937444, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0137972, + "step": 1280, + "time_per_iteration": 4.88429594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111003, + "balance_loss_mlp": 1.10232449, + "diversity_loss_mlp": 0.0, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.05615046205501133, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79055113, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1281, + "time_per_iteration": 4.752316236495972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113823, + "balance_loss_mlp": 1.09987593, + "diversity_loss_mlp": 0.0, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.08093958913819582, + "language_loss": 0.89542687, + "learning_rate": 0.0008819073982335619, + "loss": 0.90656507, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1282, + "time_per_iteration": 2.876927137374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110167, + "balance_loss_mlp": 1.08783603, + "diversity_loss_mlp": 0.0, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07169123109412263, + "language_loss": 0.84362143, + "learning_rate": 0.0008817062436519235, + "loss": 0.8546381, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.13824463, + "routerloss_mlp": 0.0, + "step": 1283, + "time_per_iteration": 2.6551387310028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0086846, + "balance_loss_mlp": 1.5022366, + "diversity_loss_mlp": 0.20048198, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.033180516132009126, + "language_loss": 0.89655471, + "learning_rate": 0.0008815049408787788, + "loss": 0.90523928, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01710081, + "step": 1284, + "time_per_iteration": 2.5652830600738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100363, + "balance_loss_mlp": 1.08698821, + "diversity_loss_mlp": 0.0, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.0762028673981185, + "language_loss": 0.85473216, + "learning_rate": 0.0008813034899922805, + "loss": 0.86573577, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1285, + "time_per_iteration": 2.549622058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111306, + "balance_loss_mlp": 1.09783578, + "diversity_loss_mlp": 0.0, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.11471388318643767, + "language_loss": 0.89855313, + "learning_rate": 0.0008811018910706387, + "loss": 0.9096663, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1286, + "time_per_iteration": 2.575176954269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117993, + "balance_loss_mlp": 1.10453439, + "diversity_loss_mlp": 0.0, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.10517914532856759, + "language_loss": 0.81922066, + "learning_rate": 0.0008809001441921211, + "loss": 0.83040059, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1287, + "time_per_iteration": 2.732236862182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.1132865, + "diversity_loss_mlp": 0.0, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.1440229573277689, + "language_loss": 0.85392761, + "learning_rate": 0.0008806982494350528, + "loss": 0.86519527, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1288, + "time_per_iteration": 2.6544177532196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168468, + "balance_loss_mlp": 1.1549263, + "diversity_loss_mlp": 0.0, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.07192560701016996, + "language_loss": 0.9021467, + "learning_rate": 0.0008804962068778161, + "loss": 0.91383135, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1289, + "time_per_iteration": 2.8321304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_mlp": 1.20329499, + "diversity_loss_mlp": 0.0, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.08274381184261048, + "language_loss": 0.81234664, + "learning_rate": 0.0008802940165988511, + "loss": 0.82451665, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1290, + "time_per_iteration": 2.848726749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262968, + "balance_loss_mlp": 1.24875808, + "diversity_loss_mlp": 0.0, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.09449787402071168, + "language_loss": 0.88461435, + "learning_rate": 0.000880091678676655, + "loss": 0.8972441, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1291, + "time_per_iteration": 2.802199363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279654, + "balance_loss_mlp": 1.26553965, + "diversity_loss_mlp": 0.0, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.11843407890200246, + "language_loss": 0.88870949, + "learning_rate": 0.0008798891931897821, + "loss": 0.90150601, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1292, + "time_per_iteration": 2.7150259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870403, + "balance_loss_mlp": 1.50883341, + "diversity_loss_mlp": 0.20002533, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.035309457370921726, + "language_loss": 0.84031773, + "learning_rate": 0.0008796865602168447, + "loss": 0.84902173, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597392, + "step": 1293, + "time_per_iteration": 2.5952000617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210957, + "balance_loss_mlp": 1.19661582, + "diversity_loss_mlp": 0.0, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.07909897749306223, + "language_loss": 0.88611919, + "learning_rate": 0.0008794837798365115, + "loss": 0.89822876, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1294, + "time_per_iteration": 2.6257524490356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167929, + "balance_loss_mlp": 1.15246725, + "diversity_loss_mlp": 0.0, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.06704316740686254, + "language_loss": 0.8866623, + "learning_rate": 0.0008792808521275089, + "loss": 0.89834166, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1295, + "time_per_iteration": 2.7125115394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153488, + "balance_loss_mlp": 1.13757372, + "diversity_loss_mlp": 0.0, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.08601952378824393, + "language_loss": 0.87496305, + "learning_rate": 0.0008790777771686206, + "loss": 0.88649786, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1296, + "time_per_iteration": 2.6131319999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124706, + "balance_loss_mlp": 1.10882747, + "diversity_loss_mlp": 0.0, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.0951042007575699, + "language_loss": 0.8543523, + "learning_rate": 0.0008788745550386872, + "loss": 0.86559939, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 1297, + "time_per_iteration": 2.5590503215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115503, + "balance_loss_mlp": 1.09948111, + "diversity_loss_mlp": 0.0, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.07219065567928346, + "language_loss": 0.80291975, + "learning_rate": 0.0008786711858166063, + "loss": 0.81407487, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1298, + "time_per_iteration": 2.951768398284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00871436, + "balance_loss_mlp": 1.51113367, + "diversity_loss_mlp": 0.19870289, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.03357842357877673, + "language_loss": 0.83488023, + "learning_rate": 0.0008784676695813332, + "loss": 0.84359455, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165179, + "step": 1299, + "time_per_iteration": 2.985684871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108985, + "balance_loss_mlp": 1.07411456, + "diversity_loss_mlp": 0.0, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07050099983107566, + "language_loss": 0.84900999, + "learning_rate": 0.0008782640064118796, + "loss": 0.85990846, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1300, + "time_per_iteration": 2.943368673324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139161, + "balance_loss_mlp": 1.13172245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.062054541004710057, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323914, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.07421875, + "routerloss_mlp": 0.0, + "step": 1301, + "time_per_iteration": 4.975619316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106013, + "balance_loss_mlp": 1.09055138, + "diversity_loss_mlp": 0.0, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.08145949094764637, + "language_loss": 0.86554521, + "learning_rate": 0.0008778562395867648, + "loss": 0.87660533, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1302, + "time_per_iteration": 2.6318612098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111342, + "balance_loss_mlp": 1.09572554, + "diversity_loss_mlp": 0.0, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.0727542370097133, + "language_loss": 0.84224409, + "learning_rate": 0.0008776521360894127, + "loss": 0.85335743, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 1303, + "time_per_iteration": 2.6512627601623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_mlp": 1.02259421, + "diversity_loss_mlp": 0.0, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.02979233866947858, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79991817, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.07128906, + "routerloss_mlp": 0.0, + "step": 1304, + "time_per_iteration": 4.802467107772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112993, + "balance_loss_mlp": 1.11518431, + "diversity_loss_mlp": 0.0, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.07060498048015267, + "language_loss": 0.9057076, + "learning_rate": 0.0008772434893213186, + "loss": 0.91700697, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1305, + "time_per_iteration": 2.601546049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137218, + "balance_loss_mlp": 1.12251997, + "diversity_loss_mlp": 0.0, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.13797279723809866, + "language_loss": 0.84362888, + "learning_rate": 0.0008770389462092276, + "loss": 0.85500103, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1306, + "time_per_iteration": 2.626138210296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141522, + "balance_loss_mlp": 1.12685966, + "diversity_loss_mlp": 0.0, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.08471108342240245, + "language_loss": 0.86803389, + "learning_rate": 0.0008768342567176357, + "loss": 0.87944913, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1307, + "time_per_iteration": 2.8074796199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114159, + "balance_loss_mlp": 1.12681937, + "diversity_loss_mlp": 0.0, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.07263390393133992, + "language_loss": 0.90559924, + "learning_rate": 0.0008766294209260107, + "loss": 0.91701508, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1308, + "time_per_iteration": 2.670790910720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.13312435, + "diversity_loss_mlp": 0.0, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.07764888634730133, + "language_loss": 0.91554916, + "learning_rate": 0.0008764244389138767, + "loss": 0.92702377, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1309, + "time_per_iteration": 2.572793483734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147476, + "balance_loss_mlp": 1.13318276, + "diversity_loss_mlp": 0.0, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09714227143719616, + "language_loss": 0.82980847, + "learning_rate": 0.000876219310760815, + "loss": 0.8412832, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1310, + "time_per_iteration": 2.8601791858673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146968, + "balance_loss_mlp": 1.13273418, + "diversity_loss_mlp": 0.0, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.09648806821544922, + "language_loss": 0.81436276, + "learning_rate": 0.0008760140365464631, + "loss": 0.82583249, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1311, + "time_per_iteration": 2.599353790283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870128, + "balance_loss_mlp": 1.50605726, + "diversity_loss_mlp": 0.20002663, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.03529693250820236, + "language_loss": 0.871418, + "learning_rate": 0.0008758086163505156, + "loss": 0.88011926, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170862, + "step": 1312, + "time_per_iteration": 2.6166832447052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163863, + "balance_loss_mlp": 1.14953399, + "diversity_loss_mlp": 0.0, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.07147814499844148, + "language_loss": 0.89267951, + "learning_rate": 0.0008756030502527239, + "loss": 0.90431809, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1313, + "time_per_iteration": 2.8452062606811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188075, + "balance_loss_mlp": 1.17377019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.09335955432973846, + "language_loss": 0.90298462, + "learning_rate": 0.0008753973383328954, + "loss": 0.91486537, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1314, + "time_per_iteration": 2.6988537311553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165459, + "balance_loss_mlp": 1.15108287, + "diversity_loss_mlp": 0.0, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.08872096542459323, + "language_loss": 0.83944553, + "learning_rate": 0.0008751914806708952, + "loss": 0.85110015, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1315, + "time_per_iteration": 2.6328680515289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.1372478, + "diversity_loss_mlp": 0.0, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.09247066962171595, + "language_loss": 0.81854099, + "learning_rate": 0.0008749854773466439, + "loss": 0.83005595, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1316, + "time_per_iteration": 2.6708498001098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134446, + "balance_loss_mlp": 1.11980653, + "diversity_loss_mlp": 0.0, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.06992463478304738, + "language_loss": 0.84568423, + "learning_rate": 0.0008747793284401192, + "loss": 0.85702872, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1317, + "time_per_iteration": 2.70182204246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120804, + "balance_loss_mlp": 1.10560477, + "diversity_loss_mlp": 0.0, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.11229953955213261, + "language_loss": 0.85994983, + "learning_rate": 0.0008745730340313551, + "loss": 0.87115788, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1318, + "time_per_iteration": 2.8026556968688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.1048007, + "diversity_loss_mlp": 0.0, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.0843917818222923, + "language_loss": 0.84519732, + "learning_rate": 0.0008743665942004422, + "loss": 0.85639453, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.14904785, + "routerloss_mlp": 0.0, + "step": 1319, + "time_per_iteration": 2.6717073917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120645, + "balance_loss_mlp": 1.10569644, + "diversity_loss_mlp": 0.0, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.06860607652829093, + "language_loss": 0.92769039, + "learning_rate": 0.0008741600090275277, + "loss": 0.93889689, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1320, + "time_per_iteration": 2.6251981258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120587, + "balance_loss_mlp": 1.10530448, + "diversity_loss_mlp": 0.0, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.09643257369734548, + "language_loss": 0.8425917, + "learning_rate": 0.0008739532785928151, + "loss": 0.85379755, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1321, + "time_per_iteration": 3.4925267696380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101061, + "balance_loss_mlp": 1.09305024, + "diversity_loss_mlp": 0.0, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.04547815076873398, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75994641, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1322, + "time_per_iteration": 4.8446879386901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0085354, + "balance_loss_mlp": 1.4814328, + "diversity_loss_mlp": 0.19370571, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.036800523279172735, + "language_loss": 0.82844102, + "learning_rate": 0.0008735393822590908, + "loss": 0.83697641, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597124, + "step": 1323, + "time_per_iteration": 2.7354650497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174586, + "balance_loss_mlp": 1.16032863, + "diversity_loss_mlp": 0.0, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.08280852347492981, + "language_loss": 0.87442601, + "learning_rate": 0.0008733322165207681, + "loss": 0.88617194, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1324, + "time_per_iteration": 2.6581695079803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120021, + "balance_loss_mlp": 1.18529749, + "diversity_loss_mlp": 0.0, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.0779912319299164, + "language_loss": 0.8296451, + "learning_rate": 0.0008731249058420247, + "loss": 0.84164721, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1325, + "time_per_iteration": 3.0674960613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203892, + "balance_loss_mlp": 1.18865728, + "diversity_loss_mlp": 0.0, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.10695670124077197, + "language_loss": 0.90080667, + "learning_rate": 0.0008729174503033459, + "loss": 0.91284555, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1326, + "time_per_iteration": 2.6511192321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188403, + "balance_loss_mlp": 1.17334652, + "diversity_loss_mlp": 0.0, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.10125548093505272, + "language_loss": 0.82427752, + "learning_rate": 0.0008727098499852728, + "loss": 0.83616149, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1327, + "time_per_iteration": 2.833803415298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150318, + "balance_loss_mlp": 1.13529778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.08478455973869617, + "language_loss": 0.89778203, + "learning_rate": 0.0008725021049684034, + "loss": 0.90928519, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.15002441, + "routerloss_mlp": 0.0, + "step": 1328, + "time_per_iteration": 2.7405433654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116795, + "balance_loss_mlp": 1.10194123, + "diversity_loss_mlp": 0.0, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.07099770943741918, + "language_loss": 0.83078361, + "learning_rate": 0.000872294215333391, + "loss": 0.84195161, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1329, + "time_per_iteration": 3.219834089279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099158, + "balance_loss_mlp": 1.08430433, + "diversity_loss_mlp": 0.0, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.06913408205057751, + "language_loss": 0.82662833, + "learning_rate": 0.0008720861811609457, + "loss": 0.8376199, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1330, + "time_per_iteration": 2.753122329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096587, + "balance_loss_mlp": 1.0816741, + "diversity_loss_mlp": 0.0, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0919113566921475, + "language_loss": 0.83719599, + "learning_rate": 0.0008718780025318338, + "loss": 0.84816188, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1331, + "time_per_iteration": 2.724808692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.09296656, + "diversity_loss_mlp": 0.0, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.09880415123515712, + "language_loss": 0.83982158, + "learning_rate": 0.0008716696795268771, + "loss": 0.85089689, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1332, + "time_per_iteration": 2.718421220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098797, + "balance_loss_mlp": 1.08430111, + "diversity_loss_mlp": 0.0, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.15208681676824193, + "language_loss": 0.85333431, + "learning_rate": 0.0008714612122269538, + "loss": 0.8643223, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1333, + "time_per_iteration": 2.877823829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120258, + "balance_loss_mlp": 1.10586989, + "diversity_loss_mlp": 0.0, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.07756137703605612, + "language_loss": 0.89334106, + "learning_rate": 0.0008712526007129982, + "loss": 0.90454364, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1334, + "time_per_iteration": 2.561842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155014, + "balance_loss_mlp": 1.14101923, + "diversity_loss_mlp": 0.0, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.12724628219842446, + "language_loss": 0.90676123, + "learning_rate": 0.0008710438450660003, + "loss": 0.91831136, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1335, + "time_per_iteration": 2.6618270874023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199277, + "balance_loss_mlp": 1.18486404, + "diversity_loss_mlp": 0.0, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.10895723532104484, + "language_loss": 0.87596953, + "learning_rate": 0.0008708349453670064, + "loss": 0.88796222, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1336, + "time_per_iteration": 2.5121865272521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195197, + "balance_loss_mlp": 1.18032002, + "diversity_loss_mlp": 0.0, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.10227195785495524, + "language_loss": 0.91035736, + "learning_rate": 0.0008706259016971185, + "loss": 0.92230934, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1337, + "time_per_iteration": 2.7760090827941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189061, + "balance_loss_mlp": 1.17414773, + "diversity_loss_mlp": 0.0, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.12625436277937716, + "language_loss": 0.83095431, + "learning_rate": 0.0008704167141374944, + "loss": 0.84284496, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1338, + "time_per_iteration": 2.824122428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146224, + "balance_loss_mlp": 1.13107228, + "diversity_loss_mlp": 0.0, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.0801465901926633, + "language_loss": 0.88427222, + "learning_rate": 0.0008702073827693482, + "loss": 0.89573455, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1339, + "time_per_iteration": 2.708488941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101183, + "balance_loss_mlp": 1.0865202, + "diversity_loss_mlp": 0.0, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.07445900988257396, + "language_loss": 0.88514435, + "learning_rate": 0.0008699979076739494, + "loss": 0.89615613, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1340, + "time_per_iteration": 2.960650682449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.07054412, + "diversity_loss_mlp": 0.0, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.09041758143252471, + "language_loss": 0.88622832, + "learning_rate": 0.0008697882889326234, + "loss": 0.89708054, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1341, + "time_per_iteration": 2.5199689865112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094608, + "balance_loss_mlp": 1.08043432, + "diversity_loss_mlp": 0.0, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.08157938691300957, + "language_loss": 0.86840844, + "learning_rate": 0.0008695785266267515, + "loss": 0.87935448, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1342, + "time_per_iteration": 2.6833419799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089859, + "balance_loss_mlp": 1.56664371, + "diversity_loss_mlp": 0.19803861, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.03344075262961686, + "language_loss": 0.83491886, + "learning_rate": 0.0008693686208377704, + "loss": 0.84390479, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01624843, + "step": 1343, + "time_per_iteration": 2.8157622814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101399, + "balance_loss_mlp": 1.08711743, + "diversity_loss_mlp": 0.0, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.07460013341605923, + "language_loss": 0.89022982, + "learning_rate": 0.0008691585716471733, + "loss": 0.90124375, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1344, + "time_per_iteration": 2.6386232376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.09707415, + "diversity_loss_mlp": 0.0, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.08548738123283665, + "language_loss": 0.85822487, + "learning_rate": 0.0008689483791365079, + "loss": 0.86934054, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1345, + "time_per_iteration": 2.831817626953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112096, + "balance_loss_mlp": 1.10685778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.07218857890204664, + "language_loss": 0.89327282, + "learning_rate": 0.0008687380433873786, + "loss": 0.90448248, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1346, + "time_per_iteration": 2.8322408199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1251955, + "diversity_loss_mlp": 0.0, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.07612070672802876, + "language_loss": 0.82638776, + "learning_rate": 0.0008685275644814448, + "loss": 0.83778065, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1347, + "time_per_iteration": 2.689772367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116224, + "balance_loss_mlp": 1.14764857, + "diversity_loss_mlp": 0.0, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07884944678342334, + "language_loss": 0.84390515, + "learning_rate": 0.0008683169425004216, + "loss": 0.85552752, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1348, + "time_per_iteration": 2.895153760910034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159732, + "balance_loss_mlp": 1.14511704, + "diversity_loss_mlp": 0.0, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.10354145261803285, + "language_loss": 0.83314335, + "learning_rate": 0.0008681061775260799, + "loss": 0.84474063, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1349, + "time_per_iteration": 2.850862503051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166024, + "balance_loss_mlp": 1.15118265, + "diversity_loss_mlp": 0.0, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08416928552821445, + "language_loss": 0.9214983, + "learning_rate": 0.0008678952696402458, + "loss": 0.93315852, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1350, + "time_per_iteration": 2.525019884109497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153353, + "balance_loss_mlp": 1.13848734, + "diversity_loss_mlp": 0.0, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.07397225666721696, + "language_loss": 0.86554277, + "learning_rate": 0.000867684218924801, + "loss": 0.87707639, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1351, + "time_per_iteration": 2.8780648708343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083238, + "balance_loss_mlp": 1.07517958, + "diversity_loss_mlp": 0.0, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.0438698963901256, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80030328, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1352, + "time_per_iteration": 4.916059255599976 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132931, + "balance_loss_mlp": 1.11807716, + "diversity_loss_mlp": 0.0, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.06358739416567256, + "language_loss": 0.85154414, + "learning_rate": 0.0008672616893328834, + "loss": 0.86287344, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1353, + "time_per_iteration": 2.9301464557647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120208, + "balance_loss_mlp": 1.10545015, + "diversity_loss_mlp": 0.0, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.0804298790611747, + "language_loss": 0.89736795, + "learning_rate": 0.0008670502106204512, + "loss": 0.90857005, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.14733887, + "routerloss_mlp": 0.0, + "step": 1354, + "time_per_iteration": 2.8392651081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121529, + "balance_loss_mlp": 1.10672283, + "diversity_loss_mlp": 0.0, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.08121830869095954, + "language_loss": 0.81676221, + "learning_rate": 0.0008668385894064892, + "loss": 0.82797754, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1355, + "time_per_iteration": 2.632744550704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.10095191, + "diversity_loss_mlp": 0.0, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.0871855710564252, + "language_loss": 0.88984954, + "learning_rate": 0.0008666268257731562, + "loss": 0.90100139, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1356, + "time_per_iteration": 3.0961363315582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132093, + "balance_loss_mlp": 1.11785948, + "diversity_loss_mlp": 0.0, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.08548634624367135, + "language_loss": 0.8594982, + "learning_rate": 0.0008664149198026662, + "loss": 0.87081909, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1357, + "time_per_iteration": 3.2423956394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133945, + "balance_loss_mlp": 1.12039137, + "diversity_loss_mlp": 0.0, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.09109654485188295, + "language_loss": 0.88802171, + "learning_rate": 0.0008662028715772883, + "loss": 0.89936113, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1358, + "time_per_iteration": 2.619495153427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138578, + "balance_loss_mlp": 1.12476182, + "diversity_loss_mlp": 0.0, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.07135790209188476, + "language_loss": 0.85816395, + "learning_rate": 0.0008659906811793467, + "loss": 0.86954975, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.13842773, + "routerloss_mlp": 0.0, + "step": 1359, + "time_per_iteration": 2.6752817630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135637, + "balance_loss_mlp": 1.12191582, + "diversity_loss_mlp": 0.0, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.07783428421444573, + "language_loss": 0.89649427, + "learning_rate": 0.0008657783486912215, + "loss": 0.90785068, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1360, + "time_per_iteration": 2.770136594772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918859, + "balance_loss_mlp": 1.60386825, + "diversity_loss_mlp": 0.20058532, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.03438194549161764, + "language_loss": 0.90315008, + "learning_rate": 0.0008655658741953472, + "loss": 0.91233867, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01663268, + "step": 1361, + "time_per_iteration": 3.239567518234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117406, + "balance_loss_mlp": 1.10352993, + "diversity_loss_mlp": 0.0, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.053733033776962646, + "language_loss": 0.88311911, + "learning_rate": 0.0008653532577742136, + "loss": 0.89429319, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1362, + "time_per_iteration": 2.6912107467651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111717, + "balance_loss_mlp": 1.09805584, + "diversity_loss_mlp": 0.0, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.07456283347469675, + "language_loss": 0.8687824, + "learning_rate": 0.0008651404995103659, + "loss": 0.87989956, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1363, + "time_per_iteration": 2.5554919242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106371, + "balance_loss_mlp": 1.09212554, + "diversity_loss_mlp": 0.0, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.0735216597505126, + "language_loss": 0.87311852, + "learning_rate": 0.0008649275994864041, + "loss": 0.88418221, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1364, + "time_per_iteration": 2.7228429317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109936, + "balance_loss_mlp": 1.0955832, + "diversity_loss_mlp": 0.0, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.06423000395680191, + "language_loss": 0.83767593, + "learning_rate": 0.0008647145577849834, + "loss": 0.84877527, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1365, + "time_per_iteration": 2.8194234371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110395, + "balance_loss_mlp": 1.09573257, + "diversity_loss_mlp": 0.0, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.0636918785190987, + "language_loss": 0.82912111, + "learning_rate": 0.0008645013744888139, + "loss": 0.8402251, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1366, + "time_per_iteration": 2.9121909141540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106528, + "balance_loss_mlp": 1.09266424, + "diversity_loss_mlp": 0.0, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.07268525177684865, + "language_loss": 0.87255573, + "learning_rate": 0.0008642880496806607, + "loss": 0.88362104, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1367, + "time_per_iteration": 2.7527663707733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117256, + "balance_loss_mlp": 1.1027844, + "diversity_loss_mlp": 0.0, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.06883104565378229, + "language_loss": 0.84193766, + "learning_rate": 0.0008640745834433437, + "loss": 0.85311019, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1368, + "time_per_iteration": 2.7203800678253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114698, + "balance_loss_mlp": 1.10065532, + "diversity_loss_mlp": 0.0, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.0718323039568536, + "language_loss": 0.87083656, + "learning_rate": 0.000863860975859738, + "loss": 0.88198352, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1369, + "time_per_iteration": 2.9021553993225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116648, + "balance_loss_mlp": 1.10278392, + "diversity_loss_mlp": 0.0, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.08463505288724613, + "language_loss": 0.88568735, + "learning_rate": 0.0008636472270127733, + "loss": 0.8968538, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1370, + "time_per_iteration": 2.6336748600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118916, + "balance_loss_mlp": 1.10440779, + "diversity_loss_mlp": 0.0, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.08505114845208346, + "language_loss": 0.90530956, + "learning_rate": 0.0008634333369854345, + "loss": 0.91649872, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1371, + "time_per_iteration": 2.585775136947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122621, + "balance_loss_mlp": 1.10868549, + "diversity_loss_mlp": 0.0, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.07138701063901956, + "language_loss": 0.87574148, + "learning_rate": 0.0008632193058607608, + "loss": 0.88696772, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.13952637, + "routerloss_mlp": 0.0, + "step": 1372, + "time_per_iteration": 2.719151735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124687, + "balance_loss_mlp": 1.11042953, + "diversity_loss_mlp": 0.0, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.09395332240398839, + "language_loss": 0.81125695, + "learning_rate": 0.0008630051337218466, + "loss": 0.82250381, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1373, + "time_per_iteration": 2.6700031757354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118707, + "balance_loss_mlp": 1.10506988, + "diversity_loss_mlp": 0.0, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0808240378873911, + "language_loss": 0.82403839, + "learning_rate": 0.0008627908206518409, + "loss": 0.83522546, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1374, + "time_per_iteration": 2.6610107421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061343, + "balance_loss_mlp": 1.05442929, + "diversity_loss_mlp": 0.0, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.04099598647265769, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76212597, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 1375, + "time_per_iteration": 4.979893922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109458, + "balance_loss_mlp": 1.09580863, + "diversity_loss_mlp": 0.0, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.06989177478220372, + "language_loss": 0.91488004, + "learning_rate": 0.0008623617720514241, + "loss": 0.92597461, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1376, + "time_per_iteration": 2.6515755653381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109452, + "balance_loss_mlp": 1.09554029, + "diversity_loss_mlp": 0.0, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.07399727326907257, + "language_loss": 0.84706682, + "learning_rate": 0.0008621470366875848, + "loss": 0.85816133, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1377, + "time_per_iteration": 2.599776268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119233, + "balance_loss_mlp": 1.10546422, + "diversity_loss_mlp": 0.0, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.07769258092785128, + "language_loss": 0.87980253, + "learning_rate": 0.0008619321607257966, + "loss": 0.89099485, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1378, + "time_per_iteration": 2.678865671157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.10274947, + "diversity_loss_mlp": 0.0, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.07519514659764338, + "language_loss": 0.82002568, + "learning_rate": 0.000861717144249482, + "loss": 0.83118635, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1379, + "time_per_iteration": 2.8830740451812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118616, + "balance_loss_mlp": 1.10515702, + "diversity_loss_mlp": 0.0, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06542821866252439, + "language_loss": 0.89670694, + "learning_rate": 0.0008615019873421175, + "loss": 0.90789306, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.1348877, + "routerloss_mlp": 0.0, + "step": 1380, + "time_per_iteration": 2.4692320823669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124803, + "balance_loss_mlp": 1.11096311, + "diversity_loss_mlp": 0.0, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.08230289019981965, + "language_loss": 0.85984069, + "learning_rate": 0.0008612866900872349, + "loss": 0.87108874, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1381, + "time_per_iteration": 2.5671193599700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119212, + "balance_loss_mlp": 1.10564578, + "diversity_loss_mlp": 0.0, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.09708901974799254, + "language_loss": 0.8800329, + "learning_rate": 0.0008610712525684197, + "loss": 0.89122504, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1382, + "time_per_iteration": 2.673672676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.12075388, + "diversity_loss_mlp": 0.0, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.08550137436350284, + "language_loss": 0.84231853, + "learning_rate": 0.0008608556748693121, + "loss": 0.85366714, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1383, + "time_per_iteration": 3.285391330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113293, + "balance_loss_mlp": 1.11881518, + "diversity_loss_mlp": 0.0, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.07276264363306281, + "language_loss": 0.86098409, + "learning_rate": 0.000860639957073607, + "loss": 0.87231338, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1384, + "time_per_iteration": 2.74979829788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130834, + "balance_loss_mlp": 1.11668396, + "diversity_loss_mlp": 0.0, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.07735164598050102, + "language_loss": 0.87488532, + "learning_rate": 0.0008604240992650534, + "loss": 0.88619369, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1385, + "time_per_iteration": 2.765714406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113264, + "balance_loss_mlp": 1.11819148, + "diversity_loss_mlp": 0.0, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.09224305204204497, + "language_loss": 0.89344275, + "learning_rate": 0.0008602081015274545, + "loss": 0.90476912, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1386, + "time_per_iteration": 2.7466471195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130382, + "balance_loss_mlp": 1.11580229, + "diversity_loss_mlp": 0.0, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.08049268911379595, + "language_loss": 0.83551365, + "learning_rate": 0.0008599919639446684, + "loss": 0.84681749, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1387, + "time_per_iteration": 2.680053234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119435, + "balance_loss_mlp": 1.10439074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.08313146027802099, + "language_loss": 0.80363739, + "learning_rate": 0.000859775686600607, + "loss": 0.81483173, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1388, + "time_per_iteration": 2.5738272666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114186, + "balance_loss_mlp": 1.12722135, + "diversity_loss_mlp": 0.0, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.08559032433145165, + "language_loss": 0.85052109, + "learning_rate": 0.0008595592695792367, + "loss": 0.86193967, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1389, + "time_per_iteration": 2.660012722015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112772, + "balance_loss_mlp": 1.11312914, + "diversity_loss_mlp": 0.0, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.07620364037172102, + "language_loss": 0.90774226, + "learning_rate": 0.0008593427129645778, + "loss": 0.91901946, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1390, + "time_per_iteration": 2.62744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131127, + "balance_loss_mlp": 1.11615419, + "diversity_loss_mlp": 0.0, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.0742307152228864, + "language_loss": 0.85619152, + "learning_rate": 0.0008591260168407052, + "loss": 0.86750275, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1391, + "time_per_iteration": 2.738680124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113811, + "balance_loss_mlp": 1.09930313, + "diversity_loss_mlp": 0.0, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.05574398067767488, + "language_loss": 0.82839364, + "learning_rate": 0.0008589091812917479, + "loss": 0.83953172, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1392, + "time_per_iteration": 2.5947506427764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109245, + "balance_loss_mlp": 1.09471345, + "diversity_loss_mlp": 0.0, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07022348692687568, + "language_loss": 0.85257161, + "learning_rate": 0.0008586922064018887, + "loss": 0.86366403, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1393, + "time_per_iteration": 2.6624581813812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110195, + "balance_loss_mlp": 1.09542501, + "diversity_loss_mlp": 0.0, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.07561979453055602, + "language_loss": 0.89401793, + "learning_rate": 0.0008584750922553651, + "loss": 0.9051199, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1394, + "time_per_iteration": 3.1940202713012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107917, + "balance_loss_mlp": 1.0934931, + "diversity_loss_mlp": 0.0, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.07234350422575066, + "language_loss": 0.83740592, + "learning_rate": 0.0008582578389364677, + "loss": 0.84848505, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1395, + "time_per_iteration": 2.8844621181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106129, + "balance_loss_mlp": 1.09147811, + "diversity_loss_mlp": 0.0, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.061968206774760184, + "language_loss": 0.91908813, + "learning_rate": 0.0008580404465295422, + "loss": 0.93014938, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1396, + "time_per_iteration": 2.7842769622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106127, + "balance_loss_mlp": 1.09155917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07293181793333794, + "language_loss": 0.88274646, + "learning_rate": 0.0008578229151189876, + "loss": 0.89380777, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1397, + "time_per_iteration": 2.96771502494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110096, + "balance_loss_mlp": 1.08638036, + "diversity_loss_mlp": 0.0, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.08798004746081324, + "language_loss": 0.81253606, + "learning_rate": 0.0008576052447892573, + "loss": 0.82354569, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1398, + "time_per_iteration": 2.5413830280303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101823, + "balance_loss_mlp": 1.08761334, + "diversity_loss_mlp": 0.0, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.0737959226904994, + "language_loss": 0.86320835, + "learning_rate": 0.000857387435624858, + "loss": 0.87422657, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1399, + "time_per_iteration": 2.554016351699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00934821, + "balance_loss_mlp": 1.63627267, + "diversity_loss_mlp": 0.20064378, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.02492172823463741, + "language_loss": 0.88190895, + "learning_rate": 0.0008571694877103513, + "loss": 0.89125717, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636335, + "step": 1400, + "time_per_iteration": 3.307114839553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110386, + "balance_loss_mlp": 1.09591365, + "diversity_loss_mlp": 0.0, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.07757128819182789, + "language_loss": 0.87680864, + "learning_rate": 0.0008569514011303515, + "loss": 0.88791251, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1401, + "time_per_iteration": 2.800502300262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917512, + "balance_loss_mlp": 1.60226941, + "diversity_loss_mlp": 0.19939175, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.03393521208879438, + "language_loss": 0.88186574, + "learning_rate": 0.0008567331759695277, + "loss": 0.8910408, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01668182, + "step": 1402, + "time_per_iteration": 2.7670016288757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108043, + "balance_loss_mlp": 1.09297514, + "diversity_loss_mlp": 0.0, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.0674494366068644, + "language_loss": 0.86427194, + "learning_rate": 0.0008565148123126023, + "loss": 0.87535238, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1403, + "time_per_iteration": 2.660659074783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094781, + "balance_loss_mlp": 1.08053553, + "diversity_loss_mlp": 0.0, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.059221605294443855, + "language_loss": 0.86113608, + "learning_rate": 0.0008562963102443516, + "loss": 0.8720839, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1404, + "time_per_iteration": 2.6982760429382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110424, + "balance_loss_mlp": 1.090042, + "diversity_loss_mlp": 0.0, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.08483345099627004, + "language_loss": 0.85166299, + "learning_rate": 0.0008560776698496056, + "loss": 0.86270541, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1405, + "time_per_iteration": 2.9167518615722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110133, + "balance_loss_mlp": 1.09539831, + "diversity_loss_mlp": 0.0, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.06923600464578249, + "language_loss": 0.85861331, + "learning_rate": 0.0008558588912132481, + "loss": 0.86971468, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1406, + "time_per_iteration": 2.8346776962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00696474, + "balance_loss_mlp": 1.17983532, + "diversity_loss_mlp": 0.18206902, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.0036772550136199766, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77155459, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0155216, + "step": 1407, + "time_per_iteration": 4.943782091140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_mlp": 1.09137964, + "diversity_loss_mlp": 0.0, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.08329945876184135, + "language_loss": 0.82942384, + "learning_rate": 0.0008554209195555016, + "loss": 0.84047806, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1408, + "time_per_iteration": 2.7417516708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125368, + "balance_loss_mlp": 1.11146832, + "diversity_loss_mlp": 0.0, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.06975199960684045, + "language_loss": 0.8827157, + "learning_rate": 0.0008552017267041483, + "loss": 0.89396936, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1409, + "time_per_iteration": 2.6978721618652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126015, + "balance_loss_mlp": 1.11216331, + "diversity_loss_mlp": 0.0, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06710824628929367, + "language_loss": 0.83395678, + "learning_rate": 0.0008549823959512549, + "loss": 0.84521693, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1410, + "time_per_iteration": 2.6867637634277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125714, + "balance_loss_mlp": 1.11246991, + "diversity_loss_mlp": 0.0, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.07002470067050659, + "language_loss": 0.86486357, + "learning_rate": 0.0008547629273819728, + "loss": 0.87612069, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.13262939, + "routerloss_mlp": 0.0, + "step": 1411, + "time_per_iteration": 3.410454750061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142962, + "balance_loss_mlp": 1.12940812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.07619635814943253, + "language_loss": 0.83522588, + "learning_rate": 0.0008545433210815074, + "loss": 0.84665549, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1412, + "time_per_iteration": 2.638172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139051, + "balance_loss_mlp": 1.12536621, + "diversity_loss_mlp": 0.0, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.06317158203016926, + "language_loss": 0.87351668, + "learning_rate": 0.0008543235771351176, + "loss": 0.88490719, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1413, + "time_per_iteration": 2.7705581188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159735, + "balance_loss_mlp": 1.14645457, + "diversity_loss_mlp": 0.0, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.08259318688939964, + "language_loss": 0.84684592, + "learning_rate": 0.0008541036956281154, + "loss": 0.85844326, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1414, + "time_per_iteration": 2.8803579807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147853, + "balance_loss_mlp": 1.13435841, + "diversity_loss_mlp": 0.0, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.09396951476817994, + "language_loss": 0.81928164, + "learning_rate": 0.0008538836766458665, + "loss": 0.83076018, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.13519287, + "routerloss_mlp": 0.0, + "step": 1415, + "time_per_iteration": 2.860991954803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140979, + "balance_loss_mlp": 1.12721062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.07553622395064079, + "language_loss": 0.84927893, + "learning_rate": 0.0008536635202737897, + "loss": 0.86068869, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1416, + "time_per_iteration": 2.848196268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146453, + "balance_loss_mlp": 1.13278019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.07031625369418516, + "language_loss": 0.82188255, + "learning_rate": 0.0008534432265973573, + "loss": 0.83334708, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1417, + "time_per_iteration": 2.6029789447784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153419, + "balance_loss_mlp": 1.13950717, + "diversity_loss_mlp": 0.0, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07823597875801033, + "language_loss": 0.88322413, + "learning_rate": 0.000853222795702095, + "loss": 0.89475828, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1418, + "time_per_iteration": 3.3933968544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149691, + "balance_loss_mlp": 1.13570726, + "diversity_loss_mlp": 0.0, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.07267637680100167, + "language_loss": 0.83730674, + "learning_rate": 0.0008530022276735813, + "loss": 0.84880364, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1419, + "time_per_iteration": 2.766181707382202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134499, + "balance_loss_mlp": 1.12086129, + "diversity_loss_mlp": 0.0, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.06887995103877555, + "language_loss": 0.86238861, + "learning_rate": 0.0008527815225974489, + "loss": 0.87373358, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1420, + "time_per_iteration": 2.6471102237701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135972, + "balance_loss_mlp": 1.12148833, + "diversity_loss_mlp": 0.0, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10131461494963417, + "language_loss": 0.88726115, + "learning_rate": 0.0008525606805593829, + "loss": 0.89862096, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1421, + "time_per_iteration": 2.436647653579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118286, + "balance_loss_mlp": 1.10405266, + "diversity_loss_mlp": 0.0, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.0859881194807961, + "language_loss": 0.8254106, + "learning_rate": 0.0008523397016451213, + "loss": 0.83659345, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1422, + "time_per_iteration": 2.593588352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103656, + "balance_loss_mlp": 1.08907628, + "diversity_loss_mlp": 0.0, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.06052148467578676, + "language_loss": 0.87038374, + "learning_rate": 0.0008521185859404564, + "loss": 0.88142037, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1423, + "time_per_iteration": 3.3936307430267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092129, + "balance_loss_mlp": 1.07775199, + "diversity_loss_mlp": 0.0, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06977326166261295, + "language_loss": 0.8940134, + "learning_rate": 0.0008518973335312326, + "loss": 0.90493476, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1424, + "time_per_iteration": 2.7834270000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081272, + "balance_loss_mlp": 1.06702638, + "diversity_loss_mlp": 0.0, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.119675165593639, + "language_loss": 0.83282709, + "learning_rate": 0.0008516759445033477, + "loss": 0.84363985, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1425, + "time_per_iteration": 2.665099859237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.06930685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08266887436661914, + "language_loss": 0.85026807, + "learning_rate": 0.0008514544189427526, + "loss": 0.86110568, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1426, + "time_per_iteration": 2.6887404918670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086038, + "balance_loss_mlp": 1.07249546, + "diversity_loss_mlp": 0.0, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.06908859165293682, + "language_loss": 0.86575979, + "learning_rate": 0.0008512327569354511, + "loss": 0.87662017, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1427, + "time_per_iteration": 2.5235631465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108142, + "balance_loss_mlp": 1.09480238, + "diversity_loss_mlp": 0.0, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.08987008099145026, + "language_loss": 0.8368206, + "learning_rate": 0.0008510109585675001, + "loss": 0.847902, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.13360596, + "routerloss_mlp": 0.0, + "step": 1428, + "time_per_iteration": 2.613348960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140613, + "balance_loss_mlp": 1.13260245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.05207498704371428, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82293957, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1429, + "time_per_iteration": 4.706013202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133032, + "balance_loss_mlp": 1.11977601, + "diversity_loss_mlp": 0.0, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.09002666847623074, + "language_loss": 0.80503839, + "learning_rate": 0.0008505669530941415, + "loss": 0.8163687, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1430, + "time_per_iteration": 3.2976372241973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0097004, + "balance_loss_mlp": 1.70641518, + "diversity_loss_mlp": 0.20088202, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.03747760406507578, + "language_loss": 0.84294951, + "learning_rate": 0.000850344746161112, + "loss": 0.85264993, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01639144, + "step": 1431, + "time_per_iteration": 2.6297106742858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139922, + "balance_loss_mlp": 1.12685704, + "diversity_loss_mlp": 0.0, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.08230554095697513, + "language_loss": 0.87346137, + "learning_rate": 0.0008501224032121894, + "loss": 0.88486063, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1432, + "time_per_iteration": 2.4853787422180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129049, + "balance_loss_mlp": 1.1158998, + "diversity_loss_mlp": 0.0, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06557126517551867, + "language_loss": 0.82118285, + "learning_rate": 0.0008498999243336946, + "loss": 0.83247334, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1433, + "time_per_iteration": 2.623809576034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11776567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.0832335684907068, + "language_loss": 0.87471139, + "learning_rate": 0.0008496773096120021, + "loss": 0.88601708, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.12817383, + "routerloss_mlp": 0.0, + "step": 1434, + "time_per_iteration": 2.7995760440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111971, + "balance_loss_mlp": 1.10637057, + "diversity_loss_mlp": 0.0, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.10286197296711953, + "language_loss": 0.84387434, + "learning_rate": 0.0008494545591335381, + "loss": 0.85507143, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.13354492, + "routerloss_mlp": 0.0, + "step": 1435, + "time_per_iteration": 2.933576822280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113068, + "balance_loss_mlp": 1.09978795, + "diversity_loss_mlp": 0.0, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.053150449500146836, + "language_loss": 0.86971611, + "learning_rate": 0.0008492316729847823, + "loss": 0.88084674, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1436, + "time_per_iteration": 2.8865604400634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.09676659, + "diversity_loss_mlp": 0.0, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08937825724590943, + "language_loss": 0.7968539, + "learning_rate": 0.0008490086512522664, + "loss": 0.80795395, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1437, + "time_per_iteration": 2.7166872024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105369, + "balance_loss_mlp": 1.0916723, + "diversity_loss_mlp": 0.0, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.09013751301914075, + "language_loss": 0.90582836, + "learning_rate": 0.0008487854940225755, + "loss": 0.91688204, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1438, + "time_per_iteration": 2.4426465034484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102616, + "balance_loss_mlp": 1.08844161, + "diversity_loss_mlp": 0.0, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.09066429268698341, + "language_loss": 0.89896768, + "learning_rate": 0.0008485622013823466, + "loss": 0.90999383, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1439, + "time_per_iteration": 2.599177360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090727, + "balance_loss_mlp": 1.07675576, + "diversity_loss_mlp": 0.0, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.08059762035463526, + "language_loss": 0.83446515, + "learning_rate": 0.00084833877341827, + "loss": 0.84537244, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1440, + "time_per_iteration": 2.667215347290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.0762167, + "diversity_loss_mlp": 0.0, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.07889497077341047, + "language_loss": 0.80625433, + "learning_rate": 0.000848115210217088, + "loss": 0.81715715, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1441, + "time_per_iteration": 2.5463788509368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.08003855, + "diversity_loss_mlp": 0.0, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.08443965058939805, + "language_loss": 0.81771946, + "learning_rate": 0.0008478915118655952, + "loss": 0.82866359, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1442, + "time_per_iteration": 2.743678569793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118232, + "balance_loss_mlp": 1.10385561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.07019455815968899, + "language_loss": 0.86195552, + "learning_rate": 0.0008476676784506393, + "loss": 0.87313789, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1443, + "time_per_iteration": 2.663422107696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.10996866, + "diversity_loss_mlp": 0.0, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.08623331537045495, + "language_loss": 0.81889486, + "learning_rate": 0.0008474437100591201, + "loss": 0.83014178, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1444, + "time_per_iteration": 3.340557813644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129459, + "balance_loss_mlp": 1.11489129, + "diversity_loss_mlp": 0.0, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.08279806566523454, + "language_loss": 0.85577607, + "learning_rate": 0.0008472196067779898, + "loss": 0.86707067, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1445, + "time_per_iteration": 2.675623655319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112665, + "balance_loss_mlp": 1.09800267, + "diversity_loss_mlp": 0.0, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10281028137483857, + "language_loss": 0.85108185, + "learning_rate": 0.0008469953686942531, + "loss": 0.86220849, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1446, + "time_per_iteration": 3.0647382736206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933894, + "balance_loss_mlp": 1.63962197, + "diversity_loss_mlp": 0.19544066, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.039122045531048345, + "language_loss": 0.83261281, + "learning_rate": 0.0008467709958949668, + "loss": 0.84195173, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636306, + "step": 1447, + "time_per_iteration": 2.777806043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932176, + "balance_loss_mlp": 1.63710666, + "diversity_loss_mlp": 0.19454433, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.036668832644649825, + "language_loss": 0.85678959, + "learning_rate": 0.0008465464884672403, + "loss": 0.8661114, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01635053, + "step": 1448, + "time_per_iteration": 2.7313778400421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109364, + "balance_loss_mlp": 1.07944214, + "diversity_loss_mlp": 0.0, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.08672786191572247, + "language_loss": 0.85892808, + "learning_rate": 0.0008463218464982348, + "loss": 0.86986446, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1449, + "time_per_iteration": 2.8115885257720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109775, + "balance_loss_mlp": 1.08367157, + "diversity_loss_mlp": 0.0, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.09681901325388456, + "language_loss": 0.8756566, + "learning_rate": 0.0008460970700751645, + "loss": 0.88663405, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1450, + "time_per_iteration": 3.071645975112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093318, + "balance_loss_mlp": 1.07963276, + "diversity_loss_mlp": 0.0, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.09020366192691211, + "language_loss": 0.87640095, + "learning_rate": 0.000845872159285295, + "loss": 0.88733411, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1451, + "time_per_iteration": 2.7342164516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051691, + "balance_loss_mlp": 1.04301238, + "diversity_loss_mlp": 0.0, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.032344288076380935, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78818536, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1452, + "time_per_iteration": 4.95387077331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121492, + "balance_loss_mlp": 1.10795009, + "diversity_loss_mlp": 0.0, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.08097200979220782, + "language_loss": 0.86171871, + "learning_rate": 0.0008454219349544836, + "loss": 0.87293363, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1453, + "time_per_iteration": 3.373755693435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127619, + "balance_loss_mlp": 1.11439896, + "diversity_loss_mlp": 0.0, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.0882994281711823, + "language_loss": 0.81864405, + "learning_rate": 0.000845196621588334, + "loss": 0.82992017, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.13244629, + "routerloss_mlp": 0.0, + "step": 1454, + "time_per_iteration": 2.758122682571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147815, + "balance_loss_mlp": 1.13453507, + "diversity_loss_mlp": 0.0, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.06575509380885615, + "language_loss": 0.76256007, + "learning_rate": 0.0008449711742049706, + "loss": 0.7740382, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1455, + "time_per_iteration": 2.752345561981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.1432693, + "diversity_loss_mlp": 0.0, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.10411587441286801, + "language_loss": 0.84306383, + "learning_rate": 0.0008447455928919196, + "loss": 0.85462898, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1456, + "time_per_iteration": 2.6104180812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146529, + "balance_loss_mlp": 1.13327312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.07273170046833245, + "language_loss": 0.86767292, + "learning_rate": 0.0008445198777367595, + "loss": 0.87913817, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1457, + "time_per_iteration": 2.614743947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144047, + "balance_loss_mlp": 1.13080251, + "diversity_loss_mlp": 0.0, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.08362811388708001, + "language_loss": 0.81054902, + "learning_rate": 0.0008442940288271208, + "loss": 0.82198954, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1458, + "time_per_iteration": 2.615705966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112578, + "balance_loss_mlp": 1.11191583, + "diversity_loss_mlp": 0.0, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06892977395484212, + "language_loss": 0.8688817, + "learning_rate": 0.0008440680462506856, + "loss": 0.88013953, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1459, + "time_per_iteration": 2.810474157333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121233, + "balance_loss_mlp": 1.10828125, + "diversity_loss_mlp": 0.0, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.06441288224223744, + "language_loss": 0.86424565, + "learning_rate": 0.0008438419300951883, + "loss": 0.87545788, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1460, + "time_per_iteration": 2.6540863513946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115517, + "balance_loss_mlp": 1.10215354, + "diversity_loss_mlp": 0.0, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.12446768600100189, + "language_loss": 0.86647975, + "learning_rate": 0.0008436156804484148, + "loss": 0.87763494, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1461, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110833, + "balance_loss_mlp": 1.0965395, + "diversity_loss_mlp": 0.0, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.08490544085138897, + "language_loss": 0.88168794, + "learning_rate": 0.0008433892973982031, + "loss": 0.89279622, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1462, + "time_per_iteration": 2.561211347579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10098886, + "diversity_loss_mlp": 0.0, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07295818188475026, + "language_loss": 0.84776855, + "learning_rate": 0.0008431627810324431, + "loss": 0.85892212, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1463, + "time_per_iteration": 2.654146671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117739, + "balance_loss_mlp": 1.10345769, + "diversity_loss_mlp": 0.0, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.06893619297503142, + "language_loss": 0.8126353, + "learning_rate": 0.000842936131439076, + "loss": 0.82381272, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1464, + "time_per_iteration": 2.6571760177612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115394, + "balance_loss_mlp": 1.1010766, + "diversity_loss_mlp": 0.0, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.07879840484237804, + "language_loss": 0.87885797, + "learning_rate": 0.0008427093487060951, + "loss": 0.89001191, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1465, + "time_per_iteration": 2.6847336292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101907, + "balance_loss_mlp": 1.08776927, + "diversity_loss_mlp": 0.0, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.06118480673876746, + "language_loss": 0.84661305, + "learning_rate": 0.000842482432921545, + "loss": 0.8576321, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1466, + "time_per_iteration": 2.884965181350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110751, + "balance_loss_mlp": 1.09353852, + "diversity_loss_mlp": 0.0, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.07927655906335743, + "language_loss": 0.87199128, + "learning_rate": 0.0008422553841735225, + "loss": 0.88306642, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1467, + "time_per_iteration": 2.528017997741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115631, + "balance_loss_mlp": 1.10146928, + "diversity_loss_mlp": 0.0, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.07348722340160863, + "language_loss": 0.84837711, + "learning_rate": 0.0008420282025501757, + "loss": 0.85953343, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1468, + "time_per_iteration": 2.7696359157562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115321, + "balance_loss_mlp": 1.10156429, + "diversity_loss_mlp": 0.0, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07024793700711117, + "language_loss": 0.85080296, + "learning_rate": 0.0008418008881397043, + "loss": 0.86195612, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1469, + "time_per_iteration": 2.659646511077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115825, + "balance_loss_mlp": 1.10241413, + "diversity_loss_mlp": 0.0, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.12791916727658353, + "language_loss": 0.82420468, + "learning_rate": 0.0008415734410303595, + "loss": 0.83536291, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1470, + "time_per_iteration": 3.2350287437438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120259, + "balance_loss_mlp": 1.10672879, + "diversity_loss_mlp": 0.0, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.0700140113394834, + "language_loss": 0.90437436, + "learning_rate": 0.0008413458613104444, + "loss": 0.91557699, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1471, + "time_per_iteration": 2.7219245433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111254, + "balance_loss_mlp": 1.09766376, + "diversity_loss_mlp": 0.0, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.07145574186167022, + "language_loss": 0.83164495, + "learning_rate": 0.0008411181490683129, + "loss": 0.84275752, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1472, + "time_per_iteration": 2.727936029434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107735, + "balance_loss_mlp": 1.09348917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.0645149730480124, + "language_loss": 0.82377428, + "learning_rate": 0.0008408903043923707, + "loss": 0.83485162, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1473, + "time_per_iteration": 2.9972269535064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111455, + "balance_loss_mlp": 1.1004951, + "diversity_loss_mlp": 0.0, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09233547648167305, + "language_loss": 0.81268132, + "learning_rate": 0.0008406623273710754, + "loss": 0.82382679, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1474, + "time_per_iteration": 2.5923123359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105938, + "balance_loss_mlp": 1.09263408, + "diversity_loss_mlp": 0.0, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.0761903935255829, + "language_loss": 0.8290056, + "learning_rate": 0.0008404342180929351, + "loss": 0.840065, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1475, + "time_per_iteration": 2.664698600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121728, + "balance_loss_mlp": 1.10819817, + "diversity_loss_mlp": 0.0, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.08946081876366527, + "language_loss": 0.81824017, + "learning_rate": 0.00084020597664651, + "loss": 0.82945752, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1476, + "time_per_iteration": 2.7941510677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113829, + "balance_loss_mlp": 1.10019112, + "diversity_loss_mlp": 0.0, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.09030679544521746, + "language_loss": 0.83820337, + "learning_rate": 0.0008399776031204111, + "loss": 0.84934169, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1477, + "time_per_iteration": 2.7508158683776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101529, + "balance_loss_mlp": 1.08784389, + "diversity_loss_mlp": 0.0, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07642048536310797, + "language_loss": 0.79864645, + "learning_rate": 0.0008397490976033009, + "loss": 0.80966175, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1478, + "time_per_iteration": 2.6500625610351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054127, + "balance_loss_mlp": 1.04673624, + "diversity_loss_mlp": 0.0, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.0303646120618472, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933775, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.07373047, + "routerloss_mlp": 0.0, + "step": 1479, + "time_per_iteration": 4.757360935211182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098606, + "balance_loss_mlp": 1.08449173, + "diversity_loss_mlp": 0.0, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06570619267025138, + "language_loss": 0.85133117, + "learning_rate": 0.0008392916909509525, + "loss": 0.86231726, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1480, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093081, + "balance_loss_mlp": 1.07888281, + "diversity_loss_mlp": 0.0, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.07896332999012158, + "language_loss": 0.8543641, + "learning_rate": 0.0008390627899932954, + "loss": 0.86529493, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1481, + "time_per_iteration": 2.5937705039978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100254, + "balance_loss_mlp": 1.08532953, + "diversity_loss_mlp": 0.0, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.08879627929694006, + "language_loss": 0.88894033, + "learning_rate": 0.000838833757399789, + "loss": 0.89994287, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1482, + "time_per_iteration": 2.95451283454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106961, + "balance_loss_mlp": 1.09247661, + "diversity_loss_mlp": 0.0, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.08557616325511565, + "language_loss": 0.80760586, + "learning_rate": 0.0008386045932593515, + "loss": 0.81867552, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1483, + "time_per_iteration": 2.6901025772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.09776473, + "diversity_loss_mlp": 0.0, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.0661413109298982, + "language_loss": 0.86017227, + "learning_rate": 0.0008383752976609525, + "loss": 0.87129307, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1484, + "time_per_iteration": 2.9148330688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116421, + "balance_loss_mlp": 1.1014719, + "diversity_loss_mlp": 0.0, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06788684976720215, + "language_loss": 0.80004096, + "learning_rate": 0.0008381458706936123, + "loss": 0.81120521, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1485, + "time_per_iteration": 2.681067943572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112387, + "balance_loss_mlp": 1.09728312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06920905175587555, + "language_loss": 0.8725493, + "learning_rate": 0.0008379163124464025, + "loss": 0.88367319, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1486, + "time_per_iteration": 2.7093162536621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_mlp": 1.10290396, + "diversity_loss_mlp": 0.0, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.09647963836289664, + "language_loss": 0.77093983, + "learning_rate": 0.0008376866230084452, + "loss": 0.78211844, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1487, + "time_per_iteration": 2.8678433895111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00910546, + "balance_loss_mlp": 1.59136748, + "diversity_loss_mlp": 0.19592074, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.03660624024989628, + "language_loss": 0.86046171, + "learning_rate": 0.000837456802468914, + "loss": 0.86956716, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01690142, + "step": 1488, + "time_per_iteration": 2.602982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102391, + "balance_loss_mlp": 1.08787107, + "diversity_loss_mlp": 0.0, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.0820682475712047, + "language_loss": 0.85374725, + "learning_rate": 0.0008372268509170331, + "loss": 0.86477119, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1489, + "time_per_iteration": 2.6895487308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099554, + "balance_loss_mlp": 1.08529639, + "diversity_loss_mlp": 0.0, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.09305985964981825, + "language_loss": 0.85262501, + "learning_rate": 0.0008369967684420779, + "loss": 0.86362052, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1490, + "time_per_iteration": 2.7102949619293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083179, + "balance_loss_mlp": 1.06912422, + "diversity_loss_mlp": 0.0, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.08804420397834639, + "language_loss": 0.84696782, + "learning_rate": 0.0008367665551333736, + "loss": 0.85779965, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1491, + "time_per_iteration": 2.618272304534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088636, + "balance_loss_mlp": 1.07430756, + "diversity_loss_mlp": 0.0, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.07991380194683065, + "language_loss": 0.85525382, + "learning_rate": 0.0008365362110802977, + "loss": 0.86614019, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1492, + "time_per_iteration": 2.851928234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101019, + "balance_loss_mlp": 1.08655906, + "diversity_loss_mlp": 0.0, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.0838988471662801, + "language_loss": 0.82620168, + "learning_rate": 0.0008363057363722773, + "loss": 0.83721185, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1493, + "time_per_iteration": 2.853207588195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_mlp": 1.09245062, + "diversity_loss_mlp": 0.0, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.06826703692619526, + "language_loss": 0.84157109, + "learning_rate": 0.0008360751310987906, + "loss": 0.85263485, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1494, + "time_per_iteration": 2.57387638092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11695361, + "diversity_loss_mlp": 0.0, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.058749130100992836, + "language_loss": 0.85290074, + "learning_rate": 0.0008358443953493666, + "loss": 0.86420786, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1495, + "time_per_iteration": 2.8883073329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164777, + "balance_loss_mlp": 1.15067482, + "diversity_loss_mlp": 0.0, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.08087911977453179, + "language_loss": 0.88221979, + "learning_rate": 0.0008356135292135851, + "loss": 0.89386749, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1496, + "time_per_iteration": 2.5230934619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186431, + "balance_loss_mlp": 1.17226899, + "diversity_loss_mlp": 0.0, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.11116302526442519, + "language_loss": 0.92429602, + "learning_rate": 0.0008353825327810758, + "loss": 0.93616039, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1497, + "time_per_iteration": 2.420966863632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188369, + "balance_loss_mlp": 1.17465985, + "diversity_loss_mlp": 0.0, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.07094257684914687, + "language_loss": 0.8160103, + "learning_rate": 0.00083515140614152, + "loss": 0.82789397, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1498, + "time_per_iteration": 2.7105205059051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172297, + "balance_loss_mlp": 1.15901685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.09212284213685974, + "language_loss": 0.87059236, + "learning_rate": 0.0008349201493846485, + "loss": 0.88231528, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1499, + "time_per_iteration": 2.6807801723480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148211, + "balance_loss_mlp": 1.13470435, + "diversity_loss_mlp": 0.0, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.07375807574735407, + "language_loss": 0.88790113, + "learning_rate": 0.0008346887626002432, + "loss": 0.89938325, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1500, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919256, + "balance_loss_mlp": 1.60489607, + "diversity_loss_mlp": 0.19980004, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.030907333217789122, + "language_loss": 0.85892522, + "learning_rate": 0.000834457245878137, + "loss": 0.86811781, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0169074, + "step": 1501, + "time_per_iteration": 2.6543540954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112198, + "balance_loss_mlp": 1.10861671, + "diversity_loss_mlp": 0.0, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.09029230185558035, + "language_loss": 0.81450766, + "learning_rate": 0.000834225599308212, + "loss": 0.82572746, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1502, + "time_per_iteration": 3.2493886947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125349, + "balance_loss_mlp": 1.11191428, + "diversity_loss_mlp": 0.0, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07343077704271528, + "language_loss": 0.85592055, + "learning_rate": 0.0008339938229804016, + "loss": 0.86717403, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.13458252, + "routerloss_mlp": 0.0, + "step": 1503, + "time_per_iteration": 2.712455987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.08344853, + "diversity_loss_mlp": 0.0, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.040592353184382625, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76525998, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1504, + "time_per_iteration": 4.975377082824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117311, + "balance_loss_mlp": 1.10320854, + "diversity_loss_mlp": 0.0, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.10665663300821891, + "language_loss": 0.84014988, + "learning_rate": 0.0008335298814111094, + "loss": 0.85132295, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1505, + "time_per_iteration": 2.563352584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119478, + "balance_loss_mlp": 1.10572124, + "diversity_loss_mlp": 0.0, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.07488877863745698, + "language_loss": 0.87982982, + "learning_rate": 0.0008332977163497455, + "loss": 0.89102459, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1506, + "time_per_iteration": 2.799177646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011178, + "balance_loss_mlp": 1.10419846, + "diversity_loss_mlp": 0.0, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.08855239932012744, + "language_loss": 0.83522987, + "learning_rate": 0.0008330654218907325, + "loss": 0.84640789, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1507, + "time_per_iteration": 2.7311654090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130891, + "balance_loss_mlp": 1.1170032, + "diversity_loss_mlp": 0.0, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.06185767339129184, + "language_loss": 0.82011658, + "learning_rate": 0.0008328329981242548, + "loss": 0.83142549, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1508, + "time_per_iteration": 2.87014102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148949, + "balance_loss_mlp": 1.13483465, + "diversity_loss_mlp": 0.0, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.0780337340178098, + "language_loss": 0.88045996, + "learning_rate": 0.0008326004451405475, + "loss": 0.89194947, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1509, + "time_per_iteration": 2.7449288368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146827, + "balance_loss_mlp": 1.13290334, + "diversity_loss_mlp": 0.0, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.07615169765943663, + "language_loss": 0.82328165, + "learning_rate": 0.0008323677630298957, + "loss": 0.83474988, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1510, + "time_per_iteration": 2.5527472496032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911058, + "balance_loss_mlp": 1.59209251, + "diversity_loss_mlp": 0.19929613, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.030084219280472915, + "language_loss": 0.84789264, + "learning_rate": 0.0008321349518826345, + "loss": 0.85700321, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01536426, + "step": 1511, + "time_per_iteration": 2.85006046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167449, + "balance_loss_mlp": 1.15337038, + "diversity_loss_mlp": 0.0, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.09547204503407083, + "language_loss": 0.94614309, + "learning_rate": 0.0008319020117891491, + "loss": 0.95781755, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1512, + "time_per_iteration": 2.619699001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150869, + "balance_loss_mlp": 1.13603973, + "diversity_loss_mlp": 0.0, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.0903449194731753, + "language_loss": 0.86757064, + "learning_rate": 0.0008316689428398751, + "loss": 0.87907934, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1513, + "time_per_iteration": 2.6975061893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122355, + "balance_loss_mlp": 1.10804975, + "diversity_loss_mlp": 0.0, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05700485295001885, + "language_loss": 0.88661957, + "learning_rate": 0.0008314357451252979, + "loss": 0.89784312, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1514, + "time_per_iteration": 2.7759623527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101866, + "balance_loss_mlp": 1.08762062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.06876651723291546, + "language_loss": 0.87979865, + "learning_rate": 0.0008312024187359527, + "loss": 0.89081734, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1515, + "time_per_iteration": 2.6594746112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108928, + "balance_loss_mlp": 1.07499838, + "diversity_loss_mlp": 0.0, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.06943657009436902, + "language_loss": 0.87168229, + "learning_rate": 0.000830968963762425, + "loss": 0.88257504, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1516, + "time_per_iteration": 3.0544168949127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078645, + "balance_loss_mlp": 1.06457818, + "diversity_loss_mlp": 0.0, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.07942748937188983, + "language_loss": 0.84183443, + "learning_rate": 0.0008307353802953497, + "loss": 0.85262084, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1517, + "time_per_iteration": 2.7325901985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.06031072, + "diversity_loss_mlp": 0.0, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.0903207444065502, + "language_loss": 0.86203992, + "learning_rate": 0.0008305016684254125, + "loss": 0.87279052, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1518, + "time_per_iteration": 2.790580987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073552, + "balance_loss_mlp": 1.05908012, + "diversity_loss_mlp": 0.0, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.07640210633127195, + "language_loss": 0.86818451, + "learning_rate": 0.0008302678282433479, + "loss": 0.87892002, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1519, + "time_per_iteration": 2.594045400619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077986, + "balance_loss_mlp": 1.06394291, + "diversity_loss_mlp": 0.0, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07607218771192015, + "language_loss": 0.84937745, + "learning_rate": 0.0008300338598399411, + "loss": 0.86015737, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1520, + "time_per_iteration": 2.6176183223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897129, + "balance_loss_mlp": 1.56367016, + "diversity_loss_mlp": 0.19839743, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.03454500929264816, + "language_loss": 0.94754219, + "learning_rate": 0.0008297997633060263, + "loss": 0.95651346, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160955, + "step": 1521, + "time_per_iteration": 2.5507402420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098471, + "balance_loss_mlp": 1.08445215, + "diversity_loss_mlp": 0.0, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07923859397995789, + "language_loss": 0.84868819, + "learning_rate": 0.0008295655387324883, + "loss": 0.8596729, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1522, + "time_per_iteration": 2.942894458770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103286, + "balance_loss_mlp": 1.08957708, + "diversity_loss_mlp": 0.0, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.09185291067452052, + "language_loss": 0.84979212, + "learning_rate": 0.0008293311862102609, + "loss": 0.86082506, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1523, + "time_per_iteration": 2.555556297302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.10218382, + "diversity_loss_mlp": 0.0, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.07878242279946136, + "language_loss": 0.88546365, + "learning_rate": 0.0008290967058303275, + "loss": 0.89662319, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1524, + "time_per_iteration": 2.5723721981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117022, + "balance_loss_mlp": 1.10387325, + "diversity_loss_mlp": 0.0, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07157234250277994, + "language_loss": 0.86573815, + "learning_rate": 0.0008288620976837219, + "loss": 0.87690842, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1525, + "time_per_iteration": 2.539079427719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116802, + "balance_loss_mlp": 1.10354626, + "diversity_loss_mlp": 0.0, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.07300174969402286, + "language_loss": 0.82548958, + "learning_rate": 0.000828627361861527, + "loss": 0.83665758, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1526, + "time_per_iteration": 2.5784413814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117225, + "balance_loss_mlp": 1.10368335, + "diversity_loss_mlp": 0.0, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.105387273671708, + "language_loss": 0.84438479, + "learning_rate": 0.0008283924984548752, + "loss": 0.85555708, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1527, + "time_per_iteration": 2.876854181289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136626, + "balance_loss_mlp": 1.12352467, + "diversity_loss_mlp": 0.0, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.07473419184062492, + "language_loss": 0.84776825, + "learning_rate": 0.0008281575075549485, + "loss": 0.8591345, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1528, + "time_per_iteration": 2.5660881996154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103997, + "balance_loss_mlp": 1.09631968, + "diversity_loss_mlp": 0.0, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.053938657910520806, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78456688, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1529, + "time_per_iteration": 4.633493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149647, + "balance_loss_mlp": 1.13666511, + "diversity_loss_mlp": 0.0, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.07225715112962865, + "language_loss": 0.90511358, + "learning_rate": 0.0008276871436402469, + "loss": 0.91661, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1530, + "time_per_iteration": 2.8149213790893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156897, + "balance_loss_mlp": 1.14402199, + "diversity_loss_mlp": 0.0, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.10076437192912456, + "language_loss": 0.87526608, + "learning_rate": 0.000827451770808083, + "loss": 0.88683504, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1531, + "time_per_iteration": 2.7307019233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137224, + "balance_loss_mlp": 1.12402749, + "diversity_loss_mlp": 0.0, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.07118672956881426, + "language_loss": 0.8318634, + "learning_rate": 0.0008272162708478674, + "loss": 0.84323561, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1532, + "time_per_iteration": 2.559326648712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135091, + "balance_loss_mlp": 1.1222167, + "diversity_loss_mlp": 0.0, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.07324079883183283, + "language_loss": 0.86170006, + "learning_rate": 0.000826980643851029, + "loss": 0.87305093, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1533, + "time_per_iteration": 2.728351354598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120692, + "balance_loss_mlp": 1.10734081, + "diversity_loss_mlp": 0.0, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.07850912920042735, + "language_loss": 0.84523225, + "learning_rate": 0.0008267448899090464, + "loss": 0.85643911, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1534, + "time_per_iteration": 2.595296859741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121931, + "balance_loss_mlp": 1.10788798, + "diversity_loss_mlp": 0.0, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.07265790711823701, + "language_loss": 0.80930066, + "learning_rate": 0.0008265090091134473, + "loss": 0.82051992, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1535, + "time_per_iteration": 2.8336315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105358, + "balance_loss_mlp": 1.09133863, + "diversity_loss_mlp": 0.0, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.08467148330579209, + "language_loss": 0.80271345, + "learning_rate": 0.0008262730015558088, + "loss": 0.81376696, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1536, + "time_per_iteration": 2.9066760540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102197, + "balance_loss_mlp": 1.08847594, + "diversity_loss_mlp": 0.0, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.07407642769484, + "language_loss": 0.81805962, + "learning_rate": 0.0008260368673277574, + "loss": 0.82908159, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1537, + "time_per_iteration": 3.1795482635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106736, + "balance_loss_mlp": 1.09302735, + "diversity_loss_mlp": 0.0, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06784415515848828, + "language_loss": 0.84026253, + "learning_rate": 0.0008258006065209682, + "loss": 0.85132986, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1538, + "time_per_iteration": 2.766732931137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112658, + "balance_loss_mlp": 1.09863889, + "diversity_loss_mlp": 0.0, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.0747520981493109, + "language_loss": 0.80543184, + "learning_rate": 0.0008255642192271657, + "loss": 0.81655836, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1539, + "time_per_iteration": 2.792191505432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130833, + "balance_loss_mlp": 1.11683834, + "diversity_loss_mlp": 0.0, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06277821647748005, + "language_loss": 0.83592129, + "learning_rate": 0.0008253277055381241, + "loss": 0.8472296, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1540, + "time_per_iteration": 2.8384311199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138407, + "balance_loss_mlp": 1.12428069, + "diversity_loss_mlp": 0.0, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09924754491110549, + "language_loss": 0.85482454, + "learning_rate": 0.0008250910655456658, + "loss": 0.86620867, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1541, + "time_per_iteration": 3.1718008518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133859, + "balance_loss_mlp": 1.12016189, + "diversity_loss_mlp": 0.0, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.07747440640117766, + "language_loss": 0.83370835, + "learning_rate": 0.0008248542993416625, + "loss": 0.84504688, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1542, + "time_per_iteration": 2.5952396392822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127147, + "balance_loss_mlp": 1.11278272, + "diversity_loss_mlp": 0.0, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.08018137719350796, + "language_loss": 0.83926904, + "learning_rate": 0.0008246174070180352, + "loss": 0.85054052, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1543, + "time_per_iteration": 2.6775217056274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.10168624, + "diversity_loss_mlp": 0.0, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09273281815149376, + "language_loss": 0.83928716, + "learning_rate": 0.0008243803886667537, + "loss": 0.85044312, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1544, + "time_per_iteration": 3.0925238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110422, + "balance_loss_mlp": 1.09024858, + "diversity_loss_mlp": 0.0, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.06593992881851045, + "language_loss": 0.79115343, + "learning_rate": 0.0008241432443798364, + "loss": 0.80219567, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1545, + "time_per_iteration": 2.839099407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088557, + "balance_loss_mlp": 1.07518196, + "diversity_loss_mlp": 0.0, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05453506209022983, + "language_loss": 0.85691601, + "learning_rate": 0.0008239059742493512, + "loss": 0.86780155, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1546, + "time_per_iteration": 2.7476751804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088565, + "balance_loss_mlp": 1.07480812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.06672989003234615, + "language_loss": 0.87117672, + "learning_rate": 0.0008236685783674142, + "loss": 0.88206244, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1547, + "time_per_iteration": 3.0519776344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06796312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.04305360715769565, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.772995, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1548, + "time_per_iteration": 4.883166790008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.07123256, + "diversity_loss_mlp": 0.0, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.11160876507978217, + "language_loss": 0.82253683, + "learning_rate": 0.0008231934097178955, + "loss": 0.8333841, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.1350708, + "routerloss_mlp": 0.0, + "step": 1549, + "time_per_iteration": 2.60786771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_mlp": 1.07919788, + "diversity_loss_mlp": 0.0, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.07843428838445873, + "language_loss": 0.85328496, + "learning_rate": 0.0008229556371347903, + "loss": 0.86420953, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1550, + "time_per_iteration": 2.962412118911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106892, + "balance_loss_mlp": 1.09379029, + "diversity_loss_mlp": 0.0, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.0840525031564576, + "language_loss": 0.79399186, + "learning_rate": 0.0008227177391691874, + "loss": 0.80506086, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.13122559, + "routerloss_mlp": 0.0, + "step": 1551, + "time_per_iteration": 3.1673550605773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111848, + "balance_loss_mlp": 1.09871709, + "diversity_loss_mlp": 0.0, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07195743014481873, + "language_loss": 0.89281148, + "learning_rate": 0.0008224797159134463, + "loss": 0.90392995, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1552, + "time_per_iteration": 2.7333877086639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121508, + "balance_loss_mlp": 1.10890126, + "diversity_loss_mlp": 0.0, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.07485820549569244, + "language_loss": 0.83144093, + "learning_rate": 0.0008222415674599765, + "loss": 0.84265602, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.12609863, + "routerloss_mlp": 0.0, + "step": 1553, + "time_per_iteration": 3.077017068862915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135128, + "balance_loss_mlp": 1.12165701, + "diversity_loss_mlp": 0.0, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.08671551895934956, + "language_loss": 0.83149582, + "learning_rate": 0.0008220032939012349, + "loss": 0.84284711, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1554, + "time_per_iteration": 2.6689035892486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115966, + "balance_loss_mlp": 1.10284674, + "diversity_loss_mlp": 0.0, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.06666483036401037, + "language_loss": 0.87800217, + "learning_rate": 0.0008217648953297277, + "loss": 0.88916183, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1555, + "time_per_iteration": 2.8417294025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119981, + "balance_loss_mlp": 1.10677278, + "diversity_loss_mlp": 0.0, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.08472740856632217, + "language_loss": 0.78017807, + "learning_rate": 0.0008215263718380095, + "loss": 0.7913779, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1556, + "time_per_iteration": 2.682047128677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096383, + "balance_loss_mlp": 1.08319807, + "diversity_loss_mlp": 0.0, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07743195715790333, + "language_loss": 0.84389544, + "learning_rate": 0.0008212877235186833, + "loss": 0.85485923, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.13201904, + "routerloss_mlp": 0.0, + "step": 1557, + "time_per_iteration": 2.6532580852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074398, + "balance_loss_mlp": 1.06710196, + "diversity_loss_mlp": 0.0, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.04061005434024277, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78811955, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.07275391, + "routerloss_mlp": 0.0, + "step": 1558, + "time_per_iteration": 4.923272132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092088, + "balance_loss_mlp": 1.07896352, + "diversity_loss_mlp": 0.0, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.10565427097675566, + "language_loss": 0.8116585, + "learning_rate": 0.0008208100527678611, + "loss": 0.82257938, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1559, + "time_per_iteration": 2.602773427963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.07101393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.11780548804152448, + "language_loss": 0.78494406, + "learning_rate": 0.0008205710305218135, + "loss": 0.79578459, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1560, + "time_per_iteration": 3.013576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089526, + "balance_loss_mlp": 1.07663918, + "diversity_loss_mlp": 0.0, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.08018423106971302, + "language_loss": 0.89838511, + "learning_rate": 0.0008203318838190541, + "loss": 0.9092803, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1561, + "time_per_iteration": 2.741619348526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108989, + "balance_loss_mlp": 1.07702184, + "diversity_loss_mlp": 0.0, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.09397123990600864, + "language_loss": 0.85396177, + "learning_rate": 0.0008200926127524281, + "loss": 0.86486065, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1562, + "time_per_iteration": 2.60974383354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106708, + "balance_loss_mlp": 1.0936904, + "diversity_loss_mlp": 0.0, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.08688269643752358, + "language_loss": 0.83400619, + "learning_rate": 0.0008198532174148289, + "loss": 0.84507322, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1563, + "time_per_iteration": 2.7336533069610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079297, + "balance_loss_mlp": 1.07195389, + "diversity_loss_mlp": 0.0, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.04112604139988501, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81765467, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1564, + "time_per_iteration": 4.828714609146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145426, + "balance_loss_mlp": 1.1324501, + "diversity_loss_mlp": 0.0, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08852118135813189, + "language_loss": 0.89291, + "learning_rate": 0.0008193740542985244, + "loss": 0.90436429, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1565, + "time_per_iteration": 2.5988731384277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151488, + "balance_loss_mlp": 1.13872099, + "diversity_loss_mlp": 0.0, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.1281977179548432, + "language_loss": 0.86354733, + "learning_rate": 0.0008191342867058467, + "loss": 0.87506223, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1566, + "time_per_iteration": 2.6914639472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.10574174, + "diversity_loss_mlp": 0.0, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.07018370282969584, + "language_loss": 0.83602738, + "learning_rate": 0.0008188943952142509, + "loss": 0.84721458, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1567, + "time_per_iteration": 2.7846438884735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111402, + "balance_loss_mlp": 1.09847367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.08750889372003143, + "language_loss": 0.82150149, + "learning_rate": 0.0008186543799168711, + "loss": 0.83261549, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1568, + "time_per_iteration": 3.1300384998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094152, + "balance_loss_mlp": 1.08103871, + "diversity_loss_mlp": 0.0, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.07719475001811499, + "language_loss": 0.88627326, + "learning_rate": 0.0008184142409068892, + "loss": 0.89721477, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.13134766, + "routerloss_mlp": 0.0, + "step": 1569, + "time_per_iteration": 2.9922726154327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087737, + "balance_loss_mlp": 1.07475495, + "diversity_loss_mlp": 0.0, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.07345065764158631, + "language_loss": 0.86446834, + "learning_rate": 0.000818173978277536, + "loss": 0.87534571, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.12994385, + "routerloss_mlp": 0.0, + "step": 1570, + "time_per_iteration": 2.695930242538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089564, + "balance_loss_mlp": 1.07673669, + "diversity_loss_mlp": 0.0, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.0712021049255776, + "language_loss": 0.83337176, + "learning_rate": 0.000817933592122089, + "loss": 0.84426749, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.12841797, + "routerloss_mlp": 0.0, + "step": 1571, + "time_per_iteration": 2.7131617069244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087482, + "balance_loss_mlp": 1.07427394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.08283074842036095, + "language_loss": 0.83667982, + "learning_rate": 0.0008176930825338749, + "loss": 0.84755468, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1572, + "time_per_iteration": 2.5447826385498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087862, + "balance_loss_mlp": 1.07405734, + "diversity_loss_mlp": 0.0, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07741282152017008, + "language_loss": 0.88849854, + "learning_rate": 0.0008174524496062679, + "loss": 0.89937723, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1573, + "time_per_iteration": 2.908740997314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092114, + "balance_loss_mlp": 1.07822633, + "diversity_loss_mlp": 0.0, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.06962859876416791, + "language_loss": 0.85499102, + "learning_rate": 0.0008172116934326894, + "loss": 0.86591208, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1574, + "time_per_iteration": 2.751488208770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098046, + "balance_loss_mlp": 1.08365786, + "diversity_loss_mlp": 0.0, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.09195920466248479, + "language_loss": 0.8794626, + "learning_rate": 0.0008169708141066097, + "loss": 0.89044309, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1575, + "time_per_iteration": 2.5947275161743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118908, + "balance_loss_mlp": 1.10441208, + "diversity_loss_mlp": 0.0, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.0784824693742563, + "language_loss": 0.90658617, + "learning_rate": 0.0008167298117215465, + "loss": 0.91777527, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1576, + "time_per_iteration": 2.5396125316619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011316, + "balance_loss_mlp": 1.11705649, + "diversity_loss_mlp": 0.0, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.1093253517132677, + "language_loss": 0.87566864, + "learning_rate": 0.0008164886863710649, + "loss": 0.88698471, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1577, + "time_per_iteration": 2.931835412979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138004, + "balance_loss_mlp": 1.12323439, + "diversity_loss_mlp": 0.0, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07788016425512684, + "language_loss": 0.8637675, + "learning_rate": 0.0008162474381487783, + "loss": 0.87514758, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1578, + "time_per_iteration": 3.041262626647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125978, + "balance_loss_mlp": 1.11132693, + "diversity_loss_mlp": 0.0, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.1532642042193693, + "language_loss": 0.84568751, + "learning_rate": 0.0008160060671483475, + "loss": 0.8569473, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1579, + "time_per_iteration": 2.6566197872161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110829, + "balance_loss_mlp": 1.0942831, + "diversity_loss_mlp": 0.0, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.10001869607158981, + "language_loss": 0.8342396, + "learning_rate": 0.0008157645734634809, + "loss": 0.84532249, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1580, + "time_per_iteration": 2.5994346141815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151521, + "balance_loss_mlp": 1.14064956, + "diversity_loss_mlp": 0.0, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.06737085519591758, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78048015, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 1581, + "time_per_iteration": 4.946556329727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00631723, + "balance_loss_mlp": 1.05820811, + "diversity_loss_mlp": 0.17941347, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.002006006723137456, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.73846221, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01291206, + "step": 1582, + "time_per_iteration": 4.897693395614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.08376384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.07529557219412701, + "language_loss": 0.83949858, + "learning_rate": 0.000815039357240067, + "loss": 0.85047406, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1583, + "time_per_iteration": 2.6096932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101837, + "balance_loss_mlp": 1.0882473, + "diversity_loss_mlp": 0.0, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.0740498467066553, + "language_loss": 0.84922493, + "learning_rate": 0.0008147973737554952, + "loss": 0.86024332, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1584, + "time_per_iteration": 2.7863824367523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106775, + "balance_loss_mlp": 1.09364963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.11669723774220289, + "language_loss": 0.85926318, + "learning_rate": 0.000814555268055744, + "loss": 0.87033093, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1585, + "time_per_iteration": 2.6167564392089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111589, + "balance_loss_mlp": 1.1022768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07476018488685929, + "language_loss": 0.87489879, + "learning_rate": 0.0008143130402348073, + "loss": 0.88605773, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1586, + "time_per_iteration": 2.6318202018737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112097, + "balance_loss_mlp": 1.10742807, + "diversity_loss_mlp": 0.0, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.07016471467090964, + "language_loss": 0.79198885, + "learning_rate": 0.0008140706903867265, + "loss": 0.80319858, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1587, + "time_per_iteration": 2.82663893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128991, + "balance_loss_mlp": 1.11541307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.09040046070353, + "language_loss": 0.90612531, + "learning_rate": 0.0008138282186055897, + "loss": 0.91741514, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1588, + "time_per_iteration": 2.690561294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142156, + "balance_loss_mlp": 1.12872136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.07675542780120453, + "language_loss": 0.82382154, + "learning_rate": 0.0008135856249855331, + "loss": 0.83524311, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1589, + "time_per_iteration": 2.6935813426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115907, + "balance_loss_mlp": 1.14551568, + "diversity_loss_mlp": 0.0, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.07642745969896261, + "language_loss": 0.89603746, + "learning_rate": 0.0008133429096207398, + "loss": 0.90762818, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1590, + "time_per_iteration": 2.7690787315368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113549, + "balance_loss_mlp": 1.10534787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.03962763613217991, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76425815, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.08203125, + "routerloss_mlp": 0.0, + "step": 1591, + "time_per_iteration": 4.950432538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184059, + "balance_loss_mlp": 1.17060041, + "diversity_loss_mlp": 0.0, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.0624915030883944, + "language_loss": 0.8671608, + "learning_rate": 0.0008128571140339123, + "loss": 0.87900144, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1592, + "time_per_iteration": 2.717022657394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.15618944, + "diversity_loss_mlp": 0.0, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.08640912687422367, + "language_loss": 0.87240267, + "learning_rate": 0.0008126140340004805, + "loss": 0.88410139, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1593, + "time_per_iteration": 2.5112054347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157381, + "balance_loss_mlp": 1.14379096, + "diversity_loss_mlp": 0.0, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06492228459438584, + "language_loss": 0.82168889, + "learning_rate": 0.0008123708325995172, + "loss": 0.83326268, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1594, + "time_per_iteration": 3.193125009536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139509, + "balance_loss_mlp": 1.1256932, + "diversity_loss_mlp": 0.0, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06515151231920442, + "language_loss": 0.79815221, + "learning_rate": 0.0008121275099254414, + "loss": 0.80954736, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1595, + "time_per_iteration": 2.9032304286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133663, + "balance_loss_mlp": 1.12007284, + "diversity_loss_mlp": 0.0, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06899315915000012, + "language_loss": 0.88638222, + "learning_rate": 0.0008118840660727194, + "loss": 0.89771879, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1596, + "time_per_iteration": 2.6298515796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115466, + "balance_loss_mlp": 1.10215056, + "diversity_loss_mlp": 0.0, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.06984166924665287, + "language_loss": 0.87847084, + "learning_rate": 0.0008116405011358644, + "loss": 0.88962543, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.13336182, + "routerloss_mlp": 0.0, + "step": 1597, + "time_per_iteration": 3.1922342777252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095988, + "balance_loss_mlp": 1.08212388, + "diversity_loss_mlp": 0.0, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.07145022695402857, + "language_loss": 0.79985273, + "learning_rate": 0.0008113968152094369, + "loss": 0.81081259, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1598, + "time_per_iteration": 2.500500440597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090097, + "balance_loss_mlp": 1.07637632, + "diversity_loss_mlp": 0.0, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.07896733537507578, + "language_loss": 0.82477671, + "learning_rate": 0.0008111530083880438, + "loss": 0.83567768, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1599, + "time_per_iteration": 2.9081485271453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.07693791, + "diversity_loss_mlp": 0.0, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.10700735308097704, + "language_loss": 0.86289096, + "learning_rate": 0.0008109090807663399, + "loss": 0.87379909, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1600, + "time_per_iteration": 2.7883458137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084832, + "balance_loss_mlp": 1.07049167, + "diversity_loss_mlp": 0.0, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.058046583591585654, + "language_loss": 0.8845669, + "learning_rate": 0.0008106650324390257, + "loss": 0.89541531, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1601, + "time_per_iteration": 2.8250818252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012306, + "balance_loss_mlp": 1.78856134, + "diversity_loss_mlp": 0.20302816, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.03151963489439222, + "language_loss": 0.81347358, + "learning_rate": 0.0008104208635008493, + "loss": 0.8235966, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165114, + "step": 1602, + "time_per_iteration": 2.6824991703033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078191, + "balance_loss_mlp": 1.06365991, + "diversity_loss_mlp": 0.0, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.06925842581040223, + "language_loss": 0.81696957, + "learning_rate": 0.0008101765740466058, + "loss": 0.82775152, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1603, + "time_per_iteration": 2.4828884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083153, + "balance_loss_mlp": 1.06891942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.08194523431430376, + "language_loss": 0.83996522, + "learning_rate": 0.0008099321641711364, + "loss": 0.85079676, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1604, + "time_per_iteration": 2.628990650177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093302, + "balance_loss_mlp": 1.07891393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.066381842407901, + "language_loss": 0.83568424, + "learning_rate": 0.0008096876339693295, + "loss": 0.84661728, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1605, + "time_per_iteration": 2.621486186981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104017, + "balance_loss_mlp": 1.0898906, + "diversity_loss_mlp": 0.0, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.08065648415588843, + "language_loss": 0.8146233, + "learning_rate": 0.0008094429835361206, + "loss": 0.82566357, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1606, + "time_per_iteration": 2.9436137676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101821, + "balance_loss_mlp": 1.08727765, + "diversity_loss_mlp": 0.0, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.06722603246449312, + "language_loss": 0.85730284, + "learning_rate": 0.0008091982129664908, + "loss": 0.86832106, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1607, + "time_per_iteration": 2.6776270866394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110298, + "balance_loss_mlp": 1.09606481, + "diversity_loss_mlp": 0.0, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.07435522574008574, + "language_loss": 0.83177197, + "learning_rate": 0.0008089533223554687, + "loss": 0.842875, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1608, + "time_per_iteration": 2.6971724033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106883, + "balance_loss_mlp": 1.09322155, + "diversity_loss_mlp": 0.0, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.08534881839400792, + "language_loss": 0.85436511, + "learning_rate": 0.0008087083117981294, + "loss": 0.86543399, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1609, + "time_per_iteration": 2.873072624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100887, + "balance_loss_mlp": 1.08715367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.08408730625442483, + "language_loss": 0.88209295, + "learning_rate": 0.0008084631813895943, + "loss": 0.89310181, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1610, + "time_per_iteration": 2.7717368602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_mlp": 1.0843389, + "diversity_loss_mlp": 0.0, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07291880748627809, + "language_loss": 0.84093356, + "learning_rate": 0.0008082179312250315, + "loss": 0.85191453, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1611, + "time_per_iteration": 2.6323728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167376, + "balance_loss_mlp": 1.15912676, + "diversity_loss_mlp": 0.0, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.06715325583723679, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81023216, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.08251953, + "routerloss_mlp": 0.0, + "step": 1612, + "time_per_iteration": 4.837978839874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103787, + "balance_loss_mlp": 1.09591889, + "diversity_loss_mlp": 0.0, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.04843806861709949, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77733123, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.07861328, + "routerloss_mlp": 0.0, + "step": 1613, + "time_per_iteration": 5.086154937744141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118625, + "balance_loss_mlp": 1.10497594, + "diversity_loss_mlp": 0.0, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.09649046421891638, + "language_loss": 0.82414234, + "learning_rate": 0.0008074814631475545, + "loss": 0.83532858, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1614, + "time_per_iteration": 3.3300058841705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115901, + "balance_loss_mlp": 1.10232294, + "diversity_loss_mlp": 0.0, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.10381126956618623, + "language_loss": 0.7917223, + "learning_rate": 0.0008072357349114907, + "loss": 0.80288124, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1615, + "time_per_iteration": 2.692242383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123449, + "balance_loss_mlp": 1.1100384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.09811598085954727, + "language_loss": 0.88751173, + "learning_rate": 0.0008069898873959363, + "loss": 0.89874619, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1616, + "time_per_iteration": 2.688138723373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119599, + "balance_loss_mlp": 1.10590243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.06496922585492992, + "language_loss": 0.85670269, + "learning_rate": 0.0008067439206963375, + "loss": 0.8678987, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1617, + "time_per_iteration": 2.628465175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126727, + "balance_loss_mlp": 1.11359048, + "diversity_loss_mlp": 0.0, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.08367367493581554, + "language_loss": 0.86233091, + "learning_rate": 0.0008064978349081873, + "loss": 0.87359822, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1618, + "time_per_iteration": 2.9359195232391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122791, + "balance_loss_mlp": 1.10941529, + "diversity_loss_mlp": 0.0, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.062058920213391884, + "language_loss": 0.86742592, + "learning_rate": 0.0008062516301270245, + "loss": 0.87865382, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1619, + "time_per_iteration": 2.685615301132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968061, + "balance_loss_mlp": 1.70987701, + "diversity_loss_mlp": 0.19448289, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.02692656797073588, + "language_loss": 0.8831743, + "learning_rate": 0.0008060053064484343, + "loss": 0.89285493, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588114, + "step": 1620, + "time_per_iteration": 2.9507076740264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131577, + "balance_loss_mlp": 1.11839283, + "diversity_loss_mlp": 0.0, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.08216719715750098, + "language_loss": 0.85142976, + "learning_rate": 0.0008057588639680482, + "loss": 0.86274558, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.13208008, + "routerloss_mlp": 0.0, + "step": 1621, + "time_per_iteration": 2.7498936653137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00955916, + "balance_loss_mlp": 1.68915153, + "diversity_loss_mlp": 0.19115068, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.038673577194741904, + "language_loss": 0.82934028, + "learning_rate": 0.0008055123027815434, + "loss": 0.83889943, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01576493, + "step": 1622, + "time_per_iteration": 2.92877459526062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119016, + "balance_loss_mlp": 1.10545552, + "diversity_loss_mlp": 0.0, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.11144773799130939, + "language_loss": 0.8492527, + "learning_rate": 0.0008052656229846436, + "loss": 0.86044282, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.13580322, + "routerloss_mlp": 0.0, + "step": 1623, + "time_per_iteration": 2.6647849082946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104342, + "balance_loss_mlp": 1.09039474, + "diversity_loss_mlp": 0.0, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.09067734621983937, + "language_loss": 0.90320027, + "learning_rate": 0.0008050188246731182, + "loss": 0.9142437, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1624, + "time_per_iteration": 2.6908931732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108727, + "balance_loss_mlp": 1.07360816, + "diversity_loss_mlp": 0.0, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08706559573327896, + "language_loss": 0.8222695, + "learning_rate": 0.0008047719079427834, + "loss": 0.83314216, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1625, + "time_per_iteration": 2.979578733444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281481, + "balance_loss_mlp": 1.27170551, + "diversity_loss_mlp": 0.0, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.09241126848133228, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75633186, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.09765625, + "routerloss_mlp": 0.0, + "step": 1626, + "time_per_iteration": 4.813723802566528 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078597, + "balance_loss_mlp": 1.06489933, + "diversity_loss_mlp": 0.0, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.061158387019755324, + "language_loss": 0.86164916, + "learning_rate": 0.0008042777196091757, + "loss": 0.87243509, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1627, + "time_per_iteration": 2.6777052879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931263, + "balance_loss_mlp": 1.63595629, + "diversity_loss_mlp": 0.19502082, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.02888255305303151, + "language_loss": 0.81839561, + "learning_rate": 0.0008040304481977643, + "loss": 0.82770824, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01577434, + "step": 1628, + "time_per_iteration": 2.685519218444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_mlp": 1.07024312, + "diversity_loss_mlp": 0.0, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.070875243316129, + "language_loss": 0.86462033, + "learning_rate": 0.0008037830587512649, + "loss": 0.875458, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1629, + "time_per_iteration": 3.0812296867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093655, + "balance_loss_mlp": 1.07976675, + "diversity_loss_mlp": 0.0, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.07857424850498267, + "language_loss": 0.78910959, + "learning_rate": 0.0008035355513657224, + "loss": 0.80004621, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1630, + "time_per_iteration": 2.509866714477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109932, + "balance_loss_mlp": 1.08518136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.05926482463995905, + "language_loss": 0.9323386, + "learning_rate": 0.0008032879261372279, + "loss": 0.94333184, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1631, + "time_per_iteration": 2.793675422668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121244, + "balance_loss_mlp": 1.20142555, + "diversity_loss_mlp": 0.0, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.0543299042148954, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80848283, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 1632, + "time_per_iteration": 5.6717705726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_mlp": 1.08712876, + "diversity_loss_mlp": 0.0, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.07399367926820971, + "language_loss": 0.87236691, + "learning_rate": 0.0008027923225359748, + "loss": 0.88337696, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.13885498, + "routerloss_mlp": 0.0, + "step": 1633, + "time_per_iteration": 2.591161012649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_mlp": 1.09272563, + "diversity_loss_mlp": 0.0, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.07361205381971474, + "language_loss": 0.8823992, + "learning_rate": 0.0008025443443556267, + "loss": 0.89347273, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1634, + "time_per_iteration": 2.714925765991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_mlp": 1.09279966, + "diversity_loss_mlp": 0.0, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.05821338652647348, + "language_loss": 0.88174599, + "learning_rate": 0.000802296248717147, + "loss": 0.89281231, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1635, + "time_per_iteration": 2.924661159515381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102803, + "balance_loss_mlp": 1.08889091, + "diversity_loss_mlp": 0.0, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06918051977022115, + "language_loss": 0.78766519, + "learning_rate": 0.0008020480357168554, + "loss": 0.79869324, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1636, + "time_per_iteration": 2.8397598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096954, + "balance_loss_mlp": 1.08334041, + "diversity_loss_mlp": 0.0, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.061070409346790804, + "language_loss": 0.88343245, + "learning_rate": 0.0008017997054511165, + "loss": 0.89440191, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.13623047, + "routerloss_mlp": 0.0, + "step": 1637, + "time_per_iteration": 2.5770463943481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109567, + "balance_loss_mlp": 1.08241367, + "diversity_loss_mlp": 0.0, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06082888573267997, + "language_loss": 0.85688329, + "learning_rate": 0.0008015512580163407, + "loss": 0.86783999, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1638, + "time_per_iteration": 2.7893900871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915347, + "balance_loss_mlp": 1.6005652, + "diversity_loss_mlp": 0.19760543, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.03200753828687725, + "language_loss": 0.80247211, + "learning_rate": 0.0008013026935089838, + "loss": 0.8116256, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0162621, + "step": 1639, + "time_per_iteration": 2.9013028144836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116887, + "balance_loss_mlp": 1.10366678, + "diversity_loss_mlp": 0.0, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.07107229367788748, + "language_loss": 0.84156835, + "learning_rate": 0.0008010540120255472, + "loss": 0.85273731, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1640, + "time_per_iteration": 2.6617894172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122905, + "balance_loss_mlp": 1.10991144, + "diversity_loss_mlp": 0.0, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.08316081918757003, + "language_loss": 0.86058956, + "learning_rate": 0.0008008052136625774, + "loss": 0.87181866, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1641, + "time_per_iteration": 2.8128581047058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117766, + "balance_loss_mlp": 1.10461712, + "diversity_loss_mlp": 0.0, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.11340060957388516, + "language_loss": 0.86898887, + "learning_rate": 0.0008005562985166666, + "loss": 0.88016647, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1642, + "time_per_iteration": 2.6915791034698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113412, + "balance_loss_mlp": 1.10045385, + "diversity_loss_mlp": 0.0, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.06371803301806024, + "language_loss": 0.85065734, + "learning_rate": 0.0008003072666844524, + "loss": 0.86179143, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.12976074, + "routerloss_mlp": 0.0, + "step": 1643, + "time_per_iteration": 2.713515520095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110554, + "balance_loss_mlp": 1.09287417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.09207812275617455, + "language_loss": 0.82446098, + "learning_rate": 0.0008000581182626173, + "loss": 0.83551639, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 1644, + "time_per_iteration": 2.5728507041931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099382, + "balance_loss_mlp": 1.08668065, + "diversity_loss_mlp": 0.0, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.07446065392993936, + "language_loss": 0.86341298, + "learning_rate": 0.0007998088533478894, + "loss": 0.87440687, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.12713623, + "routerloss_mlp": 0.0, + "step": 1645, + "time_per_iteration": 2.7022316455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103676, + "balance_loss_mlp": 1.09096265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.09512310951915111, + "language_loss": 0.84171218, + "learning_rate": 0.000799559472037042, + "loss": 0.85274899, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.12719727, + "routerloss_mlp": 0.0, + "step": 1646, + "time_per_iteration": 2.5341672897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089286, + "balance_loss_mlp": 1.07678151, + "diversity_loss_mlp": 0.0, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05690135295492242, + "language_loss": 0.87462902, + "learning_rate": 0.0007993099744268932, + "loss": 0.88552189, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1647, + "time_per_iteration": 2.9204719066619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097973, + "balance_loss_mlp": 1.08491409, + "diversity_loss_mlp": 0.0, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.08028992569563033, + "language_loss": 0.88103539, + "learning_rate": 0.000799060360614307, + "loss": 0.8920151, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1648, + "time_per_iteration": 2.7098584175109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094994, + "balance_loss_mlp": 1.08204746, + "diversity_loss_mlp": 0.0, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.07374581447427947, + "language_loss": 0.83565277, + "learning_rate": 0.0007988106306961917, + "loss": 0.84660268, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1649, + "time_per_iteration": 3.136148691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096074, + "balance_loss_mlp": 1.08292556, + "diversity_loss_mlp": 0.0, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.08307651310008923, + "language_loss": 0.84510154, + "learning_rate": 0.0007985607847695014, + "loss": 0.85606229, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1650, + "time_per_iteration": 2.6657865047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090136, + "balance_loss_mlp": 1.07697558, + "diversity_loss_mlp": 0.0, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.07221907468491222, + "language_loss": 0.82981718, + "learning_rate": 0.0007983108229312345, + "loss": 0.84071863, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.13183594, + "routerloss_mlp": 0.0, + "step": 1651, + "time_per_iteration": 2.939943313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109006, + "balance_loss_mlp": 1.07648206, + "diversity_loss_mlp": 0.0, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.0785368607999539, + "language_loss": 0.86505926, + "learning_rate": 0.0007980607452784351, + "loss": 0.87595987, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1652, + "time_per_iteration": 2.586700916290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082675, + "balance_loss_mlp": 1.06952596, + "diversity_loss_mlp": 0.0, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.06920593361186494, + "language_loss": 0.90510356, + "learning_rate": 0.0007978105519081919, + "loss": 0.91593033, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1653, + "time_per_iteration": 2.665844440460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084984, + "balance_loss_mlp": 1.0715965, + "diversity_loss_mlp": 0.0, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.07269169213621761, + "language_loss": 0.87967515, + "learning_rate": 0.0007975602429176385, + "loss": 0.89052504, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.13415527, + "routerloss_mlp": 0.0, + "step": 1654, + "time_per_iteration": 2.5818393230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085975, + "balance_loss_mlp": 1.07225442, + "diversity_loss_mlp": 0.0, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.08150423110047789, + "language_loss": 0.81308222, + "learning_rate": 0.0007973098184039536, + "loss": 0.82394195, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1655, + "time_per_iteration": 2.664916515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094509, + "balance_loss_mlp": 1.08110952, + "diversity_loss_mlp": 0.0, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.0661968945841423, + "language_loss": 0.8695243, + "learning_rate": 0.0007970592784643602, + "loss": 0.88046944, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.13427734, + "routerloss_mlp": 0.0, + "step": 1656, + "time_per_iteration": 2.851214647293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104427, + "balance_loss_mlp": 1.09084868, + "diversity_loss_mlp": 0.0, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.0809768283097012, + "language_loss": 0.85228848, + "learning_rate": 0.0007968086231961272, + "loss": 0.86333275, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1657, + "time_per_iteration": 2.6277201175689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111168, + "balance_loss_mlp": 1.09744644, + "diversity_loss_mlp": 0.0, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.10999441213252201, + "language_loss": 0.83322126, + "learning_rate": 0.0007965578526965671, + "loss": 0.84433806, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1658, + "time_per_iteration": 2.5514447689056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097556, + "balance_loss_mlp": 1.08337009, + "diversity_loss_mlp": 0.0, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.07090711515760839, + "language_loss": 0.86299932, + "learning_rate": 0.0007963069670630377, + "loss": 0.87397492, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1659, + "time_per_iteration": 2.722572088241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.07523549, + "diversity_loss_mlp": 0.0, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.07181055202596492, + "language_loss": 0.88127738, + "learning_rate": 0.0007960559663929416, + "loss": 0.8921715, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1660, + "time_per_iteration": 2.6411688327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079317, + "balance_loss_mlp": 1.06500006, + "diversity_loss_mlp": 0.0, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.06614466369263741, + "language_loss": 0.87915826, + "learning_rate": 0.0007958048507837259, + "loss": 0.88995141, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1661, + "time_per_iteration": 2.954888343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075627, + "balance_loss_mlp": 1.06107187, + "diversity_loss_mlp": 0.0, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.08599761261652404, + "language_loss": 0.87309289, + "learning_rate": 0.0007955536203328822, + "loss": 0.88384914, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1662, + "time_per_iteration": 2.9499282836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074811, + "balance_loss_mlp": 1.06073272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.08962386225204486, + "language_loss": 0.8334958, + "learning_rate": 0.0007953022751379469, + "loss": 0.84424388, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1663, + "time_per_iteration": 2.768754005432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075722, + "balance_loss_mlp": 1.06131005, + "diversity_loss_mlp": 0.0, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.08182948291647181, + "language_loss": 0.8200748, + "learning_rate": 0.000795050815296501, + "loss": 0.830832, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1664, + "time_per_iteration": 2.9893014430999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.07167196, + "diversity_loss_mlp": 0.0, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.0641722272838546, + "language_loss": 0.93037909, + "learning_rate": 0.0007947992409061695, + "loss": 0.94122881, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1665, + "time_per_iteration": 2.583789110183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100063, + "balance_loss_mlp": 1.08662808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07388769827525307, + "language_loss": 0.86501724, + "learning_rate": 0.0007945475520646226, + "loss": 0.87601787, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1666, + "time_per_iteration": 2.944988965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127031, + "balance_loss_mlp": 1.11408508, + "diversity_loss_mlp": 0.0, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.0781321549049884, + "language_loss": 0.84777099, + "learning_rate": 0.0007942957488695743, + "loss": 0.85904133, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1667, + "time_per_iteration": 2.667464017868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138299, + "balance_loss_mlp": 1.12505507, + "diversity_loss_mlp": 0.0, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06588913292879497, + "language_loss": 0.81000018, + "learning_rate": 0.0007940438314187833, + "loss": 0.82138324, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.13250732, + "routerloss_mlp": 0.0, + "step": 1668, + "time_per_iteration": 3.0395359992980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147791, + "balance_loss_mlp": 1.13491094, + "diversity_loss_mlp": 0.0, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.07621602089938284, + "language_loss": 0.80540276, + "learning_rate": 0.0007937917998100529, + "loss": 0.8168807, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1669, + "time_per_iteration": 2.5894687175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142528, + "balance_loss_mlp": 1.1294744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.07981389159152626, + "language_loss": 0.79167509, + "learning_rate": 0.0007935396541412302, + "loss": 0.80310035, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.13067627, + "routerloss_mlp": 0.0, + "step": 1670, + "time_per_iteration": 2.672978401184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141245, + "balance_loss_mlp": 1.12813175, + "diversity_loss_mlp": 0.0, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.06899314705075654, + "language_loss": 0.85712755, + "learning_rate": 0.0007932873945102068, + "loss": 0.86854005, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1671, + "time_per_iteration": 2.6296515464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_mlp": 1.25616145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.05047573422440889, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.77033865, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1672, + "time_per_iteration": 4.840561628341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138556, + "balance_loss_mlp": 1.1251744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.06902528499394482, + "language_loss": 0.86527705, + "learning_rate": 0.0007927825337533461, + "loss": 0.87666261, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1673, + "time_per_iteration": 2.693758964538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142697, + "balance_loss_mlp": 1.12930942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.08521571565711833, + "language_loss": 0.84877092, + "learning_rate": 0.0007925299328235131, + "loss": 0.8601979, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1674, + "time_per_iteration": 2.659621238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141943, + "balance_loss_mlp": 1.12855613, + "diversity_loss_mlp": 0.0, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.08187135533898351, + "language_loss": 0.84720862, + "learning_rate": 0.000792277218323488, + "loss": 0.85862803, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1675, + "time_per_iteration": 2.646108865737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135022, + "balance_loss_mlp": 1.12169456, + "diversity_loss_mlp": 0.0, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.08499328402904442, + "language_loss": 0.8509531, + "learning_rate": 0.0007920243903513833, + "loss": 0.86230332, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1676, + "time_per_iteration": 2.5730555057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126699, + "balance_loss_mlp": 1.11364567, + "diversity_loss_mlp": 0.0, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.08854342537284099, + "language_loss": 0.84008271, + "learning_rate": 0.0007917714490053556, + "loss": 0.85134971, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1677, + "time_per_iteration": 2.718555212020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122958, + "balance_loss_mlp": 1.10974979, + "diversity_loss_mlp": 0.0, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.07711595043056121, + "language_loss": 0.86223996, + "learning_rate": 0.0007915183943836055, + "loss": 0.87346947, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1678, + "time_per_iteration": 2.902038812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112402, + "balance_loss_mlp": 1.09958673, + "diversity_loss_mlp": 0.0, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07762427611918464, + "language_loss": 0.8422336, + "learning_rate": 0.0007912652265843773, + "loss": 0.85335761, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1679, + "time_per_iteration": 3.024665117263794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107958, + "balance_loss_mlp": 1.09453535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.06959311244041297, + "language_loss": 0.81845474, + "learning_rate": 0.0007910119457059597, + "loss": 0.82953429, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1680, + "time_per_iteration": 2.6954221725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111109, + "balance_loss_mlp": 1.09806776, + "diversity_loss_mlp": 0.0, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.08135634404485692, + "language_loss": 0.80380678, + "learning_rate": 0.0007907585518466849, + "loss": 0.81491786, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1681, + "time_per_iteration": 2.961648464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108764, + "balance_loss_mlp": 1.09574652, + "diversity_loss_mlp": 0.0, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.06462126830885603, + "language_loss": 0.89670283, + "learning_rate": 0.000790505045104929, + "loss": 0.90779042, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1682, + "time_per_iteration": 2.5210485458374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111018, + "balance_loss_mlp": 1.09719789, + "diversity_loss_mlp": 0.0, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.08715930327910015, + "language_loss": 0.86719161, + "learning_rate": 0.0007902514255791125, + "loss": 0.8782934, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1683, + "time_per_iteration": 2.8002610206604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097901, + "balance_loss_mlp": 1.084764, + "diversity_loss_mlp": 0.0, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06656486310868524, + "language_loss": 0.8795855, + "learning_rate": 0.0007899976933676986, + "loss": 0.89056444, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1684, + "time_per_iteration": 2.967172622680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092249, + "balance_loss_mlp": 1.07880259, + "diversity_loss_mlp": 0.0, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.09628316614228749, + "language_loss": 0.87045735, + "learning_rate": 0.0007897438485691955, + "loss": 0.88137984, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1685, + "time_per_iteration": 2.680147171020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103099, + "balance_loss_mlp": 1.0898304, + "diversity_loss_mlp": 0.0, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.0850736326825917, + "language_loss": 0.82684374, + "learning_rate": 0.0007894898912821542, + "loss": 0.83787471, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1686, + "time_per_iteration": 2.554380416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.0880518, + "diversity_loss_mlp": 0.0, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.06056792299191916, + "language_loss": 0.86695451, + "learning_rate": 0.0007892358216051695, + "loss": 0.87797034, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1687, + "time_per_iteration": 2.7851648330688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.09641767, + "diversity_loss_mlp": 0.0, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.07434076211008771, + "language_loss": 0.91829026, + "learning_rate": 0.0007889816396368803, + "loss": 0.92938912, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1688, + "time_per_iteration": 2.6211581230163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.10499799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07845440141588131, + "language_loss": 0.85253429, + "learning_rate": 0.0007887273454759687, + "loss": 0.8637172, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.13299561, + "routerloss_mlp": 0.0, + "step": 1689, + "time_per_iteration": 2.507779598236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.10946417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.08373410695529686, + "language_loss": 0.82792354, + "learning_rate": 0.0007884729392211603, + "loss": 0.83914578, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1690, + "time_per_iteration": 2.6805906295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119249, + "balance_loss_mlp": 1.10672641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09069843341009556, + "language_loss": 0.85648167, + "learning_rate": 0.0007882184209712245, + "loss": 0.86767411, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.12530518, + "routerloss_mlp": 0.0, + "step": 1691, + "time_per_iteration": 2.569239377975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00949982, + "balance_loss_mlp": 1.66309059, + "diversity_loss_mlp": 0.20491584, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.028395749586794427, + "language_loss": 0.85757548, + "learning_rate": 0.000787963790824974, + "loss": 0.86707526, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597837, + "step": 1692, + "time_per_iteration": 3.009209156036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113225, + "balance_loss_mlp": 1.10071397, + "diversity_loss_mlp": 0.0, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.22846677162281695, + "language_loss": 0.89612615, + "learning_rate": 0.0007877090488812651, + "loss": 0.90725839, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.12512207, + "routerloss_mlp": 0.0, + "step": 1693, + "time_per_iteration": 2.450209617614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936753, + "balance_loss_mlp": 1.63723278, + "diversity_loss_mlp": 0.20419246, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.03161007726798549, + "language_loss": 0.83743423, + "learning_rate": 0.0007874541952389973, + "loss": 0.84680176, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01604037, + "step": 1694, + "time_per_iteration": 2.6965737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111615, + "balance_loss_mlp": 1.10350823, + "diversity_loss_mlp": 0.0, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.07424213060006848, + "language_loss": 0.86538494, + "learning_rate": 0.0007871992299971136, + "loss": 0.87654638, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1695, + "time_per_iteration": 2.570406913757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131966, + "balance_loss_mlp": 1.11953878, + "diversity_loss_mlp": 0.0, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.0612219868328418, + "language_loss": 0.84142137, + "learning_rate": 0.0007869441532546001, + "loss": 0.852741, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1696, + "time_per_iteration": 2.763688087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128533, + "balance_loss_mlp": 1.11626601, + "diversity_loss_mlp": 0.0, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06155756648422996, + "language_loss": 0.79298395, + "learning_rate": 0.0007866889651104867, + "loss": 0.80426925, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 1697, + "time_per_iteration": 2.816236972808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130866, + "balance_loss_mlp": 1.11769366, + "diversity_loss_mlp": 0.0, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.0827611554210385, + "language_loss": 0.83172429, + "learning_rate": 0.000786433665663846, + "loss": 0.84303296, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.13195801, + "routerloss_mlp": 0.0, + "step": 1698, + "time_per_iteration": 2.6627049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135283, + "balance_loss_mlp": 1.12240815, + "diversity_loss_mlp": 0.0, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.08562611300573084, + "language_loss": 0.86256903, + "learning_rate": 0.0007861782550137942, + "loss": 0.87392187, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1699, + "time_per_iteration": 2.9298973083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115677, + "balance_loss_mlp": 1.10270739, + "diversity_loss_mlp": 0.0, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.06870341741306431, + "language_loss": 0.85913056, + "learning_rate": 0.0007859227332594901, + "loss": 0.8702873, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1700, + "time_per_iteration": 2.9108214378356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099921, + "balance_loss_mlp": 1.08703494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.08010897822069696, + "language_loss": 0.84705722, + "learning_rate": 0.0007856671005001365, + "loss": 0.85805643, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1701, + "time_per_iteration": 3.172921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088126, + "balance_loss_mlp": 1.07506084, + "diversity_loss_mlp": 0.0, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.0963591610521261, + "language_loss": 0.81720912, + "learning_rate": 0.0007854113568349787, + "loss": 0.82809043, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.13085938, + "routerloss_mlp": 0.0, + "step": 1702, + "time_per_iteration": 3.1135685443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100269, + "balance_loss_mlp": 1.08686948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.07838750037803571, + "language_loss": 0.80661154, + "learning_rate": 0.0007851555023633052, + "loss": 0.8176142, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.13397217, + "routerloss_mlp": 0.0, + "step": 1703, + "time_per_iteration": 2.841059684753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086421, + "balance_loss_mlp": 1.07271171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07047077484334266, + "language_loss": 0.82222247, + "learning_rate": 0.0007848995371844474, + "loss": 0.83308667, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1704, + "time_per_iteration": 2.515455961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094981, + "balance_loss_mlp": 1.0816896, + "diversity_loss_mlp": 0.0, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.08203255389116743, + "language_loss": 0.80260348, + "learning_rate": 0.0007846434613977801, + "loss": 0.81355333, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1705, + "time_per_iteration": 2.523026466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.0868392, + "diversity_loss_mlp": 0.0, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.07270926258732689, + "language_loss": 0.78603041, + "learning_rate": 0.0007843872751027203, + "loss": 0.7970314, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.13275146, + "routerloss_mlp": 0.0, + "step": 1706, + "time_per_iteration": 2.8923709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915397, + "balance_loss_mlp": 1.59612775, + "diversity_loss_mlp": 0.20258766, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.02966318853366187, + "language_loss": 0.87305748, + "learning_rate": 0.0007841309783987287, + "loss": 0.88221151, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01603885, + "step": 1707, + "time_per_iteration": 2.7517144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115655, + "balance_loss_mlp": 1.10263109, + "diversity_loss_mlp": 0.0, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06500174516261728, + "language_loss": 0.89240694, + "learning_rate": 0.0007838745713853084, + "loss": 0.9035635, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1708, + "time_per_iteration": 2.6181201934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122322, + "balance_loss_mlp": 1.10945296, + "diversity_loss_mlp": 0.0, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.06936064314807153, + "language_loss": 0.8434307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85465395, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.12866211, + "routerloss_mlp": 0.0, + "step": 1709, + "time_per_iteration": 2.7040350437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124287, + "balance_loss_mlp": 1.1112572, + "diversity_loss_mlp": 0.0, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06883588356672955, + "language_loss": 0.86454904, + "learning_rate": 0.0007833614268284082, + "loss": 0.87579191, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 1710, + "time_per_iteration": 2.5110740661621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425821, + "balance_loss_mlp": 1.41738081, + "diversity_loss_mlp": 0.0, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.1402114647579648, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75535595, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.08447266, + "routerloss_mlp": 0.0, + "step": 1711, + "time_per_iteration": 4.873327016830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129416, + "balance_loss_mlp": 1.11650598, + "diversity_loss_mlp": 0.0, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.0798208466882041, + "language_loss": 0.78414649, + "learning_rate": 0.0007828478422289016, + "loss": 0.79544067, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 1712, + "time_per_iteration": 2.608412027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138403, + "balance_loss_mlp": 1.12507582, + "diversity_loss_mlp": 0.0, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.07544776571140048, + "language_loss": 0.8909815, + "learning_rate": 0.0007825908851623833, + "loss": 0.90236557, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.13323975, + "routerloss_mlp": 0.0, + "step": 1713, + "time_per_iteration": 2.8033607006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134961, + "balance_loss_mlp": 1.12190771, + "diversity_loss_mlp": 0.0, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.06974595077498419, + "language_loss": 0.85003847, + "learning_rate": 0.0007823338183843533, + "loss": 0.86138809, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1714, + "time_per_iteration": 2.6861188411712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148942, + "balance_loss_mlp": 1.13610959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.07049806127627434, + "language_loss": 0.81025606, + "learning_rate": 0.0007820766419946141, + "loss": 0.82174551, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1715, + "time_per_iteration": 3.3007164001464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168148, + "balance_loss_mlp": 1.16008925, + "diversity_loss_mlp": 0.0, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.052131774928428895, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80840629, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1716, + "time_per_iteration": 4.947760105133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906852, + "balance_loss_mlp": 1.58163857, + "diversity_loss_mlp": 0.20079982, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.033697214377685164, + "language_loss": 0.75853068, + "learning_rate": 0.0007815619607794288, + "loss": 0.76759923, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563331, + "step": 1717, + "time_per_iteration": 2.689937114715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173062, + "balance_loss_mlp": 1.1601274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.09689448967864323, + "language_loss": 0.8294118, + "learning_rate": 0.0007813044561538001, + "loss": 0.84114236, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1718, + "time_per_iteration": 3.1421005725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158875, + "balance_loss_mlp": 1.14559531, + "diversity_loss_mlp": 0.0, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06842928932014077, + "language_loss": 0.88578129, + "learning_rate": 0.0007810468423160958, + "loss": 0.89736998, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1719, + "time_per_iteration": 2.8917293548583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157511, + "balance_loss_mlp": 1.14486265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06941390463820386, + "language_loss": 0.81896281, + "learning_rate": 0.0007807891193663306, + "loss": 0.83053792, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.12640381, + "routerloss_mlp": 0.0, + "step": 1720, + "time_per_iteration": 2.8352882862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141979, + "balance_loss_mlp": 1.12950385, + "diversity_loss_mlp": 0.0, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07961809028947962, + "language_loss": 0.82409328, + "learning_rate": 0.0007805312874045614, + "loss": 0.83551311, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1721, + "time_per_iteration": 2.5056259632110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137994, + "balance_loss_mlp": 1.12510777, + "diversity_loss_mlp": 0.0, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.09061115976682882, + "language_loss": 0.86960506, + "learning_rate": 0.0007802733465308874, + "loss": 0.88098502, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1722, + "time_per_iteration": 2.438533306121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144898, + "balance_loss_mlp": 1.13225603, + "diversity_loss_mlp": 0.0, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06773749819611302, + "language_loss": 0.84162688, + "learning_rate": 0.0007800152968454501, + "loss": 0.8530758, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1723, + "time_per_iteration": 2.6364991664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134044, + "balance_loss_mlp": 1.12146711, + "diversity_loss_mlp": 0.0, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06044198445597461, + "language_loss": 0.90330362, + "learning_rate": 0.0007797571384484334, + "loss": 0.91464406, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.12567139, + "routerloss_mlp": 0.0, + "step": 1724, + "time_per_iteration": 2.8638265132904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133346, + "balance_loss_mlp": 1.12061453, + "diversity_loss_mlp": 0.0, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.0752969909322094, + "language_loss": 0.91929704, + "learning_rate": 0.0007794988714400633, + "loss": 0.93063056, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 1725, + "time_per_iteration": 2.615788698196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125798, + "balance_loss_mlp": 1.11242867, + "diversity_loss_mlp": 0.0, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.07890733478173245, + "language_loss": 0.85302055, + "learning_rate": 0.0007792404959206079, + "loss": 0.86427855, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.13372803, + "routerloss_mlp": 0.0, + "step": 1726, + "time_per_iteration": 2.545780897140503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107165, + "balance_loss_mlp": 1.09446895, + "diversity_loss_mlp": 0.0, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.07756389475354548, + "language_loss": 0.81480336, + "learning_rate": 0.0007789820119903774, + "loss": 0.82587504, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.12689209, + "routerloss_mlp": 0.0, + "step": 1727, + "time_per_iteration": 3.005662441253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114992, + "balance_loss_mlp": 1.10335684, + "diversity_loss_mlp": 0.0, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.03748312413261812, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.7960766, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 1728, + "time_per_iteration": 4.833205223083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105872, + "balance_loss_mlp": 1.09285486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.07170574552345628, + "language_loss": 0.83970881, + "learning_rate": 0.0007784647192990428, + "loss": 0.85076749, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 1729, + "time_per_iteration": 2.7309772968292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107979, + "balance_loss_mlp": 1.0948776, + "diversity_loss_mlp": 0.0, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06011930461286596, + "language_loss": 0.80777055, + "learning_rate": 0.0007782059107387696, + "loss": 0.81885028, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.13116455, + "routerloss_mlp": 0.0, + "step": 1730, + "time_per_iteration": 2.8615641593933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113195, + "balance_loss_mlp": 1.11733532, + "diversity_loss_mlp": 0.0, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.08106060743083753, + "language_loss": 0.88617826, + "learning_rate": 0.0007779469941693826, + "loss": 0.89749771, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1731, + "time_per_iteration": 2.801208257675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126142, + "balance_loss_mlp": 1.11240935, + "diversity_loss_mlp": 0.0, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.09519717038034853, + "language_loss": 0.77091044, + "learning_rate": 0.0007776879696914029, + "loss": 0.78217185, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1732, + "time_per_iteration": 2.8286595344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123068, + "balance_loss_mlp": 1.10889435, + "diversity_loss_mlp": 0.0, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.05947539267688924, + "language_loss": 0.88910627, + "learning_rate": 0.000777428837405392, + "loss": 0.90033698, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1733, + "time_per_iteration": 2.8319156169891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121491, + "balance_loss_mlp": 1.10701954, + "diversity_loss_mlp": 0.0, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.07113995025739508, + "language_loss": 0.86735553, + "learning_rate": 0.0007771695974119544, + "loss": 0.87857044, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1734, + "time_per_iteration": 2.5376570224761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112031, + "balance_loss_mlp": 1.09795249, + "diversity_loss_mlp": 0.0, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.08734149249458338, + "language_loss": 0.75937277, + "learning_rate": 0.0007769102498117359, + "loss": 0.77049315, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1735, + "time_per_iteration": 3.093188524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105698, + "balance_loss_mlp": 1.09138131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06929562674350419, + "language_loss": 0.79383999, + "learning_rate": 0.000776650794705424, + "loss": 0.80489695, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1736, + "time_per_iteration": 3.253673791885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121685, + "balance_loss_mlp": 1.10730791, + "diversity_loss_mlp": 0.0, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.06325878214231093, + "language_loss": 0.82130396, + "learning_rate": 0.0007763912321937483, + "loss": 0.83252084, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1737, + "time_per_iteration": 2.7109947204589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117751, + "balance_loss_mlp": 1.10324299, + "diversity_loss_mlp": 0.0, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.08404595709863052, + "language_loss": 0.82403475, + "learning_rate": 0.0007761315623774799, + "loss": 0.83521223, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1738, + "time_per_iteration": 3.4125657081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109047, + "balance_loss_mlp": 1.0946703, + "diversity_loss_mlp": 0.0, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08421865543081901, + "language_loss": 0.87820536, + "learning_rate": 0.0007758717853574313, + "loss": 0.88929582, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1739, + "time_per_iteration": 2.7345223426818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106099, + "balance_loss_mlp": 1.09184134, + "diversity_loss_mlp": 0.0, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.07638673743764693, + "language_loss": 0.90095574, + "learning_rate": 0.0007756119012344571, + "loss": 0.91201669, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1740, + "time_per_iteration": 2.5901129245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.08709717, + "diversity_loss_mlp": 0.0, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06863708242027233, + "language_loss": 0.8461023, + "learning_rate": 0.0007753519101094535, + "loss": 0.85711253, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1741, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089984, + "balance_loss_mlp": 1.07595301, + "diversity_loss_mlp": 0.0, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.07992644583812669, + "language_loss": 0.86363387, + "learning_rate": 0.0007750918120833575, + "loss": 0.87453371, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1742, + "time_per_iteration": 2.58940052986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.07488728, + "diversity_loss_mlp": 0.0, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.11201991585260462, + "language_loss": 0.87392128, + "learning_rate": 0.0007748316072571485, + "loss": 0.88480592, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1743, + "time_per_iteration": 2.8557286262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086266, + "balance_loss_mlp": 1.07202053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0749416267225997, + "language_loss": 0.79045737, + "learning_rate": 0.0007745712957318467, + "loss": 0.80131996, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1744, + "time_per_iteration": 2.9912548065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084233, + "balance_loss_mlp": 1.07057166, + "diversity_loss_mlp": 0.0, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06946859722884112, + "language_loss": 0.86471289, + "learning_rate": 0.0007743108776085141, + "loss": 0.87555522, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1745, + "time_per_iteration": 2.7899224758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_mlp": 1.07023191, + "diversity_loss_mlp": 0.0, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.08256839233284315, + "language_loss": 0.82965624, + "learning_rate": 0.0007740503529882543, + "loss": 0.84050083, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1746, + "time_per_iteration": 2.808084011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084564, + "balance_loss_mlp": 1.07044971, + "diversity_loss_mlp": 0.0, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.07349682427851349, + "language_loss": 0.90707254, + "learning_rate": 0.0007737897219722114, + "loss": 0.91791821, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1747, + "time_per_iteration": 2.712833881378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092286, + "balance_loss_mlp": 1.07794499, + "diversity_loss_mlp": 0.0, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.05794758251669461, + "language_loss": 0.81094921, + "learning_rate": 0.0007735289846615716, + "loss": 0.82187206, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1748, + "time_per_iteration": 2.677976369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108166, + "balance_loss_mlp": 1.09457588, + "diversity_loss_mlp": 0.0, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.0827866783592608, + "language_loss": 0.823035, + "learning_rate": 0.0007732681411575621, + "loss": 0.8341167, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1749, + "time_per_iteration": 2.674349069595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114062, + "balance_loss_mlp": 1.09997165, + "diversity_loss_mlp": 0.0, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.4203922337067485, + "language_loss": 0.87328398, + "learning_rate": 0.0007730071915614514, + "loss": 0.88442457, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1750, + "time_per_iteration": 2.6714634895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113648, + "balance_loss_mlp": 1.10037947, + "diversity_loss_mlp": 0.0, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09571011442330926, + "language_loss": 0.88792437, + "learning_rate": 0.0007727461359745489, + "loss": 0.89906085, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1751, + "time_per_iteration": 2.469905376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141755, + "balance_loss_mlp": 1.12897623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.07412184794878955, + "language_loss": 0.85941112, + "learning_rate": 0.0007724849744982056, + "loss": 0.87082875, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 1752, + "time_per_iteration": 2.6805977821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117715, + "balance_loss_mlp": 1.16388226, + "diversity_loss_mlp": 0.0, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.09378397224837084, + "language_loss": 0.81843758, + "learning_rate": 0.0007722237072338131, + "loss": 0.83020908, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1753, + "time_per_iteration": 2.7348344326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186311, + "balance_loss_mlp": 1.17280459, + "diversity_loss_mlp": 0.0, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.1034159122014491, + "language_loss": 0.85304463, + "learning_rate": 0.0007719623342828046, + "loss": 0.86490774, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1754, + "time_per_iteration": 2.5181336402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202577, + "balance_loss_mlp": 1.18872511, + "diversity_loss_mlp": 0.0, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.12703041648808322, + "language_loss": 0.84088987, + "learning_rate": 0.000771700855746654, + "loss": 0.85291564, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1755, + "time_per_iteration": 2.590925931930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188345, + "balance_loss_mlp": 1.1743381, + "diversity_loss_mlp": 0.0, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.06849832931784437, + "language_loss": 0.88371092, + "learning_rate": 0.0007714392717268763, + "loss": 0.89559436, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1756, + "time_per_iteration": 2.560246706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189305, + "balance_loss_mlp": 1.17545295, + "diversity_loss_mlp": 0.0, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.09135673410225151, + "language_loss": 0.8630141, + "learning_rate": 0.0007711775823250273, + "loss": 0.8749072, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1757, + "time_per_iteration": 2.562939167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194838, + "balance_loss_mlp": 1.18069935, + "diversity_loss_mlp": 0.0, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.07414503329772545, + "language_loss": 0.83081156, + "learning_rate": 0.0007709157876427039, + "loss": 0.84275991, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1758, + "time_per_iteration": 3.0652947425842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190916, + "balance_loss_mlp": 1.17681408, + "diversity_loss_mlp": 0.0, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.06977999371164574, + "language_loss": 0.85321373, + "learning_rate": 0.0007706538877815439, + "loss": 0.86512285, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1759, + "time_per_iteration": 2.5949320793151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202515, + "balance_loss_mlp": 1.1888063, + "diversity_loss_mlp": 0.0, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.052908737395413206, + "language_loss": 0.83029473, + "learning_rate": 0.0007703918828432259, + "loss": 0.84231991, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1760, + "time_per_iteration": 2.6404576301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231589, + "balance_loss_mlp": 1.21696198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.11529749255982873, + "language_loss": 0.89274669, + "learning_rate": 0.000770129772929469, + "loss": 0.90506256, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1761, + "time_per_iteration": 2.6486427783966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212596, + "balance_loss_mlp": 1.19812357, + "diversity_loss_mlp": 0.0, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.10010821715075297, + "language_loss": 0.8820551, + "learning_rate": 0.0007698675581420334, + "loss": 0.89418107, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1762, + "time_per_iteration": 2.8473589420318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170537, + "balance_loss_mlp": 1.15610099, + "diversity_loss_mlp": 0.0, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06768336788468338, + "language_loss": 0.79040444, + "learning_rate": 0.0007696052385827199, + "loss": 0.80210984, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1763, + "time_per_iteration": 2.9893951416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147034, + "balance_loss_mlp": 1.13271689, + "diversity_loss_mlp": 0.0, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.06731413775333611, + "language_loss": 0.78161937, + "learning_rate": 0.00076934281435337, + "loss": 0.79308975, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1764, + "time_per_iteration": 2.7329161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933074, + "balance_loss_mlp": 1.62411106, + "diversity_loss_mlp": 0.20785357, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0341650984642099, + "language_loss": 0.86205357, + "learning_rate": 0.0007690802855558658, + "loss": 0.87138426, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170921, + "step": 1765, + "time_per_iteration": 2.9281163215637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121638, + "balance_loss_mlp": 1.10924029, + "diversity_loss_mlp": 0.0, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.029090002598214117, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77496594, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 1766, + "time_per_iteration": 4.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104609, + "balance_loss_mlp": 1.08886182, + "diversity_loss_mlp": 0.0, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.08396151855964885, + "language_loss": 0.89357018, + "learning_rate": 0.0007685549146641262, + "loss": 0.90461624, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1767, + "time_per_iteration": 2.5867435932159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108303, + "balance_loss_mlp": 1.093521, + "diversity_loss_mlp": 0.0, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.10736891621188589, + "language_loss": 0.8816734, + "learning_rate": 0.0007682920727738579, + "loss": 0.89275646, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1768, + "time_per_iteration": 2.5119268894195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102653, + "balance_loss_mlp": 1.08738232, + "diversity_loss_mlp": 0.0, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.10494960168224592, + "language_loss": 0.85048056, + "learning_rate": 0.000768029126723369, + "loss": 0.86150718, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1769, + "time_per_iteration": 2.495424270629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090257, + "balance_loss_mlp": 1.07520068, + "diversity_loss_mlp": 0.0, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.08686425564719477, + "language_loss": 0.82128584, + "learning_rate": 0.0007677660766147447, + "loss": 0.83218843, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 1770, + "time_per_iteration": 2.532904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066946, + "balance_loss_mlp": 1.05578792, + "diversity_loss_mlp": 0.0, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.023964921008177247, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73537892, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 1771, + "time_per_iteration": 4.944117784500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117931, + "balance_loss_mlp": 1.1034112, + "diversity_loss_mlp": 0.0, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.10616133846526872, + "language_loss": 0.795196, + "learning_rate": 0.0007672396646316306, + "loss": 0.80637527, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1772, + "time_per_iteration": 2.6089062690734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134399, + "balance_loss_mlp": 1.11959314, + "diversity_loss_mlp": 0.0, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.07513330183645242, + "language_loss": 0.80376065, + "learning_rate": 0.000766976302961512, + "loss": 0.8151046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1773, + "time_per_iteration": 3.042421340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158934, + "balance_loss_mlp": 1.14410484, + "diversity_loss_mlp": 0.0, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.07872996810077096, + "language_loss": 0.81390858, + "learning_rate": 0.0007667128376420003, + "loss": 0.82549793, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1774, + "time_per_iteration": 2.536562442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208475, + "balance_loss_mlp": 1.19358635, + "diversity_loss_mlp": 0.0, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.08297883362487203, + "language_loss": 0.8462863, + "learning_rate": 0.0007664492687753817, + "loss": 0.85837102, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1775, + "time_per_iteration": 2.6977102756500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198612, + "balance_loss_mlp": 1.18424678, + "diversity_loss_mlp": 0.0, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10155126624771216, + "language_loss": 0.81542516, + "learning_rate": 0.000766185596463983, + "loss": 0.82741123, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1776, + "time_per_iteration": 2.6038215160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196202, + "balance_loss_mlp": 1.18163514, + "diversity_loss_mlp": 0.0, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.0897891274607312, + "language_loss": 0.77011722, + "learning_rate": 0.0007659218208101706, + "loss": 0.78207922, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1777, + "time_per_iteration": 3.0933022499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173425, + "balance_loss_mlp": 1.15902483, + "diversity_loss_mlp": 0.0, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.08364054831663822, + "language_loss": 0.85122472, + "learning_rate": 0.0007656579419163515, + "loss": 0.86295897, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1778, + "time_per_iteration": 2.732297420501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146549, + "balance_loss_mlp": 1.13211274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.0722191895240348, + "language_loss": 0.77409559, + "learning_rate": 0.0007653939598849724, + "loss": 0.78556108, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1779, + "time_per_iteration": 2.4908664226531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_mlp": 1.02253902, + "diversity_loss_mlp": 0.0, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.029240552967656448, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83912855, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.10107422, + "routerloss_mlp": 0.0, + "step": 1780, + "time_per_iteration": 4.9182775020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.10688317, + "diversity_loss_mlp": 0.0, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07624931845389674, + "language_loss": 0.80176342, + "learning_rate": 0.000764865686819522, + "loss": 0.81297386, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1781, + "time_per_iteration": 3.0602052211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111853, + "balance_loss_mlp": 1.097965, + "diversity_loss_mlp": 0.0, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.07936344533488468, + "language_loss": 0.85836053, + "learning_rate": 0.0007646013959905449, + "loss": 0.86947906, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1782, + "time_per_iteration": 2.5750925540924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109071, + "balance_loss_mlp": 1.09528995, + "diversity_loss_mlp": 0.0, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.07233814650781724, + "language_loss": 0.81042612, + "learning_rate": 0.0007643370024341949, + "loss": 0.82151681, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1783, + "time_per_iteration": 3.0870087146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110812, + "balance_loss_mlp": 1.09431553, + "diversity_loss_mlp": 0.0, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.07806584209391611, + "language_loss": 0.83175099, + "learning_rate": 0.0007640725062531195, + "loss": 0.84283221, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1784, + "time_per_iteration": 2.5063886642456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102776, + "balance_loss_mlp": 1.08888865, + "diversity_loss_mlp": 0.0, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.5067557182324087, + "language_loss": 0.86699629, + "learning_rate": 0.0007638079075500047, + "loss": 0.87802398, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1785, + "time_per_iteration": 2.532945394515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015111, + "balance_loss_mlp": 1.00562215, + "diversity_loss_mlp": 0.0, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.016449027395748255, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76195776, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 1786, + "time_per_iteration": 4.944318056106567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150049, + "balance_loss_mlp": 1.13542247, + "diversity_loss_mlp": 0.0, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.07356798682381475, + "language_loss": 0.83088338, + "learning_rate": 0.0007632784029886026, + "loss": 0.84238386, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1787, + "time_per_iteration": 2.6217002868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204344, + "balance_loss_mlp": 1.1884768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.08799574205003287, + "language_loss": 0.85466659, + "learning_rate": 0.0007630134973358873, + "loss": 0.86671007, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1788, + "time_per_iteration": 2.9664394855499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251833, + "balance_loss_mlp": 1.2359066, + "diversity_loss_mlp": 0.0, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.1052875761358054, + "language_loss": 0.86575854, + "learning_rate": 0.0007627484895722763, + "loss": 0.87827688, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1789, + "time_per_iteration": 2.67280912399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247407, + "balance_loss_mlp": 1.23117065, + "diversity_loss_mlp": 0.0, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.09611070791328494, + "language_loss": 0.80025196, + "learning_rate": 0.0007624833798006552, + "loss": 0.81272602, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.16235352, + "routerloss_mlp": 0.0, + "step": 1790, + "time_per_iteration": 3.046809196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238128, + "balance_loss_mlp": 1.22221315, + "diversity_loss_mlp": 0.0, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.07959093752215074, + "language_loss": 0.83783114, + "learning_rate": 0.0007622181681239483, + "loss": 0.8502124, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1791, + "time_per_iteration": 2.6601433753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244342, + "balance_loss_mlp": 1.22793913, + "diversity_loss_mlp": 0.0, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07919089267187412, + "language_loss": 0.84668601, + "learning_rate": 0.0007619528546451202, + "loss": 0.85912943, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 1792, + "time_per_iteration": 2.782947063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208587, + "balance_loss_mlp": 1.19314909, + "diversity_loss_mlp": 0.0, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.07332959959795217, + "language_loss": 0.83832949, + "learning_rate": 0.0007616874394671745, + "loss": 0.85041535, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1793, + "time_per_iteration": 3.3206703662872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184994, + "balance_loss_mlp": 1.169258, + "diversity_loss_mlp": 0.0, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.0713753042238581, + "language_loss": 0.85051751, + "learning_rate": 0.0007614219226931547, + "loss": 0.86236751, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1794, + "time_per_iteration": 2.7190396785736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179587, + "balance_loss_mlp": 1.16401851, + "diversity_loss_mlp": 0.0, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.07163818055438703, + "language_loss": 0.8457973, + "learning_rate": 0.0007611563044261435, + "loss": 0.85759324, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 1795, + "time_per_iteration": 2.5077741146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150042, + "balance_loss_mlp": 1.13422251, + "diversity_loss_mlp": 0.0, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.0670543853763616, + "language_loss": 0.86376798, + "learning_rate": 0.0007608905847692631, + "loss": 0.8752684, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1796, + "time_per_iteration": 2.4662768840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112741, + "balance_loss_mlp": 1.11171043, + "diversity_loss_mlp": 0.0, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07671810253227593, + "language_loss": 0.86553091, + "learning_rate": 0.0007606247638256749, + "loss": 0.87680501, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 1797, + "time_per_iteration": 2.8649494647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00624206, + "balance_loss_mlp": 1.05204535, + "diversity_loss_mlp": 0.16984753, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.0016633519833830733, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.78794497, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01325956, + "step": 1798, + "time_per_iteration": 4.963132619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_mlp": 1.04498482, + "diversity_loss_mlp": 0.0, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.032920799461559694, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80382872, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.10693359, + "routerloss_mlp": 0.0, + "step": 1799, + "time_per_iteration": 4.773633003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099713, + "balance_loss_mlp": 1.08345306, + "diversity_loss_mlp": 0.0, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.10233507255995049, + "language_loss": 0.85892332, + "learning_rate": 0.0007598266943068686, + "loss": 0.86992049, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 1800, + "time_per_iteration": 2.7380948066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092311, + "balance_loss_mlp": 1.0761466, + "diversity_loss_mlp": 0.0, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.08416075255699706, + "language_loss": 0.83903629, + "learning_rate": 0.0007595604692488507, + "loss": 0.84995937, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1801, + "time_per_iteration": 2.5558300018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099836, + "balance_loss_mlp": 1.08382583, + "diversity_loss_mlp": 0.0, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.0681721192963598, + "language_loss": 0.82674247, + "learning_rate": 0.0007592941434205215, + "loss": 0.83774084, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 1802, + "time_per_iteration": 2.8181002140045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017477, + "balance_loss_mlp": 1.00651026, + "diversity_loss_mlp": 0.0, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.018274165575771096, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588537, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 1803, + "time_per_iteration": 5.063629388809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126513, + "balance_loss_mlp": 1.11121821, + "diversity_loss_mlp": 0.0, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07342722091818694, + "language_loss": 0.80217302, + "learning_rate": 0.0007587611898665566, + "loss": 0.81343818, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.15270996, + "routerloss_mlp": 0.0, + "step": 1804, + "time_per_iteration": 3.0994317531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113982, + "balance_loss_mlp": 1.12468028, + "diversity_loss_mlp": 0.0, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.05936466476556785, + "language_loss": 0.82130265, + "learning_rate": 0.0007584945623478315, + "loss": 0.83270085, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1805, + "time_per_iteration": 2.833981513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152624, + "balance_loss_mlp": 1.13780582, + "diversity_loss_mlp": 0.0, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.08744691316973383, + "language_loss": 0.80801159, + "learning_rate": 0.000758227834472617, + "loss": 0.81953788, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1806, + "time_per_iteration": 3.0535178184509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166216, + "balance_loss_mlp": 1.15111172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.07500761638021176, + "language_loss": 0.77729452, + "learning_rate": 0.0007579610063444664, + "loss": 0.7889567, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1807, + "time_per_iteration": 2.7615864276885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149194, + "balance_loss_mlp": 1.1339947, + "diversity_loss_mlp": 0.0, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.07406875426876382, + "language_loss": 0.87547183, + "learning_rate": 0.0007576940780669712, + "loss": 0.88696373, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1808, + "time_per_iteration": 3.264080762863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143539, + "balance_loss_mlp": 1.12863731, + "diversity_loss_mlp": 0.0, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.07928472428244501, + "language_loss": 0.84104979, + "learning_rate": 0.0007574270497437624, + "loss": 0.85248518, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1809, + "time_per_iteration": 2.9859273433685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128376, + "balance_loss_mlp": 1.11302221, + "diversity_loss_mlp": 0.0, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.07150597602774303, + "language_loss": 0.88426095, + "learning_rate": 0.000757159921478509, + "loss": 0.89554477, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 1810, + "time_per_iteration": 2.7891488075256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057512, + "balance_loss_mlp": 1.04754615, + "diversity_loss_mlp": 0.0, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.03228641235871289, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75508153, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 1811, + "time_per_iteration": 4.737962007522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103488, + "balance_loss_mlp": 1.08814573, + "diversity_loss_mlp": 0.0, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.07438083858778873, + "language_loss": 0.87798911, + "learning_rate": 0.0007566253655367423, + "loss": 0.88902402, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 1812, + "time_per_iteration": 2.5879476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.07600367, + "diversity_loss_mlp": 0.0, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.06854488097647142, + "language_loss": 0.8957805, + "learning_rate": 0.000756357938067762, + "loss": 0.90669596, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 1813, + "time_per_iteration": 2.7090489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.07826209, + "diversity_loss_mlp": 0.0, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.0690606019510397, + "language_loss": 0.8334865, + "learning_rate": 0.0007560904110718033, + "loss": 0.84443069, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1814, + "time_per_iteration": 3.2445590496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096093, + "balance_loss_mlp": 1.08003569, + "diversity_loss_mlp": 0.0, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06223934742271703, + "language_loss": 0.83650601, + "learning_rate": 0.0007558227846527297, + "loss": 0.84746695, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1815, + "time_per_iteration": 2.8504550457000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110334, + "balance_loss_mlp": 1.08731842, + "diversity_loss_mlp": 0.0, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.07831164241761415, + "language_loss": 0.83117825, + "learning_rate": 0.0007555550589144429, + "loss": 0.84221166, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1816, + "time_per_iteration": 2.4655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111049, + "balance_loss_mlp": 1.09515882, + "diversity_loss_mlp": 0.0, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.08460625336983617, + "language_loss": 0.84522688, + "learning_rate": 0.000755287233960883, + "loss": 0.85633731, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1817, + "time_per_iteration": 2.602492094039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.07385683, + "diversity_loss_mlp": 0.0, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.07045705340523431, + "language_loss": 0.77682364, + "learning_rate": 0.0007550193098960292, + "loss": 0.78771949, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1818, + "time_per_iteration": 2.8674800395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00989642, + "balance_loss_mlp": 1.73270237, + "diversity_loss_mlp": 0.21087486, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.029406524514427698, + "language_loss": 0.86412024, + "learning_rate": 0.0007547512868238988, + "loss": 0.87401664, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01785346, + "step": 1819, + "time_per_iteration": 3.151559829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090552, + "balance_loss_mlp": 1.07453036, + "diversity_loss_mlp": 0.0, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.06124546921927801, + "language_loss": 0.83503008, + "learning_rate": 0.0007544831648485473, + "loss": 0.84593564, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1820, + "time_per_iteration": 2.6791367530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094234, + "balance_loss_mlp": 1.07806909, + "diversity_loss_mlp": 0.0, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.08232155140582742, + "language_loss": 0.81448233, + "learning_rate": 0.0007542149440740694, + "loss": 0.82542467, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1821, + "time_per_iteration": 2.665632724761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.07229352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.08177047744866778, + "language_loss": 0.85514361, + "learning_rate": 0.000753946624604597, + "loss": 0.8660273, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1822, + "time_per_iteration": 2.708221673965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085159, + "balance_loss_mlp": 1.06938744, + "diversity_loss_mlp": 0.0, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.07022994660183399, + "language_loss": 0.88119262, + "learning_rate": 0.0007536782065443015, + "loss": 0.89204431, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 1823, + "time_per_iteration": 2.633929967880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109141, + "balance_loss_mlp": 1.0758059, + "diversity_loss_mlp": 0.0, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.09965750131036237, + "language_loss": 0.75038946, + "learning_rate": 0.0007534096899973919, + "loss": 0.7613036, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1824, + "time_per_iteration": 2.585160732269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089888, + "balance_loss_mlp": 1.07460535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.0636070515998131, + "language_loss": 0.82941401, + "learning_rate": 0.0007531410750681154, + "loss": 0.84031284, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1825, + "time_per_iteration": 2.7595911026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100118, + "balance_loss_mlp": 1.08562207, + "diversity_loss_mlp": 0.0, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.09267960960885083, + "language_loss": 0.87015611, + "learning_rate": 0.0007528723618607575, + "loss": 0.88115728, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1826, + "time_per_iteration": 3.4216692447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090335, + "balance_loss_mlp": 1.07524323, + "diversity_loss_mlp": 0.0, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.07214965975453298, + "language_loss": 0.82582879, + "learning_rate": 0.0007526035504796422, + "loss": 0.83673215, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.15087891, + "routerloss_mlp": 0.0, + "step": 1827, + "time_per_iteration": 2.7822000980377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094099, + "balance_loss_mlp": 1.0794003, + "diversity_loss_mlp": 0.0, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.07057247929289283, + "language_loss": 0.86824054, + "learning_rate": 0.0007523346410291312, + "loss": 0.8791815, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1828, + "time_per_iteration": 2.7560181617736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098232, + "balance_loss_mlp": 1.08291376, + "diversity_loss_mlp": 0.0, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.0630617970486185, + "language_loss": 0.85159689, + "learning_rate": 0.0007520656336136245, + "loss": 0.86257917, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1829, + "time_per_iteration": 2.9432313442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098974, + "balance_loss_mlp": 1.08431172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06541232162591855, + "language_loss": 0.88230217, + "learning_rate": 0.0007517965283375599, + "loss": 0.89329195, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1830, + "time_per_iteration": 2.8773486614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098552, + "balance_loss_mlp": 1.08363926, + "diversity_loss_mlp": 0.0, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.06973135687475002, + "language_loss": 0.89511967, + "learning_rate": 0.0007515273253054132, + "loss": 0.90610522, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1831, + "time_per_iteration": 2.662757396697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097986, + "balance_loss_mlp": 1.08288169, + "diversity_loss_mlp": 0.0, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.07142201858296882, + "language_loss": 0.82785273, + "learning_rate": 0.0007512580246216988, + "loss": 0.83883256, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1832, + "time_per_iteration": 2.730994939804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096378, + "balance_loss_mlp": 1.08164394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.07119734441282773, + "language_loss": 0.84715027, + "learning_rate": 0.000750988626390968, + "loss": 0.85811406, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1833, + "time_per_iteration": 2.604182004928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_mlp": 1.07508624, + "diversity_loss_mlp": 0.0, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.07060575001723658, + "language_loss": 0.85089648, + "learning_rate": 0.0007507191307178108, + "loss": 0.86179501, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1834, + "time_per_iteration": 2.7584774494171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083532, + "balance_loss_mlp": 1.06808281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.09392412586459238, + "language_loss": 0.75105453, + "learning_rate": 0.0007504495377068543, + "loss": 0.76188982, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1835, + "time_per_iteration": 2.731039524078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087025, + "balance_loss_mlp": 1.07230306, + "diversity_loss_mlp": 0.0, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09299008065025831, + "language_loss": 0.81784093, + "learning_rate": 0.0007501798474627642, + "loss": 0.82871115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1836, + "time_per_iteration": 2.9180665016174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092183, + "balance_loss_mlp": 1.07738876, + "diversity_loss_mlp": 0.0, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.06800399913452355, + "language_loss": 0.8354817, + "learning_rate": 0.0007499100600902433, + "loss": 0.84640354, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1837, + "time_per_iteration": 2.981478452682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097597, + "balance_loss_mlp": 1.08236217, + "diversity_loss_mlp": 0.0, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.07178124654929893, + "language_loss": 0.83625698, + "learning_rate": 0.0007496401756940324, + "loss": 0.84723294, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1838, + "time_per_iteration": 2.7256877422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107267, + "balance_loss_mlp": 1.09267545, + "diversity_loss_mlp": 0.0, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.08438072522416575, + "language_loss": 0.81940264, + "learning_rate": 0.0007493701943789098, + "loss": 0.83047533, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1839, + "time_per_iteration": 2.805553674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117723, + "balance_loss_mlp": 1.10266685, + "diversity_loss_mlp": 0.0, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07000666511795951, + "language_loss": 0.82830888, + "learning_rate": 0.000749100116249692, + "loss": 0.83948612, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1840, + "time_per_iteration": 2.608135223388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00954188, + "balance_loss_mlp": 1.66862321, + "diversity_loss_mlp": 0.20571998, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.03743173710930313, + "language_loss": 0.86076337, + "learning_rate": 0.0007488299414112321, + "loss": 0.87030524, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01701665, + "step": 1841, + "time_per_iteration": 2.6307811737060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112413, + "balance_loss_mlp": 1.10974133, + "diversity_loss_mlp": 0.0, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.06710116446149988, + "language_loss": 0.77204335, + "learning_rate": 0.0007485596699684215, + "loss": 0.78328466, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1842, + "time_per_iteration": 2.808776378631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132611, + "balance_loss_mlp": 1.11780548, + "diversity_loss_mlp": 0.0, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.07987851383877129, + "language_loss": 0.85353696, + "learning_rate": 0.000748289302026189, + "loss": 0.86486304, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1843, + "time_per_iteration": 2.8449106216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127963, + "balance_loss_mlp": 1.11339569, + "diversity_loss_mlp": 0.0, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.06918658934745357, + "language_loss": 0.85752398, + "learning_rate": 0.0007480188376895004, + "loss": 0.86880362, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1844, + "time_per_iteration": 3.0339298248291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160602, + "balance_loss_mlp": 1.15135121, + "diversity_loss_mlp": 0.0, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.06421168097867443, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74971944, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 1845, + "time_per_iteration": 4.932978391647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.10506296, + "diversity_loss_mlp": 0.0, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08194467088107492, + "language_loss": 0.78768218, + "learning_rate": 0.0007474776202528074, + "loss": 0.79887938, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1846, + "time_per_iteration": 2.9188990592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111713, + "balance_loss_mlp": 1.1021452, + "diversity_loss_mlp": 0.0, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08015412782248336, + "language_loss": 0.80999184, + "learning_rate": 0.000747206867362922, + "loss": 0.82116312, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1847, + "time_per_iteration": 3.0966272354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099814, + "balance_loss_mlp": 1.085235, + "diversity_loss_mlp": 0.0, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.09857033029565816, + "language_loss": 0.836568, + "learning_rate": 0.0007469360184988194, + "loss": 0.84756613, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1848, + "time_per_iteration": 2.9021246433258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104687, + "balance_loss_mlp": 1.08986914, + "diversity_loss_mlp": 0.0, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08185517170087683, + "language_loss": 0.86821651, + "learning_rate": 0.0007466650737656518, + "loss": 0.8792634, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1849, + "time_per_iteration": 2.615549325942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102053, + "balance_loss_mlp": 1.0876888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06916390030254578, + "language_loss": 0.89687926, + "learning_rate": 0.0007463940332686098, + "loss": 0.9078998, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1850, + "time_per_iteration": 2.497159242630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931214, + "balance_loss_mlp": 1.62144685, + "diversity_loss_mlp": 0.20650919, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.030410176313075864, + "language_loss": 0.84120536, + "learning_rate": 0.0007461228971129205, + "loss": 0.85051751, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01723633, + "step": 1851, + "time_per_iteration": 2.959170341491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931448, + "balance_loss_mlp": 1.62270963, + "diversity_loss_mlp": 0.20620242, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.03221270440610224, + "language_loss": 0.85523784, + "learning_rate": 0.0007458516654038483, + "loss": 0.86455238, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01699215, + "step": 1852, + "time_per_iteration": 2.6886868476867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.13526964, + "diversity_loss_mlp": 0.0, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06572834298852859, + "language_loss": 0.86835778, + "learning_rate": 0.0007455803382466946, + "loss": 0.8798511, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1853, + "time_per_iteration": 2.8323659896850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151398, + "balance_loss_mlp": 1.13686657, + "diversity_loss_mlp": 0.0, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.06349489422764842, + "language_loss": 0.86956179, + "learning_rate": 0.0007453089157467979, + "loss": 0.88107574, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1854, + "time_per_iteration": 2.817117929458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.13687038, + "diversity_loss_mlp": 0.0, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06687597930641362, + "language_loss": 0.8221277, + "learning_rate": 0.0007450373980095341, + "loss": 0.83364242, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1855, + "time_per_iteration": 3.0857772827148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148466, + "balance_loss_mlp": 1.13494754, + "diversity_loss_mlp": 0.0, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.0656889709190827, + "language_loss": 0.86804116, + "learning_rate": 0.0007447657851403155, + "loss": 0.87952584, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1856, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144273, + "balance_loss_mlp": 1.1303966, + "diversity_loss_mlp": 0.0, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.08894932465162153, + "language_loss": 0.78988904, + "learning_rate": 0.0007444940772445915, + "loss": 0.80133176, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1857, + "time_per_iteration": 2.752232551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122576, + "balance_loss_mlp": 1.10860419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06705763345081875, + "language_loss": 0.80129987, + "learning_rate": 0.0007442222744278484, + "loss": 0.81252563, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1858, + "time_per_iteration": 2.638322591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110883, + "balance_loss_mlp": 1.09717393, + "diversity_loss_mlp": 0.0, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.05935371072747042, + "language_loss": 0.8399322, + "learning_rate": 0.0007439503767956099, + "loss": 0.85104102, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.137146, + "routerloss_mlp": 0.0, + "step": 1859, + "time_per_iteration": 2.699204921722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124434, + "balance_loss_mlp": 1.11480188, + "diversity_loss_mlp": 0.0, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.03541879327423246, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80796039, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 1860, + "time_per_iteration": 4.89499831199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089258, + "balance_loss_mlp": 1.07479787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.06413043417122823, + "language_loss": 0.86215138, + "learning_rate": 0.000743406297506922, + "loss": 0.87304389, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1861, + "time_per_iteration": 2.7184388637542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919817, + "balance_loss_mlp": 1.60078692, + "diversity_loss_mlp": 0.20507258, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.028510278569739433, + "language_loss": 0.84439111, + "learning_rate": 0.0007431341160617031, + "loss": 0.8535893, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01688758, + "step": 1862, + "time_per_iteration": 2.8915610313415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084391, + "balance_loss_mlp": 1.06988358, + "diversity_loss_mlp": 0.0, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06954606141633879, + "language_loss": 0.88100171, + "learning_rate": 0.0007428618402234491, + "loss": 0.8918457, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1863, + "time_per_iteration": 2.6724555492401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087919, + "balance_loss_mlp": 1.0733279, + "diversity_loss_mlp": 0.0, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.07542508091229044, + "language_loss": 0.80288851, + "learning_rate": 0.0007425894700978668, + "loss": 0.81376767, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1864, + "time_per_iteration": 2.724853038787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_mlp": 1.06996608, + "diversity_loss_mlp": 0.0, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.07695346444963648, + "language_loss": 0.7981261, + "learning_rate": 0.0007423170057906996, + "loss": 0.80896473, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1865, + "time_per_iteration": 3.9006779193878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108627, + "balance_loss_mlp": 1.0722512, + "diversity_loss_mlp": 0.0, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.07814080760266444, + "language_loss": 0.86228722, + "learning_rate": 0.0007420444474077275, + "loss": 0.87314993, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1866, + "time_per_iteration": 2.546194076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095193, + "balance_loss_mlp": 1.0812335, + "diversity_loss_mlp": 0.0, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.0773553058948038, + "language_loss": 0.8949936, + "learning_rate": 0.0007417717950547671, + "loss": 0.90594554, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1867, + "time_per_iteration": 2.5670700073242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052493, + "balance_loss_mlp": 1.04262233, + "diversity_loss_mlp": 0.0, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.023944930622272237, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.770491, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 1868, + "time_per_iteration": 4.900780200958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101302, + "balance_loss_mlp": 1.087533, + "diversity_loss_mlp": 0.0, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.06547244306940128, + "language_loss": 0.84938717, + "learning_rate": 0.0007412262088623299, + "loss": 0.86040014, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1869, + "time_per_iteration": 2.7674195766448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092029, + "balance_loss_mlp": 1.60128522, + "diversity_loss_mlp": 0.20662443, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.03542659619783611, + "language_loss": 0.79155517, + "learning_rate": 0.0007409532752346684, + "loss": 0.80075806, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633519, + "step": 1870, + "time_per_iteration": 2.7116785049438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111485, + "balance_loss_mlp": 1.101367, + "diversity_loss_mlp": 0.0, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.061502004439029076, + "language_loss": 0.8836326, + "learning_rate": 0.0007406802480606491, + "loss": 0.89478111, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1871, + "time_per_iteration": 2.642608165740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105605, + "balance_loss_mlp": 1.0916698, + "diversity_loss_mlp": 0.0, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.06939665757215846, + "language_loss": 0.90353388, + "learning_rate": 0.0007404071274462707, + "loss": 0.91458994, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.1394043, + "routerloss_mlp": 0.0, + "step": 1872, + "time_per_iteration": 2.5600955486297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113518, + "balance_loss_mlp": 1.09967744, + "diversity_loss_mlp": 0.0, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.07241097832053987, + "language_loss": 0.83719409, + "learning_rate": 0.0007401339134975682, + "loss": 0.84832925, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1873, + "time_per_iteration": 2.6775293350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111162, + "balance_loss_mlp": 1.09724998, + "diversity_loss_mlp": 0.0, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.07980684605652169, + "language_loss": 0.84604299, + "learning_rate": 0.0007398606063206122, + "loss": 0.85715467, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1874, + "time_per_iteration": 2.6092889308929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109131, + "balance_loss_mlp": 1.09546924, + "diversity_loss_mlp": 0.0, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.09304103013369584, + "language_loss": 0.78818524, + "learning_rate": 0.0007395872060215101, + "loss": 0.79927647, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1875, + "time_per_iteration": 2.5999374389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124779, + "balance_loss_mlp": 1.11121297, + "diversity_loss_mlp": 0.0, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.08049441369365674, + "language_loss": 0.8851527, + "learning_rate": 0.0007393137127064056, + "loss": 0.89640045, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1876, + "time_per_iteration": 2.635896682739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127435, + "balance_loss_mlp": 1.11380959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.06613177233605298, + "language_loss": 0.84377646, + "learning_rate": 0.0007390401264814779, + "loss": 0.8550508, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1877, + "time_per_iteration": 2.597508192062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151319, + "balance_loss_mlp": 1.1378243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.09083655630754779, + "language_loss": 0.84454513, + "learning_rate": 0.0007387664474529427, + "loss": 0.8560583, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1878, + "time_per_iteration": 2.6493661403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143725, + "balance_loss_mlp": 1.1302073, + "diversity_loss_mlp": 0.0, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.0643860955644754, + "language_loss": 0.91379291, + "learning_rate": 0.0007384926757270518, + "loss": 0.92523015, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1879, + "time_per_iteration": 2.62565016746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152012, + "balance_loss_mlp": 1.13819528, + "diversity_loss_mlp": 0.0, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.07609143241795291, + "language_loss": 0.80057949, + "learning_rate": 0.0007382188114100924, + "loss": 0.81209958, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1880, + "time_per_iteration": 2.974212169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.14148784, + "diversity_loss_mlp": 0.0, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.0632350243804942, + "language_loss": 0.8182314, + "learning_rate": 0.0007379448546083884, + "loss": 0.82978803, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1881, + "time_per_iteration": 2.894099712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154364, + "balance_loss_mlp": 1.14052355, + "diversity_loss_mlp": 0.0, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06232367753538678, + "language_loss": 0.8822301, + "learning_rate": 0.0007376708054282992, + "loss": 0.89377379, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1882, + "time_per_iteration": 2.9576163291931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162916, + "balance_loss_mlp": 1.14919519, + "diversity_loss_mlp": 0.0, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.06608098206448941, + "language_loss": 0.83563071, + "learning_rate": 0.0007373966639762201, + "loss": 0.84725988, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1883, + "time_per_iteration": 2.6004068851470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158732, + "balance_loss_mlp": 1.14478457, + "diversity_loss_mlp": 0.0, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.07441448138889938, + "language_loss": 0.88544619, + "learning_rate": 0.0007371224303585822, + "loss": 0.89703357, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1884, + "time_per_iteration": 2.5741078853607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109461, + "balance_loss_mlp": 1.09897089, + "diversity_loss_mlp": 0.0, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.03545085729862102, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81466532, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 1885, + "time_per_iteration": 4.706872224807739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.13442218, + "diversity_loss_mlp": 0.0, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.0691831634947964, + "language_loss": 0.8278423, + "learning_rate": 0.0007365736870525335, + "loss": 0.83932269, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1886, + "time_per_iteration": 2.8480284214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135823, + "balance_loss_mlp": 1.12236464, + "diversity_loss_mlp": 0.0, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.0786816251155578, + "language_loss": 0.82659888, + "learning_rate": 0.000736299177577164, + "loss": 0.83795714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1887, + "time_per_iteration": 2.601449966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127197, + "balance_loss_mlp": 1.11358309, + "diversity_loss_mlp": 0.0, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0767010159800114, + "language_loss": 0.8381778, + "learning_rate": 0.0007360245763623174, + "loss": 0.84944975, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1888, + "time_per_iteration": 2.6951138973236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106263, + "balance_loss_mlp": 1.09350717, + "diversity_loss_mlp": 0.0, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06311908909694558, + "language_loss": 0.89886129, + "learning_rate": 0.0007357498835146039, + "loss": 0.90992391, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1889, + "time_per_iteration": 2.8509137630462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094399, + "balance_loss_mlp": 1.08141732, + "diversity_loss_mlp": 0.0, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.06820711534899371, + "language_loss": 0.86674547, + "learning_rate": 0.0007354750991406684, + "loss": 0.87768942, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1890, + "time_per_iteration": 2.7162795066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089716, + "balance_loss_mlp": 1.07673419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07876014589837055, + "language_loss": 0.80930853, + "learning_rate": 0.0007352002233471919, + "loss": 0.82020569, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1891, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091374, + "balance_loss_mlp": 1.07835662, + "diversity_loss_mlp": 0.0, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.08103720744805817, + "language_loss": 0.79372823, + "learning_rate": 0.0007349252562408906, + "loss": 0.80464196, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1892, + "time_per_iteration": 2.6752734184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097706, + "balance_loss_mlp": 1.08496833, + "diversity_loss_mlp": 0.0, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.07356128462514616, + "language_loss": 0.81490725, + "learning_rate": 0.0007346501979285158, + "loss": 0.82588428, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1893, + "time_per_iteration": 2.8990893363952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_mlp": 1.03214884, + "diversity_loss_mlp": 0.0, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.022756463517582398, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81579787, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.08544922, + "routerloss_mlp": 0.0, + "step": 1894, + "time_per_iteration": 4.8097145557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098768, + "balance_loss_mlp": 1.0857501, + "diversity_loss_mlp": 0.0, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06969655176236832, + "language_loss": 0.85880721, + "learning_rate": 0.0007340998081127308, + "loss": 0.86979485, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1895, + "time_per_iteration": 2.757380485534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087705, + "balance_loss_mlp": 1.074646, + "diversity_loss_mlp": 0.0, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06910669114263218, + "language_loss": 0.91127002, + "learning_rate": 0.0007338244768230007, + "loss": 0.92214715, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1896, + "time_per_iteration": 2.7967634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098629, + "balance_loss_mlp": 1.08584976, + "diversity_loss_mlp": 0.0, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.05804787602656793, + "language_loss": 0.88684666, + "learning_rate": 0.0007335490547545578, + "loss": 0.89783299, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1897, + "time_per_iteration": 3.086498260498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095377, + "balance_loss_mlp": 1.08286643, + "diversity_loss_mlp": 0.0, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06953546528053214, + "language_loss": 0.82679451, + "learning_rate": 0.0007332735420143308, + "loss": 0.83774823, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1898, + "time_per_iteration": 2.788245439529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097867, + "balance_loss_mlp": 1.08476591, + "diversity_loss_mlp": 0.0, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.07600656362423025, + "language_loss": 0.86647844, + "learning_rate": 0.0007329979387092826, + "loss": 0.87745708, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1899, + "time_per_iteration": 2.5437934398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101105, + "balance_loss_mlp": 1.08821869, + "diversity_loss_mlp": 0.0, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.05952938167480439, + "language_loss": 0.83796108, + "learning_rate": 0.0007327222449464124, + "loss": 0.8489722, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1900, + "time_per_iteration": 3.2824244499206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011078, + "balance_loss_mlp": 1.09499097, + "diversity_loss_mlp": 0.0, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07745224305421915, + "language_loss": 0.88634431, + "learning_rate": 0.0007324464608327538, + "loss": 0.89742231, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.12823486, + "routerloss_mlp": 0.0, + "step": 1901, + "time_per_iteration": 2.6411991119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102996, + "balance_loss_mlp": 1.08995461, + "diversity_loss_mlp": 0.0, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.08223816362142805, + "language_loss": 0.88474846, + "learning_rate": 0.0007321705864753758, + "loss": 0.89577842, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.1305542, + "routerloss_mlp": 0.0, + "step": 1902, + "time_per_iteration": 2.682002544403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931657, + "balance_loss_mlp": 1.62497878, + "diversity_loss_mlp": 0.20707282, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.026825446902959647, + "language_loss": 0.84137708, + "learning_rate": 0.0007318946219813823, + "loss": 0.85069364, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563089, + "step": 1903, + "time_per_iteration": 3.0061404705047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108165, + "balance_loss_mlp": 1.09403849, + "diversity_loss_mlp": 0.0, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.07526416733947026, + "language_loss": 0.89736164, + "learning_rate": 0.000731618567457912, + "loss": 0.90844321, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.14105225, + "routerloss_mlp": 0.0, + "step": 1904, + "time_per_iteration": 2.6523027420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099252, + "balance_loss_mlp": 1.08536446, + "diversity_loss_mlp": 0.0, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07605082206895837, + "language_loss": 0.87058568, + "learning_rate": 0.000731342423012139, + "loss": 0.88157821, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1905, + "time_per_iteration": 3.0595312118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096318, + "balance_loss_mlp": 1.08213234, + "diversity_loss_mlp": 0.0, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.07718853495225737, + "language_loss": 0.82559443, + "learning_rate": 0.0007310661887512722, + "loss": 0.83655763, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1906, + "time_per_iteration": 3.056859016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090478, + "balance_loss_mlp": 1.07672131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.07458396044121823, + "language_loss": 0.8194133, + "learning_rate": 0.0007307898647825549, + "loss": 0.83031803, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1907, + "time_per_iteration": 2.670468807220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090408, + "balance_loss_mlp": 1.07666349, + "diversity_loss_mlp": 0.0, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.09231339543244264, + "language_loss": 0.89368939, + "learning_rate": 0.0007305134512132659, + "loss": 0.90459347, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.13751221, + "routerloss_mlp": 0.0, + "step": 1908, + "time_per_iteration": 2.6561663150787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091843, + "balance_loss_mlp": 1.07826495, + "diversity_loss_mlp": 0.0, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.08913139219920335, + "language_loss": 0.83308864, + "learning_rate": 0.0007302369481507183, + "loss": 0.84400707, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1909, + "time_per_iteration": 2.5485799312591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017138, + "balance_loss_mlp": 1.00979447, + "diversity_loss_mlp": 0.0, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.013277678950868657, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.80978894, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1910, + "time_per_iteration": 4.848855257034302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111697, + "balance_loss_mlp": 1.09842944, + "diversity_loss_mlp": 0.0, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.058739485749840115, + "language_loss": 0.85315347, + "learning_rate": 0.000729683673975274, + "loss": 0.86427045, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.13287354, + "routerloss_mlp": 0.0, + "step": 1911, + "time_per_iteration": 2.690218210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114863, + "balance_loss_mlp": 1.10165429, + "diversity_loss_mlp": 0.0, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.05585809452393386, + "language_loss": 0.8291769, + "learning_rate": 0.0007294069030771774, + "loss": 0.84032547, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1912, + "time_per_iteration": 3.678927183151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125561, + "balance_loss_mlp": 1.1124301, + "diversity_loss_mlp": 0.0, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.06389765233013874, + "language_loss": 0.90667701, + "learning_rate": 0.0007291300431154224, + "loss": 0.91793263, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1913, + "time_per_iteration": 2.616999387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043841, + "balance_loss_mlp": 1.03611672, + "diversity_loss_mlp": 0.0, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.02051984405011318, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7143358, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1914, + "time_per_iteration": 4.973980903625488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137116, + "balance_loss_mlp": 1.12441444, + "diversity_loss_mlp": 0.0, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.0814243559806059, + "language_loss": 0.7981922, + "learning_rate": 0.0007285760564309179, + "loss": 0.8095634, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.1270752, + "routerloss_mlp": 0.0, + "step": 1915, + "time_per_iteration": 3.091447353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127931, + "balance_loss_mlp": 1.11485386, + "diversity_loss_mlp": 0.0, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.09574055809111115, + "language_loss": 0.84848046, + "learning_rate": 0.0007282989299232448, + "loss": 0.85975981, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.13092041, + "routerloss_mlp": 0.0, + "step": 1916, + "time_per_iteration": 3.074547052383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113017, + "balance_loss_mlp": 1.09977341, + "diversity_loss_mlp": 0.0, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.08763204320127825, + "language_loss": 0.83209801, + "learning_rate": 0.0007280217147820668, + "loss": 0.84322822, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1917, + "time_per_iteration": 2.6260228157043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092055, + "balance_loss_mlp": 1.07888198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06316346716689762, + "language_loss": 0.79465461, + "learning_rate": 0.0007277444111150079, + "loss": 0.80557513, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.13189697, + "routerloss_mlp": 0.0, + "step": 1918, + "time_per_iteration": 2.6777923107147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088544, + "balance_loss_mlp": 1.07465601, + "diversity_loss_mlp": 0.0, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.09595367080188737, + "language_loss": 0.84512901, + "learning_rate": 0.0007274670190297272, + "loss": 0.85601443, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1919, + "time_per_iteration": 2.590839147567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085845, + "balance_loss_mlp": 1.07205224, + "diversity_loss_mlp": 0.0, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.07431087712553297, + "language_loss": 0.82079387, + "learning_rate": 0.0007271895386339179, + "loss": 0.83165228, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1920, + "time_per_iteration": 2.7924282550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094639, + "balance_loss_mlp": 1.08048892, + "diversity_loss_mlp": 0.0, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.07797312778631413, + "language_loss": 0.83431751, + "learning_rate": 0.0007269119700353073, + "loss": 0.84526384, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1921, + "time_per_iteration": 2.7155139446258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112357, + "balance_loss_mlp": 1.0987196, + "diversity_loss_mlp": 0.0, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.07250682713227712, + "language_loss": 0.84994757, + "learning_rate": 0.0007266343133416571, + "loss": 0.86107111, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1922, + "time_per_iteration": 2.7394983768463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073276, + "balance_loss_mlp": 1.06564641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.035523530201468645, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78190196, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.07617188, + "routerloss_mlp": 0.0, + "step": 1923, + "time_per_iteration": 4.877161026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10153794, + "diversity_loss_mlp": 0.0, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.0789330271899564, + "language_loss": 0.84356588, + "learning_rate": 0.0007260787361004556, + "loss": 0.85471952, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1924, + "time_per_iteration": 2.608745813369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_mlp": 1.02985299, + "diversity_loss_mlp": 0.0, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.021371165562314075, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74798417, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.07080078, + "routerloss_mlp": 0.0, + "step": 1925, + "time_per_iteration": 4.906585931777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114233, + "balance_loss_mlp": 1.10069048, + "diversity_loss_mlp": 0.0, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.12026638393290963, + "language_loss": 0.87422252, + "learning_rate": 0.0007255228077730903, + "loss": 0.88536477, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1926, + "time_per_iteration": 2.6886680126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123836, + "balance_loss_mlp": 1.11107421, + "diversity_loss_mlp": 0.0, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.06719853297068734, + "language_loss": 0.81722987, + "learning_rate": 0.0007252447122218632, + "loss": 0.82846814, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1927, + "time_per_iteration": 3.1511058807373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125512, + "balance_loss_mlp": 1.11258984, + "diversity_loss_mlp": 0.0, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.08764579691953547, + "language_loss": 0.87849444, + "learning_rate": 0.0007249665292228834, + "loss": 0.88974959, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1928, + "time_per_iteration": 2.565991163253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120289, + "balance_loss_mlp": 1.1073308, + "diversity_loss_mlp": 0.0, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.0633685198143462, + "language_loss": 0.83318496, + "learning_rate": 0.000724688258884151, + "loss": 0.84438789, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1929, + "time_per_iteration": 2.531827926635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_mlp": 1.10286927, + "diversity_loss_mlp": 0.0, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.05744658583323744, + "language_loss": 0.86564112, + "learning_rate": 0.0007244099013137002, + "loss": 0.8767941, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1930, + "time_per_iteration": 3.1130166053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.10404849, + "diversity_loss_mlp": 0.0, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.06880018611034966, + "language_loss": 0.88695574, + "learning_rate": 0.0007241314566195993, + "loss": 0.89812243, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.1262207, + "routerloss_mlp": 0.0, + "step": 1931, + "time_per_iteration": 3.374743700027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110097, + "balance_loss_mlp": 1.08821416, + "diversity_loss_mlp": 0.0, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.06303779661636588, + "language_loss": 0.85510373, + "learning_rate": 0.0007238529249099496, + "loss": 0.86611342, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1932, + "time_per_iteration": 2.6654059886932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097731, + "balance_loss_mlp": 1.0911988, + "diversity_loss_mlp": 0.0, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.03412398452916775, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78954613, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.06542969, + "routerloss_mlp": 0.0, + "step": 1933, + "time_per_iteration": 4.851354598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091175, + "balance_loss_mlp": 1.07859278, + "diversity_loss_mlp": 0.0, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.08014253307267598, + "language_loss": 0.80636895, + "learning_rate": 0.000723295600876581, + "loss": 0.81728071, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.12597656, + "routerloss_mlp": 0.0, + "step": 1934, + "time_per_iteration": 3.0025534629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097044, + "balance_loss_mlp": 1.08416963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.08698689907724866, + "language_loss": 0.88006312, + "learning_rate": 0.0007230168087692344, + "loss": 0.89103359, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.12872314, + "routerloss_mlp": 0.0, + "step": 1935, + "time_per_iteration": 2.6499342918395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095813, + "balance_loss_mlp": 1.0830214, + "diversity_loss_mlp": 0.0, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.07031074193849007, + "language_loss": 0.82382512, + "learning_rate": 0.0007227379300790839, + "loss": 0.8347832, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1936, + "time_per_iteration": 3.0040676593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092601, + "balance_loss_mlp": 1.07969058, + "diversity_loss_mlp": 0.0, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.07132774808829288, + "language_loss": 0.85478282, + "learning_rate": 0.0007224589649143997, + "loss": 0.86570889, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 1937, + "time_per_iteration": 2.584545612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089825, + "balance_loss_mlp": 1.07662272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.0711139803163438, + "language_loss": 0.8120302, + "learning_rate": 0.0007221799133834861, + "loss": 0.82292843, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.13214111, + "routerloss_mlp": 0.0, + "step": 1938, + "time_per_iteration": 2.6393649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109955, + "balance_loss_mlp": 1.08649623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.20460237815205612, + "language_loss": 0.81793052, + "learning_rate": 0.00072190077559468, + "loss": 0.82892597, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1939, + "time_per_iteration": 2.5494682788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127139, + "balance_loss_mlp": 1.1140976, + "diversity_loss_mlp": 0.0, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.05817015695703163, + "language_loss": 0.89248812, + "learning_rate": 0.0007216215516563527, + "loss": 0.90375948, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.13049316, + "routerloss_mlp": 0.0, + "step": 1940, + "time_per_iteration": 2.6755452156066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129035, + "balance_loss_mlp": 1.1159811, + "diversity_loss_mlp": 0.0, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.07778932214282369, + "language_loss": 0.83852386, + "learning_rate": 0.0007213422416769083, + "loss": 0.84981418, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1941, + "time_per_iteration": 2.6008002758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135994, + "balance_loss_mlp": 1.12319708, + "diversity_loss_mlp": 0.0, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.06345716224902766, + "language_loss": 0.7501297, + "learning_rate": 0.0007210628457647849, + "loss": 0.76148963, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1942, + "time_per_iteration": 2.5911362171173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140859, + "balance_loss_mlp": 1.12763917, + "diversity_loss_mlp": 0.0, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.06753886702103719, + "language_loss": 0.78585184, + "learning_rate": 0.000720783364028453, + "loss": 0.7972604, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.13238525, + "routerloss_mlp": 0.0, + "step": 1943, + "time_per_iteration": 2.7490458488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149977, + "balance_loss_mlp": 1.13685822, + "diversity_loss_mlp": 0.0, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.0650742437261564, + "language_loss": 0.87667847, + "learning_rate": 0.0007205037965764177, + "loss": 0.88817823, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1944, + "time_per_iteration": 2.5870554447174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134812, + "balance_loss_mlp": 1.12192512, + "diversity_loss_mlp": 0.0, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07468357539719116, + "language_loss": 0.85650361, + "learning_rate": 0.0007202241435172161, + "loss": 0.86785173, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1945, + "time_per_iteration": 2.7550253868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131811, + "balance_loss_mlp": 1.11901414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07270487210957549, + "language_loss": 0.87884831, + "learning_rate": 0.0007199444049594198, + "loss": 0.8901664, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1946, + "time_per_iteration": 2.9499337673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111095, + "balance_loss_mlp": 1.09783912, + "diversity_loss_mlp": 0.0, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.07247382516020226, + "language_loss": 0.83384776, + "learning_rate": 0.0007196645810116322, + "loss": 0.84495866, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1947, + "time_per_iteration": 2.70394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113218, + "balance_loss_mlp": 1.1003499, + "diversity_loss_mlp": 0.0, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.07522309633784076, + "language_loss": 0.84431696, + "learning_rate": 0.0007193846717824912, + "loss": 0.8554492, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1948, + "time_per_iteration": 2.923752546310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116002, + "balance_loss_mlp": 1.10312748, + "diversity_loss_mlp": 0.0, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.06883561802065806, + "language_loss": 0.88268626, + "learning_rate": 0.0007191046773806669, + "loss": 0.89384627, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1949, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108921, + "balance_loss_mlp": 1.09593272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07969110082801287, + "language_loss": 0.83211446, + "learning_rate": 0.0007188245979148631, + "loss": 0.84320366, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1950, + "time_per_iteration": 3.193124294281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111725, + "balance_loss_mlp": 1.09892154, + "diversity_loss_mlp": 0.0, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.07005872092850987, + "language_loss": 0.87434363, + "learning_rate": 0.0007185444334938157, + "loss": 0.88546085, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1951, + "time_per_iteration": 2.669201135635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101783, + "balance_loss_mlp": 1.0892663, + "diversity_loss_mlp": 0.0, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.08195801919923047, + "language_loss": 0.85047525, + "learning_rate": 0.0007182641842262947, + "loss": 0.86149311, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.12518311, + "routerloss_mlp": 0.0, + "step": 1952, + "time_per_iteration": 2.602139472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092197, + "balance_loss_mlp": 1.07936394, + "diversity_loss_mlp": 0.0, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.07349771430020792, + "language_loss": 0.77754879, + "learning_rate": 0.0007179838502211022, + "loss": 0.78847075, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.128479, + "routerloss_mlp": 0.0, + "step": 1953, + "time_per_iteration": 2.85720157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094498, + "balance_loss_mlp": 1.08148086, + "diversity_loss_mlp": 0.0, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.0681681729591206, + "language_loss": 0.86330736, + "learning_rate": 0.0007177034315870738, + "loss": 0.87425238, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1954, + "time_per_iteration": 2.958862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101066, + "balance_loss_mlp": 1.08803654, + "diversity_loss_mlp": 0.0, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06642365438263753, + "language_loss": 0.90809441, + "learning_rate": 0.0007174229284330773, + "loss": 0.91910505, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1955, + "time_per_iteration": 2.5824947357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108936, + "balance_loss_mlp": 1.07642531, + "diversity_loss_mlp": 0.0, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.07788827503332588, + "language_loss": 0.86705017, + "learning_rate": 0.0007171423408680141, + "loss": 0.87794375, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1956, + "time_per_iteration": 2.8101606369018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00950311, + "balance_loss_mlp": 1.6602329, + "diversity_loss_mlp": 0.20739825, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.03218717292019043, + "language_loss": 0.89567441, + "learning_rate": 0.0007168616690008176, + "loss": 0.90517747, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01649548, + "step": 1957, + "time_per_iteration": 2.6774377822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081569, + "balance_loss_mlp": 1.06840825, + "diversity_loss_mlp": 0.0, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.07242251254882147, + "language_loss": 0.85681045, + "learning_rate": 0.0007165809129404545, + "loss": 0.86762613, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1958, + "time_per_iteration": 2.8396048545837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090105, + "balance_loss_mlp": 1.07657433, + "diversity_loss_mlp": 0.0, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.08227545286248691, + "language_loss": 0.86212921, + "learning_rate": 0.0007163000727959239, + "loss": 0.87303019, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1959, + "time_per_iteration": 2.478990316390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087148, + "balance_loss_mlp": 1.07989979, + "diversity_loss_mlp": 0.0, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.05215322395932221, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79046214, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.07226562, + "routerloss_mlp": 0.0, + "step": 1960, + "time_per_iteration": 4.869986057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095327, + "balance_loss_mlp": 1.08232689, + "diversity_loss_mlp": 0.0, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.08048811275026858, + "language_loss": 0.84568793, + "learning_rate": 0.00071573814069052, + "loss": 0.85664117, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.13018799, + "routerloss_mlp": 0.0, + "step": 1961, + "time_per_iteration": 2.9122819900512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109901, + "balance_loss_mlp": 1.08614171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.06061063893945359, + "language_loss": 0.88073885, + "learning_rate": 0.0007154570489478081, + "loss": 0.89172894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1962, + "time_per_iteration": 3.1824018955230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111399, + "balance_loss_mlp": 1.10154414, + "diversity_loss_mlp": 0.0, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.06274200702745775, + "language_loss": 0.86391222, + "learning_rate": 0.0007151758735572514, + "loss": 0.87505209, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.12451172, + "routerloss_mlp": 0.0, + "step": 1963, + "time_per_iteration": 2.997624158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111089, + "balance_loss_mlp": 1.09836888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.07983075782925624, + "language_loss": 0.80894458, + "learning_rate": 0.0007148946146280119, + "loss": 0.82005548, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.12731934, + "routerloss_mlp": 0.0, + "step": 1964, + "time_per_iteration": 2.836583137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00620122, + "balance_loss_mlp": 1.05382681, + "diversity_loss_mlp": 0.16216688, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.0017779517528101797, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.72812271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01212509, + "step": 1965, + "time_per_iteration": 4.906678915023804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_mlp": 1.02436352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.025755206304302582, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.7637251, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.06176758, + "routerloss_mlp": 0.0, + "step": 1966, + "time_per_iteration": 4.93319296836853 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127949, + "balance_loss_mlp": 1.11581361, + "diversity_loss_mlp": 0.0, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.05898800907157556, + "language_loss": 0.83873129, + "learning_rate": 0.0007140503377003022, + "loss": 0.85001081, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 1967, + "time_per_iteration": 2.9807000160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123739, + "balance_loss_mlp": 1.11125755, + "diversity_loss_mlp": 0.0, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.06421364750503517, + "language_loss": 0.84625173, + "learning_rate": 0.000713768745708599, + "loss": 0.85748911, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1968, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118961, + "balance_loss_mlp": 1.10671234, + "diversity_loss_mlp": 0.0, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.06880095080762995, + "language_loss": 0.77052647, + "learning_rate": 0.0007134870707245085, + "loss": 0.78171611, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.12249756, + "routerloss_mlp": 0.0, + "step": 1969, + "time_per_iteration": 3.302985429763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120912, + "balance_loss_mlp": 1.10852587, + "diversity_loss_mlp": 0.0, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.07142024228833302, + "language_loss": 0.84469545, + "learning_rate": 0.0007132053128573864, + "loss": 0.85590458, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.12384033, + "routerloss_mlp": 0.0, + "step": 1970, + "time_per_iteration": 2.7751197814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_mlp": 1.11231327, + "diversity_loss_mlp": 0.0, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06795721743578591, + "language_loss": 0.83786452, + "learning_rate": 0.0007129234722166211, + "loss": 0.84910882, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 1971, + "time_per_iteration": 2.806898832321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114684, + "balance_loss_mlp": 1.10238707, + "diversity_loss_mlp": 0.0, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.06601167392952549, + "language_loss": 0.91087604, + "learning_rate": 0.0007126415489116328, + "loss": 0.92202282, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.1229248, + "routerloss_mlp": 0.0, + "step": 1972, + "time_per_iteration": 2.656651496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.09782279, + "diversity_loss_mlp": 0.0, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06641244535011205, + "language_loss": 0.81145501, + "learning_rate": 0.0007123595430518736, + "loss": 0.82255375, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 1973, + "time_per_iteration": 2.8665072917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102568, + "balance_loss_mlp": 1.09068835, + "diversity_loss_mlp": 0.0, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.07235703206146665, + "language_loss": 0.86411089, + "learning_rate": 0.0007120774547468282, + "loss": 0.87513655, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 1974, + "time_per_iteration": 2.5590381622314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948323, + "balance_loss_mlp": 1.65707994, + "diversity_loss_mlp": 0.20756721, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.03148003592885531, + "language_loss": 0.81558585, + "learning_rate": 0.0007117952841060128, + "loss": 0.82506907, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01599924, + "step": 1975, + "time_per_iteration": 2.6777563095092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083824, + "balance_loss_mlp": 1.07167053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.07660828670939425, + "language_loss": 0.83672053, + "learning_rate": 0.0007115130312389756, + "loss": 0.8475588, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.12145996, + "routerloss_mlp": 0.0, + "step": 1976, + "time_per_iteration": 2.7103323936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.07200503, + "diversity_loss_mlp": 0.0, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.08353002189035653, + "language_loss": 0.79290646, + "learning_rate": 0.0007112306962552973, + "loss": 0.80375111, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.12463379, + "routerloss_mlp": 0.0, + "step": 1977, + "time_per_iteration": 2.576239824295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084774, + "balance_loss_mlp": 1.07254314, + "diversity_loss_mlp": 0.0, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.06483406604645132, + "language_loss": 0.85315859, + "learning_rate": 0.0007109482792645896, + "loss": 0.86400628, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 1978, + "time_per_iteration": 2.7146143913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.07276165, + "diversity_loss_mlp": 0.0, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.06865418790878511, + "language_loss": 0.83831733, + "learning_rate": 0.0007106657803764969, + "loss": 0.84916663, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 1979, + "time_per_iteration": 2.73152494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086146, + "balance_loss_mlp": 1.07395101, + "diversity_loss_mlp": 0.0, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.07620298141647525, + "language_loss": 0.81962979, + "learning_rate": 0.0007103831997006948, + "loss": 0.83049119, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.12194824, + "routerloss_mlp": 0.0, + "step": 1980, + "time_per_iteration": 2.7383615970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094611, + "balance_loss_mlp": 1.08276772, + "diversity_loss_mlp": 0.0, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.0842263164190672, + "language_loss": 0.85342598, + "learning_rate": 0.0007101005373468908, + "loss": 0.86437213, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 1981, + "time_per_iteration": 2.889251708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097444, + "balance_loss_mlp": 1.08543372, + "diversity_loss_mlp": 0.0, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.06048237516575629, + "language_loss": 0.86649287, + "learning_rate": 0.0007098177934248242, + "loss": 0.87746727, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 1982, + "time_per_iteration": 2.773146867752075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920145, + "balance_loss_mlp": 1.60273147, + "diversity_loss_mlp": 0.20649332, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.033525346661278974, + "language_loss": 0.85516387, + "learning_rate": 0.0007095349680442661, + "loss": 0.86436534, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01553278, + "step": 1983, + "time_per_iteration": 2.8675785064697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116298, + "balance_loss_mlp": 1.1045742, + "diversity_loss_mlp": 0.0, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.06407324010727367, + "language_loss": 0.78783178, + "learning_rate": 0.0007092520613150188, + "loss": 0.79899484, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 1984, + "time_per_iteration": 2.709177017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918651, + "balance_loss_mlp": 1.59999418, + "diversity_loss_mlp": 0.20665541, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.03070680845617011, + "language_loss": 0.80925471, + "learning_rate": 0.0007089690733469165, + "loss": 0.81844121, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532666, + "step": 1985, + "time_per_iteration": 2.750558376312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135734, + "balance_loss_mlp": 1.12384343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.08571071539105668, + "language_loss": 0.82313848, + "learning_rate": 0.000708686004249825, + "loss": 0.83449578, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 1986, + "time_per_iteration": 2.7550368309020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132102, + "balance_loss_mlp": 1.12012124, + "diversity_loss_mlp": 0.0, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.07744479108461458, + "language_loss": 0.91340905, + "learning_rate": 0.0007084028541336413, + "loss": 0.92473006, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.11975098, + "routerloss_mlp": 0.0, + "step": 1987, + "time_per_iteration": 2.703339099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914957, + "balance_loss_mlp": 1.59260678, + "diversity_loss_mlp": 0.20690078, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.03035395776464378, + "language_loss": 0.86267084, + "learning_rate": 0.0007081196231082942, + "loss": 0.87182039, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01520337, + "step": 1988, + "time_per_iteration": 2.8075153827667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.10567343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.07746710731409655, + "language_loss": 0.80053389, + "learning_rate": 0.0007078363112837436, + "loss": 0.81171107, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.12036133, + "routerloss_mlp": 0.0, + "step": 1989, + "time_per_iteration": 2.811197280883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104881, + "balance_loss_mlp": 1.09261441, + "diversity_loss_mlp": 0.0, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.07961201652041947, + "language_loss": 0.84721339, + "learning_rate": 0.000707552918769981, + "loss": 0.85826218, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 1990, + "time_per_iteration": 2.4908246994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102216, + "balance_loss_mlp": 1.08987188, + "diversity_loss_mlp": 0.0, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.06284554422997896, + "language_loss": 0.83619118, + "learning_rate": 0.000707269445677029, + "loss": 0.84721333, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.12341309, + "routerloss_mlp": 0.0, + "step": 1991, + "time_per_iteration": 2.733126401901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101588, + "balance_loss_mlp": 1.08921361, + "diversity_loss_mlp": 0.0, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.07203164936975576, + "language_loss": 0.85140717, + "learning_rate": 0.0007069858921149416, + "loss": 0.86242306, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.12371826, + "routerloss_mlp": 0.0, + "step": 1992, + "time_per_iteration": 2.9382007122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096798, + "balance_loss_mlp": 1.08434701, + "diversity_loss_mlp": 0.0, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.05485930037569587, + "language_loss": 0.85794246, + "learning_rate": 0.0007067022581938043, + "loss": 0.86891043, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.12457275, + "routerloss_mlp": 0.0, + "step": 1993, + "time_per_iteration": 2.857525110244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095406, + "balance_loss_mlp": 1.08321714, + "diversity_loss_mlp": 0.0, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.0871408980162776, + "language_loss": 0.83722532, + "learning_rate": 0.0007064185440237334, + "loss": 0.8481794, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1994, + "time_per_iteration": 2.7131123542785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099231, + "balance_loss_mlp": 1.08733368, + "diversity_loss_mlp": 0.0, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.06357294591464056, + "language_loss": 0.84358412, + "learning_rate": 0.0007061347497148764, + "loss": 0.85457647, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.11895752, + "routerloss_mlp": 0.0, + "step": 1995, + "time_per_iteration": 2.7398569583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102339, + "balance_loss_mlp": 1.09015, + "diversity_loss_mlp": 0.0, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.07322887134464046, + "language_loss": 0.86299884, + "learning_rate": 0.0007058508753774122, + "loss": 0.87402225, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1996, + "time_per_iteration": 2.6903162002563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108867, + "balance_loss_mlp": 1.09709477, + "diversity_loss_mlp": 0.0, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.0698381422429368, + "language_loss": 0.86921895, + "learning_rate": 0.0007055669211215505, + "loss": 0.88030767, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 1997, + "time_per_iteration": 2.695028066635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113141, + "balance_loss_mlp": 1.10084486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.08585182349688475, + "language_loss": 0.77776283, + "learning_rate": 0.0007052828870575322, + "loss": 0.78889418, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 1998, + "time_per_iteration": 2.685685873031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011259, + "balance_loss_mlp": 1.11406291, + "diversity_loss_mlp": 0.0, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.06979871165732322, + "language_loss": 0.87060714, + "learning_rate": 0.0007049987732956291, + "loss": 0.8818661, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.11834717, + "routerloss_mlp": 0.0, + "step": 1999, + "time_per_iteration": 2.9710631370544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110647, + "balance_loss_mlp": 1.09428668, + "diversity_loss_mlp": 0.0, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.05561177596637214, + "language_loss": 0.82812738, + "learning_rate": 0.0007047145799461439, + "loss": 0.83919203, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2000, + "time_per_iteration": 2.8492860794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105216, + "balance_loss_mlp": 1.09293747, + "diversity_loss_mlp": 0.0, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06017266002852966, + "language_loss": 0.82272708, + "learning_rate": 0.00070443030711941, + "loss": 0.83377922, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2001, + "time_per_iteration": 2.769383430480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100076, + "balance_loss_mlp": 1.08806002, + "diversity_loss_mlp": 0.0, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.061888534691205976, + "language_loss": 0.82098496, + "learning_rate": 0.0007041459549257924, + "loss": 0.83198571, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2002, + "time_per_iteration": 2.876244306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089803, + "balance_loss_mlp": 1.07744145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.06816771124006925, + "language_loss": 0.78024125, + "learning_rate": 0.0007038615234756859, + "loss": 0.79113925, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.12359619, + "routerloss_mlp": 0.0, + "step": 2003, + "time_per_iteration": 3.1744768619537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086899, + "balance_loss_mlp": 1.07477546, + "diversity_loss_mlp": 0.0, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.09233530116269285, + "language_loss": 0.83808231, + "learning_rate": 0.000703577012879517, + "loss": 0.84895122, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2004, + "time_per_iteration": 2.633391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089004, + "balance_loss_mlp": 1.07705307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.07105955558417659, + "language_loss": 0.88946962, + "learning_rate": 0.0007032924232477423, + "loss": 0.90035963, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2005, + "time_per_iteration": 2.6482574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109168, + "balance_loss_mlp": 1.0797528, + "diversity_loss_mlp": 0.0, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.07024694433071269, + "language_loss": 0.80605727, + "learning_rate": 0.0007030077546908493, + "loss": 0.81697416, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2006, + "time_per_iteration": 2.6219046115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087051, + "balance_loss_mlp": 1.08056581, + "diversity_loss_mlp": 0.0, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.032453276732354666, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84151709, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.06494141, + "routerloss_mlp": 0.0, + "step": 2007, + "time_per_iteration": 4.798014402389526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099136, + "balance_loss_mlp": 1.08744717, + "diversity_loss_mlp": 0.0, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.08661380313869275, + "language_loss": 0.79137146, + "learning_rate": 0.0007024381812438117, + "loss": 0.8023628, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.11676025, + "routerloss_mlp": 0.0, + "step": 2008, + "time_per_iteration": 2.5403189659118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110095, + "balance_loss_mlp": 1.08864713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.09407170185597404, + "language_loss": 0.83448064, + "learning_rate": 0.0007021532765747951, + "loss": 0.8454901, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.12310791, + "routerloss_mlp": 0.0, + "step": 2009, + "time_per_iteration": 2.9585187435150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094144, + "balance_loss_mlp": 1.08211613, + "diversity_loss_mlp": 0.0, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.0684890586406507, + "language_loss": 0.79048979, + "learning_rate": 0.0007018682934229162, + "loss": 0.80143124, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2010, + "time_per_iteration": 2.9703307151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_mlp": 1.0842756, + "diversity_loss_mlp": 0.0, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.06303649013837292, + "language_loss": 0.82761061, + "learning_rate": 0.0007015832318988152, + "loss": 0.83857542, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2011, + "time_per_iteration": 2.6060009002685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_mlp": 1.02231336, + "diversity_loss_mlp": 0.0, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.017766506591404385, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.7491802, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.05615234, + "routerloss_mlp": 0.0, + "step": 2012, + "time_per_iteration": 4.938155651092529 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109088, + "balance_loss_mlp": 1.07810068, + "diversity_loss_mlp": 0.0, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.060967443696148906, + "language_loss": 0.84265292, + "learning_rate": 0.0007010128741766604, + "loss": 0.85356176, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 2013, + "time_per_iteration": 2.7293431758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091499, + "balance_loss_mlp": 1.07861209, + "diversity_loss_mlp": 0.0, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.07873148114105366, + "language_loss": 0.84277219, + "learning_rate": 0.0007007275782000391, + "loss": 0.85368717, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 2014, + "time_per_iteration": 2.644911766052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091625, + "balance_loss_mlp": 1.07889354, + "diversity_loss_mlp": 0.0, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.0868083489465314, + "language_loss": 0.8502394, + "learning_rate": 0.0007004422042940605, + "loss": 0.86115563, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 2015, + "time_per_iteration": 2.5096747875213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109894, + "balance_loss_mlp": 1.08593392, + "diversity_loss_mlp": 0.0, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.08227522563153689, + "language_loss": 0.89877218, + "learning_rate": 0.0007001567525695169, + "loss": 0.90976155, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 2016, + "time_per_iteration": 2.606520891189575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105972, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.06437704205290017, + "language_loss": 0.83705699, + "learning_rate": 0.0006998712231372303, + "loss": 0.84811676, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 2017, + "time_per_iteration": 3.016061305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119415, + "balance_loss_mlp": 1.10692167, + "diversity_loss_mlp": 0.0, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06622760195410109, + "language_loss": 0.85886908, + "learning_rate": 0.0006995856161080532, + "loss": 0.87006325, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.12487793, + "routerloss_mlp": 0.0, + "step": 2018, + "time_per_iteration": 2.8263893127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_mlp": 1.11165869, + "diversity_loss_mlp": 0.0, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.06957079313074316, + "language_loss": 0.82328916, + "learning_rate": 0.0006992999315928679, + "loss": 0.83453172, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.1260376, + "routerloss_mlp": 0.0, + "step": 2019, + "time_per_iteration": 2.789020299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.11772799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.05589846380959986, + "language_loss": 0.85480869, + "learning_rate": 0.0006990141697025871, + "loss": 0.86611497, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 2020, + "time_per_iteration": 2.788597345352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067569, + "balance_loss_mlp": 1.06141829, + "diversity_loss_mlp": 0.0, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.034323999481440985, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77427208, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2021, + "time_per_iteration": 4.782108545303345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130924, + "balance_loss_mlp": 1.11879468, + "diversity_loss_mlp": 0.0, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0813348018947899, + "language_loss": 0.82333553, + "learning_rate": 0.0006984424142405392, + "loss": 0.83464473, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 2022, + "time_per_iteration": 2.804866075515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118053, + "balance_loss_mlp": 1.10578668, + "diversity_loss_mlp": 0.0, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.07379903296161248, + "language_loss": 0.82117045, + "learning_rate": 0.0006981564208907474, + "loss": 0.83235097, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2023, + "time_per_iteration": 2.5883662700653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130828, + "balance_loss_mlp": 1.11855519, + "diversity_loss_mlp": 0.0, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.07869766022149485, + "language_loss": 0.8995713, + "learning_rate": 0.0006978703506098102, + "loss": 0.91087961, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2024, + "time_per_iteration": 2.730283498764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127449, + "balance_loss_mlp": 1.11556411, + "diversity_loss_mlp": 0.0, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.0665173530375796, + "language_loss": 0.88210815, + "learning_rate": 0.00069758420350879, + "loss": 0.89338267, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2025, + "time_per_iteration": 2.62969708442688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932402, + "balance_loss_mlp": 1.62686133, + "diversity_loss_mlp": 0.20693868, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.03379762859523427, + "language_loss": 0.8613863, + "learning_rate": 0.000697297979698779, + "loss": 0.87071025, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01550185, + "step": 2026, + "time_per_iteration": 2.837543249130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107077, + "balance_loss_mlp": 1.09529877, + "diversity_loss_mlp": 0.0, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06049708379655892, + "language_loss": 0.83660531, + "learning_rate": 0.0006970116792908992, + "loss": 0.84767604, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2027, + "time_per_iteration": 3.1133604049682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107629, + "balance_loss_mlp": 1.0960542, + "diversity_loss_mlp": 0.0, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.07190738956644391, + "language_loss": 0.81380564, + "learning_rate": 0.000696725302396302, + "loss": 0.82488191, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2028, + "time_per_iteration": 2.6460230350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109766, + "balance_loss_mlp": 1.08604932, + "diversity_loss_mlp": 0.0, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.06814290150602269, + "language_loss": 0.85887402, + "learning_rate": 0.0006964388491261692, + "loss": 0.86985064, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2029, + "time_per_iteration": 3.296208143234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099195, + "balance_loss_mlp": 1.0871129, + "diversity_loss_mlp": 0.0, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.075812953715104, + "language_loss": 0.87511015, + "learning_rate": 0.0006961523195917114, + "loss": 0.88610214, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2030, + "time_per_iteration": 2.803239345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107522, + "balance_loss_mlp": 1.09573865, + "diversity_loss_mlp": 0.0, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.0665807006884719, + "language_loss": 0.78137511, + "learning_rate": 0.0006958657139041696, + "loss": 0.79245031, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2031, + "time_per_iteration": 2.739151954650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061018, + "balance_loss_mlp": 1.05531955, + "diversity_loss_mlp": 0.0, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.035996309550900246, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77773988, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.05688477, + "routerloss_mlp": 0.0, + "step": 2032, + "time_per_iteration": 4.918209552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094882, + "balance_loss_mlp": 1.08307993, + "diversity_loss_mlp": 0.0, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.0751880944680772, + "language_loss": 0.78643966, + "learning_rate": 0.0006952922745149434, + "loss": 0.79738843, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2033, + "time_per_iteration": 2.6274161338806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091358, + "balance_loss_mlp": 1.07940745, + "diversity_loss_mlp": 0.0, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.07391479182011068, + "language_loss": 0.87674987, + "learning_rate": 0.000695005441035888, + "loss": 0.88766348, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2034, + "time_per_iteration": 2.647348642349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018577, + "balance_loss_mlp": 1.01280713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.010435626825017296, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74742007, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2035, + "time_per_iteration": 4.8861188888549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107172, + "balance_loss_mlp": 1.094733, + "diversity_loss_mlp": 0.0, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.06114898183694146, + "language_loss": 0.81133932, + "learning_rate": 0.0006944315470656863, + "loss": 0.82241106, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.12438965, + "routerloss_mlp": 0.0, + "step": 2036, + "time_per_iteration": 3.0057246685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108633, + "balance_loss_mlp": 1.09606266, + "diversity_loss_mlp": 0.0, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.0812142536963638, + "language_loss": 0.90953541, + "learning_rate": 0.000694144486797345, + "loss": 0.92062169, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.12579346, + "routerloss_mlp": 0.0, + "step": 2037, + "time_per_iteration": 2.6566872596740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_mlp": 1.0060699, + "diversity_loss_mlp": 0.0, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.012879447335335118, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80532491, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2038, + "time_per_iteration": 4.609802722930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103729, + "balance_loss_mlp": 1.09141517, + "diversity_loss_mlp": 0.0, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.07718413790316761, + "language_loss": 0.89271998, + "learning_rate": 0.0006935701402514156, + "loss": 0.90375727, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.12316895, + "routerloss_mlp": 0.0, + "step": 2039, + "time_per_iteration": 2.610905408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101805, + "balance_loss_mlp": 1.01206541, + "diversity_loss_mlp": 0.0, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.016017309503016164, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74052942, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2040, + "time_per_iteration": 4.954579830169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106674, + "balance_loss_mlp": 1.09434199, + "diversity_loss_mlp": 0.0, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.0728619475730698, + "language_loss": 0.84539711, + "learning_rate": 0.0006929954931031422, + "loss": 0.85646391, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2041, + "time_per_iteration": 3.6979990005493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114297, + "balance_loss_mlp": 1.10201287, + "diversity_loss_mlp": 0.0, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.07303574322286652, + "language_loss": 0.88330269, + "learning_rate": 0.0006927080570819805, + "loss": 0.89444566, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2042, + "time_per_iteration": 2.5840306282043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126631, + "balance_loss_mlp": 1.11437607, + "diversity_loss_mlp": 0.0, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.09784101638347129, + "language_loss": 0.80726093, + "learning_rate": 0.0006924205462449161, + "loss": 0.81852722, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2043, + "time_per_iteration": 2.556964159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123407, + "balance_loss_mlp": 1.11139631, + "diversity_loss_mlp": 0.0, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.07674510212981295, + "language_loss": 0.81822228, + "learning_rate": 0.0006921329607035702, + "loss": 0.82945639, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.12005615, + "routerloss_mlp": 0.0, + "step": 2044, + "time_per_iteration": 3.2355051040649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109464, + "balance_loss_mlp": 1.09777582, + "diversity_loss_mlp": 0.0, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.0626655505852987, + "language_loss": 0.87889385, + "learning_rate": 0.0006918453005695938, + "loss": 0.88998848, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2045, + "time_per_iteration": 2.616405725479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112555, + "balance_loss_mlp": 1.10047281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.0593607382511463, + "language_loss": 0.8430419, + "learning_rate": 0.0006915575659546662, + "loss": 0.85416746, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2046, + "time_per_iteration": 2.6596429347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100096, + "balance_loss_mlp": 1.08785915, + "diversity_loss_mlp": 0.0, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.0680979304239865, + "language_loss": 0.80745959, + "learning_rate": 0.0006912697569704959, + "loss": 0.81846058, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2047, + "time_per_iteration": 2.5962154865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097855, + "balance_loss_mlp": 1.08564174, + "diversity_loss_mlp": 0.0, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07634449995136075, + "language_loss": 0.8702817, + "learning_rate": 0.0006909818737288205, + "loss": 0.88126016, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.12207031, + "routerloss_mlp": 0.0, + "step": 2048, + "time_per_iteration": 2.5559332370758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111282, + "balance_loss_mlp": 1.09955215, + "diversity_loss_mlp": 0.0, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07451514550279957, + "language_loss": 0.80715293, + "learning_rate": 0.000690693916341406, + "loss": 0.81826574, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2049, + "time_per_iteration": 2.605881690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115507, + "balance_loss_mlp": 1.10377121, + "diversity_loss_mlp": 0.0, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.06516266173427393, + "language_loss": 0.82286257, + "learning_rate": 0.0006904058849200475, + "loss": 0.83401763, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2050, + "time_per_iteration": 2.7183115482330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.09360313, + "diversity_loss_mlp": 0.0, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.0753850450331705, + "language_loss": 0.84972727, + "learning_rate": 0.0006901177795765683, + "loss": 0.8607837, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 2051, + "time_per_iteration": 2.627774715423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105318, + "balance_loss_mlp": 1.09354019, + "diversity_loss_mlp": 0.0, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.06465732667856934, + "language_loss": 0.81096435, + "learning_rate": 0.0006898296004228213, + "loss": 0.82201755, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2052, + "time_per_iteration": 2.7607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050397, + "balance_loss_mlp": 1.04446077, + "diversity_loss_mlp": 0.0, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03031396698302257, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177135, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.05932617, + "routerloss_mlp": 0.0, + "step": 2053, + "time_per_iteration": 4.876460552215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117073, + "balance_loss_mlp": 1.10529494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.1105412420488248, + "language_loss": 0.79620701, + "learning_rate": 0.0006892530211320763, + "loss": 0.80737776, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2054, + "time_per_iteration": 2.702591896057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00944261, + "balance_loss_mlp": 1.6481061, + "diversity_loss_mlp": 0.21043469, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.03587460904718008, + "language_loss": 0.84313488, + "learning_rate": 0.000688964621218926, + "loss": 0.85257751, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01499031, + "step": 2055, + "time_per_iteration": 2.6392524242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109262, + "balance_loss_mlp": 1.08063984, + "diversity_loss_mlp": 0.0, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.0862390851468888, + "language_loss": 0.80478442, + "learning_rate": 0.0006886761479432037, + "loss": 0.81571066, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.11968994, + "routerloss_mlp": 0.0, + "step": 2056, + "time_per_iteration": 2.8577234745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079958, + "balance_loss_mlp": 1.06739902, + "diversity_loss_mlp": 0.0, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.06874544900142358, + "language_loss": 0.84387571, + "learning_rate": 0.0006883876014169045, + "loss": 0.85467529, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.12554932, + "routerloss_mlp": 0.0, + "step": 2057, + "time_per_iteration": 2.572458505630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073975, + "balance_loss_mlp": 1.06154716, + "diversity_loss_mlp": 0.0, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07681071569739906, + "language_loss": 0.90056652, + "learning_rate": 0.000688098981752052, + "loss": 0.91130626, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 2058, + "time_per_iteration": 2.7125563621520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080003, + "balance_loss_mlp": 1.06697917, + "diversity_loss_mlp": 0.0, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08571295812058347, + "language_loss": 0.80176479, + "learning_rate": 0.0006878102890606982, + "loss": 0.81256485, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 2059, + "time_per_iteration": 3.0797197818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108467, + "balance_loss_mlp": 1.07161617, + "diversity_loss_mlp": 0.0, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.08415103615204221, + "language_loss": 0.81576395, + "learning_rate": 0.0006875215234549239, + "loss": 0.82661068, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 2060, + "time_per_iteration": 2.5358171463012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078952, + "balance_loss_mlp": 1.06604218, + "diversity_loss_mlp": 0.0, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.08360675720274492, + "language_loss": 0.85212821, + "learning_rate": 0.0006872326850468376, + "loss": 0.86291778, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 2061, + "time_per_iteration": 2.685746669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079022, + "balance_loss_mlp": 1.06612396, + "diversity_loss_mlp": 0.0, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.08669948408116639, + "language_loss": 0.78834969, + "learning_rate": 0.0006869437739485762, + "loss": 0.79913992, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.12908936, + "routerloss_mlp": 0.0, + "step": 2062, + "time_per_iteration": 2.608938455581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085084, + "balance_loss_mlp": 1.07266808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06314890183319057, + "language_loss": 0.92750764, + "learning_rate": 0.0006866547902723053, + "loss": 0.93835843, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.12420654, + "routerloss_mlp": 0.0, + "step": 2063, + "time_per_iteration": 2.654764175415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07135844, + "diversity_loss_mlp": 0.0, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10797740353372913, + "language_loss": 0.80444092, + "learning_rate": 0.000686365734130218, + "loss": 0.81527805, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.12365723, + "routerloss_mlp": 0.0, + "step": 2064, + "time_per_iteration": 2.7161076068878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085534, + "balance_loss_mlp": 1.07345843, + "diversity_loss_mlp": 0.0, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06605501724079509, + "language_loss": 0.83883071, + "learning_rate": 0.000686076605634536, + "loss": 0.84968603, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2065, + "time_per_iteration": 2.5960052013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088286, + "balance_loss_mlp": 1.07656133, + "diversity_loss_mlp": 0.0, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.06893141882644385, + "language_loss": 0.84303313, + "learning_rate": 0.0006857874048975088, + "loss": 0.85391599, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2066, + "time_per_iteration": 2.5419557094573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.08599246, + "diversity_loss_mlp": 0.0, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.07076940729430262, + "language_loss": 0.86944497, + "learning_rate": 0.0006854981320314142, + "loss": 0.88042831, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2067, + "time_per_iteration": 2.4425127506256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101956, + "balance_loss_mlp": 1.0900414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.08678893766230582, + "language_loss": 0.86775517, + "learning_rate": 0.0006852087871485579, + "loss": 0.87877476, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2068, + "time_per_iteration": 2.617234468460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104627, + "balance_loss_mlp": 1.09308147, + "diversity_loss_mlp": 0.0, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08540761893483814, + "language_loss": 0.81805646, + "learning_rate": 0.0006849193703612735, + "loss": 0.82910275, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2069, + "time_per_iteration": 2.7818312644958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.08808875, + "diversity_loss_mlp": 0.0, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.06305964525737012, + "language_loss": 0.77731991, + "learning_rate": 0.0006846298817819225, + "loss": 0.78832221, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2070, + "time_per_iteration": 2.970045328140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099656, + "balance_loss_mlp": 1.08777106, + "diversity_loss_mlp": 0.0, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.09229213766989015, + "language_loss": 0.81058359, + "learning_rate": 0.0006843403215228945, + "loss": 0.82158017, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2071, + "time_per_iteration": 2.47542405128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.08525538, + "diversity_loss_mlp": 0.0, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.06250612449775428, + "language_loss": 0.80665851, + "learning_rate": 0.0006840506896966065, + "loss": 0.81763273, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2072, + "time_per_iteration": 2.7048730850219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102151, + "balance_loss_mlp": 1.09000397, + "diversity_loss_mlp": 0.0, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.07670911788950584, + "language_loss": 0.82343054, + "learning_rate": 0.0006837609864155038, + "loss": 0.83445203, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2073, + "time_per_iteration": 2.940208673477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111546, + "balance_loss_mlp": 1.09976768, + "diversity_loss_mlp": 0.0, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.06443735331096001, + "language_loss": 0.83203363, + "learning_rate": 0.0006834712117920592, + "loss": 0.84314907, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2074, + "time_per_iteration": 2.6217153072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111182, + "balance_loss_mlp": 1.09892166, + "diversity_loss_mlp": 0.0, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.07401760730887977, + "language_loss": 0.85670066, + "learning_rate": 0.0006831813659387729, + "loss": 0.86781245, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2075, + "time_per_iteration": 2.5696237087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109855, + "balance_loss_mlp": 1.09774292, + "diversity_loss_mlp": 0.0, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.05990934262108594, + "language_loss": 0.84167391, + "learning_rate": 0.0006828914489681733, + "loss": 0.85277247, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2076, + "time_per_iteration": 2.7859339714050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119025, + "balance_loss_mlp": 1.1072948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.06517456650976074, + "language_loss": 0.85312855, + "learning_rate": 0.0006826014609928162, + "loss": 0.86431879, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2077, + "time_per_iteration": 2.6851699352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0060157, + "balance_loss_mlp": 1.02597332, + "diversity_loss_mlp": 0.1552759, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.0013651319096223075, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8380096, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01094547, + "step": 2078, + "time_per_iteration": 4.859188795089722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.1030947, + "diversity_loss_mlp": 0.0, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.0748648316539235, + "language_loss": 0.80062771, + "learning_rate": 0.0006820212724781896, + "loss": 0.81177354, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.11486816, + "routerloss_mlp": 0.0, + "step": 2079, + "time_per_iteration": 2.6628189086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.09492946, + "diversity_loss_mlp": 0.0, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06148312623903997, + "language_loss": 0.83733618, + "learning_rate": 0.0006817310721641694, + "loss": 0.84840119, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2080, + "time_per_iteration": 2.847182512283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119227, + "balance_loss_mlp": 1.10731816, + "diversity_loss_mlp": 0.0, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.07223167054032475, + "language_loss": 0.83566946, + "learning_rate": 0.00068144080129589, + "loss": 0.84686172, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2081, + "time_per_iteration": 2.7161402702331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_mlp": 1.10388541, + "diversity_loss_mlp": 0.0, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.07619573858560975, + "language_loss": 0.8280167, + "learning_rate": 0.0006811504599860441, + "loss": 0.83917284, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2082, + "time_per_iteration": 2.5584774017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104984, + "balance_loss_mlp": 1.0928719, + "diversity_loss_mlp": 0.0, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.1306421138400452, + "language_loss": 0.8569895, + "learning_rate": 0.0006808600483473526, + "loss": 0.86803931, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2083, + "time_per_iteration": 2.864786148071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094797, + "balance_loss_mlp": 1.0824883, + "diversity_loss_mlp": 0.0, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.06339794743033755, + "language_loss": 0.86393988, + "learning_rate": 0.0006805695664925629, + "loss": 0.87488782, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.12304688, + "routerloss_mlp": 0.0, + "step": 2084, + "time_per_iteration": 2.844709634780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089155, + "balance_loss_mlp": 1.07735372, + "diversity_loss_mlp": 0.0, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.0888076684038974, + "language_loss": 0.83841193, + "learning_rate": 0.0006802790145344506, + "loss": 0.84930348, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2085, + "time_per_iteration": 2.4883856773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083386, + "balance_loss_mlp": 1.07145894, + "diversity_loss_mlp": 0.0, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07803386161895243, + "language_loss": 0.87420845, + "learning_rate": 0.0006799883925858176, + "loss": 0.88504231, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2086, + "time_per_iteration": 2.8824286460876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088787, + "balance_loss_mlp": 1.0766871, + "diversity_loss_mlp": 0.0, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.06924310288687491, + "language_loss": 0.85459089, + "learning_rate": 0.0006796977007594933, + "loss": 0.86547881, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2087, + "time_per_iteration": 2.6597371101379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970559, + "balance_loss_mlp": 1.6983223, + "diversity_loss_mlp": 0.21244028, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.03280700890509502, + "language_loss": 0.86715519, + "learning_rate": 0.0006794069391683345, + "loss": 0.87686074, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01517779, + "step": 2088, + "time_per_iteration": 2.7649624347686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078316, + "balance_loss_mlp": 1.06610286, + "diversity_loss_mlp": 0.0, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.07764554073270104, + "language_loss": 0.80781567, + "learning_rate": 0.0006791161079252248, + "loss": 0.81859887, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2089, + "time_per_iteration": 2.6467885971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082706, + "balance_loss_mlp": 1.07014716, + "diversity_loss_mlp": 0.0, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.0935978018434956, + "language_loss": 0.82482743, + "learning_rate": 0.0006788252071430747, + "loss": 0.8356545, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.12561035, + "routerloss_mlp": 0.0, + "step": 2090, + "time_per_iteration": 2.684659242630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076621, + "balance_loss_mlp": 1.06417561, + "diversity_loss_mlp": 0.0, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.061003649340911806, + "language_loss": 0.86884034, + "learning_rate": 0.0006785342369348222, + "loss": 0.87960654, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.12451172, + "routerloss_mlp": 0.0, + "step": 2091, + "time_per_iteration": 2.7500762939453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081239, + "balance_loss_mlp": 1.06896663, + "diversity_loss_mlp": 0.0, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.08323404973511926, + "language_loss": 0.79681003, + "learning_rate": 0.0006782431974134316, + "loss": 0.80762231, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2092, + "time_per_iteration": 2.554500102996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_mlp": 1.07266974, + "diversity_loss_mlp": 0.0, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.06323665884579813, + "language_loss": 0.89339125, + "learning_rate": 0.0006779520886918949, + "loss": 0.90424317, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2093, + "time_per_iteration": 3.0625791549682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109256, + "balance_loss_mlp": 1.08038247, + "diversity_loss_mlp": 0.0, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.06591278584355922, + "language_loss": 0.81594688, + "learning_rate": 0.0006776609108832301, + "loss": 0.82687247, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2094, + "time_per_iteration": 2.84006929397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099273, + "balance_loss_mlp": 1.08723903, + "diversity_loss_mlp": 0.0, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.07397134749055344, + "language_loss": 0.84911013, + "learning_rate": 0.0006773696641004828, + "loss": 0.86010277, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.12030029, + "routerloss_mlp": 0.0, + "step": 2095, + "time_per_iteration": 2.5662059783935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110289, + "balance_loss_mlp": 1.09781969, + "diversity_loss_mlp": 0.0, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.07471072764212172, + "language_loss": 0.77422667, + "learning_rate": 0.0006770783484567247, + "loss": 0.78532958, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.12475586, + "routerloss_mlp": 0.0, + "step": 2096, + "time_per_iteration": 3.120000123977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106409, + "balance_loss_mlp": 1.09445786, + "diversity_loss_mlp": 0.0, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.05645154934481913, + "language_loss": 0.85885596, + "learning_rate": 0.000676786964065055, + "loss": 0.86992002, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2097, + "time_per_iteration": 2.7947449684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.09767413, + "diversity_loss_mlp": 0.0, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.06468702094514471, + "language_loss": 0.78823644, + "learning_rate": 0.0006764955110385986, + "loss": 0.7993331, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2098, + "time_per_iteration": 2.7805027961730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113539, + "balance_loss_mlp": 1.10162365, + "diversity_loss_mlp": 0.0, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.06520165677387538, + "language_loss": 0.80479109, + "learning_rate": 0.0006762039894905083, + "loss": 0.81592649, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2099, + "time_per_iteration": 2.5934462547302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113864, + "balance_loss_mlp": 1.10191941, + "diversity_loss_mlp": 0.0, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.07619139256642768, + "language_loss": 0.80502266, + "learning_rate": 0.000675912399533962, + "loss": 0.81616127, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2100, + "time_per_iteration": 2.5193917751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0095878, + "balance_loss_mlp": 1.67460704, + "diversity_loss_mlp": 0.21229821, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.026749352452392162, + "language_loss": 0.8501215, + "learning_rate": 0.0006756207412821656, + "loss": 0.85970926, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532745, + "step": 2101, + "time_per_iteration": 3.0674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125351, + "balance_loss_mlp": 1.11366224, + "diversity_loss_mlp": 0.0, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.07971707112625441, + "language_loss": 0.80680853, + "learning_rate": 0.0006753290148483505, + "loss": 0.81806201, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2102, + "time_per_iteration": 3.0177412033081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128591, + "balance_loss_mlp": 1.11720061, + "diversity_loss_mlp": 0.0, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.07197972569419236, + "language_loss": 0.78862077, + "learning_rate": 0.0006750372203457752, + "loss": 0.79990667, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2103, + "time_per_iteration": 2.4715232849121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133472, + "balance_loss_mlp": 1.12199795, + "diversity_loss_mlp": 0.0, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.05679089538273026, + "language_loss": 0.8629868, + "learning_rate": 0.0006747453578877242, + "loss": 0.87432158, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2104, + "time_per_iteration": 2.7127907276153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133404, + "balance_loss_mlp": 1.12154305, + "diversity_loss_mlp": 0.0, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.07881786572134404, + "language_loss": 0.83325595, + "learning_rate": 0.0006744534275875085, + "loss": 0.84459001, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.11853027, + "routerloss_mlp": 0.0, + "step": 2105, + "time_per_iteration": 2.9968934059143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_mlp": 1.11278331, + "diversity_loss_mlp": 0.0, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.06959652480101333, + "language_loss": 0.85228348, + "learning_rate": 0.0006741614295584657, + "loss": 0.86352497, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2106, + "time_per_iteration": 2.6837310791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128388, + "balance_loss_mlp": 1.1166873, + "diversity_loss_mlp": 0.0, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.07271017039443997, + "language_loss": 0.78820735, + "learning_rate": 0.0006738693639139595, + "loss": 0.79949123, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2107, + "time_per_iteration": 2.9876344203948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_mlp": 1.09982085, + "diversity_loss_mlp": 0.0, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.07545270814647756, + "language_loss": 0.7770499, + "learning_rate": 0.0006735772307673796, + "loss": 0.78816462, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2108, + "time_per_iteration": 3.5391368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.1007216, + "diversity_loss_mlp": 0.0, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.07028810729839409, + "language_loss": 0.8317976, + "learning_rate": 0.0006732850302321421, + "loss": 0.84292281, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2109, + "time_per_iteration": 2.924703359603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107897, + "balance_loss_mlp": 1.0962801, + "diversity_loss_mlp": 0.0, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.08331494403878895, + "language_loss": 0.84220135, + "learning_rate": 0.00067299276242169, + "loss": 0.85328031, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2110, + "time_per_iteration": 2.6628758907318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00591895, + "balance_loss_mlp": 1.01285744, + "diversity_loss_mlp": 0.15005666, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.0011574932258311419, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.74974066, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01043818, + "step": 2111, + "time_per_iteration": 4.913798093795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100112, + "balance_loss_mlp": 1.0884769, + "diversity_loss_mlp": 0.0, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.0671840972805921, + "language_loss": 0.77974957, + "learning_rate": 0.0006724080254290395, + "loss": 0.79075068, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2112, + "time_per_iteration": 2.790695905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087298, + "balance_loss_mlp": 1.07509685, + "diversity_loss_mlp": 0.0, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.06921545909042545, + "language_loss": 0.89956391, + "learning_rate": 0.0006721155564738566, + "loss": 0.91043687, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2113, + "time_per_iteration": 2.654052495956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00590146, + "balance_loss_mlp": 1.01069736, + "diversity_loss_mlp": 0.14874323, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.001129022163549877, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79212785, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01042587, + "step": 2114, + "time_per_iteration": 5.02890682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095601, + "balance_loss_mlp": 1.08348942, + "diversity_loss_mlp": 0.0, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.06673632265299649, + "language_loss": 0.85678279, + "learning_rate": 0.0006715304182135078, + "loss": 0.86773884, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2115, + "time_per_iteration": 2.6665151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_mlp": 1.07951176, + "diversity_loss_mlp": 0.0, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.08902530655488881, + "language_loss": 0.8859638, + "learning_rate": 0.0006712377491355127, + "loss": 0.89688623, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.12731934, + "routerloss_mlp": 0.0, + "step": 2116, + "time_per_iteration": 2.9124083518981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091485, + "balance_loss_mlp": 1.07896256, + "diversity_loss_mlp": 0.0, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.06275972542298792, + "language_loss": 0.81009984, + "learning_rate": 0.0006709450135771274, + "loss": 0.8210147, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2117, + "time_per_iteration": 2.9538469314575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109045, + "balance_loss_mlp": 1.07800436, + "diversity_loss_mlp": 0.0, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.06731197780562713, + "language_loss": 0.8655895, + "learning_rate": 0.0006706522116520023, + "loss": 0.87649393, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.12445068, + "routerloss_mlp": 0.0, + "step": 2118, + "time_per_iteration": 2.6403684616088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109386, + "balance_loss_mlp": 1.08127189, + "diversity_loss_mlp": 0.0, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.07339707473672348, + "language_loss": 0.82936597, + "learning_rate": 0.0006703593434738127, + "loss": 0.84030455, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.12579346, + "routerloss_mlp": 0.0, + "step": 2119, + "time_per_iteration": 2.706406354904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_mlp": 1.0847466, + "diversity_loss_mlp": 0.0, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.05750096894007485, + "language_loss": 0.78123623, + "learning_rate": 0.0006700664091562604, + "loss": 0.79220533, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2120, + "time_per_iteration": 2.5515992641448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102687, + "balance_loss_mlp": 1.09045601, + "diversity_loss_mlp": 0.0, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.08484846499370094, + "language_loss": 0.85241771, + "learning_rate": 0.0006697734088130725, + "loss": 0.86344457, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2121, + "time_per_iteration": 2.5997116565704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094995, + "balance_loss_mlp": 1.08268619, + "diversity_loss_mlp": 0.0, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.06901349076849703, + "language_loss": 0.85628182, + "learning_rate": 0.0006694803425580018, + "loss": 0.86723173, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.12310791, + "routerloss_mlp": 0.0, + "step": 2122, + "time_per_iteration": 2.975572109222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090381, + "balance_loss_mlp": 1.07825708, + "diversity_loss_mlp": 0.0, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.08123936309079019, + "language_loss": 0.84420574, + "learning_rate": 0.0006691872105048268, + "loss": 0.85510951, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.12133789, + "routerloss_mlp": 0.0, + "step": 2123, + "time_per_iteration": 2.5785253047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109138, + "balance_loss_mlp": 1.07879114, + "diversity_loss_mlp": 0.0, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.06700388653835253, + "language_loss": 0.84703517, + "learning_rate": 0.0006688940127673513, + "loss": 0.85794896, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.12597656, + "routerloss_mlp": 0.0, + "step": 2124, + "time_per_iteration": 2.794312000274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080411, + "balance_loss_mlp": 1.06789398, + "diversity_loss_mlp": 0.0, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.11477925500015464, + "language_loss": 0.85646629, + "learning_rate": 0.0006686007494594049, + "loss": 0.86727041, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2125, + "time_per_iteration": 2.8629977703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.06869102, + "diversity_loss_mlp": 0.0, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.08770785423003769, + "language_loss": 0.80226219, + "learning_rate": 0.0006683074206948425, + "loss": 0.81306815, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2126, + "time_per_iteration": 2.5477960109710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_mlp": 1.06884146, + "diversity_loss_mlp": 0.0, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.0688791895715759, + "language_loss": 0.81257784, + "learning_rate": 0.0006680140265875443, + "loss": 0.82338405, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2127, + "time_per_iteration": 2.824706792831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076947, + "balance_loss_mlp": 1.06504989, + "diversity_loss_mlp": 0.0, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.0706270365820259, + "language_loss": 0.95744675, + "learning_rate": 0.0006677205672514162, + "loss": 0.96821618, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2128, + "time_per_iteration": 2.6173171997070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081253, + "balance_loss_mlp": 1.06944525, + "diversity_loss_mlp": 0.0, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.08385407721227026, + "language_loss": 0.88751161, + "learning_rate": 0.000667427042800389, + "loss": 0.89832413, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2129, + "time_per_iteration": 2.746561288833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090066, + "balance_loss_mlp": 1.07828188, + "diversity_loss_mlp": 0.0, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.0802302808929841, + "language_loss": 0.82728851, + "learning_rate": 0.0006671334533484192, + "loss": 0.83818918, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2130, + "time_per_iteration": 2.7765390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094937, + "balance_loss_mlp": 1.08306408, + "diversity_loss_mlp": 0.0, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.06494454218377498, + "language_loss": 0.83394802, + "learning_rate": 0.0006668397990094881, + "loss": 0.84489739, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2131, + "time_per_iteration": 2.6814444065093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094306, + "balance_loss_mlp": 1.08240891, + "diversity_loss_mlp": 0.0, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.08851492372685672, + "language_loss": 0.84863144, + "learning_rate": 0.0006665460798976027, + "loss": 0.8595745, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2132, + "time_per_iteration": 2.734208822250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098079, + "balance_loss_mlp": 1.08680749, + "diversity_loss_mlp": 0.0, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.07834997970618658, + "language_loss": 0.8153789, + "learning_rate": 0.0006662522961267947, + "loss": 0.82635975, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2133, + "time_per_iteration": 2.642789363861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100422, + "balance_loss_mlp": 1.0889008, + "diversity_loss_mlp": 0.0, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.06175420460070233, + "language_loss": 0.87238759, + "learning_rate": 0.0006659584478111211, + "loss": 0.88339174, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2134, + "time_per_iteration": 2.8097283840179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110764, + "balance_loss_mlp": 1.09618366, + "diversity_loss_mlp": 0.0, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.07261990262121029, + "language_loss": 0.82762325, + "learning_rate": 0.000665664535064664, + "loss": 0.83869964, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2135, + "time_per_iteration": 3.034973382949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118337, + "balance_loss_mlp": 1.10702372, + "diversity_loss_mlp": 0.0, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.07277612177905571, + "language_loss": 0.82753229, + "learning_rate": 0.0006653705580015303, + "loss": 0.83871567, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.11309814, + "routerloss_mlp": 0.0, + "step": 2136, + "time_per_iteration": 2.719024181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130623, + "balance_loss_mlp": 1.11913705, + "diversity_loss_mlp": 0.0, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.09561286081072368, + "language_loss": 0.86333638, + "learning_rate": 0.0006650765167358523, + "loss": 0.87464261, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2137, + "time_per_iteration": 2.798013210296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119744, + "balance_loss_mlp": 1.10816908, + "diversity_loss_mlp": 0.0, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.06575385598885217, + "language_loss": 0.90120316, + "learning_rate": 0.0006647824113817864, + "loss": 0.9124006, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2138, + "time_per_iteration": 2.5290029048919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00941862, + "balance_loss_mlp": 1.64172852, + "diversity_loss_mlp": 0.21382158, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.027199696320483784, + "language_loss": 0.81782889, + "learning_rate": 0.000664488242053515, + "loss": 0.8272475, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01408678, + "step": 2139, + "time_per_iteration": 2.7610864639282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111286, + "balance_loss_mlp": 1.1009748, + "diversity_loss_mlp": 0.0, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.07795493316399416, + "language_loss": 0.83879304, + "learning_rate": 0.0006641940088652445, + "loss": 0.84992164, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 2140, + "time_per_iteration": 2.7797446250915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.08682573, + "diversity_loss_mlp": 0.0, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.09321248474614077, + "language_loss": 0.82214057, + "learning_rate": 0.0006638997119312065, + "loss": 0.83312857, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2141, + "time_per_iteration": 2.688427209854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_mlp": 1.07580638, + "diversity_loss_mlp": 0.0, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.05051376163622262, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76146024, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 2142, + "time_per_iteration": 4.916438817977905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084339, + "balance_loss_mlp": 1.07186329, + "diversity_loss_mlp": 0.0, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.0666522569579182, + "language_loss": 0.84487629, + "learning_rate": 0.000663310927282877, + "loss": 0.85571963, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.12475586, + "routerloss_mlp": 0.0, + "step": 2143, + "time_per_iteration": 2.742781162261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075707, + "balance_loss_mlp": 1.06302905, + "diversity_loss_mlp": 0.0, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.07553146792883669, + "language_loss": 0.85816187, + "learning_rate": 0.000663016439797172, + "loss": 0.86891896, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.12677002, + "routerloss_mlp": 0.0, + "step": 2144, + "time_per_iteration": 2.602322578430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075852, + "balance_loss_mlp": 1.06363273, + "diversity_loss_mlp": 0.0, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.09188682549299809, + "language_loss": 0.80924189, + "learning_rate": 0.0006627218890228724, + "loss": 0.82000041, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2145, + "time_per_iteration": 2.76790452003479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081048, + "balance_loss_mlp": 1.0687809, + "diversity_loss_mlp": 0.0, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.09235653357512275, + "language_loss": 0.83860421, + "learning_rate": 0.0006624272750743326, + "loss": 0.84941471, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2146, + "time_per_iteration": 2.986267566680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085622, + "balance_loss_mlp": 1.073385, + "diversity_loss_mlp": 0.0, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.06221373460159241, + "language_loss": 0.82866907, + "learning_rate": 0.0006621325980659322, + "loss": 0.83952528, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2147, + "time_per_iteration": 2.78074049949646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091906, + "balance_loss_mlp": 1.07981253, + "diversity_loss_mlp": 0.0, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.06655163113776748, + "language_loss": 0.81613219, + "learning_rate": 0.000661837858112075, + "loss": 0.82705128, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2148, + "time_per_iteration": 2.8118457794189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920817, + "balance_loss_mlp": 1.59947157, + "diversity_loss_mlp": 0.21162269, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.03430222900415099, + "language_loss": 0.88696158, + "learning_rate": 0.0006615430553271888, + "loss": 0.89616972, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01526995, + "step": 2149, + "time_per_iteration": 2.809389352798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.10438299, + "diversity_loss_mlp": 0.0, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.06824786639125466, + "language_loss": 0.85333586, + "learning_rate": 0.0006612481898257264, + "loss": 0.8644954, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2150, + "time_per_iteration": 2.855074644088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137757, + "balance_loss_mlp": 1.12599659, + "diversity_loss_mlp": 0.0, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.07789693292988349, + "language_loss": 0.851385, + "learning_rate": 0.000660953261722165, + "loss": 0.86276257, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.11749268, + "routerloss_mlp": 0.0, + "step": 2151, + "time_per_iteration": 2.5938022136688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113704, + "balance_loss_mlp": 1.12522054, + "diversity_loss_mlp": 0.0, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.08228338378299185, + "language_loss": 0.82884097, + "learning_rate": 0.0006606582711310055, + "loss": 0.84021133, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2152, + "time_per_iteration": 2.7282497882843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145368, + "balance_loss_mlp": 1.13366747, + "diversity_loss_mlp": 0.0, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.06559194318793425, + "language_loss": 0.82812124, + "learning_rate": 0.0006603632181667736, + "loss": 0.83957493, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2153, + "time_per_iteration": 2.6664750576019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103836, + "balance_loss_mlp": 1.09754133, + "diversity_loss_mlp": 0.0, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.03767833543400207, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.8004716, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.06298828, + "routerloss_mlp": 0.0, + "step": 2154, + "time_per_iteration": 4.910309791564941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135237, + "balance_loss_mlp": 1.12367392, + "diversity_loss_mlp": 0.0, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.0807614788835298, + "language_loss": 0.81897664, + "learning_rate": 0.0006597729255773153, + "loss": 0.83032906, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.11560059, + "routerloss_mlp": 0.0, + "step": 2155, + "time_per_iteration": 2.509021520614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146058, + "balance_loss_mlp": 1.13441765, + "diversity_loss_mlp": 0.0, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.07993173196210833, + "language_loss": 0.82465029, + "learning_rate": 0.0006594776861812608, + "loss": 0.83611095, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2156, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151315, + "balance_loss_mlp": 1.13991857, + "diversity_loss_mlp": 0.0, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.06494614409867079, + "language_loss": 0.8654387, + "learning_rate": 0.0006591823848704776, + "loss": 0.87695187, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.11395264, + "routerloss_mlp": 0.0, + "step": 2157, + "time_per_iteration": 2.9039251804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134696, + "balance_loss_mlp": 1.12316287, + "diversity_loss_mlp": 0.0, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.07584878913150254, + "language_loss": 0.81510401, + "learning_rate": 0.0006588870217596117, + "loss": 0.82645094, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2158, + "time_per_iteration": 2.7366249561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121714, + "balance_loss_mlp": 1.11010289, + "diversity_loss_mlp": 0.0, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.0768974217493938, + "language_loss": 0.8567549, + "learning_rate": 0.0006585915969633334, + "loss": 0.86797202, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2159, + "time_per_iteration": 2.557969331741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105923, + "balance_loss_mlp": 1.09437764, + "diversity_loss_mlp": 0.0, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.06453825749462137, + "language_loss": 0.89545041, + "learning_rate": 0.0006582961105963366, + "loss": 0.90650964, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2160, + "time_per_iteration": 2.782766103744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089959, + "balance_loss_mlp": 1.07836008, + "diversity_loss_mlp": 0.0, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.09389311079563152, + "language_loss": 0.77639234, + "learning_rate": 0.0006580005627733395, + "loss": 0.78729188, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2161, + "time_per_iteration": 2.7049734592437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086569, + "balance_loss_mlp": 1.07492197, + "diversity_loss_mlp": 0.0, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.08236412019602501, + "language_loss": 0.81618345, + "learning_rate": 0.0006577049536090838, + "loss": 0.8270492, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.11645508, + "routerloss_mlp": 0.0, + "step": 2162, + "time_per_iteration": 2.723243236541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078674, + "balance_loss_mlp": 1.06676459, + "diversity_loss_mlp": 0.0, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.09869721655750711, + "language_loss": 0.85591501, + "learning_rate": 0.000657409283218335, + "loss": 0.86670172, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2163, + "time_per_iteration": 2.64973783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078005, + "balance_loss_mlp": 1.0662148, + "diversity_loss_mlp": 0.0, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.06806079796586995, + "language_loss": 0.81014043, + "learning_rate": 0.0006571135517158829, + "loss": 0.82092047, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2164, + "time_per_iteration": 2.6662614345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261192, + "balance_loss_mlp": 1.25542271, + "diversity_loss_mlp": 0.0, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.0963910676883023, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.78025252, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2165, + "time_per_iteration": 4.733267068862915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.07227921, + "diversity_loss_mlp": 0.0, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.08489426271121504, + "language_loss": 0.83098751, + "learning_rate": 0.0006565219058351444, + "loss": 0.84183216, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.12194824, + "routerloss_mlp": 0.0, + "step": 2166, + "time_per_iteration": 2.555367946624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087654, + "balance_loss_mlp": 1.07506573, + "diversity_loss_mlp": 0.0, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.0663020588108057, + "language_loss": 0.82663929, + "learning_rate": 0.0006562259916865553, + "loss": 0.83751583, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.12585449, + "routerloss_mlp": 0.0, + "step": 2167, + "time_per_iteration": 2.5647947788238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085528, + "balance_loss_mlp": 1.07305884, + "diversity_loss_mlp": 0.0, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.11811458423881586, + "language_loss": 0.79392177, + "learning_rate": 0.0006559300168856573, + "loss": 0.80477709, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 2168, + "time_per_iteration": 2.737071990966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090803, + "balance_loss_mlp": 1.07860184, + "diversity_loss_mlp": 0.0, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.07183663020795078, + "language_loss": 0.86060214, + "learning_rate": 0.0006556339815473577, + "loss": 0.87151015, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2169, + "time_per_iteration": 2.6506707668304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087149, + "balance_loss_mlp": 1.07504892, + "diversity_loss_mlp": 0.0, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.07609133400056706, + "language_loss": 0.86409211, + "learning_rate": 0.000655337885786588, + "loss": 0.87496364, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2170, + "time_per_iteration": 2.8835949897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078201, + "balance_loss_mlp": 1.06654263, + "diversity_loss_mlp": 0.0, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.08298304012821277, + "language_loss": 0.85129267, + "learning_rate": 0.0006550417297183025, + "loss": 0.86207461, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2171, + "time_per_iteration": 2.6195385456085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087717, + "balance_loss_mlp": 1.07584357, + "diversity_loss_mlp": 0.0, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.07223590906341684, + "language_loss": 0.81395489, + "learning_rate": 0.0006547455134574793, + "loss": 0.82483202, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 2172, + "time_per_iteration": 2.688387155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091071, + "balance_loss_mlp": 1.07947183, + "diversity_loss_mlp": 0.0, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06986640066350178, + "language_loss": 0.84520721, + "learning_rate": 0.0006544492371191198, + "loss": 0.85611784, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2173, + "time_per_iteration": 3.1099753379821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094341, + "balance_loss_mlp": 1.08226562, + "diversity_loss_mlp": 0.0, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.06657472623207703, + "language_loss": 0.8341983, + "learning_rate": 0.0006541529008182485, + "loss": 0.84514177, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2174, + "time_per_iteration": 3.203376054763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_mlp": 1.09567666, + "diversity_loss_mlp": 0.0, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.07167092475387357, + "language_loss": 0.87561977, + "learning_rate": 0.0006538565046699136, + "loss": 0.8866933, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2175, + "time_per_iteration": 2.6136248111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122872, + "balance_loss_mlp": 1.1111474, + "diversity_loss_mlp": 0.0, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.08073018870716439, + "language_loss": 0.81308544, + "learning_rate": 0.0006535600487891862, + "loss": 0.82431418, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2176, + "time_per_iteration": 2.8484995365142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112142, + "balance_loss_mlp": 1.10968423, + "diversity_loss_mlp": 0.0, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.06933020813080157, + "language_loss": 0.89047962, + "learning_rate": 0.0006532635332911603, + "loss": 0.90169382, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2177, + "time_per_iteration": 2.6983814239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139797, + "balance_loss_mlp": 1.12828767, + "diversity_loss_mlp": 0.0, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.07833316419755533, + "language_loss": 0.80340332, + "learning_rate": 0.0006529669582909541, + "loss": 0.81480134, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2178, + "time_per_iteration": 3.247034788131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130167, + "balance_loss_mlp": 1.11881781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.08850961832331757, + "language_loss": 0.85867965, + "learning_rate": 0.0006526703239037077, + "loss": 0.86998129, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2179, + "time_per_iteration": 2.6653683185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933718, + "balance_loss_mlp": 1.62844765, + "diversity_loss_mlp": 0.20954823, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.029582524443817385, + "language_loss": 0.86593473, + "learning_rate": 0.0006523736302445851, + "loss": 0.87527192, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01471971, + "step": 2180, + "time_per_iteration": 2.857030153274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120353, + "balance_loss_mlp": 1.10893881, + "diversity_loss_mlp": 0.0, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.0687803817541909, + "language_loss": 0.77392578, + "learning_rate": 0.0006520768774287728, + "loss": 0.78512931, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2181, + "time_per_iteration": 5.625683307647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114289, + "balance_loss_mlp": 1.10282135, + "diversity_loss_mlp": 0.0, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06088029266780351, + "language_loss": 0.85493296, + "learning_rate": 0.0006517800655714806, + "loss": 0.86607587, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2182, + "time_per_iteration": 2.812955617904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105622, + "balance_loss_mlp": 1.09442866, + "diversity_loss_mlp": 0.0, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07098705372074567, + "language_loss": 0.85399854, + "learning_rate": 0.0006514831947879407, + "loss": 0.86505473, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.11193848, + "routerloss_mlp": 0.0, + "step": 2183, + "time_per_iteration": 2.961418867111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097617, + "balance_loss_mlp": 1.08642888, + "diversity_loss_mlp": 0.0, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.08450852264083888, + "language_loss": 0.78323019, + "learning_rate": 0.0006511862651934091, + "loss": 0.79420632, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2184, + "time_per_iteration": 3.076414108276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091172, + "balance_loss_mlp": 1.07956707, + "diversity_loss_mlp": 0.0, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.06921087236063693, + "language_loss": 0.82092035, + "learning_rate": 0.0006508892769031638, + "loss": 0.83183205, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2185, + "time_per_iteration": 2.638606309890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089875, + "balance_loss_mlp": 1.07868707, + "diversity_loss_mlp": 0.0, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.07895440454445611, + "language_loss": 0.87322706, + "learning_rate": 0.000650592230032506, + "loss": 0.88412583, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.11187744, + "routerloss_mlp": 0.0, + "step": 2186, + "time_per_iteration": 2.702061176300049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093256, + "balance_loss_mlp": 1.0815382, + "diversity_loss_mlp": 0.0, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.07748698496632533, + "language_loss": 0.85121393, + "learning_rate": 0.0006502951246967595, + "loss": 0.8621465, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.11706543, + "routerloss_mlp": 0.0, + "step": 2187, + "time_per_iteration": 2.871629476547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087261, + "balance_loss_mlp": 1.07582331, + "diversity_loss_mlp": 0.0, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.06016607527200091, + "language_loss": 0.86913472, + "learning_rate": 0.0006499979610112706, + "loss": 0.88000733, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2188, + "time_per_iteration": 2.795278787612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107962, + "balance_loss_mlp": 1.06803894, + "diversity_loss_mlp": 0.0, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.0593739697007924, + "language_loss": 0.84024572, + "learning_rate": 0.000649700739091409, + "loss": 0.85104191, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2189, + "time_per_iteration": 2.822756290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123251, + "balance_loss_mlp": 1.11500144, + "diversity_loss_mlp": 0.0, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.03860831682793276, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74959522, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.08251953, + "routerloss_mlp": 0.0, + "step": 2190, + "time_per_iteration": 4.79919958114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082066, + "balance_loss_mlp": 1.07052088, + "diversity_loss_mlp": 0.0, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.06761793691364075, + "language_loss": 0.85737348, + "learning_rate": 0.0006491061210101557, + "loss": 0.86819422, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2191, + "time_per_iteration": 2.661578416824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094285, + "balance_loss_mlp": 1.08270931, + "diversity_loss_mlp": 0.0, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.0725556462678514, + "language_loss": 0.83956218, + "learning_rate": 0.0006488087250796157, + "loss": 0.85050505, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2192, + "time_per_iteration": 2.881225347518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095445, + "balance_loss_mlp": 1.08376861, + "diversity_loss_mlp": 0.0, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.09298126342392905, + "language_loss": 0.81662476, + "learning_rate": 0.0006485112713764049, + "loss": 0.82757914, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2193, + "time_per_iteration": 2.8921914100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_mlp": 1.08214593, + "diversity_loss_mlp": 0.0, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.058244545196029895, + "language_loss": 0.83715278, + "learning_rate": 0.0006482137600160051, + "loss": 0.84809017, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2194, + "time_per_iteration": 2.484341859817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094196, + "balance_loss_mlp": 1.08240056, + "diversity_loss_mlp": 0.0, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.08574033239321836, + "language_loss": 0.847399, + "learning_rate": 0.0006479161911139206, + "loss": 0.85834098, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2195, + "time_per_iteration": 2.5937106609344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082105, + "balance_loss_mlp": 1.07043433, + "diversity_loss_mlp": 0.0, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08791937036502419, + "language_loss": 0.85522735, + "learning_rate": 0.0006476185647856778, + "loss": 0.86604846, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.11657715, + "routerloss_mlp": 0.0, + "step": 2196, + "time_per_iteration": 2.569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080125, + "balance_loss_mlp": 1.06815672, + "diversity_loss_mlp": 0.0, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.07778870715402122, + "language_loss": 0.82192588, + "learning_rate": 0.0006473208811468255, + "loss": 0.83272707, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2197, + "time_per_iteration": 2.899557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072137, + "balance_loss_mlp": 1.06046605, + "diversity_loss_mlp": 0.0, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.07330307904629892, + "language_loss": 0.84140831, + "learning_rate": 0.0006470231403129347, + "loss": 0.85212964, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.11663818, + "routerloss_mlp": 0.0, + "step": 2198, + "time_per_iteration": 2.602447509765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106933, + "balance_loss_mlp": 1.05760026, + "diversity_loss_mlp": 0.0, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.06409293690085444, + "language_loss": 0.81590885, + "learning_rate": 0.0006467253423995988, + "loss": 0.82660222, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2199, + "time_per_iteration": 2.8557229042053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107934, + "balance_loss_mlp": 1.06755078, + "diversity_loss_mlp": 0.0, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.07244216805562081, + "language_loss": 0.78831869, + "learning_rate": 0.000646427487522433, + "loss": 0.79911208, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2200, + "time_per_iteration": 2.65742826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084908, + "balance_loss_mlp": 1.07336855, + "diversity_loss_mlp": 0.0, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.07121994515744344, + "language_loss": 0.83032513, + "learning_rate": 0.0006461295757970749, + "loss": 0.84117424, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2201, + "time_per_iteration": 2.950655698776245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090426, + "balance_loss_mlp": 1.07880902, + "diversity_loss_mlp": 0.0, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.07713064950594434, + "language_loss": 0.81538546, + "learning_rate": 0.0006458316073391839, + "loss": 0.82628965, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2202, + "time_per_iteration": 2.8609914779663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089677, + "balance_loss_mlp": 1.07874584, + "diversity_loss_mlp": 0.0, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.07022827859020209, + "language_loss": 0.87709206, + "learning_rate": 0.0006455335822644422, + "loss": 0.88798881, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2203, + "time_per_iteration": 2.6978323459625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118526, + "balance_loss_mlp": 1.10743332, + "diversity_loss_mlp": 0.0, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.08724206882012846, + "language_loss": 0.78530163, + "learning_rate": 0.0006452355006885527, + "loss": 0.79648691, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.11090088, + "routerloss_mlp": 0.0, + "step": 2204, + "time_per_iteration": 2.686579704284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00922718, + "balance_loss_mlp": 1.60671031, + "diversity_loss_mlp": 0.20807257, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.038668439213979985, + "language_loss": 0.8761735, + "learning_rate": 0.0006449373627272412, + "loss": 0.88540065, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532654, + "step": 2205, + "time_per_iteration": 2.7558722496032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112883, + "balance_loss_mlp": 1.10164738, + "diversity_loss_mlp": 0.0, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.08032286277613819, + "language_loss": 0.82142913, + "learning_rate": 0.0006446391684962553, + "loss": 0.83255792, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.11230469, + "routerloss_mlp": 0.0, + "step": 2206, + "time_per_iteration": 2.6579248905181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117757, + "balance_loss_mlp": 1.10650921, + "diversity_loss_mlp": 0.0, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.06707307211931093, + "language_loss": 0.82899106, + "learning_rate": 0.000644340918111364, + "loss": 0.8401686, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2207, + "time_per_iteration": 2.5347208976745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117145, + "balance_loss_mlp": 1.10573626, + "diversity_loss_mlp": 0.0, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.09153331321335235, + "language_loss": 0.84820396, + "learning_rate": 0.0006440426116883585, + "loss": 0.85937536, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.11401367, + "routerloss_mlp": 0.0, + "step": 2208, + "time_per_iteration": 2.5513036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.11258864, + "diversity_loss_mlp": 0.0, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.07442494649717855, + "language_loss": 0.86227304, + "learning_rate": 0.0006437442493430519, + "loss": 0.87351412, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2209, + "time_per_iteration": 2.6560840606689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120019, + "balance_loss_mlp": 1.10829473, + "diversity_loss_mlp": 0.0, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.09545289030190586, + "language_loss": 0.86441422, + "learning_rate": 0.000643445831191278, + "loss": 0.8756144, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2210, + "time_per_iteration": 2.9028308391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_mlp": 1.09162724, + "diversity_loss_mlp": 0.0, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.07646392549286844, + "language_loss": 0.81526744, + "learning_rate": 0.0006431473573488937, + "loss": 0.82629919, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2211, + "time_per_iteration": 2.7377443313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089807, + "balance_loss_mlp": 1.0782795, + "diversity_loss_mlp": 0.0, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.08107145257136338, + "language_loss": 0.85147351, + "learning_rate": 0.0006428488279317765, + "loss": 0.86237156, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2212, + "time_per_iteration": 2.6276626586914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109177, + "balance_loss_mlp": 1.08065951, + "diversity_loss_mlp": 0.0, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.09124161172132733, + "language_loss": 0.87490094, + "learning_rate": 0.0006425502430558259, + "loss": 0.88581866, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.11120605, + "routerloss_mlp": 0.0, + "step": 2213, + "time_per_iteration": 2.588928699493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109161, + "balance_loss_mlp": 1.08046961, + "diversity_loss_mlp": 0.0, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.06865062693642494, + "language_loss": 0.84588826, + "learning_rate": 0.0006422516028369628, + "loss": 0.85680431, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.11138916, + "routerloss_mlp": 0.0, + "step": 2214, + "time_per_iteration": 2.639619827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085781, + "balance_loss_mlp": 1.07456374, + "diversity_loss_mlp": 0.0, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.06481575152476399, + "language_loss": 0.83497036, + "learning_rate": 0.0006419529073911296, + "loss": 0.84582818, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2215, + "time_per_iteration": 2.8564555644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091551, + "balance_loss_mlp": 1.08075058, + "diversity_loss_mlp": 0.0, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.07537518077633425, + "language_loss": 0.85102242, + "learning_rate": 0.0006416541568342901, + "loss": 0.86193788, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.10797119, + "routerloss_mlp": 0.0, + "step": 2216, + "time_per_iteration": 2.8998327255249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082292, + "balance_loss_mlp": 1.07092535, + "diversity_loss_mlp": 0.0, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.06331803259599181, + "language_loss": 0.84347832, + "learning_rate": 0.0006413553512824297, + "loss": 0.85430121, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2217, + "time_per_iteration": 2.754044532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084192, + "balance_loss_mlp": 1.07307625, + "diversity_loss_mlp": 0.0, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.07616444203019798, + "language_loss": 0.84374213, + "learning_rate": 0.0006410564908515549, + "loss": 0.85458404, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.11114502, + "routerloss_mlp": 0.0, + "step": 2218, + "time_per_iteration": 2.724478006362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081004, + "balance_loss_mlp": 1.06966138, + "diversity_loss_mlp": 0.0, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.0731173396075932, + "language_loss": 0.85161233, + "learning_rate": 0.0006407575756576935, + "loss": 0.86242241, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.11334229, + "routerloss_mlp": 0.0, + "step": 2219, + "time_per_iteration": 2.754624128341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093699, + "balance_loss_mlp": 1.08191478, + "diversity_loss_mlp": 0.0, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.068521011535794, + "language_loss": 0.87612599, + "learning_rate": 0.0006404586058168951, + "loss": 0.88706297, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2220, + "time_per_iteration": 2.6972298622131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100965, + "balance_loss_mlp": 1.08927631, + "diversity_loss_mlp": 0.0, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.1033551804820373, + "language_loss": 0.86327708, + "learning_rate": 0.0006401595814452296, + "loss": 0.87428677, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2221, + "time_per_iteration": 2.6071925163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_mlp": 1.08816695, + "diversity_loss_mlp": 0.0, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.07649462730323824, + "language_loss": 0.8070569, + "learning_rate": 0.000639860502658789, + "loss": 0.81805706, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 2222, + "time_per_iteration": 2.6844141483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101843, + "balance_loss_mlp": 1.08965993, + "diversity_loss_mlp": 0.0, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.0652732350229211, + "language_loss": 0.84929889, + "learning_rate": 0.0006395613695736853, + "loss": 0.86031729, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.1217041, + "routerloss_mlp": 0.0, + "step": 2223, + "time_per_iteration": 2.6799042224884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091013, + "balance_loss_mlp": 1.07850194, + "diversity_loss_mlp": 0.0, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.10552751254703834, + "language_loss": 0.82026577, + "learning_rate": 0.0006392621823060529, + "loss": 0.83117592, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.12518311, + "routerloss_mlp": 0.0, + "step": 2224, + "time_per_iteration": 2.722675323486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083211, + "balance_loss_mlp": 1.07109332, + "diversity_loss_mlp": 0.0, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.0790777786133485, + "language_loss": 0.8508532, + "learning_rate": 0.0006389629409720465, + "loss": 0.86168534, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2225, + "time_per_iteration": 2.6559393405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084084, + "balance_loss_mlp": 1.07179379, + "diversity_loss_mlp": 0.0, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.0811747132385773, + "language_loss": 0.88654399, + "learning_rate": 0.0006386636456878417, + "loss": 0.89738482, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2226, + "time_per_iteration": 2.898261308670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083891, + "balance_loss_mlp": 1.07153535, + "diversity_loss_mlp": 0.0, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.07696212536929578, + "language_loss": 0.92413348, + "learning_rate": 0.0006383642965696353, + "loss": 0.93497235, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 2227, + "time_per_iteration": 2.467622995376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932178, + "balance_loss_mlp": 1.62005818, + "diversity_loss_mlp": 0.21207821, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.033827312051000154, + "language_loss": 0.83018744, + "learning_rate": 0.000638064893733645, + "loss": 0.83950925, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01611001, + "step": 2228, + "time_per_iteration": 2.74554705619812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00939878, + "balance_loss_mlp": 1.63503206, + "diversity_loss_mlp": 0.21170495, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.03357304306136308, + "language_loss": 0.90087909, + "learning_rate": 0.000637765437296109, + "loss": 0.91027784, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01650969, + "step": 2229, + "time_per_iteration": 2.6807308197021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086799, + "balance_loss_mlp": 1.07446718, + "diversity_loss_mlp": 0.0, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.09425394332621637, + "language_loss": 0.85585725, + "learning_rate": 0.000637465927373287, + "loss": 0.86672527, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2230, + "time_per_iteration": 2.6279454231262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088133, + "balance_loss_mlp": 1.0761342, + "diversity_loss_mlp": 0.0, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.13300209785278838, + "language_loss": 0.79446864, + "learning_rate": 0.000637166364081459, + "loss": 0.80534995, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.11993408, + "routerloss_mlp": 0.0, + "step": 2231, + "time_per_iteration": 2.7252066135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108179, + "balance_loss_mlp": 1.07001245, + "diversity_loss_mlp": 0.0, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.08046243261781533, + "language_loss": 0.84081841, + "learning_rate": 0.0006368667475369256, + "loss": 0.85163629, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2232, + "time_per_iteration": 2.756286382675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_mlp": 1.03840148, + "diversity_loss_mlp": 0.0, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.02809293853716727, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79574001, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.078125, + "routerloss_mlp": 0.0, + "step": 2233, + "time_per_iteration": 4.852276086807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_mlp": 1.02313304, + "diversity_loss_mlp": 0.0, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.02329901381823612, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79926044, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.07470703, + "routerloss_mlp": 0.0, + "step": 2234, + "time_per_iteration": 4.812516689300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107186, + "balance_loss_mlp": 1.09534228, + "diversity_loss_mlp": 0.0, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.06628794940731256, + "language_loss": 0.86166692, + "learning_rate": 0.0006359675795504112, + "loss": 0.87273884, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 2235, + "time_per_iteration": 2.7691314220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112399, + "balance_loss_mlp": 1.11230159, + "diversity_loss_mlp": 0.0, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.08124483128316094, + "language_loss": 0.74637383, + "learning_rate": 0.0006356677511584775, + "loss": 0.75761378, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.11676025, + "routerloss_mlp": 0.0, + "step": 2236, + "time_per_iteration": 3.51676082611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138804, + "balance_loss_mlp": 1.12733603, + "diversity_loss_mlp": 0.0, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.08045247853644188, + "language_loss": 0.85975677, + "learning_rate": 0.0006353678700956511, + "loss": 0.87114477, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2237, + "time_per_iteration": 2.5487072467803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137661, + "balance_loss_mlp": 1.12605572, + "diversity_loss_mlp": 0.0, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.08414636037035166, + "language_loss": 0.84184766, + "learning_rate": 0.0006350679364783569, + "loss": 0.85322422, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2238, + "time_per_iteration": 2.730128288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113973, + "balance_loss_mlp": 1.1279577, + "diversity_loss_mlp": 0.0, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.06707032645836293, + "language_loss": 0.85872072, + "learning_rate": 0.0006347679504230393, + "loss": 0.87011802, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2239, + "time_per_iteration": 2.640791893005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136631, + "balance_loss_mlp": 1.12453079, + "diversity_loss_mlp": 0.0, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.07174503893432663, + "language_loss": 0.7626543, + "learning_rate": 0.0006344679120461632, + "loss": 0.77402061, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2240, + "time_per_iteration": 3.3352768421173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128316, + "balance_loss_mlp": 1.11687779, + "diversity_loss_mlp": 0.0, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.08647233478950261, + "language_loss": 0.79984182, + "learning_rate": 0.0006341678214642134, + "loss": 0.81112498, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2241, + "time_per_iteration": 2.662132740020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114748, + "balance_loss_mlp": 1.10336995, + "diversity_loss_mlp": 0.0, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.06482352137494116, + "language_loss": 0.82986903, + "learning_rate": 0.0006338676787936963, + "loss": 0.84101653, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2242, + "time_per_iteration": 3.064518451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123318, + "balance_loss_mlp": 1.11183178, + "diversity_loss_mlp": 0.0, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.07554467546841755, + "language_loss": 0.84015846, + "learning_rate": 0.0006335674841511367, + "loss": 0.85139167, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2243, + "time_per_iteration": 2.7494354248046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067953, + "balance_loss_mlp": 1.06189752, + "diversity_loss_mlp": 0.0, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.020266409588932003, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80249119, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.06054688, + "routerloss_mlp": 0.0, + "step": 2244, + "time_per_iteration": 5.019898414611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058136, + "balance_loss_mlp": 1.05208015, + "diversity_loss_mlp": 0.0, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.017496917907237546, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78423691, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.06054688, + "routerloss_mlp": 0.0, + "step": 2245, + "time_per_iteration": 4.940483808517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111091, + "balance_loss_mlp": 1.09893775, + "diversity_loss_mlp": 0.0, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.07826437205196314, + "language_loss": 0.82487583, + "learning_rate": 0.0006326665895567652, + "loss": 0.83598673, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.121521, + "routerloss_mlp": 0.0, + "step": 2246, + "time_per_iteration": 2.6287152767181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111115, + "balance_loss_mlp": 1.09895015, + "diversity_loss_mlp": 0.0, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.09268036537549412, + "language_loss": 0.87613881, + "learning_rate": 0.0006323661881916976, + "loss": 0.88725001, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.121521, + "routerloss_mlp": 0.0, + "step": 2247, + "time_per_iteration": 2.6966464519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110117, + "balance_loss_mlp": 1.08901072, + "diversity_loss_mlp": 0.0, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.07850654458656253, + "language_loss": 0.812437, + "learning_rate": 0.0006320657354375179, + "loss": 0.82344878, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2248, + "time_per_iteration": 3.0057384967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.08872366, + "diversity_loss_mlp": 0.0, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.07399569527983862, + "language_loss": 0.87203169, + "learning_rate": 0.0006317652314108726, + "loss": 0.88303995, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2249, + "time_per_iteration": 2.6106557846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083093, + "balance_loss_mlp": 1.07126176, + "diversity_loss_mlp": 0.0, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.07131076511794647, + "language_loss": 0.91191232, + "learning_rate": 0.0006314646762284277, + "loss": 0.92274326, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2250, + "time_per_iteration": 2.601017951965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_mlp": 1.02617049, + "diversity_loss_mlp": 0.0, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.02997957544407836, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76458681, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.06689453, + "routerloss_mlp": 0.0, + "step": 2251, + "time_per_iteration": 4.872025966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085431, + "balance_loss_mlp": 1.07351613, + "diversity_loss_mlp": 0.0, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.07162967916255573, + "language_loss": 0.77412337, + "learning_rate": 0.0006308634128629022, + "loss": 0.78497767, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2252, + "time_per_iteration": 2.858896255493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089815, + "balance_loss_mlp": 1.07750654, + "diversity_loss_mlp": 0.0, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.0655401202696214, + "language_loss": 0.8742274, + "learning_rate": 0.0006305627049132531, + "loss": 0.88512552, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2253, + "time_per_iteration": 2.8089702129364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108552, + "balance_loss_mlp": 1.07309866, + "diversity_loss_mlp": 0.0, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.05577202062379855, + "language_loss": 0.85968709, + "learning_rate": 0.0006302619462746662, + "loss": 0.87054229, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.12414551, + "routerloss_mlp": 0.0, + "step": 2254, + "time_per_iteration": 3.117469072341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090126, + "balance_loss_mlp": 1.07842588, + "diversity_loss_mlp": 0.0, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.07095559842956704, + "language_loss": 0.90230805, + "learning_rate": 0.0006299611370639069, + "loss": 0.91320932, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2255, + "time_per_iteration": 2.723188638687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084284, + "balance_loss_mlp": 1.07239318, + "diversity_loss_mlp": 0.0, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.07367301477096526, + "language_loss": 0.79524988, + "learning_rate": 0.0006296602773977593, + "loss": 0.80609274, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2256, + "time_per_iteration": 2.6743130683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099708, + "balance_loss_mlp": 1.08790588, + "diversity_loss_mlp": 0.0, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.06301035546935001, + "language_loss": 0.87406039, + "learning_rate": 0.0006293593673930277, + "loss": 0.88505745, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2257, + "time_per_iteration": 2.6397616863250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103769, + "balance_loss_mlp": 1.09211683, + "diversity_loss_mlp": 0.0, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07716264473653381, + "language_loss": 0.78774142, + "learning_rate": 0.0006290584071665358, + "loss": 0.79877913, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2258, + "time_per_iteration": 2.9148640632629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_mlp": 1.07634544, + "diversity_loss_mlp": 0.0, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.06859255861010008, + "language_loss": 0.82309216, + "learning_rate": 0.0006287573968351266, + "loss": 0.83397484, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2259, + "time_per_iteration": 2.582099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081836, + "balance_loss_mlp": 1.06989694, + "diversity_loss_mlp": 0.0, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.0728512329620832, + "language_loss": 0.8210361, + "learning_rate": 0.0006284563365156626, + "loss": 0.83185446, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2260, + "time_per_iteration": 2.802004814147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075245, + "balance_loss_mlp": 1.06343079, + "diversity_loss_mlp": 0.0, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.08318375282180102, + "language_loss": 0.87862843, + "learning_rate": 0.0006281552263250261, + "loss": 0.88938093, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.11810303, + "routerloss_mlp": 0.0, + "step": 2261, + "time_per_iteration": 2.5335495471954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_mlp": 1.02721453, + "diversity_loss_mlp": 0.0, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.02511862566194507, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81726044, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.07275391, + "routerloss_mlp": 0.0, + "step": 2262, + "time_per_iteration": 4.858395338058472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067582, + "balance_loss_mlp": 1.05593562, + "diversity_loss_mlp": 0.0, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.07030760098393707, + "language_loss": 0.81181604, + "learning_rate": 0.0006275528567978593, + "loss": 0.82249182, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2263, + "time_per_iteration": 2.9562113285064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106752, + "balance_loss_mlp": 1.05570674, + "diversity_loss_mlp": 0.0, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.09515047383985015, + "language_loss": 0.82464182, + "learning_rate": 0.0006272515976951898, + "loss": 0.83531702, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2264, + "time_per_iteration": 3.0750486850738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106778, + "balance_loss_mlp": 1.05625236, + "diversity_loss_mlp": 0.0, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.06538835415995116, + "language_loss": 0.7903443, + "learning_rate": 0.0006269502891890687, + "loss": 0.80102211, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2265, + "time_per_iteration": 3.0723042488098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069278, + "balance_loss_mlp": 1.05721438, + "diversity_loss_mlp": 0.0, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.06791130510000161, + "language_loss": 0.88071477, + "learning_rate": 0.0006266489313964743, + "loss": 0.89140749, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.12060547, + "routerloss_mlp": 0.0, + "step": 2266, + "time_per_iteration": 2.7362618446350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00937641, + "balance_loss_mlp": 1.63294578, + "diversity_loss_mlp": 0.21328503, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.028233172977391998, + "language_loss": 0.85207379, + "learning_rate": 0.0006263475244344041, + "loss": 0.8614502, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01452552, + "step": 2267, + "time_per_iteration": 2.8842954635620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082065, + "balance_loss_mlp": 1.06979251, + "diversity_loss_mlp": 0.0, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.07502115173737808, + "language_loss": 0.84271002, + "learning_rate": 0.0006260460684198746, + "loss": 0.8535307, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.12268066, + "routerloss_mlp": 0.0, + "step": 2268, + "time_per_iteration": 2.6355533599853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.07749879, + "diversity_loss_mlp": 0.0, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.07640014386484298, + "language_loss": 0.84040511, + "learning_rate": 0.0006257445634699213, + "loss": 0.85130346, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.12322998, + "routerloss_mlp": 0.0, + "step": 2269, + "time_per_iteration": 2.5279150009155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089112, + "balance_loss_mlp": 1.07683921, + "diversity_loss_mlp": 0.0, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.16142331523875347, + "language_loss": 0.83037758, + "learning_rate": 0.0006254430097015993, + "loss": 0.84126872, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.12268066, + "routerloss_mlp": 0.0, + "step": 2270, + "time_per_iteration": 2.660228729248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_mlp": 1.03087568, + "diversity_loss_mlp": 0.0, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.024589935077845904, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77516735, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.06787109, + "routerloss_mlp": 0.0, + "step": 2271, + "time_per_iteration": 4.794579744338989 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070138, + "balance_loss_mlp": 1.05796623, + "diversity_loss_mlp": 0.0, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.057648382072647573, + "language_loss": 0.85053569, + "learning_rate": 0.0006248397561781609, + "loss": 0.86123705, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2272, + "time_per_iteration": 2.862569570541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067836, + "balance_loss_mlp": 1.05557537, + "diversity_loss_mlp": 0.0, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.08840424380788836, + "language_loss": 0.86255217, + "learning_rate": 0.0006245380566572482, + "loss": 0.87323052, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2273, + "time_per_iteration": 2.7386484146118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068942, + "balance_loss_mlp": 1.0566572, + "diversity_loss_mlp": 0.0, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07723857249852564, + "language_loss": 0.75794655, + "learning_rate": 0.0006242363087863744, + "loss": 0.76863599, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.12286377, + "routerloss_mlp": 0.0, + "step": 2274, + "time_per_iteration": 2.948030710220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010675, + "balance_loss_mlp": 1.05560887, + "diversity_loss_mlp": 0.0, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.06687985923679116, + "language_loss": 0.86043644, + "learning_rate": 0.0006239345126826878, + "loss": 0.87111151, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2275, + "time_per_iteration": 2.787750482559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071289, + "balance_loss_mlp": 1.05926108, + "diversity_loss_mlp": 0.0, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.07503499995760528, + "language_loss": 0.83946115, + "learning_rate": 0.0006236326684633561, + "loss": 0.85017407, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2276, + "time_per_iteration": 2.8109841346740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071151, + "balance_loss_mlp": 1.05921769, + "diversity_loss_mlp": 0.0, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.08049471875944368, + "language_loss": 0.75253642, + "learning_rate": 0.0006233307762455658, + "loss": 0.76324785, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.11932373, + "routerloss_mlp": 0.0, + "step": 2277, + "time_per_iteration": 2.632291793823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072177, + "balance_loss_mlp": 1.06043518, + "diversity_loss_mlp": 0.0, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.0727539933311737, + "language_loss": 0.83312476, + "learning_rate": 0.0006230288361465216, + "loss": 0.8438465, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2278, + "time_per_iteration": 3.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106943, + "balance_loss_mlp": 1.05752659, + "diversity_loss_mlp": 0.0, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.08745359184854619, + "language_loss": 0.84888816, + "learning_rate": 0.0006227268482834473, + "loss": 0.85958248, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2279, + "time_per_iteration": 2.9116861820220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929134, + "balance_loss_mlp": 1.61467147, + "diversity_loss_mlp": 0.21327347, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.03053717197724305, + "language_loss": 0.8733198, + "learning_rate": 0.000622424812773585, + "loss": 0.88261116, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0151619, + "step": 2280, + "time_per_iteration": 2.83655047416687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087033, + "balance_loss_mlp": 1.07515955, + "diversity_loss_mlp": 0.0, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.09030781332224262, + "language_loss": 0.8003484, + "learning_rate": 0.000622122729734195, + "loss": 0.81121874, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2281, + "time_per_iteration": 2.598515033721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088881, + "balance_loss_mlp": 1.07746708, + "diversity_loss_mlp": 0.0, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.05965815533468205, + "language_loss": 0.87430406, + "learning_rate": 0.0006218205992825566, + "loss": 0.88519287, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2282, + "time_per_iteration": 2.6424663066864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084394, + "balance_loss_mlp": 1.07271123, + "diversity_loss_mlp": 0.0, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.06483845116972914, + "language_loss": 0.81733787, + "learning_rate": 0.0006215184215359671, + "loss": 0.8281818, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2283, + "time_per_iteration": 2.736311674118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087917, + "balance_loss_mlp": 1.07662153, + "diversity_loss_mlp": 0.0, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.0656289826640407, + "language_loss": 0.86697561, + "learning_rate": 0.0006212161966117425, + "loss": 0.8778547, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.11297607, + "routerloss_mlp": 0.0, + "step": 2284, + "time_per_iteration": 2.727402448654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091514, + "balance_loss_mlp": 1.07989156, + "diversity_loss_mlp": 0.0, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.07463232969806483, + "language_loss": 0.81628394, + "learning_rate": 0.0006209139246272164, + "loss": 0.8271991, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 2285, + "time_per_iteration": 2.978759527206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.08205843, + "diversity_loss_mlp": 0.0, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.08236326374350296, + "language_loss": 0.81938732, + "learning_rate": 0.0006206116056997421, + "loss": 0.83032608, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.1182251, + "routerloss_mlp": 0.0, + "step": 2286, + "time_per_iteration": 2.6111207008361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085811, + "balance_loss_mlp": 1.07444477, + "diversity_loss_mlp": 0.0, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.06662472973472185, + "language_loss": 0.82727671, + "learning_rate": 0.0006203092399466892, + "loss": 0.83813483, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2287, + "time_per_iteration": 2.6246864795684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109279, + "balance_loss_mlp": 1.08137023, + "diversity_loss_mlp": 0.0, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.06470350083987941, + "language_loss": 0.85380936, + "learning_rate": 0.0006200068274854473, + "loss": 0.86473733, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.11419678, + "routerloss_mlp": 0.0, + "step": 2288, + "time_per_iteration": 2.675197124481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091416, + "balance_loss_mlp": 1.07988858, + "diversity_loss_mlp": 0.0, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.0650031810595099, + "language_loss": 0.8588661, + "learning_rate": 0.0006197043684334229, + "loss": 0.86978024, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2289, + "time_per_iteration": 2.787095785140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092106, + "balance_loss_mlp": 1.08063841, + "diversity_loss_mlp": 0.0, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.0715970788084748, + "language_loss": 0.79333103, + "learning_rate": 0.0006194018629080411, + "loss": 0.80425215, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2290, + "time_per_iteration": 2.817836284637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103028, + "balance_loss_mlp": 1.09150028, + "diversity_loss_mlp": 0.0, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.07061114258803743, + "language_loss": 0.81714827, + "learning_rate": 0.0006190993110267451, + "loss": 0.82817852, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2291, + "time_per_iteration": 2.741288900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108227, + "balance_loss_mlp": 1.09614503, + "diversity_loss_mlp": 0.0, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.07455801894128893, + "language_loss": 0.84193838, + "learning_rate": 0.0006187967129069958, + "loss": 0.85302061, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2292, + "time_per_iteration": 2.5778286457061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106682, + "balance_loss_mlp": 1.09472573, + "diversity_loss_mlp": 0.0, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.06400814904414545, + "language_loss": 0.8690064, + "learning_rate": 0.0006184940686662722, + "loss": 0.88007319, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2293, + "time_per_iteration": 2.7292487621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111022, + "balance_loss_mlp": 1.09812045, + "diversity_loss_mlp": 0.0, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.06813451942076464, + "language_loss": 0.90379488, + "learning_rate": 0.0006181913784220714, + "loss": 0.91489702, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2294, + "time_per_iteration": 2.6506428718566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081962, + "balance_loss_mlp": 1.0750953, + "diversity_loss_mlp": 0.0, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.029819366941177792, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81635749, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.06884766, + "routerloss_mlp": 0.0, + "step": 2295, + "time_per_iteration": 4.882002592086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110182, + "balance_loss_mlp": 1.09772444, + "diversity_loss_mlp": 0.0, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.07012194180041048, + "language_loss": 0.7971437, + "learning_rate": 0.0006175858603933146, + "loss": 0.80824548, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.12469482, + "routerloss_mlp": 0.0, + "step": 2296, + "time_per_iteration": 2.8836371898651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908854, + "balance_loss_mlp": 1.58032632, + "diversity_loss_mlp": 0.2095283, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.03267646081870075, + "language_loss": 0.80986243, + "learning_rate": 0.0006172830328438416, + "loss": 0.81895095, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01392685, + "step": 2297, + "time_per_iteration": 2.9758472442626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093338, + "balance_loss_mlp": 1.0806725, + "diversity_loss_mlp": 0.0, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.0684627092891604, + "language_loss": 0.86739677, + "learning_rate": 0.0006169801597610572, + "loss": 0.87833017, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 2298, + "time_per_iteration": 2.796999454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080861, + "balance_loss_mlp": 1.06855834, + "diversity_loss_mlp": 0.0, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.09148837874044675, + "language_loss": 0.89672303, + "learning_rate": 0.0006166772412625469, + "loss": 0.90753162, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.12304688, + "routerloss_mlp": 0.0, + "step": 2299, + "time_per_iteration": 2.719217300415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079493, + "balance_loss_mlp": 1.06674969, + "diversity_loss_mlp": 0.0, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.0806717243265584, + "language_loss": 0.81995088, + "learning_rate": 0.0006163742774659141, + "loss": 0.83074582, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 2300, + "time_per_iteration": 2.857851266860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082582, + "balance_loss_mlp": 1.07051837, + "diversity_loss_mlp": 0.0, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.07368324051857801, + "language_loss": 0.85920924, + "learning_rate": 0.0006160712684887801, + "loss": 0.87003505, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.1206665, + "routerloss_mlp": 0.0, + "step": 2301, + "time_per_iteration": 2.7615816593170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076648, + "balance_loss_mlp": 1.06491232, + "diversity_loss_mlp": 0.0, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.07775198871362894, + "language_loss": 0.81987381, + "learning_rate": 0.0006157682144487832, + "loss": 0.83064032, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2302, + "time_per_iteration": 2.759446620941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071769, + "balance_loss_mlp": 1.05998516, + "diversity_loss_mlp": 0.0, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.07391427816126875, + "language_loss": 0.82887244, + "learning_rate": 0.0006154651154635793, + "loss": 0.83959019, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2303, + "time_per_iteration": 2.8566582202911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074582, + "balance_loss_mlp": 1.0627867, + "diversity_loss_mlp": 0.0, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.07276664214775759, + "language_loss": 0.84800553, + "learning_rate": 0.0006151619716508421, + "loss": 0.85875136, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2304, + "time_per_iteration": 2.678624153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070842, + "balance_loss_mlp": 1.05890322, + "diversity_loss_mlp": 0.0, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.0708190445963316, + "language_loss": 0.87117589, + "learning_rate": 0.0006148587831282625, + "loss": 0.88188434, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2305, + "time_per_iteration": 2.6833643913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065001, + "balance_loss_mlp": 1.05813479, + "diversity_loss_mlp": 0.0, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.03167846404368131, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80241072, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.06884766, + "routerloss_mlp": 0.0, + "step": 2306, + "time_per_iteration": 4.908214092254639 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_mlp": 1.06202734, + "diversity_loss_mlp": 0.0, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.10781991147306623, + "language_loss": 0.87386847, + "learning_rate": 0.0006142522724244255, + "loss": 0.8846153, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.12664795, + "routerloss_mlp": 0.0, + "step": 2307, + "time_per_iteration": 2.559011459350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_mlp": 1.03301477, + "diversity_loss_mlp": 0.0, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.019467834986953515, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77524698, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.06982422, + "routerloss_mlp": 0.0, + "step": 2308, + "time_per_iteration": 4.990226984024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010956, + "balance_loss_mlp": 1.08379281, + "diversity_loss_mlp": 0.0, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.134173965781989, + "language_loss": 0.77330542, + "learning_rate": 0.000613645584293942, + "loss": 0.78426147, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2309, + "time_per_iteration": 2.925625801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096392, + "balance_loss_mlp": 1.08444726, + "diversity_loss_mlp": 0.0, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.07260585347328512, + "language_loss": 0.83497787, + "learning_rate": 0.0006133421739881185, + "loss": 0.84594172, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2310, + "time_per_iteration": 2.6521387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105727, + "balance_loss_mlp": 1.09360933, + "diversity_loss_mlp": 0.0, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.08716252058009813, + "language_loss": 0.82747865, + "learning_rate": 0.0006130387196789605, + "loss": 0.8385359, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2311, + "time_per_iteration": 2.7266759872436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100575, + "balance_loss_mlp": 1.08809423, + "diversity_loss_mlp": 0.0, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.057672451626414926, + "language_loss": 0.84308195, + "learning_rate": 0.0006127352214842795, + "loss": 0.85408771, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2312, + "time_per_iteration": 2.9728119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104798, + "balance_loss_mlp": 1.09263897, + "diversity_loss_mlp": 0.0, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.09124128780751645, + "language_loss": 0.85551131, + "learning_rate": 0.0006124316795219041, + "loss": 0.86655927, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2313, + "time_per_iteration": 2.793999671936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098642, + "balance_loss_mlp": 1.08649504, + "diversity_loss_mlp": 0.0, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.07392199689713573, + "language_loss": 0.82170153, + "learning_rate": 0.0006121280939096794, + "loss": 0.83268797, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.12145996, + "routerloss_mlp": 0.0, + "step": 2314, + "time_per_iteration": 2.7882213592529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087686, + "balance_loss_mlp": 1.07496047, + "diversity_loss_mlp": 0.0, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.07188819518398708, + "language_loss": 0.87831259, + "learning_rate": 0.000611824464765468, + "loss": 0.88918942, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.12738037, + "routerloss_mlp": 0.0, + "step": 2315, + "time_per_iteration": 2.570239305496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_mlp": 1.03435254, + "diversity_loss_mlp": 0.0, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.031544046963938845, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79636735, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.07421875, + "routerloss_mlp": 0.0, + "step": 2316, + "time_per_iteration": 4.63933539390564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107211, + "balance_loss_mlp": 1.05995071, + "diversity_loss_mlp": 0.0, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.10006595419905694, + "language_loss": 0.85561663, + "learning_rate": 0.000611217076352619, + "loss": 0.86633772, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2317, + "time_per_iteration": 2.763282299041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068374, + "balance_loss_mlp": 1.05613708, + "diversity_loss_mlp": 0.0, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.07080250397958886, + "language_loss": 0.8323034, + "learning_rate": 0.0006109133173197905, + "loss": 0.84298718, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 2318, + "time_per_iteration": 2.7228074073791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.0546751, + "diversity_loss_mlp": 0.0, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.07919775459104113, + "language_loss": 0.85392821, + "learning_rate": 0.0006106095152265935, + "loss": 0.86459887, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.12390137, + "routerloss_mlp": 0.0, + "step": 2319, + "time_per_iteration": 2.950333595275879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067661, + "balance_loss_mlp": 1.05547166, + "diversity_loss_mlp": 0.0, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.061336847968553085, + "language_loss": 0.84789562, + "learning_rate": 0.0006103056701909739, + "loss": 0.85857224, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2320, + "time_per_iteration": 2.9283788204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076472, + "balance_loss_mlp": 1.06437278, + "diversity_loss_mlp": 0.0, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.06696737396207848, + "language_loss": 0.83276129, + "learning_rate": 0.0006100017823308956, + "loss": 0.84352595, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2321, + "time_per_iteration": 3.159337282180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072799, + "balance_loss_mlp": 1.06091988, + "diversity_loss_mlp": 0.0, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.07676377008356373, + "language_loss": 0.79803503, + "learning_rate": 0.0006096978517643377, + "loss": 0.80876303, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2322, + "time_per_iteration": 2.8253674507141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00921995, + "balance_loss_mlp": 1.60181236, + "diversity_loss_mlp": 0.21422489, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.03237790796068106, + "language_loss": 0.83347481, + "learning_rate": 0.0006093938786092968, + "loss": 0.84269476, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01397606, + "step": 2323, + "time_per_iteration": 2.648444890975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110106, + "balance_loss_mlp": 1.09840608, + "diversity_loss_mlp": 0.0, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.07300553293113453, + "language_loss": 0.90023661, + "learning_rate": 0.0006090898629837857, + "loss": 0.91133773, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2324, + "time_per_iteration": 2.852698564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126468, + "balance_loss_mlp": 1.11461282, + "diversity_loss_mlp": 0.0, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.06000654076761871, + "language_loss": 0.87143672, + "learning_rate": 0.0006087858050058337, + "loss": 0.8827014, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2325, + "time_per_iteration": 2.7674834728240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138358, + "balance_loss_mlp": 1.12663388, + "diversity_loss_mlp": 0.0, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.0853990663964482, + "language_loss": 0.82412744, + "learning_rate": 0.0006084817047934866, + "loss": 0.83551097, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2326, + "time_per_iteration": 2.6421871185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121575, + "balance_loss_mlp": 1.10977352, + "diversity_loss_mlp": 0.0, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.08985792381424736, + "language_loss": 0.89330196, + "learning_rate": 0.0006081775624648066, + "loss": 0.90451771, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2327, + "time_per_iteration": 2.578197956085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131674, + "balance_loss_mlp": 1.12057006, + "diversity_loss_mlp": 0.0, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.0872530433154025, + "language_loss": 0.83162999, + "learning_rate": 0.0006078733781378721, + "loss": 0.84294665, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2328, + "time_per_iteration": 2.6186208724975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099348, + "balance_loss_mlp": 1.08810675, + "diversity_loss_mlp": 0.0, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.07633837573658239, + "language_loss": 0.82202363, + "learning_rate": 0.0006075691519307781, + "loss": 0.83301711, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2329, + "time_per_iteration": 2.9000244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094166, + "balance_loss_mlp": 1.08247721, + "diversity_loss_mlp": 0.0, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.0736281868256213, + "language_loss": 0.81618124, + "learning_rate": 0.0006072648839616356, + "loss": 0.82712287, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.11694336, + "routerloss_mlp": 0.0, + "step": 2330, + "time_per_iteration": 2.6364829540252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083826, + "balance_loss_mlp": 1.07230425, + "diversity_loss_mlp": 0.0, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.0657010816534965, + "language_loss": 0.82723016, + "learning_rate": 0.0006069605743485718, + "loss": 0.83806837, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2331, + "time_per_iteration": 3.3334474563598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086805, + "balance_loss_mlp": 1.07531917, + "diversity_loss_mlp": 0.0, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.07225675858451452, + "language_loss": 0.83265316, + "learning_rate": 0.0006066562232097303, + "loss": 0.84352124, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2332, + "time_per_iteration": 2.705143690109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082054, + "balance_loss_mlp": 1.07051468, + "diversity_loss_mlp": 0.0, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.06521315479324259, + "language_loss": 0.8614397, + "learning_rate": 0.0006063518306632708, + "loss": 0.87226027, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2333, + "time_per_iteration": 2.9501705169677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085822, + "balance_loss_mlp": 1.07427073, + "diversity_loss_mlp": 0.0, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.07251688845149425, + "language_loss": 0.82197714, + "learning_rate": 0.0006060473968273688, + "loss": 0.83283544, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2334, + "time_per_iteration": 2.708394765853882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_mlp": 1.032179, + "diversity_loss_mlp": 0.0, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.02865006957504222, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78918916, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.07177734, + "routerloss_mlp": 0.0, + "step": 2335, + "time_per_iteration": 4.866912841796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_mlp": 1.01901519, + "diversity_loss_mlp": 0.0, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.021847156852776353, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82031286, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.07080078, + "routerloss_mlp": 0.0, + "step": 2336, + "time_per_iteration": 4.834076642990112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108818, + "balance_loss_mlp": 1.07613969, + "diversity_loss_mlp": 0.0, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.09890748330953583, + "language_loss": 0.88285863, + "learning_rate": 0.0006051338487650047, + "loss": 0.89374042, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 2337, + "time_per_iteration": 2.4428114891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00930205, + "balance_loss_mlp": 1.62015963, + "diversity_loss_mlp": 0.20974493, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.03186253719782368, + "language_loss": 0.82399797, + "learning_rate": 0.0006048292509534095, + "loss": 0.83329999, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01525321, + "step": 2338, + "time_per_iteration": 2.6332457065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079855, + "balance_loss_mlp": 1.06772542, + "diversity_loss_mlp": 0.0, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.08456945041025239, + "language_loss": 0.77873439, + "learning_rate": 0.0006045246124434895, + "loss": 0.7895329, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 2339, + "time_per_iteration": 2.7590980529785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073061, + "balance_loss_mlp": 1.06156278, + "diversity_loss_mlp": 0.0, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.06841757056071682, + "language_loss": 0.86623305, + "learning_rate": 0.0006042199333535162, + "loss": 0.87696362, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2340, + "time_per_iteration": 3.293574333190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079106, + "balance_loss_mlp": 1.06769133, + "diversity_loss_mlp": 0.0, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06101547553515947, + "language_loss": 0.84343052, + "learning_rate": 0.0006039152138017763, + "loss": 0.85422158, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2341, + "time_per_iteration": 3.0700981616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087051, + "balance_loss_mlp": 1.07579744, + "diversity_loss_mlp": 0.0, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.09071323966594208, + "language_loss": 0.83541143, + "learning_rate": 0.0006036104539065726, + "loss": 0.84628195, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.11260986, + "routerloss_mlp": 0.0, + "step": 2342, + "time_per_iteration": 2.6694719791412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089407, + "balance_loss_mlp": 1.07793319, + "diversity_loss_mlp": 0.0, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.08270437502254605, + "language_loss": 0.84371507, + "learning_rate": 0.000603305653786223, + "loss": 0.85460913, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2343, + "time_per_iteration": 3.16105318069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083424, + "balance_loss_mlp": 1.07187295, + "diversity_loss_mlp": 0.0, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.07028076371432387, + "language_loss": 0.84103405, + "learning_rate": 0.0006030008135590622, + "loss": 0.85186827, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2344, + "time_per_iteration": 2.7197835445404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082164, + "balance_loss_mlp": 1.07096398, + "diversity_loss_mlp": 0.0, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.05864949769745669, + "language_loss": 0.7999413, + "learning_rate": 0.0006026959333434387, + "loss": 0.81076288, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2345, + "time_per_iteration": 2.777010202407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919083, + "balance_loss_mlp": 1.6008426, + "diversity_loss_mlp": 0.20793086, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.028469676504860836, + "language_loss": 0.77684712, + "learning_rate": 0.0006023910132577181, + "loss": 0.78603798, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01469593, + "step": 2346, + "time_per_iteration": 2.689173936843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093318, + "balance_loss_mlp": 1.08186746, + "diversity_loss_mlp": 0.0, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.07173117007756048, + "language_loss": 0.84956741, + "learning_rate": 0.0006020860534202806, + "loss": 0.86050057, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2347, + "time_per_iteration": 2.499941110610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099048, + "balance_loss_mlp": 1.08747303, + "diversity_loss_mlp": 0.0, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.06525031943024168, + "language_loss": 0.81076705, + "learning_rate": 0.0006017810539495224, + "loss": 0.82175756, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2348, + "time_per_iteration": 2.9487318992614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094541, + "balance_loss_mlp": 1.08284068, + "diversity_loss_mlp": 0.0, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.07881291561071736, + "language_loss": 0.82607108, + "learning_rate": 0.0006014760149638547, + "loss": 0.83701646, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.11700439, + "routerloss_mlp": 0.0, + "step": 2349, + "time_per_iteration": 2.7228691577911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096509, + "balance_loss_mlp": 1.0852139, + "diversity_loss_mlp": 0.0, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.08019466042103662, + "language_loss": 0.88398969, + "learning_rate": 0.000601170936581704, + "loss": 0.8949548, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.112854, + "routerloss_mlp": 0.0, + "step": 2350, + "time_per_iteration": 2.521714687347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090727, + "balance_loss_mlp": 1.07951522, + "diversity_loss_mlp": 0.0, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.08533615412567333, + "language_loss": 0.84897137, + "learning_rate": 0.0006008658189215121, + "loss": 0.85987866, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2351, + "time_per_iteration": 2.6506216526031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.07545722, + "diversity_loss_mlp": 0.0, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.09237808795246917, + "language_loss": 0.80232167, + "learning_rate": 0.0006005606621017366, + "loss": 0.81319243, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2352, + "time_per_iteration": 2.5878968238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010807, + "balance_loss_mlp": 1.06907678, + "diversity_loss_mlp": 0.0, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.07057821380790058, + "language_loss": 0.80339801, + "learning_rate": 0.0006002554662408496, + "loss": 0.81420493, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2353, + "time_per_iteration": 2.883782386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.0691061, + "diversity_loss_mlp": 0.0, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.0736680584084088, + "language_loss": 0.9135446, + "learning_rate": 0.0005999502314573388, + "loss": 0.9243511, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2354, + "time_per_iteration": 2.645484685897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103614, + "balance_loss_mlp": 1.09201527, + "diversity_loss_mlp": 0.0, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.07036557956994945, + "language_loss": 0.86196381, + "learning_rate": 0.0005996449578697066, + "loss": 0.87299991, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2355, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906536, + "balance_loss_mlp": 1.57839537, + "diversity_loss_mlp": 0.20635399, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.031145483684461562, + "language_loss": 0.81619978, + "learning_rate": 0.0005993396455964709, + "loss": 0.82526517, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01416124, + "step": 2356, + "time_per_iteration": 2.7277767658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.14805746, + "diversity_loss_mlp": 0.0, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.07904312092760724, + "language_loss": 0.81657517, + "learning_rate": 0.0005990342947561647, + "loss": 0.82816887, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.11315918, + "routerloss_mlp": 0.0, + "step": 2357, + "time_per_iteration": 2.696223258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167894, + "balance_loss_mlp": 1.15651524, + "diversity_loss_mlp": 0.0, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.07381995676601517, + "language_loss": 0.78198934, + "learning_rate": 0.0005987289054673351, + "loss": 0.79366827, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2358, + "time_per_iteration": 2.602642059326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360078, + "balance_loss_mlp": 1.35392714, + "diversity_loss_mlp": 0.0, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.12195170998658643, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77935815, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2359, + "time_per_iteration": 4.880090713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146892, + "balance_loss_mlp": 1.13553107, + "diversity_loss_mlp": 0.0, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.07250720881476776, + "language_loss": 0.91548061, + "learning_rate": 0.0005981180120183722, + "loss": 0.9269495, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2360, + "time_per_iteration": 2.680730104446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133243, + "balance_loss_mlp": 1.121382, + "diversity_loss_mlp": 0.0, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.055968167495159496, + "language_loss": 0.85338825, + "learning_rate": 0.0005978125080954089, + "loss": 0.8647207, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.11853027, + "routerloss_mlp": 0.0, + "step": 2361, + "time_per_iteration": 2.791376829147339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124184, + "balance_loss_mlp": 1.11265099, + "diversity_loss_mlp": 0.0, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.08653591933533131, + "language_loss": 0.77322888, + "learning_rate": 0.000597506966198262, + "loss": 0.7844708, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2362, + "time_per_iteration": 2.97446870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119088, + "balance_loss_mlp": 1.10733426, + "diversity_loss_mlp": 0.0, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.09240364374598002, + "language_loss": 0.84247041, + "learning_rate": 0.0005972013864455536, + "loss": 0.85366124, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.11743164, + "routerloss_mlp": 0.0, + "step": 2363, + "time_per_iteration": 2.577167510986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108786, + "balance_loss_mlp": 1.09771168, + "diversity_loss_mlp": 0.0, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.0787330127694287, + "language_loss": 0.8535012, + "learning_rate": 0.0005968957689559203, + "loss": 0.8645891, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2364, + "time_per_iteration": 2.7120981216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105615, + "balance_loss_mlp": 1.09457588, + "diversity_loss_mlp": 0.0, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.07389843074969835, + "language_loss": 0.88484383, + "learning_rate": 0.0005965901138480131, + "loss": 0.89590001, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2365, + "time_per_iteration": 2.578874349594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110202, + "balance_loss_mlp": 1.09081471, + "diversity_loss_mlp": 0.0, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.06426783448513047, + "language_loss": 0.87068385, + "learning_rate": 0.0005962844212404982, + "loss": 0.88170409, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.11206055, + "routerloss_mlp": 0.0, + "step": 2366, + "time_per_iteration": 2.6638920307159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096105, + "balance_loss_mlp": 1.08472049, + "diversity_loss_mlp": 0.0, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.05830156527831164, + "language_loss": 0.87147355, + "learning_rate": 0.0005959786912520558, + "loss": 0.88243461, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 2367, + "time_per_iteration": 2.6142454147338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088371, + "balance_loss_mlp": 1.07726681, + "diversity_loss_mlp": 0.0, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.06261196085687584, + "language_loss": 0.83712542, + "learning_rate": 0.0005956729240013806, + "loss": 0.84800917, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2368, + "time_per_iteration": 2.786256790161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095858, + "balance_loss_mlp": 1.08447385, + "diversity_loss_mlp": 0.0, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.06874460659515655, + "language_loss": 0.91648531, + "learning_rate": 0.0005953671196071824, + "loss": 0.92744386, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2369, + "time_per_iteration": 2.756943941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093695, + "balance_loss_mlp": 1.08220375, + "diversity_loss_mlp": 0.0, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.07258619671695062, + "language_loss": 0.80044961, + "learning_rate": 0.0005950612781881846, + "loss": 0.81138659, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2370, + "time_per_iteration": 2.6791019439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906758, + "balance_loss_mlp": 1.57760763, + "diversity_loss_mlp": 0.20680004, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.03266097765038979, + "language_loss": 0.76005763, + "learning_rate": 0.0005947553998631259, + "loss": 0.76912522, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01455403, + "step": 2371, + "time_per_iteration": 2.908493995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010769, + "balance_loss_mlp": 1.06543183, + "diversity_loss_mlp": 0.0, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.05564189265933484, + "language_loss": 0.79205543, + "learning_rate": 0.000594449484750758, + "loss": 0.80282438, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2372, + "time_per_iteration": 3.18151593208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072251, + "balance_loss_mlp": 1.06046152, + "diversity_loss_mlp": 0.0, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.07444834598910231, + "language_loss": 0.83208215, + "learning_rate": 0.0005941435329698484, + "loss": 0.84280467, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2373, + "time_per_iteration": 2.6709630489349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107048, + "balance_loss_mlp": 1.05895281, + "diversity_loss_mlp": 0.0, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.06837725942446468, + "language_loss": 0.83204812, + "learning_rate": 0.0005938375446391778, + "loss": 0.84275293, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2374, + "time_per_iteration": 2.6943106651306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_mlp": 1.06261396, + "diversity_loss_mlp": 0.0, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.0748623734907781, + "language_loss": 0.8912878, + "learning_rate": 0.0005935315198775415, + "loss": 0.90203297, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2375, + "time_per_iteration": 2.6303911209106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066821, + "balance_loss_mlp": 1.05491209, + "diversity_loss_mlp": 0.0, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.06590971106227904, + "language_loss": 0.87093645, + "learning_rate": 0.0005932254588037486, + "loss": 0.88160467, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2376, + "time_per_iteration": 2.5003554821014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106434, + "balance_loss_mlp": 1.0520016, + "diversity_loss_mlp": 0.0, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.07188519107297629, + "language_loss": 0.86239958, + "learning_rate": 0.000592919361536623, + "loss": 0.87304294, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.12335205, + "routerloss_mlp": 0.0, + "step": 2377, + "time_per_iteration": 2.6426758766174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106134, + "balance_loss_mlp": 1.04946113, + "diversity_loss_mlp": 0.0, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.06083573176815847, + "language_loss": 0.88679874, + "learning_rate": 0.0005926132281950017, + "loss": 0.89741206, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2378, + "time_per_iteration": 2.7510690689086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065869, + "balance_loss_mlp": 1.05310154, + "diversity_loss_mlp": 0.0, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.07940360452878177, + "language_loss": 0.85365742, + "learning_rate": 0.0005923070588977367, + "loss": 0.86431611, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.12774658, + "routerloss_mlp": 0.0, + "step": 2379, + "time_per_iteration": 2.7969985008239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066405, + "balance_loss_mlp": 1.05444837, + "diversity_loss_mlp": 0.0, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.06398281947580985, + "language_loss": 0.86384034, + "learning_rate": 0.0005920008537636931, + "loss": 0.87450439, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.11956787, + "routerloss_mlp": 0.0, + "step": 2380, + "time_per_iteration": 2.90964412689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_mlp": 1.05391335, + "diversity_loss_mlp": 0.0, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.05698304417859526, + "language_loss": 0.86739266, + "learning_rate": 0.0005916946129117504, + "loss": 0.87805718, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.12548828, + "routerloss_mlp": 0.0, + "step": 2381, + "time_per_iteration": 2.9013612270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.06223381, + "diversity_loss_mlp": 0.0, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.07634094682432664, + "language_loss": 0.80304879, + "learning_rate": 0.0005913883364608017, + "loss": 0.81379426, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2382, + "time_per_iteration": 3.086503505706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108411, + "balance_loss_mlp": 1.07212973, + "diversity_loss_mlp": 0.0, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.06243795661807547, + "language_loss": 0.8841778, + "learning_rate": 0.0005910820245297542, + "loss": 0.89501894, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.11975098, + "routerloss_mlp": 0.0, + "step": 2383, + "time_per_iteration": 2.8612842559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090258, + "balance_loss_mlp": 1.07756186, + "diversity_loss_mlp": 0.0, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.08243832238560393, + "language_loss": 0.80972016, + "learning_rate": 0.000590775677237529, + "loss": 0.82062268, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 2384, + "time_per_iteration": 2.731405735015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094631, + "balance_loss_mlp": 1.08257282, + "diversity_loss_mlp": 0.0, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.07578687885193977, + "language_loss": 0.80532229, + "learning_rate": 0.0005904692947030601, + "loss": 0.81626856, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.1204834, + "routerloss_mlp": 0.0, + "step": 2385, + "time_per_iteration": 2.6176209449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106556, + "balance_loss_mlp": 1.09437895, + "diversity_loss_mlp": 0.0, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.08078833732724985, + "language_loss": 0.8953619, + "learning_rate": 0.0005901628770452963, + "loss": 0.90642744, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.1217041, + "routerloss_mlp": 0.0, + "step": 2386, + "time_per_iteration": 2.5513737201690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.10345697, + "diversity_loss_mlp": 0.0, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.09403156888929357, + "language_loss": 0.87502134, + "learning_rate": 0.000589856424383199, + "loss": 0.88617843, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2387, + "time_per_iteration": 2.599862813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111298, + "balance_loss_mlp": 1.10114813, + "diversity_loss_mlp": 0.0, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.08117329221401763, + "language_loss": 0.8309918, + "learning_rate": 0.000589549936835744, + "loss": 0.8421216, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2388, + "time_per_iteration": 2.914754867553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101869, + "balance_loss_mlp": 1.0899775, + "diversity_loss_mlp": 0.0, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06559429512714879, + "language_loss": 0.79056096, + "learning_rate": 0.0005892434145219202, + "loss": 0.80157959, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2389, + "time_per_iteration": 2.6295268535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898813, + "balance_loss_mlp": 1.5620172, + "diversity_loss_mlp": 0.2081904, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.0365067866217014, + "language_loss": 0.82780147, + "learning_rate": 0.0005889368575607303, + "loss": 0.83678961, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01370906, + "step": 2390, + "time_per_iteration": 2.8635401725769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089349, + "balance_loss_mlp": 1.07753515, + "diversity_loss_mlp": 0.0, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.056196182118315396, + "language_loss": 0.78421402, + "learning_rate": 0.00058863026607119, + "loss": 0.79510748, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2391, + "time_per_iteration": 3.0734708309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099092, + "balance_loss_mlp": 1.08715332, + "diversity_loss_mlp": 0.0, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.07079174515079527, + "language_loss": 0.795928, + "learning_rate": 0.0005883236401723287, + "loss": 0.80691886, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.11932373, + "routerloss_mlp": 0.0, + "step": 2392, + "time_per_iteration": 3.1697676181793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.08348131, + "diversity_loss_mlp": 0.0, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.08882239564338372, + "language_loss": 0.84418833, + "learning_rate": 0.0005880169799831893, + "loss": 0.85514069, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2393, + "time_per_iteration": 2.668509006500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095056, + "balance_loss_mlp": 1.08327174, + "diversity_loss_mlp": 0.0, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.06874062850812142, + "language_loss": 0.81593782, + "learning_rate": 0.0005877102856228278, + "loss": 0.82688844, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2394, + "time_per_iteration": 2.862039566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099301, + "balance_loss_mlp": 1.08791018, + "diversity_loss_mlp": 0.0, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.07005170830273995, + "language_loss": 0.84822053, + "learning_rate": 0.0005874035572103133, + "loss": 0.85921353, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.1138916, + "routerloss_mlp": 0.0, + "step": 2395, + "time_per_iteration": 2.660466194152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092906, + "balance_loss_mlp": 1.08152771, + "diversity_loss_mlp": 0.0, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.09691208121118819, + "language_loss": 0.82382149, + "learning_rate": 0.0005870967948647288, + "loss": 0.83475053, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2396, + "time_per_iteration": 2.8379006385803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259876, + "balance_loss_mlp": 1.25238955, + "diversity_loss_mlp": 0.0, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.08205623370138872, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75568175, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.07470703, + "routerloss_mlp": 0.0, + "step": 2397, + "time_per_iteration": 5.0380027294158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912357, + "balance_loss_mlp": 1.5885272, + "diversity_loss_mlp": 0.20776251, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.030510515868204604, + "language_loss": 0.86040902, + "learning_rate": 0.0005864831688507443, + "loss": 0.86953259, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0142122, + "step": 2398, + "time_per_iteration": 2.9795196056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099565, + "balance_loss_mlp": 1.08854449, + "diversity_loss_mlp": 0.0, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.07495608045078013, + "language_loss": 0.75224954, + "learning_rate": 0.0005861763054205754, + "loss": 0.76324517, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2399, + "time_per_iteration": 2.7307660579681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908198, + "balance_loss_mlp": 1.58042729, + "diversity_loss_mlp": 0.20863593, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.03052990379504839, + "language_loss": 0.8056978, + "learning_rate": 0.0005858694085337976, + "loss": 0.81477976, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01366598, + "step": 2400, + "time_per_iteration": 2.8421711921691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115275, + "balance_loss_mlp": 1.10424817, + "diversity_loss_mlp": 0.0, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.08470381171074581, + "language_loss": 0.8355788, + "learning_rate": 0.0005855624783095589, + "loss": 0.84673154, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2401, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114727, + "balance_loss_mlp": 1.10386109, + "diversity_loss_mlp": 0.0, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.07139821582333657, + "language_loss": 0.85265267, + "learning_rate": 0.00058525551486702, + "loss": 0.86379993, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2402, + "time_per_iteration": 2.5159239768981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119193, + "balance_loss_mlp": 1.10795164, + "diversity_loss_mlp": 0.0, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.08747389081307531, + "language_loss": 0.80850065, + "learning_rate": 0.0005849485183253548, + "loss": 0.81969261, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.11242676, + "routerloss_mlp": 0.0, + "step": 2403, + "time_per_iteration": 2.643031358718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110943, + "balance_loss_mlp": 1.09971905, + "diversity_loss_mlp": 0.0, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.06974006499463392, + "language_loss": 0.8764264, + "learning_rate": 0.0005846414888037501, + "loss": 0.88753581, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.11224365, + "routerloss_mlp": 0.0, + "step": 2404, + "time_per_iteration": 2.4847412109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091069, + "balance_loss_mlp": 1.07962489, + "diversity_loss_mlp": 0.0, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.07303422211334305, + "language_loss": 0.82384312, + "learning_rate": 0.0005843344264214049, + "loss": 0.83475375, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.11444092, + "routerloss_mlp": 0.0, + "step": 2405, + "time_per_iteration": 2.7470028400421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093931, + "balance_loss_mlp": 1.08265948, + "diversity_loss_mlp": 0.0, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.06660378994806349, + "language_loss": 0.84838545, + "learning_rate": 0.0005840273312975317, + "loss": 0.85932475, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2406, + "time_per_iteration": 2.834179162979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082019, + "balance_loss_mlp": 1.07018733, + "diversity_loss_mlp": 0.0, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.07201348711751891, + "language_loss": 0.89853442, + "learning_rate": 0.0005837202035513555, + "loss": 0.90935457, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2407, + "time_per_iteration": 2.578505277633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081302, + "balance_loss_mlp": 1.06933987, + "diversity_loss_mlp": 0.0, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.06479654524201506, + "language_loss": 0.81299376, + "learning_rate": 0.0005834130433021136, + "loss": 0.82380676, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.11956787, + "routerloss_mlp": 0.0, + "step": 2408, + "time_per_iteration": 2.742830991744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075359, + "balance_loss_mlp": 1.0631156, + "diversity_loss_mlp": 0.0, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.06628126289532602, + "language_loss": 0.73402894, + "learning_rate": 0.0005831058506690563, + "loss": 0.74478251, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 2409, + "time_per_iteration": 2.6239566802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875374, + "balance_loss_mlp": 1.5126431, + "diversity_loss_mlp": 0.20975235, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.03030502692098504, + "language_loss": 0.86162984, + "learning_rate": 0.0005827986257714464, + "loss": 0.87038362, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01417591, + "step": 2410, + "time_per_iteration": 2.9302031993865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069185, + "balance_loss_mlp": 1.05664992, + "diversity_loss_mlp": 0.0, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.07558638886093381, + "language_loss": 0.88803709, + "learning_rate": 0.0005824913687285591, + "loss": 0.89872897, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.12542725, + "routerloss_mlp": 0.0, + "step": 2411, + "time_per_iteration": 2.685814142227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070655, + "balance_loss_mlp": 1.05821514, + "diversity_loss_mlp": 0.0, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.1080687232114875, + "language_loss": 0.81367224, + "learning_rate": 0.0005821840796596821, + "loss": 0.82437879, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.12445068, + "routerloss_mlp": 0.0, + "step": 2412, + "time_per_iteration": 2.6551058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073612, + "balance_loss_mlp": 1.06099916, + "diversity_loss_mlp": 0.0, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.07026214254932567, + "language_loss": 0.80428362, + "learning_rate": 0.0005818767586841158, + "loss": 0.81501973, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.12609863, + "routerloss_mlp": 0.0, + "step": 2413, + "time_per_iteration": 2.759437322616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085225, + "balance_loss_mlp": 1.07259476, + "diversity_loss_mlp": 0.0, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.08627931539992734, + "language_loss": 0.86441922, + "learning_rate": 0.0005815694059211726, + "loss": 0.8752715, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.12640381, + "routerloss_mlp": 0.0, + "step": 2414, + "time_per_iteration": 2.658977746963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171514, + "balance_loss_mlp": 1.16250181, + "diversity_loss_mlp": 0.0, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.047494824411654174, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82045138, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 2415, + "time_per_iteration": 4.799519777297974 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145606, + "balance_loss_mlp": 1.13711834, + "diversity_loss_mlp": 0.0, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.043373387729815825, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78090668, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.08496094, + "routerloss_mlp": 0.0, + "step": 2416, + "time_per_iteration": 4.990553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0087124, + "balance_loss_mlp": 1.50839305, + "diversity_loss_mlp": 0.20828754, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.030578892859867562, + "language_loss": 0.86378521, + "learning_rate": 0.0005806471581013931, + "loss": 0.87249762, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01289999, + "step": 2417, + "time_per_iteration": 2.6900436878204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122345, + "balance_loss_mlp": 1.11040044, + "diversity_loss_mlp": 0.0, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.07418438196536063, + "language_loss": 0.78360349, + "learning_rate": 0.0005803396793823146, + "loss": 0.79482698, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2418, + "time_per_iteration": 2.8027873039245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113389, + "balance_loss_mlp": 1.12212396, + "diversity_loss_mlp": 0.0, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.07660062238284089, + "language_loss": 0.85582161, + "learning_rate": 0.0005800321694726065, + "loss": 0.86716056, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2419, + "time_per_iteration": 4.293209075927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870744, + "balance_loss_mlp": 1.50698626, + "diversity_loss_mlp": 0.20827082, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.03270390918014964, + "language_loss": 0.86636543, + "learning_rate": 0.0005797246284916545, + "loss": 0.87507284, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01311516, + "step": 2420, + "time_per_iteration": 2.7184417247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112587, + "balance_loss_mlp": 1.1061976, + "diversity_loss_mlp": 0.0, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.04763479459010098, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78617769, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2421, + "time_per_iteration": 4.978823900222778 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164162, + "balance_loss_mlp": 1.1527952, + "diversity_loss_mlp": 0.0, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.08359324638355049, + "language_loss": 0.87635398, + "learning_rate": 0.0005791094537936233, + "loss": 0.8879956, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2422, + "time_per_iteration": 2.706270217895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145768, + "balance_loss_mlp": 1.1349256, + "diversity_loss_mlp": 0.0, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.07317342210777962, + "language_loss": 0.81790811, + "learning_rate": 0.0005788018203153762, + "loss": 0.82936579, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2423, + "time_per_iteration": 2.5965187549591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114513, + "balance_loss_mlp": 1.13404965, + "diversity_loss_mlp": 0.0, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.08308161607945047, + "language_loss": 0.85607517, + "learning_rate": 0.000578494156243549, + "loss": 0.86752647, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.11083984, + "routerloss_mlp": 0.0, + "step": 2424, + "time_per_iteration": 2.5783984661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124685, + "balance_loss_mlp": 1.1135745, + "diversity_loss_mlp": 0.0, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.06702614551613306, + "language_loss": 0.88852286, + "learning_rate": 0.0005781864616975878, + "loss": 0.89976966, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2425, + "time_per_iteration": 2.6615347862243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105595, + "balance_loss_mlp": 1.09463954, + "diversity_loss_mlp": 0.0, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.0790317604017366, + "language_loss": 0.84397781, + "learning_rate": 0.0005778787367969502, + "loss": 0.85503376, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2426, + "time_per_iteration": 2.5796711444854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095822, + "balance_loss_mlp": 1.08478928, + "diversity_loss_mlp": 0.0, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.062032004097500974, + "language_loss": 0.80925953, + "learning_rate": 0.0005775709816611053, + "loss": 0.82021779, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.11029053, + "routerloss_mlp": 0.0, + "step": 2427, + "time_per_iteration": 2.9491348266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.07454419, + "diversity_loss_mlp": 0.0, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.0676389696771178, + "language_loss": 0.83549029, + "learning_rate": 0.0005772631964095346, + "loss": 0.8463425, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.10681152, + "routerloss_mlp": 0.0, + "step": 2428, + "time_per_iteration": 2.6981353759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081501, + "balance_loss_mlp": 1.07072484, + "diversity_loss_mlp": 0.0, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.08126061261115217, + "language_loss": 0.8576231, + "learning_rate": 0.000576955381161731, + "loss": 0.86843812, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.10778809, + "routerloss_mlp": 0.0, + "step": 2429, + "time_per_iteration": 2.6633517742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074344, + "balance_loss_mlp": 1.06313229, + "diversity_loss_mlp": 0.0, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.08275287351868318, + "language_loss": 0.86212349, + "learning_rate": 0.0005766475360371985, + "loss": 0.87286699, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2430, + "time_per_iteration": 2.5904853343963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072898, + "balance_loss_mlp": 1.06205034, + "diversity_loss_mlp": 0.0, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.0860704645170746, + "language_loss": 0.84563982, + "learning_rate": 0.0005763396611554536, + "loss": 0.85636878, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.10852051, + "routerloss_mlp": 0.0, + "step": 2431, + "time_per_iteration": 2.6467607021331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071528, + "balance_loss_mlp": 1.0607698, + "diversity_loss_mlp": 0.0, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.08998246562287979, + "language_loss": 0.80544329, + "learning_rate": 0.0005760317566360237, + "loss": 0.81615859, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 2432, + "time_per_iteration": 3.006641387939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075816, + "balance_loss_mlp": 1.0648669, + "diversity_loss_mlp": 0.0, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.07509845156715887, + "language_loss": 0.84929144, + "learning_rate": 0.000575723822598448, + "loss": 0.86004961, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2433, + "time_per_iteration": 2.764425277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067328, + "balance_loss_mlp": 1.0558188, + "diversity_loss_mlp": 0.0, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.06651895210271294, + "language_loss": 0.8167448, + "learning_rate": 0.0005754158591622773, + "loss": 0.82741809, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2434, + "time_per_iteration": 2.9786107540130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075165, + "balance_loss_mlp": 1.06366098, + "diversity_loss_mlp": 0.0, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.07251033111677281, + "language_loss": 0.82255369, + "learning_rate": 0.0005751078664470732, + "loss": 0.83330536, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2435, + "time_per_iteration": 2.5367684364318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079887, + "balance_loss_mlp": 1.06816268, + "diversity_loss_mlp": 0.0, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.07721942828462902, + "language_loss": 0.85977614, + "learning_rate": 0.0005747998445724094, + "loss": 0.87057501, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2436, + "time_per_iteration": 2.636200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108497, + "balance_loss_mlp": 1.07313251, + "diversity_loss_mlp": 0.0, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.07122055500535385, + "language_loss": 0.89087129, + "learning_rate": 0.0005744917936578707, + "loss": 0.90172094, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2437, + "time_per_iteration": 2.7820210456848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.07790279, + "diversity_loss_mlp": 0.0, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.0674848593159629, + "language_loss": 0.84104413, + "learning_rate": 0.0005741837138230526, + "loss": 0.85194385, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.1206665, + "routerloss_mlp": 0.0, + "step": 2438, + "time_per_iteration": 2.7324602603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091997, + "balance_loss_mlp": 1.07981968, + "diversity_loss_mlp": 0.0, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.08534673561441382, + "language_loss": 0.86345065, + "learning_rate": 0.0005738756051875627, + "loss": 0.87437063, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2439, + "time_per_iteration": 3.0705649852752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098053, + "balance_loss_mlp": 1.08564377, + "diversity_loss_mlp": 0.0, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.06467123496854205, + "language_loss": 0.83114249, + "learning_rate": 0.0005735674678710192, + "loss": 0.84212297, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.12414551, + "routerloss_mlp": 0.0, + "step": 2440, + "time_per_iteration": 2.6645498275756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089062, + "balance_loss_mlp": 1.07644403, + "diversity_loss_mlp": 0.0, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.09155388913703945, + "language_loss": 0.81178355, + "learning_rate": 0.0005732593019930517, + "loss": 0.82267421, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.12628174, + "routerloss_mlp": 0.0, + "step": 2441, + "time_per_iteration": 2.892775774002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084176, + "balance_loss_mlp": 1.07203436, + "diversity_loss_mlp": 0.0, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.07090754106091501, + "language_loss": 0.87927258, + "learning_rate": 0.0005729511076733008, + "loss": 0.89011431, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2442, + "time_per_iteration": 2.629671096801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080039, + "balance_loss_mlp": 1.06766534, + "diversity_loss_mlp": 0.0, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.0886658808398658, + "language_loss": 0.85080904, + "learning_rate": 0.000572642885031418, + "loss": 0.86160946, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.1237793, + "routerloss_mlp": 0.0, + "step": 2443, + "time_per_iteration": 2.858177900314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083351, + "balance_loss_mlp": 1.07077432, + "diversity_loss_mlp": 0.0, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.06516149518751314, + "language_loss": 0.80735445, + "learning_rate": 0.0005723346341870662, + "loss": 0.81818795, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.12573242, + "routerloss_mlp": 0.0, + "step": 2444, + "time_per_iteration": 2.7146968841552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084098, + "balance_loss_mlp": 1.07161689, + "diversity_loss_mlp": 0.0, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.08093347646647668, + "language_loss": 0.86360067, + "learning_rate": 0.0005720263552599188, + "loss": 0.87444162, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2445, + "time_per_iteration": 2.5240447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077035, + "balance_loss_mlp": 1.06469131, + "diversity_loss_mlp": 0.0, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.10031003663616385, + "language_loss": 0.80052316, + "learning_rate": 0.0005717180483696604, + "loss": 0.81129348, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.12347412, + "routerloss_mlp": 0.0, + "step": 2446, + "time_per_iteration": 2.8576042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076731, + "balance_loss_mlp": 1.06456566, + "diversity_loss_mlp": 0.0, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.06704052343949889, + "language_loss": 0.82989585, + "learning_rate": 0.0005714097136359862, + "loss": 0.84066319, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2447, + "time_per_iteration": 2.624566078186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841696, + "balance_loss_mlp": 1.45028305, + "diversity_loss_mlp": 0.205522, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.027205551471082397, + "language_loss": 0.86918223, + "learning_rate": 0.0005711013511786027, + "loss": 0.87759912, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01379322, + "step": 2448, + "time_per_iteration": 2.797086238861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106901, + "balance_loss_mlp": 1.05689788, + "diversity_loss_mlp": 0.0, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.06342125158561994, + "language_loss": 0.83811176, + "learning_rate": 0.0005707929611172263, + "loss": 0.84880185, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2449, + "time_per_iteration": 2.731825351715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071528, + "balance_loss_mlp": 1.05951726, + "diversity_loss_mlp": 0.0, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.09170207604049842, + "language_loss": 0.84256124, + "learning_rate": 0.000570484543571585, + "loss": 0.85327655, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2450, + "time_per_iteration": 2.5735461711883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064618, + "balance_loss_mlp": 1.05268502, + "diversity_loss_mlp": 0.0, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.08479509676509417, + "language_loss": 0.82936448, + "learning_rate": 0.0005701760986614171, + "loss": 0.84001064, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2451, + "time_per_iteration": 2.537297248840332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071081, + "balance_loss_mlp": 1.0591718, + "diversity_loss_mlp": 0.0, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.059658494784791405, + "language_loss": 0.8734417, + "learning_rate": 0.0005698676265064714, + "loss": 0.88415247, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2452, + "time_per_iteration": 2.5586979389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076856, + "balance_loss_mlp": 1.06525099, + "diversity_loss_mlp": 0.0, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.0707454592736124, + "language_loss": 0.89208829, + "learning_rate": 0.0005695591272265074, + "loss": 0.90285689, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2453, + "time_per_iteration": 2.527719736099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088311, + "balance_loss_mlp": 1.07617581, + "diversity_loss_mlp": 0.0, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.07134640406799209, + "language_loss": 0.81947398, + "learning_rate": 0.0005692506009412954, + "loss": 0.83035707, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.12133789, + "routerloss_mlp": 0.0, + "step": 2454, + "time_per_iteration": 2.6558947563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0064123, + "balance_loss_mlp": 1.11988485, + "diversity_loss_mlp": 0.13842735, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.002527541257966033, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78192496, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01207405, + "step": 2455, + "time_per_iteration": 5.005730628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_mlp": 1.07716715, + "diversity_loss_mlp": 0.0, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07179176619920838, + "language_loss": 0.89308333, + "learning_rate": 0.0005686334678342593, + "loss": 0.90397304, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2456, + "time_per_iteration": 2.8779940605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094143, + "balance_loss_mlp": 1.08280611, + "diversity_loss_mlp": 0.0, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.08187467616753978, + "language_loss": 0.81664062, + "learning_rate": 0.0005683248612520274, + "loss": 0.82758206, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.11334229, + "routerloss_mlp": 0.0, + "step": 2457, + "time_per_iteration": 3.0844156742095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087436, + "balance_loss_mlp": 1.07605195, + "diversity_loss_mlp": 0.0, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.08330432962991885, + "language_loss": 0.83940041, + "learning_rate": 0.0005680162281437321, + "loss": 0.85027468, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2458, + "time_per_iteration": 2.886364221572876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108407, + "balance_loss_mlp": 1.07263231, + "diversity_loss_mlp": 0.0, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.06607837126207569, + "language_loss": 0.84340584, + "learning_rate": 0.000567707568629195, + "loss": 0.8542465, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2459, + "time_per_iteration": 2.7153613567352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082795, + "balance_loss_mlp": 1.0712074, + "diversity_loss_mlp": 0.0, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.0662532862091719, + "language_loss": 0.82247961, + "learning_rate": 0.0005673988828282486, + "loss": 0.8333075, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2460, + "time_per_iteration": 2.6740705966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_mlp": 1.06760526, + "diversity_loss_mlp": 0.0, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.05997115702153478, + "language_loss": 0.81122911, + "learning_rate": 0.0005670901708607352, + "loss": 0.82202172, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.11645508, + "routerloss_mlp": 0.0, + "step": 2461, + "time_per_iteration": 3.0222864151000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077887, + "balance_loss_mlp": 1.0661211, + "diversity_loss_mlp": 0.0, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.12722631062247966, + "language_loss": 0.83784962, + "learning_rate": 0.0005667814328465076, + "loss": 0.84862852, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2462, + "time_per_iteration": 2.62223744392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071899, + "balance_loss_mlp": 1.06031179, + "diversity_loss_mlp": 0.0, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.10920156375550993, + "language_loss": 0.82163846, + "learning_rate": 0.0005664726689054285, + "loss": 0.83235747, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2463, + "time_per_iteration": 2.474776029586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072445, + "balance_loss_mlp": 1.06096554, + "diversity_loss_mlp": 0.0, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.07990467081118383, + "language_loss": 0.80772603, + "learning_rate": 0.0005661638791573704, + "loss": 0.81845051, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2464, + "time_per_iteration": 2.699165105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073096, + "balance_loss_mlp": 1.06145513, + "diversity_loss_mlp": 0.0, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.06593248790897067, + "language_loss": 0.86978662, + "learning_rate": 0.0005658550637222164, + "loss": 0.8805176, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2465, + "time_per_iteration": 2.6154093742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070219, + "balance_loss_mlp": 1.0586381, + "diversity_loss_mlp": 0.0, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.06422453310815268, + "language_loss": 0.82103038, + "learning_rate": 0.0005655462227198592, + "loss": 0.83173257, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2466, + "time_per_iteration": 2.888040065765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068955, + "balance_loss_mlp": 1.05703366, + "diversity_loss_mlp": 0.0, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.07464863741428074, + "language_loss": 0.84426093, + "learning_rate": 0.0005652373562702016, + "loss": 0.85495043, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2467, + "time_per_iteration": 2.6240220069885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071196, + "balance_loss_mlp": 1.05926943, + "diversity_loss_mlp": 0.0, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.06778780294468974, + "language_loss": 0.88405621, + "learning_rate": 0.000564928464493156, + "loss": 0.89476824, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2468, + "time_per_iteration": 2.598493814468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068768, + "balance_loss_mlp": 1.05676329, + "diversity_loss_mlp": 0.0, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.06443301027733518, + "language_loss": 0.81735635, + "learning_rate": 0.000564619547508645, + "loss": 0.82804406, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.11999512, + "routerloss_mlp": 0.0, + "step": 2469, + "time_per_iteration": 4.510512828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_mlp": 1.05816698, + "diversity_loss_mlp": 0.0, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.0879456232971056, + "language_loss": 0.82882106, + "learning_rate": 0.0005643106054366008, + "loss": 0.83952397, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.12121582, + "routerloss_mlp": 0.0, + "step": 2470, + "time_per_iteration": 2.5648152828216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.06276536, + "diversity_loss_mlp": 0.0, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.06194770014341408, + "language_loss": 0.79193991, + "learning_rate": 0.000564001638396965, + "loss": 0.8026849, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.11706543, + "routerloss_mlp": 0.0, + "step": 2471, + "time_per_iteration": 2.7267987728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073205, + "balance_loss_mlp": 1.06152296, + "diversity_loss_mlp": 0.0, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.06505306942508977, + "language_loss": 0.82164901, + "learning_rate": 0.0005636926465096897, + "loss": 0.83238107, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2472, + "time_per_iteration": 3.035590887069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078551, + "balance_loss_mlp": 1.06670165, + "diversity_loss_mlp": 0.0, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.08684318660371242, + "language_loss": 0.8723672, + "learning_rate": 0.0005633836298947363, + "loss": 0.88315272, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2473, + "time_per_iteration": 4.002026796340942 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091096, + "balance_loss_mlp": 1.07912695, + "diversity_loss_mlp": 0.0, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.0706680414575132, + "language_loss": 0.70566314, + "learning_rate": 0.000563074588672075, + "loss": 0.71657413, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2474, + "time_per_iteration": 2.6985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089769, + "balance_loss_mlp": 1.07802129, + "diversity_loss_mlp": 0.0, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.06282750442858279, + "language_loss": 0.85378051, + "learning_rate": 0.0005627655229616868, + "loss": 0.86467826, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.11743164, + "routerloss_mlp": 0.0, + "step": 2475, + "time_per_iteration": 2.7580935955047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091111, + "balance_loss_mlp": 1.07941031, + "diversity_loss_mlp": 0.0, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.07002888905047219, + "language_loss": 0.90058106, + "learning_rate": 0.0005624564328835616, + "loss": 0.91149217, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2476, + "time_per_iteration": 2.789257764816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108666, + "balance_loss_mlp": 1.07509637, + "diversity_loss_mlp": 0.0, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.06042863191219761, + "language_loss": 0.84203571, + "learning_rate": 0.0005621473185576986, + "loss": 0.85290229, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2477, + "time_per_iteration": 2.724280834197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089922, + "balance_loss_mlp": 1.07846594, + "diversity_loss_mlp": 0.0, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.07203405271885309, + "language_loss": 0.87555075, + "learning_rate": 0.0005618381801041068, + "loss": 0.88644993, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2478, + "time_per_iteration": 2.6800026893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085708, + "balance_loss_mlp": 1.0738883, + "diversity_loss_mlp": 0.0, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.08495018756940642, + "language_loss": 0.83006722, + "learning_rate": 0.0005615290176428044, + "loss": 0.84092432, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.11810303, + "routerloss_mlp": 0.0, + "step": 2479, + "time_per_iteration": 2.6456432342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078377, + "balance_loss_mlp": 1.06658673, + "diversity_loss_mlp": 0.0, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.07371403414772894, + "language_loss": 0.84979588, + "learning_rate": 0.0005612198312938187, + "loss": 0.86057961, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2480, + "time_per_iteration": 2.7325923442840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085233, + "balance_loss_mlp": 1.0737772, + "diversity_loss_mlp": 0.0, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.05926830515799366, + "language_loss": 0.79493093, + "learning_rate": 0.0005609106211771868, + "loss": 0.80578327, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2481, + "time_per_iteration": 2.8374931812286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108305, + "balance_loss_mlp": 1.07103384, + "diversity_loss_mlp": 0.0, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.06643858588339867, + "language_loss": 0.88938701, + "learning_rate": 0.0005606013874129543, + "loss": 0.90021759, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2482, + "time_per_iteration": 2.7547929286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081649, + "balance_loss_mlp": 1.07017505, + "diversity_loss_mlp": 0.0, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.06416127972697647, + "language_loss": 0.80410159, + "learning_rate": 0.0005602921301211768, + "loss": 0.81491804, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2483, + "time_per_iteration": 2.7025153636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080053, + "balance_loss_mlp": 1.06850159, + "diversity_loss_mlp": 0.0, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.07652865967226291, + "language_loss": 0.8209163, + "learning_rate": 0.0005599828494219185, + "loss": 0.83171678, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.11541748, + "routerloss_mlp": 0.0, + "step": 2484, + "time_per_iteration": 2.5415024757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070825, + "balance_loss_mlp": 1.05903542, + "diversity_loss_mlp": 0.0, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.07721505579443601, + "language_loss": 0.89162952, + "learning_rate": 0.0005596735454352527, + "loss": 0.90233779, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2485, + "time_per_iteration": 2.8591346740722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077742, + "balance_loss_mlp": 1.06591046, + "diversity_loss_mlp": 0.0, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.07819028279068943, + "language_loss": 0.85696715, + "learning_rate": 0.0005593642182812619, + "loss": 0.86774457, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.1182251, + "routerloss_mlp": 0.0, + "step": 2486, + "time_per_iteration": 2.679927349090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_mlp": 1.06575358, + "diversity_loss_mlp": 0.0, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.0859238614993436, + "language_loss": 0.83753216, + "learning_rate": 0.0005590548680800378, + "loss": 0.84830678, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.11694336, + "routerloss_mlp": 0.0, + "step": 2487, + "time_per_iteration": 3.0984909534454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071222, + "balance_loss_mlp": 1.05950415, + "diversity_loss_mlp": 0.0, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.06795851613398404, + "language_loss": 0.76434267, + "learning_rate": 0.0005587454949516804, + "loss": 0.77505481, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2488, + "time_per_iteration": 2.692324161529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_mlp": 1.06507468, + "diversity_loss_mlp": 0.0, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.06921637005003253, + "language_loss": 0.8785038, + "learning_rate": 0.0005584360990162993, + "loss": 0.88927084, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.11627197, + "routerloss_mlp": 0.0, + "step": 2489, + "time_per_iteration": 2.646521806716919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077817, + "balance_loss_mlp": 1.06614649, + "diversity_loss_mlp": 0.0, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.06386300972416134, + "language_loss": 0.85713631, + "learning_rate": 0.0005581266803940124, + "loss": 0.86791456, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.11657715, + "routerloss_mlp": 0.0, + "step": 2490, + "time_per_iteration": 2.735152244567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.05925143, + "diversity_loss_mlp": 0.0, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.0718717211843218, + "language_loss": 0.87536263, + "learning_rate": 0.0005578172392049471, + "loss": 0.88607073, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2491, + "time_per_iteration": 2.7718377113342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00892921, + "balance_loss_mlp": 1.54530287, + "diversity_loss_mlp": 0.21191472, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.033555176901221506, + "language_loss": 0.84551859, + "learning_rate": 0.0005575077755692386, + "loss": 0.85444778, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01431197, + "step": 2492, + "time_per_iteration": 2.81888747215271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070402, + "balance_loss_mlp": 1.05893993, + "diversity_loss_mlp": 0.0, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.054684262853474656, + "language_loss": 0.86001486, + "learning_rate": 0.0005571982896070316, + "loss": 0.8707189, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.11456299, + "routerloss_mlp": 0.0, + "step": 2493, + "time_per_iteration": 2.655311346054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084039, + "balance_loss_mlp": 1.07248712, + "diversity_loss_mlp": 0.0, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.07545203546694841, + "language_loss": 0.89854079, + "learning_rate": 0.0005568887814384792, + "loss": 0.90938115, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2494, + "time_per_iteration": 2.5930681228637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_mlp": 1.07098675, + "diversity_loss_mlp": 0.0, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.07194257940045806, + "language_loss": 0.87281573, + "learning_rate": 0.000556579251183743, + "loss": 0.88364077, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2495, + "time_per_iteration": 2.6386003494262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076942, + "balance_loss_mlp": 1.06520605, + "diversity_loss_mlp": 0.0, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.0750590648958695, + "language_loss": 0.80158448, + "learning_rate": 0.0005562696989629936, + "loss": 0.81235385, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.11737061, + "routerloss_mlp": 0.0, + "step": 2496, + "time_per_iteration": 2.7050864696502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00880705, + "balance_loss_mlp": 1.52288473, + "diversity_loss_mlp": 0.21003026, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.02916103721032611, + "language_loss": 0.82606125, + "learning_rate": 0.0005559601248964095, + "loss": 0.83486831, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01424794, + "step": 2497, + "time_per_iteration": 2.6473939418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085332, + "balance_loss_mlp": 1.0741564, + "diversity_loss_mlp": 0.0, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.07410871061403823, + "language_loss": 0.85882998, + "learning_rate": 0.0005556505291041783, + "loss": 0.86968333, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.11175537, + "routerloss_mlp": 0.0, + "step": 2498, + "time_per_iteration": 2.665832042694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105898, + "balance_loss_mlp": 1.09428692, + "diversity_loss_mlp": 0.0, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.06465509842390993, + "language_loss": 0.84413946, + "learning_rate": 0.0005553409117064954, + "loss": 0.8551985, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2499, + "time_per_iteration": 2.880300521850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00859857, + "balance_loss_mlp": 1.48415303, + "diversity_loss_mlp": 0.20870377, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.02869897963967695, + "language_loss": 0.84937358, + "learning_rate": 0.0005550312728235654, + "loss": 0.85797209, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01342856, + "step": 2500, + "time_per_iteration": 2.7199203968048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109419, + "balance_loss_mlp": 1.08251953, + "diversity_loss_mlp": 0.0, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.07331859457791397, + "language_loss": 0.83879191, + "learning_rate": 0.0005547216125756003, + "loss": 0.84973377, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2501, + "time_per_iteration": 2.732786178588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098928, + "balance_loss_mlp": 1.08708501, + "diversity_loss_mlp": 0.0, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.07387575947985975, + "language_loss": 0.82064617, + "learning_rate": 0.0005544119310828211, + "loss": 0.83163536, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2502, + "time_per_iteration": 3.1029446125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100673, + "balance_loss_mlp": 1.08865714, + "diversity_loss_mlp": 0.0, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.06596898477591598, + "language_loss": 0.84657413, + "learning_rate": 0.0005541022284654568, + "loss": 0.8575809, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2503, + "time_per_iteration": 2.901026725769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092163, + "balance_loss_mlp": 1.08015907, + "diversity_loss_mlp": 0.0, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.0759157238743441, + "language_loss": 0.83907866, + "learning_rate": 0.0005537925048437446, + "loss": 0.85000032, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2504, + "time_per_iteration": 2.6014060974121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00594545, + "balance_loss_mlp": 1.03097272, + "diversity_loss_mlp": 0.13453583, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.0017952613590721677, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76346016, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01179097, + "step": 2505, + "time_per_iteration": 4.960138320922852 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00867388, + "balance_loss_mlp": 1.49711311, + "diversity_loss_mlp": 0.20998067, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.029195885141922995, + "language_loss": 0.88189656, + "learning_rate": 0.0005531729950682664, + "loss": 0.8905704, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01384138, + "step": 2506, + "time_per_iteration": 3.056671142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082436, + "balance_loss_mlp": 1.07027662, + "diversity_loss_mlp": 0.0, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.09591114443507165, + "language_loss": 0.84746361, + "learning_rate": 0.000552863209155015, + "loss": 0.85828793, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2507, + "time_per_iteration": 2.473930835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00866012, + "balance_loss_mlp": 1.49284506, + "diversity_loss_mlp": 0.21081753, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.03047035716712285, + "language_loss": 0.82048851, + "learning_rate": 0.0005525534027184461, + "loss": 0.82914865, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01418037, + "step": 2508, + "time_per_iteration": 2.5708260536193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078028, + "balance_loss_mlp": 1.06624985, + "diversity_loss_mlp": 0.0, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.06261213728600334, + "language_loss": 0.83131289, + "learning_rate": 0.0005522435758788365, + "loss": 0.84209323, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2509, + "time_per_iteration": 2.7291650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00853572, + "balance_loss_mlp": 1.46908307, + "diversity_loss_mlp": 0.20966808, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.03495470447814039, + "language_loss": 0.80126894, + "learning_rate": 0.0005519337287564721, + "loss": 0.80980462, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01419635, + "step": 2510, + "time_per_iteration": 2.843698024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077046, + "balance_loss_mlp": 1.06536365, + "diversity_loss_mlp": 0.0, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07525780944119016, + "language_loss": 0.83495927, + "learning_rate": 0.000551623861471646, + "loss": 0.84572971, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2511, + "time_per_iteration": 2.7327091693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.1273582, + "diversity_loss_mlp": 0.0, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.052890092991212126, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79952717, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.06542969, + "routerloss_mlp": 0.0, + "step": 2512, + "time_per_iteration": 4.820046901702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073764, + "balance_loss_mlp": 1.06182551, + "diversity_loss_mlp": 0.0, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.09417698665840035, + "language_loss": 0.8670119, + "learning_rate": 0.0005510040668958211, + "loss": 0.87774956, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2513, + "time_per_iteration": 2.579780101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051826, + "balance_loss_mlp": 1.04515004, + "diversity_loss_mlp": 0.0, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.02705432320804172, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78812408, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.06689453, + "routerloss_mlp": 0.0, + "step": 2514, + "time_per_iteration": 4.83507227897644 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106953, + "balance_loss_mlp": 1.05716157, + "diversity_loss_mlp": 0.0, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.07432123735470587, + "language_loss": 0.83170015, + "learning_rate": 0.0005503841931138645, + "loss": 0.84239542, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.12365723, + "routerloss_mlp": 0.0, + "step": 2515, + "time_per_iteration": 2.6834895610809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071737, + "balance_loss_mlp": 1.05963731, + "diversity_loss_mlp": 0.0, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.07510504832931036, + "language_loss": 0.81515384, + "learning_rate": 0.0005500742268214025, + "loss": 0.82587123, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2516, + "time_per_iteration": 2.494479179382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084077, + "balance_loss_mlp": 1.0715425, + "diversity_loss_mlp": 0.0, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.06432693662792612, + "language_loss": 0.85142744, + "learning_rate": 0.0005497642410884014, + "loss": 0.86226821, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.12542725, + "routerloss_mlp": 0.0, + "step": 2517, + "time_per_iteration": 2.760425090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080501, + "balance_loss_mlp": 1.06788325, + "diversity_loss_mlp": 0.0, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.06763953923030977, + "language_loss": 0.85120749, + "learning_rate": 0.0005494542360352085, + "loss": 0.86201251, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.12628174, + "routerloss_mlp": 0.0, + "step": 2518, + "time_per_iteration": 2.6524109840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108191, + "balance_loss_mlp": 1.06955993, + "diversity_loss_mlp": 0.0, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.06089591080825084, + "language_loss": 0.85741639, + "learning_rate": 0.0005491442117821783, + "loss": 0.86823547, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 2519, + "time_per_iteration": 2.7461459636688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079216, + "balance_loss_mlp": 1.06654429, + "diversity_loss_mlp": 0.0, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.07584750574127574, + "language_loss": 0.87494171, + "learning_rate": 0.0005488341684496732, + "loss": 0.88573384, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.12677002, + "routerloss_mlp": 0.0, + "step": 2520, + "time_per_iteration": 2.6621458530426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080655, + "balance_loss_mlp": 1.06843615, + "diversity_loss_mlp": 0.0, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06605179609441998, + "language_loss": 0.9207437, + "learning_rate": 0.0005485241061580624, + "loss": 0.9315502, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2521, + "time_per_iteration": 2.772949457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089898, + "balance_loss_mlp": 1.07741094, + "diversity_loss_mlp": 0.0, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.06556104217544546, + "language_loss": 0.8458938, + "learning_rate": 0.0005482140250277228, + "loss": 0.85679281, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 2522, + "time_per_iteration": 2.978330135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847105, + "balance_loss_mlp": 1.45509815, + "diversity_loss_mlp": 0.21114388, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.03368619412239962, + "language_loss": 0.87090278, + "learning_rate": 0.0005479039251790387, + "loss": 0.87937379, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01398425, + "step": 2523, + "time_per_iteration": 2.6939120292663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00840008, + "balance_loss_mlp": 1.44148707, + "diversity_loss_mlp": 0.21069397, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.03188648694570784, + "language_loss": 0.84722733, + "learning_rate": 0.0005475938067324014, + "loss": 0.85562754, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0139178, + "step": 2524, + "time_per_iteration": 2.859184980392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106923, + "balance_loss_mlp": 1.09528267, + "diversity_loss_mlp": 0.0, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.06962736532334403, + "language_loss": 0.83518255, + "learning_rate": 0.0005472836698082098, + "loss": 0.84625173, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2525, + "time_per_iteration": 2.534783363342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101033, + "balance_loss_mlp": 1.08923149, + "diversity_loss_mlp": 0.0, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.07423434170097615, + "language_loss": 0.84140873, + "learning_rate": 0.0005469735145268694, + "loss": 0.85241902, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2526, + "time_per_iteration": 2.7064108848571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090982, + "balance_loss_mlp": 1.07928169, + "diversity_loss_mlp": 0.0, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.0731540325655248, + "language_loss": 0.81093931, + "learning_rate": 0.0005466633410087933, + "loss": 0.82184911, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2527, + "time_per_iteration": 2.682969570159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085513, + "balance_loss_mlp": 1.07793164, + "diversity_loss_mlp": 0.0, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.03711409557498352, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78346336, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.07568359, + "routerloss_mlp": 0.0, + "step": 2528, + "time_per_iteration": 4.962444067001343 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085086, + "balance_loss_mlp": 1.07360601, + "diversity_loss_mlp": 0.0, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.07791605184695856, + "language_loss": 0.88148236, + "learning_rate": 0.0005460429397441214, + "loss": 0.89233321, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2529, + "time_per_iteration": 2.5908102989196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00835644, + "balance_loss_mlp": 1.43002903, + "diversity_loss_mlp": 0.21195745, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.03186279831907627, + "language_loss": 0.87013817, + "learning_rate": 0.0005457327122383866, + "loss": 0.87849462, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01465126, + "step": 2530, + "time_per_iteration": 2.656264543533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036926, + "balance_loss_mlp": 1.02939153, + "diversity_loss_mlp": 0.0, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.02373673385224348, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75673413, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.07519531, + "routerloss_mlp": 0.0, + "step": 2531, + "time_per_iteration": 4.838496208190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100935, + "balance_loss_mlp": 1.08965194, + "diversity_loss_mlp": 0.0, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.06845758574896237, + "language_loss": 0.75823385, + "learning_rate": 0.0005451122040823244, + "loss": 0.76924324, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2532, + "time_per_iteration": 2.770751714706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099102, + "balance_loss_mlp": 1.08746696, + "diversity_loss_mlp": 0.0, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.07387169787784394, + "language_loss": 0.77164292, + "learning_rate": 0.0005448019236728997, + "loss": 0.7826339, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 2533, + "time_per_iteration": 2.8874497413635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00837303, + "balance_loss_mlp": 1.43305767, + "diversity_loss_mlp": 0.21233971, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.03246629845535473, + "language_loss": 0.8471576, + "learning_rate": 0.0005444916258698255, + "loss": 0.85553062, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01460437, + "step": 2534, + "time_per_iteration": 2.623748540878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112009, + "balance_loss_mlp": 1.10867584, + "diversity_loss_mlp": 0.0, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.06488105381348498, + "language_loss": 0.86077154, + "learning_rate": 0.0005441813107935704, + "loss": 0.87197244, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2535, + "time_per_iteration": 2.6705739498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124443, + "balance_loss_mlp": 1.11277819, + "diversity_loss_mlp": 0.0, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.07112550287999594, + "language_loss": 0.86025345, + "learning_rate": 0.0005438709785646091, + "loss": 0.87149793, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2536, + "time_per_iteration": 2.5624749660491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120427, + "balance_loss_mlp": 1.10864902, + "diversity_loss_mlp": 0.0, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.08492074314505418, + "language_loss": 0.86885595, + "learning_rate": 0.0005435606293034234, + "loss": 0.8800602, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2537, + "time_per_iteration": 2.6347479820251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121847, + "balance_loss_mlp": 1.11035514, + "diversity_loss_mlp": 0.0, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.08214525409599778, + "language_loss": 0.84619427, + "learning_rate": 0.0005432502631305016, + "loss": 0.8574127, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2538, + "time_per_iteration": 2.700613021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_mlp": 1.10190618, + "diversity_loss_mlp": 0.0, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.06429037959601741, + "language_loss": 0.83193302, + "learning_rate": 0.0005429398801663386, + "loss": 0.84306723, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2539, + "time_per_iteration": 2.9839913845062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097658, + "balance_loss_mlp": 1.08599913, + "diversity_loss_mlp": 0.0, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.12053819121868696, + "language_loss": 0.8290484, + "learning_rate": 0.0005426294805314355, + "loss": 0.84002495, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2540, + "time_per_iteration": 2.5029373168945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094803, + "balance_loss_mlp": 1.08291781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.06245664696917761, + "language_loss": 0.80155998, + "learning_rate": 0.0005423190643463003, + "loss": 0.81250799, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2541, + "time_per_iteration": 2.949772357940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093208, + "balance_loss_mlp": 1.08163261, + "diversity_loss_mlp": 0.0, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.07791209549750817, + "language_loss": 0.8281579, + "learning_rate": 0.0005420086317314473, + "loss": 0.83908999, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2542, + "time_per_iteration": 2.6383941173553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088553, + "balance_loss_mlp": 1.0765729, + "diversity_loss_mlp": 0.0, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.06362759827284906, + "language_loss": 0.81081557, + "learning_rate": 0.0005416981828073971, + "loss": 0.82170111, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.11981201, + "routerloss_mlp": 0.0, + "step": 2543, + "time_per_iteration": 2.8023576736450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007156, + "balance_loss_mlp": 0.99990815, + "diversity_loss_mlp": 0.0, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.01938913368632236, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78122175, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.07226562, + "routerloss_mlp": 0.0, + "step": 2544, + "time_per_iteration": 4.817458629608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093446, + "balance_loss_mlp": 1.08184147, + "diversity_loss_mlp": 0.0, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.08678858450341921, + "language_loss": 0.84937072, + "learning_rate": 0.000541077236513819, + "loss": 0.86030519, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2545, + "time_per_iteration": 2.5271120071411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089352, + "balance_loss_mlp": 1.07800293, + "diversity_loss_mlp": 0.0, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.07207098978073255, + "language_loss": 0.82449925, + "learning_rate": 0.0005407667393853638, + "loss": 0.83539271, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2546, + "time_per_iteration": 2.6385204792022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093617, + "balance_loss_mlp": 1.08250618, + "diversity_loss_mlp": 0.0, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.06843607218978102, + "language_loss": 0.83673334, + "learning_rate": 0.0005404562264298569, + "loss": 0.84766948, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2547, + "time_per_iteration": 2.845250368118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102851, + "balance_loss_mlp": 1.09120405, + "diversity_loss_mlp": 0.0, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.06940893068641271, + "language_loss": 0.83999467, + "learning_rate": 0.0005401456977678498, + "loss": 0.8510232, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2548, + "time_per_iteration": 2.638720750808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099322, + "balance_loss_mlp": 1.08754444, + "diversity_loss_mlp": 0.0, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.08453175850654031, + "language_loss": 0.77431965, + "learning_rate": 0.0005398351535199008, + "loss": 0.78531289, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2549, + "time_per_iteration": 3.064035415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103016, + "balance_loss_mlp": 1.09175706, + "diversity_loss_mlp": 0.0, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.07238427843662706, + "language_loss": 0.84189212, + "learning_rate": 0.0005395245938065735, + "loss": 0.85292226, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2550, + "time_per_iteration": 2.7746829986572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118468, + "balance_loss_mlp": 1.10702372, + "diversity_loss_mlp": 0.0, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.08583684211433391, + "language_loss": 0.82631576, + "learning_rate": 0.0005392140187484379, + "loss": 0.83750039, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2551, + "time_per_iteration": 2.582195281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124142, + "balance_loss_mlp": 1.11273384, + "diversity_loss_mlp": 0.0, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.0682243054902728, + "language_loss": 0.89719319, + "learning_rate": 0.0005389034284660701, + "loss": 0.90843463, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.11401367, + "routerloss_mlp": 0.0, + "step": 2552, + "time_per_iteration": 2.824427366256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131566, + "balance_loss_mlp": 1.12022352, + "diversity_loss_mlp": 0.0, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.08386347311462448, + "language_loss": 0.82537109, + "learning_rate": 0.000538592823080052, + "loss": 0.83668673, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.11340332, + "routerloss_mlp": 0.0, + "step": 2553, + "time_per_iteration": 3.24122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127167, + "balance_loss_mlp": 1.11565781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.06967590045443849, + "language_loss": 0.84592807, + "learning_rate": 0.000538282202710971, + "loss": 0.85719973, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.11505127, + "routerloss_mlp": 0.0, + "step": 2554, + "time_per_iteration": 2.5753910541534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130476, + "balance_loss_mlp": 1.11918652, + "diversity_loss_mlp": 0.0, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.07442252581599826, + "language_loss": 0.82315147, + "learning_rate": 0.000537971567479421, + "loss": 0.83445626, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2555, + "time_per_iteration": 2.7354228496551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127557, + "balance_loss_mlp": 1.11596429, + "diversity_loss_mlp": 0.0, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.09076326784032986, + "language_loss": 0.88129175, + "learning_rate": 0.0005376609175060011, + "loss": 0.8925674, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2556, + "time_per_iteration": 2.6124610900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106232, + "balance_loss_mlp": 1.09465659, + "diversity_loss_mlp": 0.0, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.07210041581715526, + "language_loss": 0.80779845, + "learning_rate": 0.0005373502529113162, + "loss": 0.81886077, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2557, + "time_per_iteration": 2.823993444442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100884, + "balance_loss_mlp": 1.08888519, + "diversity_loss_mlp": 0.0, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.07460313059090624, + "language_loss": 0.81449521, + "learning_rate": 0.0005370395738159773, + "loss": 0.82550406, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2558, + "time_per_iteration": 2.6436777114868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00834873, + "balance_loss_mlp": 1.42800272, + "diversity_loss_mlp": 0.21467975, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.03347414568603151, + "language_loss": 0.82822633, + "learning_rate": 0.0005367288803406003, + "loss": 0.83657515, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01353174, + "step": 2559, + "time_per_iteration": 2.662224531173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083349, + "balance_loss_mlp": 1.07132101, + "diversity_loss_mlp": 0.0, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.0788259825299616, + "language_loss": 0.818443, + "learning_rate": 0.0005364181726058073, + "loss": 0.82927656, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.12023926, + "routerloss_mlp": 0.0, + "step": 2560, + "time_per_iteration": 2.686300277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.06417727, + "diversity_loss_mlp": 0.0, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.07955060847799823, + "language_loss": 0.8272332, + "learning_rate": 0.0005361074507322261, + "loss": 0.83799613, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2561, + "time_per_iteration": 2.5809431076049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073469, + "balance_loss_mlp": 1.06138754, + "diversity_loss_mlp": 0.0, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.07091460094801966, + "language_loss": 0.81425411, + "learning_rate": 0.000535796714840489, + "loss": 0.82498884, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2562, + "time_per_iteration": 2.6425187587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073356, + "balance_loss_mlp": 1.06107163, + "diversity_loss_mlp": 0.0, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.10871355986071002, + "language_loss": 0.83800626, + "learning_rate": 0.0005354859650512348, + "loss": 0.84873986, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2563, + "time_per_iteration": 2.7957375049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074544, + "balance_loss_mlp": 1.06282604, + "diversity_loss_mlp": 0.0, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.0798917687203661, + "language_loss": 0.87428886, + "learning_rate": 0.0005351752014851074, + "loss": 0.88503432, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2564, + "time_per_iteration": 2.6205673217773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085324, + "balance_loss_mlp": 1.07352281, + "diversity_loss_mlp": 0.0, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.06874397476353511, + "language_loss": 0.83621442, + "learning_rate": 0.0005348644242627553, + "loss": 0.84706771, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2565, + "time_per_iteration": 2.7460625171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010105, + "balance_loss_mlp": 1.00411022, + "diversity_loss_mlp": 0.0, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.013767653611631516, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76297128, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2566, + "time_per_iteration": 4.943475723266602 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110567, + "balance_loss_mlp": 1.09899187, + "diversity_loss_mlp": 0.0, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.08759046492811678, + "language_loss": 0.81650245, + "learning_rate": 0.0005342428293320013, + "loss": 0.82760805, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2567, + "time_per_iteration": 2.7889564037323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102659, + "balance_loss_mlp": 1.09142327, + "diversity_loss_mlp": 0.0, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.07999691418133484, + "language_loss": 0.8344667, + "learning_rate": 0.0005339320118649238, + "loss": 0.84549326, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.11230469, + "routerloss_mlp": 0.0, + "step": 2568, + "time_per_iteration": 2.7774229049682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.09715271, + "diversity_loss_mlp": 0.0, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.07608170940546952, + "language_loss": 0.86422324, + "learning_rate": 0.000533621181224271, + "loss": 0.87530512, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2569, + "time_per_iteration": 2.7708005905151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08442283, + "diversity_loss_mlp": 0.0, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.06858054906862693, + "language_loss": 0.8138749, + "learning_rate": 0.0005333103375307182, + "loss": 0.82483125, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2570, + "time_per_iteration": 2.8407034873962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090688, + "balance_loss_mlp": 1.07972121, + "diversity_loss_mlp": 0.0, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06174009778797697, + "language_loss": 0.85711801, + "learning_rate": 0.0005329994809049451, + "loss": 0.86802495, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.10974121, + "routerloss_mlp": 0.0, + "step": 2571, + "time_per_iteration": 2.7500712871551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096363, + "balance_loss_mlp": 1.08508563, + "diversity_loss_mlp": 0.0, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.06855083904022342, + "language_loss": 0.88066995, + "learning_rate": 0.0005326886114676375, + "loss": 0.89163363, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2572, + "time_per_iteration": 2.730137825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_mlp": 1.07269001, + "diversity_loss_mlp": 0.0, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06053914015656951, + "language_loss": 0.88364595, + "learning_rate": 0.0005323777293394854, + "loss": 0.89448464, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2573, + "time_per_iteration": 2.539825201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084718, + "balance_loss_mlp": 1.07365584, + "diversity_loss_mlp": 0.0, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.06797932871808014, + "language_loss": 0.81904709, + "learning_rate": 0.000532066834641184, + "loss": 0.8298943, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.11065674, + "routerloss_mlp": 0.0, + "step": 2574, + "time_per_iteration": 2.6663713455200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103824, + "balance_loss_mlp": 1.09271336, + "diversity_loss_mlp": 0.0, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.07191084425213706, + "language_loss": 0.85331243, + "learning_rate": 0.0005317559274934334, + "loss": 0.86435068, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.11114502, + "routerloss_mlp": 0.0, + "step": 2575, + "time_per_iteration": 2.756410598754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097116, + "balance_loss_mlp": 1.08592236, + "diversity_loss_mlp": 0.0, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.08893709148941176, + "language_loss": 0.80365205, + "learning_rate": 0.0005314450080169382, + "loss": 0.81462318, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2576, + "time_per_iteration": 2.613163471221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092174, + "balance_loss_mlp": 1.0810523, + "diversity_loss_mlp": 0.0, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.10818754121519983, + "language_loss": 0.8082127, + "learning_rate": 0.0005311340763324083, + "loss": 0.81913447, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.11126709, + "routerloss_mlp": 0.0, + "step": 2577, + "time_per_iteration": 2.5670807361602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087439, + "balance_loss_mlp": 1.07612574, + "diversity_loss_mlp": 0.0, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.07097138632102568, + "language_loss": 0.82323599, + "learning_rate": 0.0005308231325605578, + "loss": 0.83411032, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.11315918, + "routerloss_mlp": 0.0, + "step": 2578, + "time_per_iteration": 2.6519079208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085757, + "balance_loss_mlp": 1.07421172, + "diversity_loss_mlp": 0.0, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.06601832089031445, + "language_loss": 0.76727217, + "learning_rate": 0.0005305121768221061, + "loss": 0.7781297, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2579, + "time_per_iteration": 3.1306209564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_mlp": 1.03489161, + "diversity_loss_mlp": 0.0, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.022004289450105873, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76079202, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2580, + "time_per_iteration": 4.8141255378723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079045, + "balance_loss_mlp": 1.06767821, + "diversity_loss_mlp": 0.0, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.06618835036619775, + "language_loss": 0.91614985, + "learning_rate": 0.0005298902299282984, + "loss": 0.92694032, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2581, + "time_per_iteration": 2.586012125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087579, + "balance_loss_mlp": 1.07617044, + "diversity_loss_mlp": 0.0, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.07143589820149647, + "language_loss": 0.84265745, + "learning_rate": 0.0005295792390144033, + "loss": 0.85353327, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2582, + "time_per_iteration": 2.704911708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096311, + "balance_loss_mlp": 1.08442605, + "diversity_loss_mlp": 0.0, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.07556433689349051, + "language_loss": 0.83576399, + "learning_rate": 0.0005292682366168294, + "loss": 0.84672707, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2583, + "time_per_iteration": 2.5530638694763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105009, + "balance_loss_mlp": 1.09309435, + "diversity_loss_mlp": 0.0, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.06699014279274042, + "language_loss": 0.80089158, + "learning_rate": 0.0005289572228563181, + "loss": 0.81194162, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2584, + "time_per_iteration": 2.729093551635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100707, + "balance_loss_mlp": 1.08861935, + "diversity_loss_mlp": 0.0, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.0657007833960997, + "language_loss": 0.83234823, + "learning_rate": 0.000528646197853616, + "loss": 0.8433553, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2585, + "time_per_iteration": 2.727252721786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113697, + "balance_loss_mlp": 1.10166335, + "diversity_loss_mlp": 0.0, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.07376563164337009, + "language_loss": 0.85810697, + "learning_rate": 0.0005283351617294735, + "loss": 0.86924398, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.12023926, + "routerloss_mlp": 0.0, + "step": 2586, + "time_per_iteration": 2.945610761642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011716, + "balance_loss_mlp": 1.00470638, + "diversity_loss_mlp": 0.0, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.017193207514109847, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77648377, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.0703125, + "routerloss_mlp": 0.0, + "step": 2587, + "time_per_iteration": 5.038366079330444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.07597303, + "diversity_loss_mlp": 0.0, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.06591325697086226, + "language_loss": 0.86769819, + "learning_rate": 0.0005277130565998916, + "loss": 0.87858337, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.12554932, + "routerloss_mlp": 0.0, + "step": 2588, + "time_per_iteration": 2.7726681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086804, + "balance_loss_mlp": 1.07443595, + "diversity_loss_mlp": 0.0, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.05822748641904789, + "language_loss": 0.81899714, + "learning_rate": 0.0005274019878359748, + "loss": 0.82986516, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.12371826, + "routerloss_mlp": 0.0, + "step": 2589, + "time_per_iteration": 2.733985424041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.06275249, + "diversity_loss_mlp": 0.0, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.0736619230298454, + "language_loss": 0.87174684, + "learning_rate": 0.0005270909084336628, + "loss": 0.88249791, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.12335205, + "routerloss_mlp": 0.0, + "step": 2590, + "time_per_iteration": 2.648728370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075145, + "balance_loss_mlp": 1.06231809, + "diversity_loss_mlp": 0.0, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.07329601175103365, + "language_loss": 0.8877548, + "learning_rate": 0.0005267798185137276, + "loss": 0.89850616, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.12835693, + "routerloss_mlp": 0.0, + "step": 2591, + "time_per_iteration": 2.616903066635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061242, + "balance_loss_mlp": 1.04852843, + "diversity_loss_mlp": 0.0, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.0712913700859702, + "language_loss": 0.89140213, + "learning_rate": 0.0005264687181969444, + "loss": 0.90201461, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.12713623, + "routerloss_mlp": 0.0, + "step": 2592, + "time_per_iteration": 2.7121951580047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067017, + "balance_loss_mlp": 1.05430353, + "diversity_loss_mlp": 0.0, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.07969645648170227, + "language_loss": 0.75208342, + "learning_rate": 0.0005261576076040937, + "loss": 0.76275361, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.12719727, + "routerloss_mlp": 0.0, + "step": 2593, + "time_per_iteration": 3.248811721801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059604, + "balance_loss_mlp": 1.04746807, + "diversity_loss_mlp": 0.0, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.07355463018535204, + "language_loss": 0.84396625, + "learning_rate": 0.0005258464868559591, + "loss": 0.85456228, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.12121582, + "routerloss_mlp": 0.0, + "step": 2594, + "time_per_iteration": 2.6535778045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_mlp": 1.0461601, + "diversity_loss_mlp": 0.0, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.06735340586139127, + "language_loss": 0.88490266, + "learning_rate": 0.0005255353560733284, + "loss": 0.89548326, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2595, + "time_per_iteration": 2.5711045265197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_mlp": 1.03453541, + "diversity_loss_mlp": 0.0, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.025598241729826776, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76619136, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 2596, + "time_per_iteration": 4.7992448806762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106698, + "balance_loss_mlp": 1.05498767, + "diversity_loss_mlp": 0.0, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.07107233717475309, + "language_loss": 0.83179224, + "learning_rate": 0.0005249130648877492, + "loss": 0.84246206, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2597, + "time_per_iteration": 2.7089900970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068426, + "balance_loss_mlp": 1.05646324, + "diversity_loss_mlp": 0.0, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.08792128719199578, + "language_loss": 0.84945238, + "learning_rate": 0.0005246019047263953, + "loss": 0.86013663, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2598, + "time_per_iteration": 2.4586942195892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070932, + "balance_loss_mlp": 1.0594883, + "diversity_loss_mlp": 0.0, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.08031275074858332, + "language_loss": 0.82562858, + "learning_rate": 0.0005242907350137353, + "loss": 0.83633792, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2599, + "time_per_iteration": 2.547146797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075807, + "balance_loss_mlp": 1.06445217, + "diversity_loss_mlp": 0.0, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.08690624784708721, + "language_loss": 0.79332286, + "learning_rate": 0.0005239795558705754, + "loss": 0.80408096, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2600, + "time_per_iteration": 2.5985541343688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077027, + "balance_loss_mlp": 1.06555915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.06025548364908716, + "language_loss": 0.89517641, + "learning_rate": 0.0005236683674177264, + "loss": 0.90594667, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2601, + "time_per_iteration": 2.6358349323272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090245, + "balance_loss_mlp": 1.07874131, + "diversity_loss_mlp": 0.0, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.06252214062087984, + "language_loss": 0.82497251, + "learning_rate": 0.0005233571697760021, + "loss": 0.83587497, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.11505127, + "routerloss_mlp": 0.0, + "step": 2602, + "time_per_iteration": 2.8629817962646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112229, + "balance_loss_mlp": 1.10087442, + "diversity_loss_mlp": 0.0, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.06974132169475507, + "language_loss": 0.8293485, + "learning_rate": 0.0005230459630662203, + "loss": 0.84047079, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.11352539, + "routerloss_mlp": 0.0, + "step": 2603, + "time_per_iteration": 2.939380168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114631, + "balance_loss_mlp": 1.10359812, + "diversity_loss_mlp": 0.0, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.10511771954620508, + "language_loss": 0.81605637, + "learning_rate": 0.0005227347474092022, + "loss": 0.82720268, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2604, + "time_per_iteration": 2.7169747352600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112322, + "balance_loss_mlp": 1.11197877, + "diversity_loss_mlp": 0.0, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.07495893748856379, + "language_loss": 0.83243322, + "learning_rate": 0.0005224235229257724, + "loss": 0.84366548, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.11236572, + "routerloss_mlp": 0.0, + "step": 2605, + "time_per_iteration": 2.6940438747406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113092, + "balance_loss_mlp": 1.10178471, + "diversity_loss_mlp": 0.0, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.06884013858989874, + "language_loss": 0.86851203, + "learning_rate": 0.0005221122897367589, + "loss": 0.87964296, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.11309814, + "routerloss_mlp": 0.0, + "step": 2606, + "time_per_iteration": 2.800685405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109504, + "balance_loss_mlp": 1.09854841, + "diversity_loss_mlp": 0.0, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.08142217271827161, + "language_loss": 0.81335354, + "learning_rate": 0.0005218010479629932, + "loss": 0.82444859, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2607, + "time_per_iteration": 2.657087564468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098904, + "balance_loss_mlp": 1.08753133, + "diversity_loss_mlp": 0.0, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.08269023882009051, + "language_loss": 0.82140303, + "learning_rate": 0.0005214897977253102, + "loss": 0.83239204, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2608, + "time_per_iteration": 2.649846076965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084998, + "balance_loss_mlp": 1.07372093, + "diversity_loss_mlp": 0.0, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.061165709745894754, + "language_loss": 0.84233439, + "learning_rate": 0.0005211785391445473, + "loss": 0.8531844, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2609, + "time_per_iteration": 2.7179222106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087043, + "balance_loss_mlp": 1.07538986, + "diversity_loss_mlp": 0.0, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.06641391212047838, + "language_loss": 0.79080439, + "learning_rate": 0.0005208672723415467, + "loss": 0.80167478, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2610, + "time_per_iteration": 2.7928884029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.07359457, + "diversity_loss_mlp": 0.0, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.07063839016412009, + "language_loss": 0.79436052, + "learning_rate": 0.0005205559974371525, + "loss": 0.80521345, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2611, + "time_per_iteration": 2.75744366645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085649, + "balance_loss_mlp": 1.07412767, + "diversity_loss_mlp": 0.0, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.06307258943078059, + "language_loss": 0.82345438, + "learning_rate": 0.0005202447145522123, + "loss": 0.83431089, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2612, + "time_per_iteration": 2.6847879886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084149, + "balance_loss_mlp": 1.07245421, + "diversity_loss_mlp": 0.0, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.060686478103186246, + "language_loss": 0.79358983, + "learning_rate": 0.0005199334238075769, + "loss": 0.80443138, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2613, + "time_per_iteration": 2.560041666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084812, + "balance_loss_mlp": 1.07277226, + "diversity_loss_mlp": 0.0, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.086387426867178, + "language_loss": 0.91963339, + "learning_rate": 0.0005196221253241, + "loss": 0.93048155, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.12030029, + "routerloss_mlp": 0.0, + "step": 2614, + "time_per_iteration": 2.6397578716278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107839, + "balance_loss_mlp": 1.06617713, + "diversity_loss_mlp": 0.0, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.09198716130289855, + "language_loss": 0.82890773, + "learning_rate": 0.0005193108192226383, + "loss": 0.83969164, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2615, + "time_per_iteration": 2.7370193004608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.06396329, + "diversity_loss_mlp": 0.0, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.08941342921082604, + "language_loss": 0.86907744, + "learning_rate": 0.000518999505624052, + "loss": 0.87983918, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2616, + "time_per_iteration": 2.733515739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067104, + "balance_loss_mlp": 1.05521274, + "diversity_loss_mlp": 0.0, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.05504525356098391, + "language_loss": 0.83447164, + "learning_rate": 0.000518688184649203, + "loss": 0.84514272, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2617, + "time_per_iteration": 2.816542625427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075166, + "balance_loss_mlp": 1.06264269, + "diversity_loss_mlp": 0.0, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.07489503160460931, + "language_loss": 0.83596766, + "learning_rate": 0.0005183768564189577, + "loss": 0.84671938, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2618, + "time_per_iteration": 2.5781893730163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081949, + "balance_loss_mlp": 1.07029045, + "diversity_loss_mlp": 0.0, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.0695581827230682, + "language_loss": 0.81485611, + "learning_rate": 0.0005180655210541838, + "loss": 0.82567555, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2619, + "time_per_iteration": 2.5642077922821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091231, + "balance_loss_mlp": 1.07894695, + "diversity_loss_mlp": 0.0, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.08072673001204132, + "language_loss": 0.83226323, + "learning_rate": 0.0005177541786757527, + "loss": 0.84317553, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2620, + "time_per_iteration": 2.7365450859069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100722, + "balance_loss_mlp": 1.0882231, + "diversity_loss_mlp": 0.0, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.0921594393427519, + "language_loss": 0.82626402, + "learning_rate": 0.000517442829404538, + "loss": 0.83727121, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2621, + "time_per_iteration": 3.053333044052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097629, + "balance_loss_mlp": 1.08534431, + "diversity_loss_mlp": 0.0, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.0844592365120011, + "language_loss": 0.87026393, + "learning_rate": 0.0005171314733614166, + "loss": 0.88124025, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.12286377, + "routerloss_mlp": 0.0, + "step": 2622, + "time_per_iteration": 2.8867554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099715, + "balance_loss_mlp": 1.08721614, + "diversity_loss_mlp": 0.0, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.07191738026805333, + "language_loss": 0.78457403, + "learning_rate": 0.0005168201106672671, + "loss": 0.79557121, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 2623, + "time_per_iteration": 2.7532849311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083535, + "balance_loss_mlp": 1.07122076, + "diversity_loss_mlp": 0.0, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.06664161086213699, + "language_loss": 0.84876573, + "learning_rate": 0.0005165087414429717, + "loss": 0.85960108, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.12316895, + "routerloss_mlp": 0.0, + "step": 2624, + "time_per_iteration": 2.614475965499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_mlp": 1.061566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.06712294156504883, + "language_loss": 0.83509946, + "learning_rate": 0.0005161973658094144, + "loss": 0.84583604, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2625, + "time_per_iteration": 2.6536033153533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875819, + "balance_loss_mlp": 1.51064336, + "diversity_loss_mlp": 0.21324398, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.02954045761884847, + "language_loss": 0.82599998, + "learning_rate": 0.000515885983887482, + "loss": 0.83475816, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01387555, + "step": 2626, + "time_per_iteration": 2.801612138748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070563, + "balance_loss_mlp": 1.05863595, + "diversity_loss_mlp": 0.0, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.07357396162877478, + "language_loss": 0.84283531, + "learning_rate": 0.0005155745957980636, + "loss": 0.8535409, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2627, + "time_per_iteration": 2.6239585876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071483, + "balance_loss_mlp": 1.0589962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.06901961430938243, + "language_loss": 0.88532668, + "learning_rate": 0.000515263201662051, + "loss": 0.89604151, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2628, + "time_per_iteration": 2.65803861618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107038, + "balance_loss_mlp": 1.05840504, + "diversity_loss_mlp": 0.0, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.06314416177701848, + "language_loss": 0.8250618, + "learning_rate": 0.0005149518016003378, + "loss": 0.8357656, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.11968994, + "routerloss_mlp": 0.0, + "step": 2629, + "time_per_iteration": 3.1646623611450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061865, + "balance_loss_mlp": 1.04946709, + "diversity_loss_mlp": 0.0, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.1007750022567515, + "language_loss": 0.82337832, + "learning_rate": 0.0005146403957338206, + "loss": 0.83399695, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.12402344, + "routerloss_mlp": 0.0, + "step": 2630, + "time_per_iteration": 2.5879476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.05236936, + "diversity_loss_mlp": 0.0, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.06667308072604639, + "language_loss": 0.82288837, + "learning_rate": 0.0005143289841833975, + "loss": 0.83353263, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.12060547, + "routerloss_mlp": 0.0, + "step": 2631, + "time_per_iteration": 2.8448615074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.05643749, + "diversity_loss_mlp": 0.0, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.09203997555384738, + "language_loss": 0.82179189, + "learning_rate": 0.0005140175670699696, + "loss": 0.83247638, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.11999512, + "routerloss_mlp": 0.0, + "step": 2632, + "time_per_iteration": 2.642666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067258, + "balance_loss_mlp": 1.05545044, + "diversity_loss_mlp": 0.0, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.04894531982576629, + "language_loss": 0.82796603, + "learning_rate": 0.0005137061445144395, + "loss": 0.8386386, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2633, + "time_per_iteration": 2.8800737857818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076133, + "balance_loss_mlp": 1.06476033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.06583044180155191, + "language_loss": 0.87074906, + "learning_rate": 0.000513394716637712, + "loss": 0.88151038, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2634, + "time_per_iteration": 2.7507505416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_mlp": 1.02921486, + "diversity_loss_mlp": 0.0, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.03533282921310782, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80227697, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.06787109, + "routerloss_mlp": 0.0, + "step": 2635, + "time_per_iteration": 4.825605869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110028, + "balance_loss_mlp": 1.08881176, + "diversity_loss_mlp": 0.0, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.07735545811428028, + "language_loss": 0.81068468, + "learning_rate": 0.0005127718454042958, + "loss": 0.82168746, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2636, + "time_per_iteration": 2.8241050243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099835, + "balance_loss_mlp": 1.08840299, + "diversity_loss_mlp": 0.0, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.08187506034762644, + "language_loss": 0.83836603, + "learning_rate": 0.0005124604022894269, + "loss": 0.8493644, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2637, + "time_per_iteration": 2.9366774559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019034, + "balance_loss_mlp": 1.01259708, + "diversity_loss_mlp": 0.0, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.025963071476552062, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.7820726, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.06445312, + "routerloss_mlp": 0.0, + "step": 2638, + "time_per_iteration": 4.828620433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092892, + "balance_loss_mlp": 1.08166814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.07837351333742608, + "language_loss": 0.83244252, + "learning_rate": 0.0005118375016679325, + "loss": 0.84337139, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.11224365, + "routerloss_mlp": 0.0, + "step": 2639, + "time_per_iteration": 2.801852226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077953, + "balance_loss_mlp": 1.0666697, + "diversity_loss_mlp": 0.0, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.07879033409242599, + "language_loss": 0.80358827, + "learning_rate": 0.0005115260444031382, + "loss": 0.81436777, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2640, + "time_per_iteration": 2.596771240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010253, + "balance_loss_mlp": 1.00422084, + "diversity_loss_mlp": 0.0, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011737851482073082, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79742074, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.06030273, + "routerloss_mlp": 0.0, + "step": 2641, + "time_per_iteration": 4.948842287063599 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075403, + "balance_loss_mlp": 1.06412029, + "diversity_loss_mlp": 0.0, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.08031663653823312, + "language_loss": 0.8740893, + "learning_rate": 0.0005109031165700483, + "loss": 0.88484335, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.112854, + "routerloss_mlp": 0.0, + "step": 2642, + "time_per_iteration": 2.5833895206451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060876, + "balance_loss_mlp": 1.04938459, + "diversity_loss_mlp": 0.0, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.06372027514248847, + "language_loss": 0.83170295, + "learning_rate": 0.0005105916462435945, + "loss": 0.84231174, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2643, + "time_per_iteration": 2.841296911239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106144, + "balance_loss_mlp": 1.05014455, + "diversity_loss_mlp": 0.0, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.0681709540800111, + "language_loss": 0.85266602, + "learning_rate": 0.0005102801718050989, + "loss": 0.86328042, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2644, + "time_per_iteration": 2.680905818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058539, + "balance_loss_mlp": 1.04714894, + "diversity_loss_mlp": 0.0, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.07434027721258654, + "language_loss": 0.89314902, + "learning_rate": 0.0005099686933754867, + "loss": 0.90373439, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.1138916, + "routerloss_mlp": 0.0, + "step": 2645, + "time_per_iteration": 2.723043441772461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062253, + "balance_loss_mlp": 1.05088663, + "diversity_loss_mlp": 0.0, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.07256046334666034, + "language_loss": 0.8429243, + "learning_rate": 0.0005096572110756845, + "loss": 0.85354686, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2646, + "time_per_iteration": 2.6682143211364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069615, + "balance_loss_mlp": 1.05801558, + "diversity_loss_mlp": 0.0, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.06200075514200526, + "language_loss": 0.85445803, + "learning_rate": 0.0005093457250266205, + "loss": 0.86515421, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2647, + "time_per_iteration": 2.682891368865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.05816472, + "diversity_loss_mlp": 0.0, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.1092618136395953, + "language_loss": 0.83279526, + "learning_rate": 0.000509034235349224, + "loss": 0.84349322, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.11627197, + "routerloss_mlp": 0.0, + "step": 2648, + "time_per_iteration": 2.7173004150390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068823, + "balance_loss_mlp": 1.05756938, + "diversity_loss_mlp": 0.0, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.07759183255272654, + "language_loss": 0.81290972, + "learning_rate": 0.0005087227421644266, + "loss": 0.82359791, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2649, + "time_per_iteration": 2.79217791557312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066501, + "balance_loss_mlp": 1.05469334, + "diversity_loss_mlp": 0.0, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.07036579944312285, + "language_loss": 0.85978615, + "learning_rate": 0.0005084112455931602, + "loss": 0.87045121, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2650, + "time_per_iteration": 2.593323230743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.06125915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.06673546987966349, + "language_loss": 0.85377133, + "learning_rate": 0.0005080997457563586, + "loss": 0.86449993, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2651, + "time_per_iteration": 2.5473101139068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074592, + "balance_loss_mlp": 1.06324303, + "diversity_loss_mlp": 0.0, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.07839929831674766, + "language_loss": 0.79146206, + "learning_rate": 0.0005077882427749569, + "loss": 0.80220807, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.11340332, + "routerloss_mlp": 0.0, + "step": 2652, + "time_per_iteration": 2.5378577709198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081444, + "balance_loss_mlp": 1.07002354, + "diversity_loss_mlp": 0.0, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.09222135648623411, + "language_loss": 0.84599656, + "learning_rate": 0.0005074767367698913, + "loss": 0.85681099, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2653, + "time_per_iteration": 2.7541823387145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.0749042, + "diversity_loss_mlp": 0.0, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.07250262260433718, + "language_loss": 0.82987714, + "learning_rate": 0.0005071652278620988, + "loss": 0.84074312, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2654, + "time_per_iteration": 3.0615251064300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089781, + "balance_loss_mlp": 1.07870018, + "diversity_loss_mlp": 0.0, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.07582936293709001, + "language_loss": 0.83328903, + "learning_rate": 0.0005068537161725186, + "loss": 0.84418684, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.11083984, + "routerloss_mlp": 0.0, + "step": 2655, + "time_per_iteration": 2.7840993404388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092544, + "balance_loss_mlp": 1.08139753, + "diversity_loss_mlp": 0.0, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.07786356346883126, + "language_loss": 0.84288549, + "learning_rate": 0.0005065422018220893, + "loss": 0.85381097, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.1114502, + "routerloss_mlp": 0.0, + "step": 2656, + "time_per_iteration": 2.832575798034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102102, + "balance_loss_mlp": 1.09118247, + "diversity_loss_mlp": 0.0, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.08194812181942494, + "language_loss": 0.80392313, + "learning_rate": 0.0005062306849317521, + "loss": 0.81494415, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.10931396, + "routerloss_mlp": 0.0, + "step": 2657, + "time_per_iteration": 2.794966220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100168, + "balance_loss_mlp": 1.08891487, + "diversity_loss_mlp": 0.0, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.08210850574888065, + "language_loss": 0.83486134, + "learning_rate": 0.0005059191656224487, + "loss": 0.84586298, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2658, + "time_per_iteration": 2.744889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093286, + "balance_loss_mlp": 1.08238411, + "diversity_loss_mlp": 0.0, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.07321009008554179, + "language_loss": 0.88860798, + "learning_rate": 0.0005056076440151212, + "loss": 0.89954078, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.10906982, + "routerloss_mlp": 0.0, + "step": 2659, + "time_per_iteration": 2.6951825618743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113007, + "balance_loss_mlp": 1.12453902, + "diversity_loss_mlp": 0.0, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.07076104465295206, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77418184, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2660, + "time_per_iteration": 4.850585460662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081367, + "balance_loss_mlp": 1.07051301, + "diversity_loss_mlp": 0.0, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06225287802871053, + "language_loss": 0.86966121, + "learning_rate": 0.0005049845943901691, + "loss": 0.88047487, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.10864258, + "routerloss_mlp": 0.0, + "step": 2661, + "time_per_iteration": 2.8342370986938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079888, + "balance_loss_mlp": 1.0692786, + "diversity_loss_mlp": 0.0, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.058043198592839004, + "language_loss": 0.86637139, + "learning_rate": 0.0005046730666144338, + "loss": 0.87717032, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2662, + "time_per_iteration": 2.8066177368164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078601, + "balance_loss_mlp": 1.06801558, + "diversity_loss_mlp": 0.0, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.058701328600128284, + "language_loss": 0.87834954, + "learning_rate": 0.0005043615370244532, + "loss": 0.88913548, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 2663, + "time_per_iteration": 3.3716113567352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105143, + "balance_loss_mlp": 1.04589903, + "diversity_loss_mlp": 0.0, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.02890820887526385, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79295814, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2664, + "time_per_iteration": 4.632098913192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074303, + "balance_loss_mlp": 1.0636878, + "diversity_loss_mlp": 0.0, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.05776678043634197, + "language_loss": 0.85301316, + "learning_rate": 0.0005037384728855425, + "loss": 0.86375624, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2665, + "time_per_iteration": 2.8025074005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077204, + "balance_loss_mlp": 1.06618285, + "diversity_loss_mlp": 0.0, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.08001364709617295, + "language_loss": 0.84092522, + "learning_rate": 0.0005034269385785075, + "loss": 0.85169727, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2666, + "time_per_iteration": 2.6508989334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070249, + "balance_loss_mlp": 1.05929327, + "diversity_loss_mlp": 0.0, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.06550806602425656, + "language_loss": 0.849998, + "learning_rate": 0.0005031154029410168, + "loss": 0.86070049, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.10955811, + "routerloss_mlp": 0.0, + "step": 2667, + "time_per_iteration": 2.6072959899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062599, + "balance_loss_mlp": 1.05130351, + "diversity_loss_mlp": 0.0, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.07261202613887993, + "language_loss": 0.86903906, + "learning_rate": 0.0005028038660940197, + "loss": 0.87966514, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2668, + "time_per_iteration": 2.5607664585113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060608, + "balance_loss_mlp": 1.04923522, + "diversity_loss_mlp": 0.0, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.06521290367629204, + "language_loss": 0.84553415, + "learning_rate": 0.0005024923281584648, + "loss": 0.8561402, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 2669, + "time_per_iteration": 2.623643159866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066692, + "balance_loss_mlp": 1.05528402, + "diversity_loss_mlp": 0.0, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.06549707374857121, + "language_loss": 0.82560658, + "learning_rate": 0.0005021807892553026, + "loss": 0.83627355, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2670, + "time_per_iteration": 2.699392318725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062757, + "balance_loss_mlp": 1.05140269, + "diversity_loss_mlp": 0.0, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.07318428846825417, + "language_loss": 0.84862608, + "learning_rate": 0.0005018692495054828, + "loss": 0.85925364, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2671, + "time_per_iteration": 2.7645046710968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106912, + "balance_loss_mlp": 1.05812323, + "diversity_loss_mlp": 0.0, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.06397327244364565, + "language_loss": 0.80696338, + "learning_rate": 0.0005015577090299561, + "loss": 0.81765461, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.11004639, + "routerloss_mlp": 0.0, + "step": 2672, + "time_per_iteration": 2.684048891067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068328, + "balance_loss_mlp": 1.05731261, + "diversity_loss_mlp": 0.0, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.06574977800170037, + "language_loss": 0.86744952, + "learning_rate": 0.0005012461679496729, + "loss": 0.87813282, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2673, + "time_per_iteration": 2.5885825157165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077367, + "balance_loss_mlp": 1.06613708, + "diversity_loss_mlp": 0.0, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.09032594792095527, + "language_loss": 0.87748468, + "learning_rate": 0.0005009346263855848, + "loss": 0.88825834, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.11236572, + "routerloss_mlp": 0.0, + "step": 2674, + "time_per_iteration": 2.5970752239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092275, + "balance_loss_mlp": 1.08141518, + "diversity_loss_mlp": 0.0, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.06465969942237398, + "language_loss": 0.83699256, + "learning_rate": 0.0005006230844586422, + "loss": 0.84791529, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2675, + "time_per_iteration": 2.7912445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00882234, + "balance_loss_mlp": 1.52600026, + "diversity_loss_mlp": 0.21199086, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.0263651655655577, + "language_loss": 0.78895926, + "learning_rate": 0.0005003115422897968, + "loss": 0.79778159, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01323896, + "step": 2676, + "time_per_iteration": 2.8051552772521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111408, + "balance_loss_mlp": 1.10282683, + "diversity_loss_mlp": 0.0, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.0741463219638638, + "language_loss": 0.87253916, + "learning_rate": 0.0005, + "loss": 0.88367999, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2677, + "time_per_iteration": 2.6435391902923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119404, + "balance_loss_mlp": 1.10841274, + "diversity_loss_mlp": 0.0, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.08792863943872284, + "language_loss": 0.79283178, + "learning_rate": 0.0004996884577102033, + "loss": 0.80402583, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.10992432, + "routerloss_mlp": 0.0, + "step": 2678, + "time_per_iteration": 3.089707374572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111646, + "balance_loss_mlp": 1.10545659, + "diversity_loss_mlp": 0.0, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.08112886088857633, + "language_loss": 0.84611261, + "learning_rate": 0.000499376915541358, + "loss": 0.85727721, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.10998535, + "routerloss_mlp": 0.0, + "step": 2679, + "time_per_iteration": 2.7143540382385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109969, + "balance_loss_mlp": 1.08910465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.16255458440641746, + "language_loss": 0.81113428, + "learning_rate": 0.0004990653736144155, + "loss": 0.82213122, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 2680, + "time_per_iteration": 2.857952356338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084916, + "balance_loss_mlp": 1.07416916, + "diversity_loss_mlp": 0.0, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.06912387000686389, + "language_loss": 0.85820174, + "learning_rate": 0.0004987538320503271, + "loss": 0.86905092, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.10748291, + "routerloss_mlp": 0.0, + "step": 2681, + "time_per_iteration": 2.485462188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077984, + "balance_loss_mlp": 1.06715369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.08121908376237164, + "language_loss": 0.83137929, + "learning_rate": 0.0004984422909700442, + "loss": 0.84215909, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2682, + "time_per_iteration": 2.7179505825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068711, + "balance_loss_mlp": 1.05784559, + "diversity_loss_mlp": 0.0, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.07829442771548371, + "language_loss": 0.83800036, + "learning_rate": 0.0004981307504945173, + "loss": 0.84868753, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2683, + "time_per_iteration": 2.71893048286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061815, + "balance_loss_mlp": 1.05075228, + "diversity_loss_mlp": 0.0, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.08619577510477876, + "language_loss": 0.89448887, + "learning_rate": 0.0004978192107446976, + "loss": 0.90510702, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2684, + "time_per_iteration": 2.7385506629943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062179, + "balance_loss_mlp": 1.05111599, + "diversity_loss_mlp": 0.0, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.08129158019501125, + "language_loss": 0.8740204, + "learning_rate": 0.0004975076718415353, + "loss": 0.88464212, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2685, + "time_per_iteration": 2.599379777908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_mlp": 1.04478931, + "diversity_loss_mlp": 0.0, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.06772474949474022, + "language_loss": 0.90610582, + "learning_rate": 0.0004971961339059806, + "loss": 0.91666389, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.11016846, + "routerloss_mlp": 0.0, + "step": 2686, + "time_per_iteration": 2.498819589614868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057473, + "balance_loss_mlp": 1.04611838, + "diversity_loss_mlp": 0.0, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.06487308694775892, + "language_loss": 0.84021914, + "learning_rate": 0.0004968845970589832, + "loss": 0.85079384, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2687, + "time_per_iteration": 2.6814825534820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061325, + "balance_loss_mlp": 1.04982185, + "diversity_loss_mlp": 0.0, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.06911328459433905, + "language_loss": 0.8435297, + "learning_rate": 0.0004965730614214926, + "loss": 0.8541429, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2688, + "time_per_iteration": 2.6537294387817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106762, + "balance_loss_mlp": 1.05618167, + "diversity_loss_mlp": 0.0, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.07039148040030412, + "language_loss": 0.85285878, + "learning_rate": 0.0004962615271144576, + "loss": 0.86353499, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2689, + "time_per_iteration": 2.50710129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064714, + "balance_loss_mlp": 1.05325246, + "diversity_loss_mlp": 0.0, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.0770213433091723, + "language_loss": 0.82680881, + "learning_rate": 0.0004959499942588264, + "loss": 0.83745599, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.11456299, + "routerloss_mlp": 0.0, + "step": 2690, + "time_per_iteration": 2.892293930053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049886, + "balance_loss_mlp": 1.04297149, + "diversity_loss_mlp": 0.0, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.03551055813206397, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79249913, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 2691, + "time_per_iteration": 4.764665842056274 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070219, + "balance_loss_mlp": 1.05894208, + "diversity_loss_mlp": 0.0, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.08037192658361764, + "language_loss": 0.85416174, + "learning_rate": 0.0004953269333855661, + "loss": 0.86486399, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2692, + "time_per_iteration": 2.785511016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075514, + "balance_loss_mlp": 1.06407034, + "diversity_loss_mlp": 0.0, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.06114385406953633, + "language_loss": 0.84516799, + "learning_rate": 0.0004950154056098309, + "loss": 0.85592318, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.11437988, + "routerloss_mlp": 0.0, + "step": 2693, + "time_per_iteration": 2.683246374130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.07183599, + "diversity_loss_mlp": 0.0, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.08066804074186672, + "language_loss": 0.84078431, + "learning_rate": 0.0004947038797692867, + "loss": 0.85161769, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2694, + "time_per_iteration": 2.8312196731567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00872465, + "balance_loss_mlp": 1.50766385, + "diversity_loss_mlp": 0.2097543, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.031552182630998016, + "language_loss": 0.77636528, + "learning_rate": 0.0004943923559848789, + "loss": 0.78508997, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01375636, + "step": 2695, + "time_per_iteration": 2.8084189891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010865, + "balance_loss_mlp": 1.07534158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.055486891719670514, + "language_loss": 0.90695632, + "learning_rate": 0.0004940808343775515, + "loss": 0.91782129, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.1116333, + "routerloss_mlp": 0.0, + "step": 2696, + "time_per_iteration": 2.6868011951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00874209, + "balance_loss_mlp": 1.50797677, + "diversity_loss_mlp": 0.21290711, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.034010170020107075, + "language_loss": 0.82213199, + "learning_rate": 0.0004937693150682479, + "loss": 0.83087409, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01376703, + "step": 2697, + "time_per_iteration": 2.5905513763427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090314, + "balance_loss_mlp": 1.07915568, + "diversity_loss_mlp": 0.0, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.06705206433038317, + "language_loss": 0.7658723, + "learning_rate": 0.0004934577981779107, + "loss": 0.77677542, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.1116333, + "routerloss_mlp": 0.0, + "step": 2698, + "time_per_iteration": 2.7049057483673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087095, + "balance_loss_mlp": 1.07585335, + "diversity_loss_mlp": 0.0, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.061529133753451364, + "language_loss": 0.812904, + "learning_rate": 0.0004931462838274817, + "loss": 0.82377493, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.11242676, + "routerloss_mlp": 0.0, + "step": 2699, + "time_per_iteration": 2.8723175525665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089813, + "balance_loss_mlp": 1.07877994, + "diversity_loss_mlp": 0.0, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.08487292742433496, + "language_loss": 0.84222901, + "learning_rate": 0.0004928347721379011, + "loss": 0.85312712, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2700, + "time_per_iteration": 2.639867067337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080545, + "balance_loss_mlp": 1.06974459, + "diversity_loss_mlp": 0.0, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.06134037245316137, + "language_loss": 0.82221866, + "learning_rate": 0.0004925232632301089, + "loss": 0.83302414, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.10797119, + "routerloss_mlp": 0.0, + "step": 2701, + "time_per_iteration": 2.622311592102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077123, + "balance_loss_mlp": 1.0660243, + "diversity_loss_mlp": 0.0, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.06337758152829237, + "language_loss": 0.79842103, + "learning_rate": 0.0004922117572250431, + "loss": 0.80919224, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.11096191, + "routerloss_mlp": 0.0, + "step": 2702, + "time_per_iteration": 2.6980605125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070723, + "balance_loss_mlp": 1.05936241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.07398400160993446, + "language_loss": 0.80852163, + "learning_rate": 0.0004919002542436414, + "loss": 0.81922889, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2703, + "time_per_iteration": 2.8354647159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072853, + "balance_loss_mlp": 1.0619514, + "diversity_loss_mlp": 0.0, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.064542502306726, + "language_loss": 0.8126899, + "learning_rate": 0.0004915887544068399, + "loss": 0.8234185, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.10906982, + "routerloss_mlp": 0.0, + "step": 2704, + "time_per_iteration": 2.6693973541259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068776, + "balance_loss_mlp": 1.05770195, + "diversity_loss_mlp": 0.0, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.06578360362401801, + "language_loss": 0.7856639, + "learning_rate": 0.0004912772578355736, + "loss": 0.79635167, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2705, + "time_per_iteration": 2.892735481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0611918, + "diversity_loss_mlp": 0.0, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.07750798967783011, + "language_loss": 0.82549465, + "learning_rate": 0.000490965764650776, + "loss": 0.83621788, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.11126709, + "routerloss_mlp": 0.0, + "step": 2706, + "time_per_iteration": 2.8544106483459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070979, + "balance_loss_mlp": 1.05984521, + "diversity_loss_mlp": 0.0, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.06572065456776559, + "language_loss": 0.82828736, + "learning_rate": 0.0004906542749733798, + "loss": 0.83899713, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.11132812, + "routerloss_mlp": 0.0, + "step": 2707, + "time_per_iteration": 3.6044294834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.06353068, + "diversity_loss_mlp": 0.0, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.055629683487612144, + "language_loss": 0.85401118, + "learning_rate": 0.0004903427889243156, + "loss": 0.86475539, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.10894775, + "routerloss_mlp": 0.0, + "step": 2708, + "time_per_iteration": 2.830115795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075334, + "balance_loss_mlp": 1.06425905, + "diversity_loss_mlp": 0.0, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.06692681375903406, + "language_loss": 0.85444081, + "learning_rate": 0.0004900313066245134, + "loss": 0.86519414, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2709, + "time_per_iteration": 2.6552441120147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106839, + "balance_loss_mlp": 1.05745232, + "diversity_loss_mlp": 0.0, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.06855502771674758, + "language_loss": 0.81061214, + "learning_rate": 0.0004897198281949012, + "loss": 0.82129598, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.10949707, + "routerloss_mlp": 0.0, + "step": 2710, + "time_per_iteration": 2.645981550216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00874972, + "balance_loss_mlp": 1.51124442, + "diversity_loss_mlp": 0.21021394, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.03577466895356274, + "language_loss": 0.78009295, + "learning_rate": 0.0004894083537564057, + "loss": 0.78884268, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01424256, + "step": 2711, + "time_per_iteration": 2.746945858001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0086804, + "balance_loss_mlp": 1.49602354, + "diversity_loss_mlp": 0.21089339, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.02967241377466632, + "language_loss": 0.80981171, + "learning_rate": 0.0004890968834299519, + "loss": 0.81849211, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01458106, + "step": 2712, + "time_per_iteration": 2.749049663543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072348, + "balance_loss_mlp": 1.06096959, + "diversity_loss_mlp": 0.0, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.06422523073894505, + "language_loss": 0.78739542, + "learning_rate": 0.0004887854173364633, + "loss": 0.79811883, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2713, + "time_per_iteration": 2.760077953338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00862336, + "balance_loss_mlp": 1.48416615, + "diversity_loss_mlp": 0.2112534, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.02839704110509781, + "language_loss": 0.81564224, + "learning_rate": 0.0004884739555968617, + "loss": 0.8242656, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01462588, + "step": 2714, + "time_per_iteration": 2.902200698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043267, + "balance_loss_mlp": 1.03711605, + "diversity_loss_mlp": 0.0, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.025188943281148922, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.8002032, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2715, + "time_per_iteration": 4.977273464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847492, + "balance_loss_mlp": 1.45660305, + "diversity_loss_mlp": 0.21012819, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.03573397478438407, + "language_loss": 0.86888605, + "learning_rate": 0.0004878510456629992, + "loss": 0.87736094, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01412619, + "step": 2716, + "time_per_iteration": 2.998455286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068588, + "balance_loss_mlp": 1.05767989, + "diversity_loss_mlp": 0.0, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.06765059094142209, + "language_loss": 0.85142076, + "learning_rate": 0.00048753959771057314, + "loss": 0.86210662, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.10925293, + "routerloss_mlp": 0.0, + "step": 2717, + "time_per_iteration": 2.6113662719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065726, + "balance_loss_mlp": 1.05442464, + "diversity_loss_mlp": 0.0, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.08600503840688169, + "language_loss": 0.82445514, + "learning_rate": 0.0004872281545957044, + "loss": 0.83511233, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.11297607, + "routerloss_mlp": 0.0, + "step": 2718, + "time_per_iteration": 2.7617604732513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070418, + "balance_loss_mlp": 1.05911732, + "diversity_loss_mlp": 0.0, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.061040572409093316, + "language_loss": 0.86051857, + "learning_rate": 0.0004869167164393055, + "loss": 0.87122279, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.11303711, + "routerloss_mlp": 0.0, + "step": 2719, + "time_per_iteration": 2.932154417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069726, + "balance_loss_mlp": 1.05857992, + "diversity_loss_mlp": 0.0, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.11614833297327579, + "language_loss": 0.89542395, + "learning_rate": 0.00048660528336228793, + "loss": 0.90612125, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.11151123, + "routerloss_mlp": 0.0, + "step": 2720, + "time_per_iteration": 2.7917380332946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071611, + "balance_loss_mlp": 1.06013143, + "diversity_loss_mlp": 0.0, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.05730438157509479, + "language_loss": 0.90177751, + "learning_rate": 0.0004862938554855606, + "loss": 0.91249359, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2721, + "time_per_iteration": 2.809875965118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074661, + "balance_loss_mlp": 1.06371188, + "diversity_loss_mlp": 0.0, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.06740042101514945, + "language_loss": 0.86071771, + "learning_rate": 0.0004859824329300304, + "loss": 0.87146431, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.10949707, + "routerloss_mlp": 0.0, + "step": 2722, + "time_per_iteration": 2.5660176277160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070887, + "balance_loss_mlp": 1.05932951, + "diversity_loss_mlp": 0.0, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.06312939516717878, + "language_loss": 0.83826602, + "learning_rate": 0.00048567101581660244, + "loss": 0.84897488, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.11560059, + "routerloss_mlp": 0.0, + "step": 2723, + "time_per_iteration": 2.593005895614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107168, + "balance_loss_mlp": 1.0603317, + "diversity_loss_mlp": 0.0, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.07171512526566694, + "language_loss": 0.86622667, + "learning_rate": 0.00048535960426617956, + "loss": 0.87694347, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2724, + "time_per_iteration": 2.611551523208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070313, + "balance_loss_mlp": 1.05852962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.07077799246948024, + "language_loss": 0.81735158, + "learning_rate": 0.0004850481983996621, + "loss": 0.82805473, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2725, + "time_per_iteration": 2.7656939029693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058325, + "balance_loss_mlp": 1.04673731, + "diversity_loss_mlp": 0.0, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.07497614956550303, + "language_loss": 0.87961793, + "learning_rate": 0.0004847367983379492, + "loss": 0.89020109, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2726, + "time_per_iteration": 2.523099899291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066059, + "balance_loss_mlp": 1.05477571, + "diversity_loss_mlp": 0.0, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.06275633211650163, + "language_loss": 0.78715622, + "learning_rate": 0.00048442540420193643, + "loss": 0.79781681, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2727, + "time_per_iteration": 2.9433038234710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056804, + "balance_loss_mlp": 1.04506755, + "diversity_loss_mlp": 0.0, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.07393634521455344, + "language_loss": 0.79367208, + "learning_rate": 0.0004841140161125182, + "loss": 0.80424011, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2728, + "time_per_iteration": 3.619252920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063037, + "balance_loss_mlp": 1.05171847, + "diversity_loss_mlp": 0.0, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.07165329358033216, + "language_loss": 0.84827459, + "learning_rate": 0.0004838026341905857, + "loss": 0.85890496, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.11322021, + "routerloss_mlp": 0.0, + "step": 2729, + "time_per_iteration": 2.716114044189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057536, + "balance_loss_mlp": 1.04594862, + "diversity_loss_mlp": 0.0, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.13042739485624238, + "language_loss": 0.85312545, + "learning_rate": 0.00048349125855702844, + "loss": 0.86370087, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2730, + "time_per_iteration": 2.787280559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00837258, + "balance_loss_mlp": 1.43598437, + "diversity_loss_mlp": 0.21135046, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.027658523195400363, + "language_loss": 0.81318069, + "learning_rate": 0.00048317988933273287, + "loss": 0.82155323, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01359018, + "step": 2731, + "time_per_iteration": 2.763814687728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057346, + "balance_loss_mlp": 1.04585993, + "diversity_loss_mlp": 0.0, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.07420390441928848, + "language_loss": 0.82373381, + "learning_rate": 0.00048286852663858367, + "loss": 0.83430725, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2732, + "time_per_iteration": 2.9533157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063655, + "balance_loss_mlp": 1.05203819, + "diversity_loss_mlp": 0.0, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.07616653501098058, + "language_loss": 0.8428973, + "learning_rate": 0.000482557170595462, + "loss": 0.8535338, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2733, + "time_per_iteration": 2.865147829055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065104, + "balance_loss_mlp": 1.0532366, + "diversity_loss_mlp": 0.0, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.060395165010054055, + "language_loss": 0.87880594, + "learning_rate": 0.0004822458213242475, + "loss": 0.88945693, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.11859131, + "routerloss_mlp": 0.0, + "step": 2734, + "time_per_iteration": 2.557253360748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070633, + "balance_loss_mlp": 1.05886698, + "diversity_loss_mlp": 0.0, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.1031910380133139, + "language_loss": 0.86086309, + "learning_rate": 0.00048193447894581627, + "loss": 0.8715694, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2735, + "time_per_iteration": 3.122976541519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076001, + "balance_loss_mlp": 1.06436014, + "diversity_loss_mlp": 0.0, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.06843040001694842, + "language_loss": 0.8809998, + "learning_rate": 0.00048162314358104243, + "loss": 0.89175981, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2736, + "time_per_iteration": 2.6340246200561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00824973, + "balance_loss_mlp": 1.41347969, + "diversity_loss_mlp": 0.20989257, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.031515925317837694, + "language_loss": 0.83306372, + "learning_rate": 0.0004813118153507969, + "loss": 0.84131336, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01328672, + "step": 2737, + "time_per_iteration": 2.7356157302856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_mlp": 1.03480983, + "diversity_loss_mlp": 0.0, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.03217065957479051, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83488321, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2738, + "time_per_iteration": 4.772867202758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.06062317, + "diversity_loss_mlp": 0.0, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.0555866415390632, + "language_loss": 0.83715498, + "learning_rate": 0.00048068918077736163, + "loss": 0.84787494, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2739, + "time_per_iteration": 3.2028074264526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076914, + "balance_loss_mlp": 1.06573176, + "diversity_loss_mlp": 0.0, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.06998122113459494, + "language_loss": 0.81445146, + "learning_rate": 0.0004803778746759001, + "loss": 0.82522058, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2740, + "time_per_iteration": 2.87070369720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082959, + "balance_loss_mlp": 1.07215285, + "diversity_loss_mlp": 0.0, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.07737040857299185, + "language_loss": 0.82122779, + "learning_rate": 0.00048006657619242317, + "loss": 0.83205736, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.10809326, + "routerloss_mlp": 0.0, + "step": 2741, + "time_per_iteration": 2.6385269165039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107519, + "balance_loss_mlp": 1.06447887, + "diversity_loss_mlp": 0.0, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.07879516603511716, + "language_loss": 0.78380877, + "learning_rate": 0.00047975528544778775, + "loss": 0.79456067, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.10717773, + "routerloss_mlp": 0.0, + "step": 2742, + "time_per_iteration": 2.6197235584259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079206, + "balance_loss_mlp": 1.06839335, + "diversity_loss_mlp": 0.0, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.07439948679259917, + "language_loss": 0.88816094, + "learning_rate": 0.00047944400256284754, + "loss": 0.89895302, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.10827637, + "routerloss_mlp": 0.0, + "step": 2743, + "time_per_iteration": 2.6887855529785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00830459, + "balance_loss_mlp": 1.42072511, + "diversity_loss_mlp": 0.21262056, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.03227823662204125, + "language_loss": 0.799101, + "learning_rate": 0.0004791327276584532, + "loss": 0.80740565, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01378582, + "step": 2744, + "time_per_iteration": 2.8497848510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.07629538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.0718535906247093, + "language_loss": 0.80497956, + "learning_rate": 0.00047882146085545264, + "loss": 0.81585032, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.10784912, + "routerloss_mlp": 0.0, + "step": 2745, + "time_per_iteration": 2.6078941822052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017458, + "balance_loss_mlp": 1.01199865, + "diversity_loss_mlp": 0.0, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.013176381696238814, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76419842, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.0546875, + "routerloss_mlp": 0.0, + "step": 2746, + "time_per_iteration": 4.974900007247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078307, + "balance_loss_mlp": 1.06777453, + "diversity_loss_mlp": 0.0, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.0894490118638191, + "language_loss": 0.79344547, + "learning_rate": 0.00047819895203700684, + "loss": 0.80422854, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2747, + "time_per_iteration": 2.717135190963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015273, + "balance_loss_mlp": 1.00983751, + "diversity_loss_mlp": 0.0, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.009473538771460566, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76527709, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.05444336, + "routerloss_mlp": 0.0, + "step": 2748, + "time_per_iteration": 4.642770290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085947, + "balance_loss_mlp": 1.07577801, + "diversity_loss_mlp": 0.0, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.07060951554594143, + "language_loss": 0.88469762, + "learning_rate": 0.0004775764770742277, + "loss": 0.89555711, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2749, + "time_per_iteration": 2.8018476963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087955, + "balance_loss_mlp": 1.07761312, + "diversity_loss_mlp": 0.0, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.08234082280170717, + "language_loss": 0.86406553, + "learning_rate": 0.00047726525259079777, + "loss": 0.8749451, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2750, + "time_per_iteration": 2.8415229320526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00831428, + "balance_loss_mlp": 1.42309499, + "diversity_loss_mlp": 0.21321589, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.03400797212131273, + "language_loss": 0.88723552, + "learning_rate": 0.0004769540369337798, + "loss": 0.89554983, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01327293, + "step": 2751, + "time_per_iteration": 2.752032518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100532, + "balance_loss_mlp": 1.09000587, + "diversity_loss_mlp": 0.0, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06288245154731438, + "language_loss": 0.85769415, + "learning_rate": 0.00047664283022399794, + "loss": 0.86869949, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2752, + "time_per_iteration": 2.8568003177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107464, + "balance_loss_mlp": 1.09725976, + "diversity_loss_mlp": 0.0, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.0883883166781065, + "language_loss": 0.80924225, + "learning_rate": 0.00047633163258227376, + "loss": 0.82031691, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.10205078, + "routerloss_mlp": 0.0, + "step": 2753, + "time_per_iteration": 2.8275938034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104119, + "balance_loss_mlp": 1.09359312, + "diversity_loss_mlp": 0.0, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.06733658380062774, + "language_loss": 0.85417688, + "learning_rate": 0.0004760204441294247, + "loss": 0.86521804, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2754, + "time_per_iteration": 2.6338090896606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104137, + "balance_loss_mlp": 1.09376574, + "diversity_loss_mlp": 0.0, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.06936353635633287, + "language_loss": 0.85999346, + "learning_rate": 0.00047570926498626486, + "loss": 0.87103486, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 2755, + "time_per_iteration": 2.716575860977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108637, + "balance_loss_mlp": 1.09822416, + "diversity_loss_mlp": 0.0, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.061285448286525046, + "language_loss": 0.81361842, + "learning_rate": 0.00047539809527360474, + "loss": 0.82470477, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2756, + "time_per_iteration": 2.881225109100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102434, + "balance_loss_mlp": 1.0919373, + "diversity_loss_mlp": 0.0, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.05865021558391441, + "language_loss": 0.82642096, + "learning_rate": 0.0004750869351122511, + "loss": 0.83744538, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 2757, + "time_per_iteration": 2.9978790283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096362, + "balance_loss_mlp": 1.08600891, + "diversity_loss_mlp": 0.0, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.07787390265260127, + "language_loss": 0.81663013, + "learning_rate": 0.00047477578462300685, + "loss": 0.82759368, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2758, + "time_per_iteration": 2.700833797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090285, + "balance_loss_mlp": 1.07975245, + "diversity_loss_mlp": 0.0, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.069319292192906, + "language_loss": 0.80022508, + "learning_rate": 0.0004744646439266718, + "loss": 0.81112796, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.10528564, + "routerloss_mlp": 0.0, + "step": 2759, + "time_per_iteration": 3.0144033432006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084993, + "balance_loss_mlp": 1.07477677, + "diversity_loss_mlp": 0.0, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.05678736813253772, + "language_loss": 0.92058611, + "learning_rate": 0.000474153513144041, + "loss": 0.93143606, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 2760, + "time_per_iteration": 2.890305995941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082633, + "balance_loss_mlp": 1.07224369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.06975892982263965, + "language_loss": 0.8659752, + "learning_rate": 0.00047384239239590633, + "loss": 0.87680155, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2761, + "time_per_iteration": 2.864649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076622, + "balance_loss_mlp": 1.06607819, + "diversity_loss_mlp": 0.0, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06592907525694008, + "language_loss": 0.88956439, + "learning_rate": 0.0004735312818030556, + "loss": 0.90033066, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 2762, + "time_per_iteration": 2.7256298065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.06967998, + "diversity_loss_mlp": 0.0, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.06903030148880929, + "language_loss": 0.82737643, + "learning_rate": 0.0004732201814862727, + "loss": 0.83817625, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 2763, + "time_per_iteration": 2.785104990005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078579, + "balance_loss_mlp": 1.0687145, + "diversity_loss_mlp": 0.0, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.07391416357546753, + "language_loss": 0.81619537, + "learning_rate": 0.0004729090915663373, + "loss": 0.82698119, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 2764, + "time_per_iteration": 2.841716766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841129, + "balance_loss_mlp": 1.43825924, + "diversity_loss_mlp": 0.21717778, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.03676047653681057, + "language_loss": 0.84753668, + "learning_rate": 0.00047259801216402534, + "loss": 0.85594797, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01341068, + "step": 2765, + "time_per_iteration": 2.5414865016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078812, + "balance_loss_mlp": 1.06872129, + "diversity_loss_mlp": 0.0, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.08353685320939014, + "language_loss": 0.86307138, + "learning_rate": 0.00047228694340010845, + "loss": 0.87385947, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 2766, + "time_per_iteration": 2.571230173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083419, + "balance_loss_mlp": 1.07304192, + "diversity_loss_mlp": 0.0, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.07758433064211989, + "language_loss": 0.85983396, + "learning_rate": 0.0004719758853953544, + "loss": 0.87066811, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 2767, + "time_per_iteration": 3.5577545166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.07479465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.08923013324738549, + "language_loss": 0.83480549, + "learning_rate": 0.00047166483827052645, + "loss": 0.84565854, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.10510254, + "routerloss_mlp": 0.0, + "step": 2768, + "time_per_iteration": 2.3904964923858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014357, + "balance_loss_mlp": 1.0088253, + "diversity_loss_mlp": 0.0, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.015852342000118255, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78092843, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2769, + "time_per_iteration": 4.993681907653809 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100393, + "balance_loss_mlp": 1.08974218, + "diversity_loss_mlp": 0.0, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.07499519146645399, + "language_loss": 0.8344022, + "learning_rate": 0.000471042777143682, + "loss": 0.84540612, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.10656738, + "routerloss_mlp": 0.0, + "step": 2770, + "time_per_iteration": 3.2187654972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099844, + "balance_loss_mlp": 1.0895741, + "diversity_loss_mlp": 0.0, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.07177386868704265, + "language_loss": 0.79602164, + "learning_rate": 0.0004707317633831707, + "loss": 0.80702007, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 2771, + "time_per_iteration": 2.5579092502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097617, + "balance_loss_mlp": 1.08694136, + "diversity_loss_mlp": 0.0, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.08358365289860634, + "language_loss": 0.78326285, + "learning_rate": 0.00047042076098559673, + "loss": 0.79423904, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.10687256, + "routerloss_mlp": 0.0, + "step": 2772, + "time_per_iteration": 2.6240808963775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089428, + "balance_loss_mlp": 1.07924104, + "diversity_loss_mlp": 0.0, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.07827879900232339, + "language_loss": 0.7374208, + "learning_rate": 0.00047010977007170174, + "loss": 0.7483151, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 2773, + "time_per_iteration": 3.239807605743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108646, + "balance_loss_mlp": 1.07606506, + "diversity_loss_mlp": 0.0, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.0770996892807777, + "language_loss": 0.82462615, + "learning_rate": 0.00046979879076222334, + "loss": 0.83549076, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 2774, + "time_per_iteration": 2.6871917247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081473, + "balance_loss_mlp": 1.07122087, + "diversity_loss_mlp": 0.0, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.060681013844514214, + "language_loss": 0.84932172, + "learning_rate": 0.0004694878231778939, + "loss": 0.86013645, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2775, + "time_per_iteration": 3.3516969680786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083848, + "balance_loss_mlp": 1.07336903, + "diversity_loss_mlp": 0.0, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.06561156947814625, + "language_loss": 0.84353071, + "learning_rate": 0.0004691768674394423, + "loss": 0.85436922, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2776, + "time_per_iteration": 2.9356815814971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010203, + "balance_loss_mlp": 1.01491189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.017317997453326725, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85504305, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 2777, + "time_per_iteration": 4.766932010650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017275, + "balance_loss_mlp": 1.01186275, + "diversity_loss_mlp": 0.0, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.016201867017030143, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77670807, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 2778, + "time_per_iteration": 5.022111177444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081648, + "balance_loss_mlp": 1.07109189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.08348606714079294, + "language_loss": 0.79229748, + "learning_rate": 0.00046824407250656676, + "loss": 0.803114, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.10565186, + "routerloss_mlp": 0.0, + "step": 2779, + "time_per_iteration": 2.6202685832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079296, + "balance_loss_mlp": 1.06859064, + "diversity_loss_mlp": 0.0, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.0812040646365834, + "language_loss": 0.83481312, + "learning_rate": 0.0004679331653588161, + "loss": 0.84560603, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.1071167, + "routerloss_mlp": 0.0, + "step": 2780, + "time_per_iteration": 2.6287879943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.07337165, + "diversity_loss_mlp": 0.0, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.08148878126655458, + "language_loss": 0.85570091, + "learning_rate": 0.0004676222706605147, + "loss": 0.86654037, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2781, + "time_per_iteration": 2.634186029434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082358, + "balance_loss_mlp": 1.07175457, + "diversity_loss_mlp": 0.0, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.08561637601090062, + "language_loss": 0.84885913, + "learning_rate": 0.0004673113885323626, + "loss": 0.85968268, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.10601807, + "routerloss_mlp": 0.0, + "step": 2782, + "time_per_iteration": 2.839108943939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084061, + "balance_loss_mlp": 1.07358241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.0730092425976976, + "language_loss": 0.78793383, + "learning_rate": 0.00046700051909505494, + "loss": 0.79877448, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.10479736, + "routerloss_mlp": 0.0, + "step": 2783, + "time_per_iteration": 3.1548988819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080824, + "balance_loss_mlp": 1.06943369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06678731146909953, + "language_loss": 0.84066731, + "learning_rate": 0.000466689662469282, + "loss": 0.85147554, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2784, + "time_per_iteration": 2.6213507652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082867, + "balance_loss_mlp": 1.07235312, + "diversity_loss_mlp": 0.0, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.06931446022689573, + "language_loss": 0.83996934, + "learning_rate": 0.00046637881877572917, + "loss": 0.85079801, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.10522461, + "routerloss_mlp": 0.0, + "step": 2785, + "time_per_iteration": 3.1161208152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084248, + "balance_loss_mlp": 1.07350779, + "diversity_loss_mlp": 0.0, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.05978198327100757, + "language_loss": 0.84824258, + "learning_rate": 0.0004660679881350764, + "loss": 0.85908508, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 2786, + "time_per_iteration": 2.7317774295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_mlp": 1.0375849, + "diversity_loss_mlp": 0.0, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.025126940202686972, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.7665174, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.06005859, + "routerloss_mlp": 0.0, + "step": 2787, + "time_per_iteration": 5.0151801109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079952, + "balance_loss_mlp": 1.06945598, + "diversity_loss_mlp": 0.0, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07181749108152896, + "language_loss": 0.78038859, + "learning_rate": 0.0004654463664951667, + "loss": 0.79118812, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 2788, + "time_per_iteration": 2.9862492084503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074852, + "balance_loss_mlp": 1.06444538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06160548649513732, + "language_loss": 0.83008492, + "learning_rate": 0.0004651355757372447, + "loss": 0.84083349, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 2789, + "time_per_iteration": 2.6209347248077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00838367, + "balance_loss_mlp": 1.43426061, + "diversity_loss_mlp": 0.2158158, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.029696530744324656, + "language_loss": 0.8589375, + "learning_rate": 0.00046482479851489274, + "loss": 0.86732113, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01332852, + "step": 2790, + "time_per_iteration": 2.6991934776306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077231, + "balance_loss_mlp": 1.06660962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.09378702232215988, + "language_loss": 0.77937293, + "learning_rate": 0.00046451403494876525, + "loss": 0.79014528, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.10632324, + "routerloss_mlp": 0.0, + "step": 2791, + "time_per_iteration": 2.8735973834991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070268, + "balance_loss_mlp": 1.05943799, + "diversity_loss_mlp": 0.0, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.07434319158841775, + "language_loss": 0.84554839, + "learning_rate": 0.0004642032851595111, + "loss": 0.85625106, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.1083374, + "routerloss_mlp": 0.0, + "step": 2792, + "time_per_iteration": 2.7458536624908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065699, + "balance_loss_mlp": 1.05472004, + "diversity_loss_mlp": 0.0, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.06545464420604186, + "language_loss": 0.85163087, + "learning_rate": 0.00046389254926777404, + "loss": 0.86228788, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.10980225, + "routerloss_mlp": 0.0, + "step": 2793, + "time_per_iteration": 2.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062925, + "balance_loss_mlp": 1.0519762, + "diversity_loss_mlp": 0.0, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.06502650627416932, + "language_loss": 0.78292251, + "learning_rate": 0.0004635818273941926, + "loss": 0.79355174, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.10955811, + "routerloss_mlp": 0.0, + "step": 2794, + "time_per_iteration": 3.569359302520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058844, + "balance_loss_mlp": 1.04798412, + "diversity_loss_mlp": 0.0, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.0851115940139546, + "language_loss": 0.81696212, + "learning_rate": 0.0004632711196593997, + "loss": 0.82755053, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2795, + "time_per_iteration": 2.763248920440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059516, + "balance_loss_mlp": 1.04872167, + "diversity_loss_mlp": 0.0, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.08577601840657965, + "language_loss": 0.85307401, + "learning_rate": 0.00046296042618402297, + "loss": 0.86366916, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.10791016, + "routerloss_mlp": 0.0, + "step": 2796, + "time_per_iteration": 3.059995651245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065204, + "balance_loss_mlp": 1.05436158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.05816929772054262, + "language_loss": 0.79285312, + "learning_rate": 0.0004626497470886839, + "loss": 0.80350512, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2797, + "time_per_iteration": 2.9551138877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059667, + "balance_loss_mlp": 1.04897988, + "diversity_loss_mlp": 0.0, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.06686475877008137, + "language_loss": 0.82082057, + "learning_rate": 0.00046233908249399897, + "loss": 0.83141726, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.10693359, + "routerloss_mlp": 0.0, + "step": 2798, + "time_per_iteration": 2.7494163513183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_mlp": 1.06012726, + "diversity_loss_mlp": 0.0, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.06311972638358435, + "language_loss": 0.78919041, + "learning_rate": 0.00046202843252057905, + "loss": 0.79990107, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.10943604, + "routerloss_mlp": 0.0, + "step": 2799, + "time_per_iteration": 2.586824655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076382, + "balance_loss_mlp": 1.06545627, + "diversity_loss_mlp": 0.0, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.06763496495115903, + "language_loss": 0.83705521, + "learning_rate": 0.00046171779728902896, + "loss": 0.84781897, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2800, + "time_per_iteration": 2.5922951698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084728, + "balance_loss_mlp": 1.07354665, + "diversity_loss_mlp": 0.0, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.12725923305511472, + "language_loss": 0.86135888, + "learning_rate": 0.000461407176919948, + "loss": 0.87220615, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2801, + "time_per_iteration": 2.532080888748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_mlp": 1.07459974, + "diversity_loss_mlp": 0.0, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.08372818850883645, + "language_loss": 0.85317719, + "learning_rate": 0.00046109657153392997, + "loss": 0.8640309, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.10772705, + "routerloss_mlp": 0.0, + "step": 2802, + "time_per_iteration": 2.7498726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082922, + "balance_loss_mlp": 1.07185912, + "diversity_loss_mlp": 0.0, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.07972844989907181, + "language_loss": 0.82981819, + "learning_rate": 0.0004607859812515622, + "loss": 0.84064734, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2803, + "time_per_iteration": 2.5823397636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077838, + "balance_loss_mlp": 1.06679916, + "diversity_loss_mlp": 0.0, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.06982591680837838, + "language_loss": 0.88185596, + "learning_rate": 0.00046047540619342667, + "loss": 0.89263427, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2804, + "time_per_iteration": 2.582594156265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089122, + "balance_loss_mlp": 1.07845902, + "diversity_loss_mlp": 0.0, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.06923180186476277, + "language_loss": 0.80359995, + "learning_rate": 0.00046016484648009933, + "loss": 0.81449121, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.10675049, + "routerloss_mlp": 0.0, + "step": 2805, + "time_per_iteration": 2.705085277557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082055, + "balance_loss_mlp": 1.0713259, + "diversity_loss_mlp": 0.0, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.06938884531628577, + "language_loss": 0.81049907, + "learning_rate": 0.0004598543022321501, + "loss": 0.82131958, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.10736084, + "routerloss_mlp": 0.0, + "step": 2806, + "time_per_iteration": 2.6722495555877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855076, + "balance_loss_mlp": 1.46593428, + "diversity_loss_mlp": 0.21781196, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.030466031644405155, + "language_loss": 0.79783833, + "learning_rate": 0.0004595437735701433, + "loss": 0.80638903, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01320273, + "step": 2807, + "time_per_iteration": 2.734110116958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088135, + "balance_loss_mlp": 1.07728648, + "diversity_loss_mlp": 0.0, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.08474622827734493, + "language_loss": 0.83849192, + "learning_rate": 0.00045923326061463623, + "loss": 0.84937334, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2808, + "time_per_iteration": 2.7606189250946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089545, + "balance_loss_mlp": 1.07878006, + "diversity_loss_mlp": 0.0, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.06442619071995537, + "language_loss": 0.8173002, + "learning_rate": 0.00045892276348618113, + "loss": 0.82819563, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.10772705, + "routerloss_mlp": 0.0, + "step": 2809, + "time_per_iteration": 2.9691591262817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_mlp": 1.02887774, + "diversity_loss_mlp": 0.0, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.01908051648382603, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79294789, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 2810, + "time_per_iteration": 4.957923173904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089256, + "balance_loss_mlp": 1.07848597, + "diversity_loss_mlp": 0.0, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.05960464217413758, + "language_loss": 0.80596066, + "learning_rate": 0.000458301817192603, + "loss": 0.81685317, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.10778809, + "routerloss_mlp": 0.0, + "step": 2811, + "time_per_iteration": 2.852247714996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021724, + "balance_loss_mlp": 1.0165503, + "diversity_loss_mlp": 0.0, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.015447521326512613, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81863511, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.05175781, + "routerloss_mlp": 0.0, + "step": 2812, + "time_per_iteration": 4.808724880218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080487, + "balance_loss_mlp": 1.06993747, + "diversity_loss_mlp": 0.0, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.06805695837678187, + "language_loss": 0.87130654, + "learning_rate": 0.00045768093565369983, + "loss": 0.88211143, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.10552979, + "routerloss_mlp": 0.0, + "step": 2813, + "time_per_iteration": 2.7794101238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090945, + "balance_loss_mlp": 1.08034182, + "diversity_loss_mlp": 0.0, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.06578755075233327, + "language_loss": 0.8208549, + "learning_rate": 0.0004573705194685646, + "loss": 0.83176434, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.1060791, + "routerloss_mlp": 0.0, + "step": 2814, + "time_per_iteration": 2.686871290206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084437, + "balance_loss_mlp": 1.07364845, + "diversity_loss_mlp": 0.0, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.07321549809116977, + "language_loss": 0.84966654, + "learning_rate": 0.00045706011983366157, + "loss": 0.86051095, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.10784912, + "routerloss_mlp": 0.0, + "step": 2815, + "time_per_iteration": 2.676772117614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843207, + "balance_loss_mlp": 1.44560027, + "diversity_loss_mlp": 0.21445701, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.03775972378408833, + "language_loss": 0.82685602, + "learning_rate": 0.00045674973686949847, + "loss": 0.83528805, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01317827, + "step": 2816, + "time_per_iteration": 2.548164129257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079521, + "balance_loss_mlp": 1.06887531, + "diversity_loss_mlp": 0.0, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.06715248152064907, + "language_loss": 0.85478067, + "learning_rate": 0.0004564393706965766, + "loss": 0.86557591, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.10656738, + "routerloss_mlp": 0.0, + "step": 2817, + "time_per_iteration": 2.9715416431427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078759, + "balance_loss_mlp": 1.06789875, + "diversity_loss_mlp": 0.0, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.07300594242261846, + "language_loss": 0.81410033, + "learning_rate": 0.00045612902143539116, + "loss": 0.82488787, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.10864258, + "routerloss_mlp": 0.0, + "step": 2818, + "time_per_iteration": 2.5861568450927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.05926371, + "diversity_loss_mlp": 0.0, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.07796543703625758, + "language_loss": 0.8169418, + "learning_rate": 0.00045581868920642986, + "loss": 0.82763875, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 2819, + "time_per_iteration": 2.495675563812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079235, + "balance_loss_mlp": 1.06864905, + "diversity_loss_mlp": 0.0, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.08284985931126, + "language_loss": 0.79605496, + "learning_rate": 0.00045550837413017457, + "loss": 0.80684733, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2820, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081511, + "balance_loss_mlp": 1.07137275, + "diversity_loss_mlp": 0.0, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06853869944040722, + "language_loss": 0.85501075, + "learning_rate": 0.0004551980763271005, + "loss": 0.86582589, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 2821, + "time_per_iteration": 2.6689629554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080137, + "balance_loss_mlp": 1.06970072, + "diversity_loss_mlp": 0.0, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.07047505467714002, + "language_loss": 0.83788973, + "learning_rate": 0.0004548877959176756, + "loss": 0.84869111, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2822, + "time_per_iteration": 2.8898305892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.06903815, + "diversity_loss_mlp": 0.0, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.06782192405371351, + "language_loss": 0.86297488, + "learning_rate": 0.00045457753302236166, + "loss": 0.87376869, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.10339355, + "routerloss_mlp": 0.0, + "step": 2823, + "time_per_iteration": 2.626262903213501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087962, + "balance_loss_mlp": 1.07755554, + "diversity_loss_mlp": 0.0, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.07336203540826484, + "language_loss": 0.87131381, + "learning_rate": 0.00045426728776161353, + "loss": 0.88219345, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2824, + "time_per_iteration": 2.7630255222320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085716, + "balance_loss_mlp": 1.07529116, + "diversity_loss_mlp": 0.0, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.07766893457840997, + "language_loss": 0.81382459, + "learning_rate": 0.00045395706025587863, + "loss": 0.82468176, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2825, + "time_per_iteration": 2.653036594390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070472, + "balance_loss_mlp": 1.05976105, + "diversity_loss_mlp": 0.0, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.08392292239142347, + "language_loss": 0.82965428, + "learning_rate": 0.00045364685062559843, + "loss": 0.84035897, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.10717773, + "routerloss_mlp": 0.0, + "step": 2826, + "time_per_iteration": 2.8091156482696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075397, + "balance_loss_mlp": 1.06498957, + "diversity_loss_mlp": 0.0, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.06510139608888613, + "language_loss": 0.91622829, + "learning_rate": 0.0004533366589912067, + "loss": 0.92698228, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2827, + "time_per_iteration": 2.949005365371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075847, + "balance_loss_mlp": 1.06538677, + "diversity_loss_mlp": 0.0, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.07049343673366977, + "language_loss": 0.77641904, + "learning_rate": 0.0004530264854731306, + "loss": 0.78717756, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.10461426, + "routerloss_mlp": 0.0, + "step": 2828, + "time_per_iteration": 3.054252862930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079568, + "balance_loss_mlp": 1.06920242, + "diversity_loss_mlp": 0.0, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.05986165572949975, + "language_loss": 0.84122354, + "learning_rate": 0.00045271633019179034, + "loss": 0.85201919, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 2829, + "time_per_iteration": 2.788818836212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077904, + "balance_loss_mlp": 1.06762242, + "diversity_loss_mlp": 0.0, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.05963281032217842, + "language_loss": 0.87701666, + "learning_rate": 0.0004524061932675986, + "loss": 0.88779569, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.10284424, + "routerloss_mlp": 0.0, + "step": 2830, + "time_per_iteration": 2.861154079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073509, + "balance_loss_mlp": 1.06306028, + "diversity_loss_mlp": 0.0, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.11132414831600651, + "language_loss": 0.87095535, + "learning_rate": 0.00045209607482096125, + "loss": 0.88169038, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.10455322, + "routerloss_mlp": 0.0, + "step": 2831, + "time_per_iteration": 3.041248321533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107632, + "balance_loss_mlp": 1.06573415, + "diversity_loss_mlp": 0.0, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.07049073021000962, + "language_loss": 0.84385192, + "learning_rate": 0.0004517859749722772, + "loss": 0.85461509, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2832, + "time_per_iteration": 2.663478374481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075242, + "balance_loss_mlp": 1.0643816, + "diversity_loss_mlp": 0.0, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.06386820666055518, + "language_loss": 0.79316235, + "learning_rate": 0.0004514758938419376, + "loss": 0.80391467, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.10870361, + "routerloss_mlp": 0.0, + "step": 2833, + "time_per_iteration": 2.8141582012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_mlp": 1.03721869, + "diversity_loss_mlp": 0.0, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.027736452139364785, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77963334, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2834, + "time_per_iteration": 4.960749864578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075071, + "balance_loss_mlp": 1.06446719, + "diversity_loss_mlp": 0.0, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.06436328535255592, + "language_loss": 0.83993077, + "learning_rate": 0.00045085578821782175, + "loss": 0.85068148, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.1060791, + "routerloss_mlp": 0.0, + "step": 2835, + "time_per_iteration": 2.6025185585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020548, + "balance_loss_mlp": 1.01516008, + "diversity_loss_mlp": 0.0, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.015651807900939278, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77155292, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 2836, + "time_per_iteration": 4.911514043807983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079494, + "balance_loss_mlp": 1.06864595, + "diversity_loss_mlp": 0.0, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.05502946705999508, + "language_loss": 0.81078947, + "learning_rate": 0.00045023575891159866, + "loss": 0.82158434, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.10852051, + "routerloss_mlp": 0.0, + "step": 2837, + "time_per_iteration": 2.7158284187316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008506, + "balance_loss_mlp": 1.00321293, + "diversity_loss_mlp": 0.0, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.010060791837063862, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75772309, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 2838, + "time_per_iteration": 4.9448912143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078413, + "balance_loss_mlp": 1.06803036, + "diversity_loss_mlp": 0.0, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.059936217606746015, + "language_loss": 0.78111225, + "learning_rate": 0.0004496158068861354, + "loss": 0.79189646, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 2839, + "time_per_iteration": 2.8019115924835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.07090366, + "diversity_loss_mlp": 0.0, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.06804602152838367, + "language_loss": 0.80713242, + "learning_rate": 0.00044930586015455207, + "loss": 0.81794775, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.10638428, + "routerloss_mlp": 0.0, + "step": 2840, + "time_per_iteration": 2.771359443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076777, + "balance_loss_mlp": 1.06646562, + "diversity_loss_mlp": 0.0, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.0578733121218936, + "language_loss": 0.88904727, + "learning_rate": 0.000448995933104179, + "loss": 0.89981508, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2841, + "time_per_iteration": 2.8486392498016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081465, + "balance_loss_mlp": 1.07075977, + "diversity_loss_mlp": 0.0, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.07392730491467848, + "language_loss": 0.80162299, + "learning_rate": 0.00044868602585534077, + "loss": 0.81243765, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.10699463, + "routerloss_mlp": 0.0, + "step": 2842, + "time_per_iteration": 2.8463480472564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074348, + "balance_loss_mlp": 1.06379187, + "diversity_loss_mlp": 0.0, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.0858024928700591, + "language_loss": 0.89360344, + "learning_rate": 0.0004483761385283541, + "loss": 0.90434694, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.10565186, + "routerloss_mlp": 0.0, + "step": 2843, + "time_per_iteration": 2.534032106399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870358, + "balance_loss_mlp": 1.4994092, + "diversity_loss_mlp": 0.21570696, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.030684440159293704, + "language_loss": 0.8165319, + "learning_rate": 0.0004480662712435281, + "loss": 0.82523549, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01280049, + "step": 2844, + "time_per_iteration": 2.7523300647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081741, + "balance_loss_mlp": 1.07085109, + "diversity_loss_mlp": 0.0, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.08261462073704483, + "language_loss": 0.88389564, + "learning_rate": 0.0004477564241211635, + "loss": 0.89471304, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 2845, + "time_per_iteration": 2.5676896572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068187, + "balance_loss_mlp": 1.0573566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.07762403474355188, + "language_loss": 0.868963, + "learning_rate": 0.0004474465972815541, + "loss": 0.87964487, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.10839844, + "routerloss_mlp": 0.0, + "step": 2846, + "time_per_iteration": 2.4843738079071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073996, + "balance_loss_mlp": 1.06337464, + "diversity_loss_mlp": 0.0, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.05857404260801407, + "language_loss": 0.87612844, + "learning_rate": 0.000447136790844985, + "loss": 0.88686836, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.10626221, + "routerloss_mlp": 0.0, + "step": 2847, + "time_per_iteration": 2.659214973449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068933, + "balance_loss_mlp": 1.05774474, + "diversity_loss_mlp": 0.0, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.0657788254057266, + "language_loss": 0.80922693, + "learning_rate": 0.00044682700493173385, + "loss": 0.81991625, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.11187744, + "routerloss_mlp": 0.0, + "step": 2848, + "time_per_iteration": 2.8093039989471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071859, + "balance_loss_mlp": 1.06077814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.06921376228249611, + "language_loss": 0.80399549, + "learning_rate": 0.00044651723966207004, + "loss": 0.81471407, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.11090088, + "routerloss_mlp": 0.0, + "step": 2849, + "time_per_iteration": 3.1084961891174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.05826974, + "diversity_loss_mlp": 0.0, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.06382752106805908, + "language_loss": 0.78137773, + "learning_rate": 0.00044620749515625536, + "loss": 0.79206896, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2850, + "time_per_iteration": 2.8127682209014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_mlp": 1.05505395, + "diversity_loss_mlp": 0.0, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.07084116902380141, + "language_loss": 0.85142213, + "learning_rate": 0.00044589777153454334, + "loss": 0.86208153, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 2851, + "time_per_iteration": 2.7690277099609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063836, + "balance_loss_mlp": 1.05239749, + "diversity_loss_mlp": 0.0, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.06308922523972363, + "language_loss": 0.83850712, + "learning_rate": 0.00044558806891717895, + "loss": 0.84914547, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2852, + "time_per_iteration": 2.542076587677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066106, + "balance_loss_mlp": 1.05529404, + "diversity_loss_mlp": 0.0, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.06000502851088379, + "language_loss": 0.79783493, + "learning_rate": 0.0004452783874243998, + "loss": 0.808496, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.1081543, + "routerloss_mlp": 0.0, + "step": 2853, + "time_per_iteration": 2.8680150508880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070657, + "balance_loss_mlp": 1.06022012, + "diversity_loss_mlp": 0.0, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.07387916596955035, + "language_loss": 0.84572864, + "learning_rate": 0.00044496872717643475, + "loss": 0.85643518, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 2854, + "time_per_iteration": 2.676128625869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_mlp": 1.04261672, + "diversity_loss_mlp": 0.0, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.03710413532206065, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78137678, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2855, + "time_per_iteration": 4.937518835067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.06609333, + "diversity_loss_mlp": 0.0, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.06582649113696544, + "language_loss": 0.81989098, + "learning_rate": 0.0004443494708958217, + "loss": 0.83065504, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 2856, + "time_per_iteration": 2.9764318466186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077652, + "balance_loss_mlp": 1.06707263, + "diversity_loss_mlp": 0.0, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.05962775351044122, + "language_loss": 0.80705082, + "learning_rate": 0.0004440398751035906, + "loss": 0.81782728, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2857, + "time_per_iteration": 2.8708760738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107925, + "balance_loss_mlp": 1.06846118, + "diversity_loss_mlp": 0.0, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.08652259855452149, + "language_loss": 0.83723986, + "learning_rate": 0.00044373030103700645, + "loss": 0.84803236, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.10791016, + "routerloss_mlp": 0.0, + "step": 2858, + "time_per_iteration": 2.629887342453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857386, + "balance_loss_mlp": 1.47058845, + "diversity_loss_mlp": 0.21831456, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.03034959963101528, + "language_loss": 0.79655832, + "learning_rate": 0.000443420748816257, + "loss": 0.80513215, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01293462, + "step": 2859, + "time_per_iteration": 2.8473408222198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107821, + "balance_loss_mlp": 1.06795764, + "diversity_loss_mlp": 0.0, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.07076083110298415, + "language_loss": 0.78692329, + "learning_rate": 0.0004431112185615208, + "loss": 0.79770535, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2860, + "time_per_iteration": 2.751131534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.0721283, + "diversity_loss_mlp": 0.0, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.06396450124437818, + "language_loss": 0.7993266, + "learning_rate": 0.00044280171039296845, + "loss": 0.81015229, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.10449219, + "routerloss_mlp": 0.0, + "step": 2861, + "time_per_iteration": 2.606870651245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082002, + "balance_loss_mlp": 1.0716126, + "diversity_loss_mlp": 0.0, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.0734058146638898, + "language_loss": 0.8832019, + "learning_rate": 0.0004424922244307616, + "loss": 0.89402187, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2862, + "time_per_iteration": 2.728055477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081346, + "balance_loss_mlp": 1.07124305, + "diversity_loss_mlp": 0.0, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.08810368166009505, + "language_loss": 0.82030249, + "learning_rate": 0.00044218276079505315, + "loss": 0.83111596, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.10101318, + "routerloss_mlp": 0.0, + "step": 2863, + "time_per_iteration": 2.8925743103027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076982, + "balance_loss_mlp": 1.0667721, + "diversity_loss_mlp": 0.0, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.06918705117949257, + "language_loss": 0.74817479, + "learning_rate": 0.0004418733196059876, + "loss": 0.75894463, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 2864, + "time_per_iteration": 2.747131109237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068041, + "balance_loss_mlp": 1.0579797, + "diversity_loss_mlp": 0.0, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.060188467246496694, + "language_loss": 0.79747194, + "learning_rate": 0.0004415639009837008, + "loss": 0.80815232, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 2865, + "time_per_iteration": 2.838609218597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077074, + "balance_loss_mlp": 1.06704867, + "diversity_loss_mlp": 0.0, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.06869441498871262, + "language_loss": 0.82126647, + "learning_rate": 0.00044125450504831955, + "loss": 0.83203721, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2866, + "time_per_iteration": 2.7267115116119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080685, + "balance_loss_mlp": 1.07046294, + "diversity_loss_mlp": 0.0, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.0812577822304444, + "language_loss": 0.82503623, + "learning_rate": 0.0004409451319199622, + "loss": 0.83584309, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 2867, + "time_per_iteration": 2.6727194786071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080071, + "balance_loss_mlp": 1.07005203, + "diversity_loss_mlp": 0.0, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07302760882162292, + "language_loss": 0.84415638, + "learning_rate": 0.0004406357817187381, + "loss": 0.8549571, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2868, + "time_per_iteration": 2.9669716358184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084226, + "balance_loss_mlp": 1.07424247, + "diversity_loss_mlp": 0.0, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.06120403113840053, + "language_loss": 0.81250817, + "learning_rate": 0.0004403264545647474, + "loss": 0.82335043, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 2869, + "time_per_iteration": 3.535280704498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092656, + "balance_loss_mlp": 1.08244562, + "diversity_loss_mlp": 0.0, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.05305368525165607, + "language_loss": 0.84751379, + "learning_rate": 0.00044001715057808154, + "loss": 0.85844034, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 2870, + "time_per_iteration": 2.757197618484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00867753, + "balance_loss_mlp": 1.49414647, + "diversity_loss_mlp": 0.21602358, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.02933333976418528, + "language_loss": 0.81627762, + "learning_rate": 0.0004397078698788232, + "loss": 0.82495517, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01266836, + "step": 2871, + "time_per_iteration": 3.241936445236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046288, + "balance_loss_mlp": 1.04097104, + "diversity_loss_mlp": 0.0, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.0256992480173019, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81488657, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 2872, + "time_per_iteration": 4.879035234451294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103168, + "balance_loss_mlp": 1.09304726, + "diversity_loss_mlp": 0.0, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.06889966135830194, + "language_loss": 0.78025937, + "learning_rate": 0.00043908937882281343, + "loss": 0.79129106, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.10119629, + "routerloss_mlp": 0.0, + "step": 2873, + "time_per_iteration": 2.624072313308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097291, + "balance_loss_mlp": 1.08644319, + "diversity_loss_mlp": 0.0, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.06659644406743612, + "language_loss": 0.82492054, + "learning_rate": 0.0004387801687061814, + "loss": 0.83589351, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2874, + "time_per_iteration": 2.839524269104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100748, + "balance_loss_mlp": 1.09040689, + "diversity_loss_mlp": 0.0, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.06411004123803754, + "language_loss": 0.80204833, + "learning_rate": 0.0004384709823571958, + "loss": 0.81305587, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2875, + "time_per_iteration": 2.768268346786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092947, + "balance_loss_mlp": 1.08278441, + "diversity_loss_mlp": 0.0, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.0827933156096061, + "language_loss": 0.83099473, + "learning_rate": 0.0004381618198958932, + "loss": 0.84192419, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 2876, + "time_per_iteration": 3.509364604949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084381, + "balance_loss_mlp": 1.07393849, + "diversity_loss_mlp": 0.0, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.0672046455921574, + "language_loss": 0.83616996, + "learning_rate": 0.00043785268144230137, + "loss": 0.84701377, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.10449219, + "routerloss_mlp": 0.0, + "step": 2877, + "time_per_iteration": 2.8941080570220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078858, + "balance_loss_mlp": 1.06849325, + "diversity_loss_mlp": 0.0, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.08466064144544548, + "language_loss": 0.82657743, + "learning_rate": 0.00043754356711643837, + "loss": 0.83736604, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 2878, + "time_per_iteration": 2.6849513053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072509, + "balance_loss_mlp": 1.0620904, + "diversity_loss_mlp": 0.0, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.08115939494621484, + "language_loss": 0.84283209, + "learning_rate": 0.0004372344770383132, + "loss": 0.85355723, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2879, + "time_per_iteration": 2.809833526611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064473, + "balance_loss_mlp": 1.05426884, + "diversity_loss_mlp": 0.0, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.15468249092113104, + "language_loss": 0.82951438, + "learning_rate": 0.00043692541132792507, + "loss": 0.84015906, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.10205078, + "routerloss_mlp": 0.0, + "step": 2880, + "time_per_iteration": 2.6886332035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106573, + "balance_loss_mlp": 1.05541205, + "diversity_loss_mlp": 0.0, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.07258014540865806, + "language_loss": 0.83396262, + "learning_rate": 0.00043661637010526384, + "loss": 0.84461993, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2881, + "time_per_iteration": 2.484912872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010637, + "balance_loss_mlp": 1.05335283, + "diversity_loss_mlp": 0.0, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.07022154553173111, + "language_loss": 0.83217472, + "learning_rate": 0.00043630735349031025, + "loss": 0.8428117, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 2882, + "time_per_iteration": 2.627950429916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.05427396, + "diversity_loss_mlp": 0.0, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.05734398116556458, + "language_loss": 0.81837022, + "learning_rate": 0.00043599836160303495, + "loss": 0.8290168, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2883, + "time_per_iteration": 2.87358021736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061705, + "balance_loss_mlp": 1.05094647, + "diversity_loss_mlp": 0.0, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.05952583825506871, + "language_loss": 0.77472365, + "learning_rate": 0.0004356893945633995, + "loss": 0.78534073, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 2884, + "time_per_iteration": 2.9415786266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058414, + "balance_loss_mlp": 1.04738104, + "diversity_loss_mlp": 0.0, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.06387157363580499, + "language_loss": 0.81997669, + "learning_rate": 0.0004353804524913551, + "loss": 0.8305608, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2885, + "time_per_iteration": 2.5772132873535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106399, + "balance_loss_mlp": 1.05298674, + "diversity_loss_mlp": 0.0, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.07314612024272811, + "language_loss": 0.82015049, + "learning_rate": 0.0004350715355068441, + "loss": 0.8307904, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.11010742, + "routerloss_mlp": 0.0, + "step": 2886, + "time_per_iteration": 2.7211849689483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062969, + "balance_loss_mlp": 1.05221653, + "diversity_loss_mlp": 0.0, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.08671001380075964, + "language_loss": 0.79774809, + "learning_rate": 0.00043476264372979847, + "loss": 0.8083778, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.10754395, + "routerloss_mlp": 0.0, + "step": 2887, + "time_per_iteration": 2.5452206134796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064606, + "balance_loss_mlp": 1.05403173, + "diversity_loss_mlp": 0.0, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.08125450311694367, + "language_loss": 0.78590369, + "learning_rate": 0.0004344537772801408, + "loss": 0.79654968, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.10577393, + "routerloss_mlp": 0.0, + "step": 2888, + "time_per_iteration": 3.870267391204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_mlp": 1.02839172, + "diversity_loss_mlp": 0.0, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.026917818165577125, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74456155, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 2889, + "time_per_iteration": 4.943026065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091351, + "balance_loss_mlp": 1.08043766, + "diversity_loss_mlp": 0.0, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.07456412824125162, + "language_loss": 0.83536172, + "learning_rate": 0.0004338361208426298, + "loss": 0.84627521, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.10919189, + "routerloss_mlp": 0.0, + "step": 2890, + "time_per_iteration": 2.65266752243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094404, + "balance_loss_mlp": 1.08348465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.057576040721241756, + "language_loss": 0.81499392, + "learning_rate": 0.00043352733109457164, + "loss": 0.82593793, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.10919189, + "routerloss_mlp": 0.0, + "step": 2891, + "time_per_iteration": 2.927246332168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106638, + "balance_loss_mlp": 1.09556401, + "diversity_loss_mlp": 0.0, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.0763949134442708, + "language_loss": 0.84462321, + "learning_rate": 0.00043321856715349244, + "loss": 0.85568959, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2892, + "time_per_iteration": 2.970857858657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110278, + "balance_loss_mlp": 1.0918721, + "diversity_loss_mlp": 0.0, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.07453927070697552, + "language_loss": 0.80594504, + "learning_rate": 0.00043290982913926466, + "loss": 0.81697285, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.10913086, + "routerloss_mlp": 0.0, + "step": 2893, + "time_per_iteration": 2.8581972122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105658, + "balance_loss_mlp": 1.09473801, + "diversity_loss_mlp": 0.0, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.08476057735977802, + "language_loss": 0.84177083, + "learning_rate": 0.0004326011171717514, + "loss": 0.85282743, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2894, + "time_per_iteration": 2.90563702583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094642, + "balance_loss_mlp": 1.08371019, + "diversity_loss_mlp": 0.0, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.06785531665857511, + "language_loss": 0.80468631, + "learning_rate": 0.0004322924313708051, + "loss": 0.8156327, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.10931396, + "routerloss_mlp": 0.0, + "step": 2895, + "time_per_iteration": 2.51784610748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092855, + "balance_loss_mlp": 1.08219218, + "diversity_loss_mlp": 0.0, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.07706946900287333, + "language_loss": 0.84533763, + "learning_rate": 0.0004319837718562681, + "loss": 0.85626626, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.10668945, + "routerloss_mlp": 0.0, + "step": 2896, + "time_per_iteration": 2.5862512588500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083747, + "balance_loss_mlp": 1.07321525, + "diversity_loss_mlp": 0.0, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.0793708179068888, + "language_loss": 0.83050567, + "learning_rate": 0.0004316751387479726, + "loss": 0.84134316, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2897, + "time_per_iteration": 2.778136730194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857516, + "balance_loss_mlp": 1.47219694, + "diversity_loss_mlp": 0.21748725, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.034004819690404205, + "language_loss": 0.82499564, + "learning_rate": 0.0004313665321657409, + "loss": 0.83357084, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01267361, + "step": 2898, + "time_per_iteration": 3.7754030227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.06795418, + "diversity_loss_mlp": 0.0, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.08236969633510602, + "language_loss": 0.79824448, + "learning_rate": 0.00043105795222938436, + "loss": 0.80903113, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.1071167, + "routerloss_mlp": 0.0, + "step": 2899, + "time_per_iteration": 2.7090694904327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073397, + "balance_loss_mlp": 1.06296027, + "diversity_loss_mlp": 0.0, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.07659548301877016, + "language_loss": 0.78690445, + "learning_rate": 0.00043074939905870467, + "loss": 0.79763848, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2900, + "time_per_iteration": 2.6444900035858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069144, + "balance_loss_mlp": 1.05899358, + "diversity_loss_mlp": 0.0, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.08372730008806528, + "language_loss": 0.80284113, + "learning_rate": 0.0004304408727734927, + "loss": 0.81353253, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 2901, + "time_per_iteration": 2.6800661087036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855039, + "balance_loss_mlp": 1.46478724, + "diversity_loss_mlp": 0.21833366, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.026106559121528438, + "language_loss": 0.88945115, + "learning_rate": 0.0004301323734935288, + "loss": 0.89800155, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01347797, + "step": 2902, + "time_per_iteration": 2.6880388259887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_mlp": 1.05446076, + "diversity_loss_mlp": 0.0, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.08715674624995783, + "language_loss": 0.87386537, + "learning_rate": 0.000429823901338583, + "loss": 0.88451326, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2903, + "time_per_iteration": 2.611330032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.06004524, + "diversity_loss_mlp": 0.0, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.07350666628476007, + "language_loss": 0.86772639, + "learning_rate": 0.00042951545642841513, + "loss": 0.87843215, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2904, + "time_per_iteration": 3.066653251647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06802535, + "diversity_loss_mlp": 0.0, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.06907930895976065, + "language_loss": 0.86694556, + "learning_rate": 0.0004292070388827737, + "loss": 0.87773216, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.10644531, + "routerloss_mlp": 0.0, + "step": 2905, + "time_per_iteration": 2.5430614948272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068326, + "balance_loss_mlp": 1.05785918, + "diversity_loss_mlp": 0.0, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06877653703862108, + "language_loss": 0.81346464, + "learning_rate": 0.00042889864882139753, + "loss": 0.82414794, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2906, + "time_per_iteration": 2.5722434520721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075755, + "balance_loss_mlp": 1.06534863, + "diversity_loss_mlp": 0.0, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06732553967994827, + "language_loss": 0.81503737, + "learning_rate": 0.0004285902863640139, + "loss": 0.82579494, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.10406494, + "routerloss_mlp": 0.0, + "step": 2907, + "time_per_iteration": 2.643721580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074648, + "balance_loss_mlp": 1.06431222, + "diversity_loss_mlp": 0.0, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.06943407338412115, + "language_loss": 0.86278725, + "learning_rate": 0.00042828195163033966, + "loss": 0.87353367, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.10339355, + "routerloss_mlp": 0.0, + "step": 2908, + "time_per_iteration": 2.7045791149139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081822, + "balance_loss_mlp": 1.07135582, + "diversity_loss_mlp": 0.0, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07324820072157985, + "language_loss": 0.79102659, + "learning_rate": 0.0004279736447400812, + "loss": 0.80184484, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2909, + "time_per_iteration": 2.585176944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107588, + "balance_loss_mlp": 1.06558049, + "diversity_loss_mlp": 0.0, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.07142642262643135, + "language_loss": 0.78468478, + "learning_rate": 0.00042766536581293385, + "loss": 0.79544365, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 2910, + "time_per_iteration": 2.723602771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090371, + "balance_loss_mlp": 1.07975566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.0702995437532307, + "language_loss": 0.79552364, + "learning_rate": 0.0004273571149685819, + "loss": 0.80642736, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2911, + "time_per_iteration": 2.7220258712768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091791, + "balance_loss_mlp": 1.08147311, + "diversity_loss_mlp": 0.0, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.06270923487878967, + "language_loss": 0.84021366, + "learning_rate": 0.00042704889232669937, + "loss": 0.85113156, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 2912, + "time_per_iteration": 2.6799380779266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848913, + "balance_loss_mlp": 1.45588994, + "diversity_loss_mlp": 0.21708892, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.03254511626684893, + "language_loss": 0.85648382, + "learning_rate": 0.0004267406980069484, + "loss": 0.86497295, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01242387, + "step": 2913, + "time_per_iteration": 2.7309391498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.10193157, + "diversity_loss_mlp": 0.0, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.05402445789476675, + "language_loss": 0.79744071, + "learning_rate": 0.0004264325321289808, + "loss": 0.80856508, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.10510254, + "routerloss_mlp": 0.0, + "step": 2914, + "time_per_iteration": 2.8245773315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104335, + "balance_loss_mlp": 1.09404707, + "diversity_loss_mlp": 0.0, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.07588418732744176, + "language_loss": 0.86308336, + "learning_rate": 0.00042612439481243736, + "loss": 0.87412667, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.10284424, + "routerloss_mlp": 0.0, + "step": 2915, + "time_per_iteration": 2.7910971641540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109566, + "balance_loss_mlp": 1.09916496, + "diversity_loss_mlp": 0.0, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.07165476469353879, + "language_loss": 0.90284097, + "learning_rate": 0.00042581628617694735, + "loss": 0.91393661, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2916, + "time_per_iteration": 2.7449898719787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00839442, + "balance_loss_mlp": 1.43753612, + "diversity_loss_mlp": 0.21687999, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.03331291255724556, + "language_loss": 0.81856477, + "learning_rate": 0.0004255082063421296, + "loss": 0.82695925, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01223436, + "step": 2917, + "time_per_iteration": 2.705263614654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.12130046, + "diversity_loss_mlp": 0.0, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.07697799391889214, + "language_loss": 0.84842837, + "learning_rate": 0.00042520015542759065, + "loss": 0.85974395, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2918, + "time_per_iteration": 2.8643360137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110893, + "balance_loss_mlp": 1.09857666, + "diversity_loss_mlp": 0.0, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.059259650717302215, + "language_loss": 0.88182557, + "learning_rate": 0.00042489213355292687, + "loss": 0.89291489, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2919, + "time_per_iteration": 2.871605634689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113923, + "balance_loss_mlp": 1.1035037, + "diversity_loss_mlp": 0.0, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.07025137955977834, + "language_loss": 0.81129396, + "learning_rate": 0.00042458414083772276, + "loss": 0.82243323, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2920, + "time_per_iteration": 2.5280137062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110383, + "balance_loss_mlp": 1.09353638, + "diversity_loss_mlp": 0.0, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.06291310679725345, + "language_loss": 0.85259616, + "learning_rate": 0.000424276177401552, + "loss": 0.86363447, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 2921, + "time_per_iteration": 2.8061861991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091107, + "balance_loss_mlp": 1.08052063, + "diversity_loss_mlp": 0.0, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.06947728514830868, + "language_loss": 0.8586399, + "learning_rate": 0.0004239682433639763, + "loss": 0.86955094, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2922, + "time_per_iteration": 2.7068192958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.07726383, + "diversity_loss_mlp": 0.0, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.06724553342566655, + "language_loss": 0.85617495, + "learning_rate": 0.0004236603388445467, + "loss": 0.86705184, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 2923, + "time_per_iteration": 2.5658164024353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083423, + "balance_loss_mlp": 1.07329023, + "diversity_loss_mlp": 0.0, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.06491959150956746, + "language_loss": 0.82087809, + "learning_rate": 0.00042335246396280166, + "loss": 0.83171237, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.10131836, + "routerloss_mlp": 0.0, + "step": 2924, + "time_per_iteration": 2.7210686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076248, + "balance_loss_mlp": 1.06606197, + "diversity_loss_mlp": 0.0, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.06924351044147684, + "language_loss": 0.90442908, + "learning_rate": 0.0004230446188382693, + "loss": 0.91519153, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 2925, + "time_per_iteration": 2.5210559368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072823, + "balance_loss_mlp": 1.06237423, + "diversity_loss_mlp": 0.0, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.06189914516088338, + "language_loss": 0.80191588, + "learning_rate": 0.0004227368035904654, + "loss": 0.81264406, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.10455322, + "routerloss_mlp": 0.0, + "step": 2926, + "time_per_iteration": 2.957545757293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073658, + "balance_loss_mlp": 1.06312013, + "diversity_loss_mlp": 0.0, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.07119677802103677, + "language_loss": 0.8312782, + "learning_rate": 0.00042242901833889474, + "loss": 0.84201479, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.10540771, + "routerloss_mlp": 0.0, + "step": 2927, + "time_per_iteration": 2.6197497844696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069584, + "balance_loss_mlp": 1.05933261, + "diversity_loss_mlp": 0.0, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.07548469953325632, + "language_loss": 0.85944557, + "learning_rate": 0.0004221212632030501, + "loss": 0.87014145, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2928, + "time_per_iteration": 3.0718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074032, + "balance_loss_mlp": 1.0636375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.0702405954135719, + "language_loss": 0.8005904, + "learning_rate": 0.0004218135383024124, + "loss": 0.81133074, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2929, + "time_per_iteration": 2.6883885860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068804, + "balance_loss_mlp": 1.05836129, + "diversity_loss_mlp": 0.0, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.07423933793606223, + "language_loss": 0.85405028, + "learning_rate": 0.0004215058437564511, + "loss": 0.86473835, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2930, + "time_per_iteration": 2.5645458698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.06520677, + "diversity_loss_mlp": 0.0, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.07045402067927274, + "language_loss": 0.82365847, + "learning_rate": 0.00042119817968462397, + "loss": 0.83441579, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.10528564, + "routerloss_mlp": 0.0, + "step": 2931, + "time_per_iteration": 2.596431255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843243, + "balance_loss_mlp": 1.44432163, + "diversity_loss_mlp": 0.21611315, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.034099962370994746, + "language_loss": 0.87154222, + "learning_rate": 0.0004208905462063766, + "loss": 0.8799746, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01302544, + "step": 2932, + "time_per_iteration": 2.7103724479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088146, + "balance_loss_mlp": 1.07760167, + "diversity_loss_mlp": 0.0, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.07257480225633914, + "language_loss": 0.84035242, + "learning_rate": 0.00042058294344114315, + "loss": 0.8512339, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 2933, + "time_per_iteration": 2.6817541122436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846618, + "balance_loss_mlp": 1.45035362, + "diversity_loss_mlp": 0.21710092, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.03239193802507573, + "language_loss": 0.77597153, + "learning_rate": 0.0004202753715083456, + "loss": 0.78443778, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01289086, + "step": 2934, + "time_per_iteration": 3.1172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.08684492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.08960488369203884, + "language_loss": 0.8126961, + "learning_rate": 0.0004199678305273936, + "loss": 0.82367325, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2935, + "time_per_iteration": 2.648293972015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096103, + "balance_loss_mlp": 1.08564794, + "diversity_loss_mlp": 0.0, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.06584718006017456, + "language_loss": 0.81395173, + "learning_rate": 0.0004196603206176854, + "loss": 0.82491279, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.10461426, + "routerloss_mlp": 0.0, + "step": 2936, + "time_per_iteration": 2.9504921436309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110133, + "balance_loss_mlp": 1.09094691, + "diversity_loss_mlp": 0.0, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.06854637503151859, + "language_loss": 0.83705592, + "learning_rate": 0.000419352841898607, + "loss": 0.84806919, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2937, + "time_per_iteration": 2.965176582336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100386, + "balance_loss_mlp": 1.09003913, + "diversity_loss_mlp": 0.0, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.06908295336200668, + "language_loss": 0.77684075, + "learning_rate": 0.000419045394489532, + "loss": 0.7878446, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2938, + "time_per_iteration": 2.692997455596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094877, + "balance_loss_mlp": 1.08429718, + "diversity_loss_mlp": 0.0, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.06508171061148607, + "language_loss": 0.76831025, + "learning_rate": 0.0004187379785098224, + "loss": 0.77925897, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2939, + "time_per_iteration": 3.123154401779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110149, + "balance_loss_mlp": 1.09110653, + "diversity_loss_mlp": 0.0, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.08014464510269267, + "language_loss": 0.83749938, + "learning_rate": 0.00041843059407882744, + "loss": 0.84851432, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2940, + "time_per_iteration": 2.9720611572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099107, + "balance_loss_mlp": 1.0887475, + "diversity_loss_mlp": 0.0, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.06910210619422795, + "language_loss": 0.82642627, + "learning_rate": 0.0004181232413158842, + "loss": 0.83741736, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2941, + "time_per_iteration": 2.657360315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094217, + "balance_loss_mlp": 1.08388722, + "diversity_loss_mlp": 0.0, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.08913898875539945, + "language_loss": 0.82192254, + "learning_rate": 0.0004178159203403179, + "loss": 0.83286464, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2942, + "time_per_iteration": 2.8812596797943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080275, + "balance_loss_mlp": 1.07014799, + "diversity_loss_mlp": 0.0, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.06202774017820852, + "language_loss": 0.8130517, + "learning_rate": 0.0004175086312714409, + "loss": 0.82385445, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 2943, + "time_per_iteration": 2.561537027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.07015431, + "diversity_loss_mlp": 0.0, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.05809127095966742, + "language_loss": 0.83570457, + "learning_rate": 0.00041720137422855366, + "loss": 0.84651101, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.10491943, + "routerloss_mlp": 0.0, + "step": 2944, + "time_per_iteration": 2.7395284175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075472, + "balance_loss_mlp": 1.06576228, + "diversity_loss_mlp": 0.0, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.07239714207057282, + "language_loss": 0.79116005, + "learning_rate": 0.00041689414933094383, + "loss": 0.80191475, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 2945, + "time_per_iteration": 2.654930353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067367, + "balance_loss_mlp": 1.05734193, + "diversity_loss_mlp": 0.0, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.07615309090382201, + "language_loss": 0.80823922, + "learning_rate": 0.00041658695669788653, + "loss": 0.81891298, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2946, + "time_per_iteration": 2.747903347015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069298, + "balance_loss_mlp": 1.05894506, + "diversity_loss_mlp": 0.0, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.09594015960064259, + "language_loss": 0.81304628, + "learning_rate": 0.00041627979644864453, + "loss": 0.82373923, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2947, + "time_per_iteration": 2.8192365169525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064628, + "balance_loss_mlp": 1.05435264, + "diversity_loss_mlp": 0.0, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.06124486727819338, + "language_loss": 0.81212783, + "learning_rate": 0.0004159726687024683, + "loss": 0.82277411, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 2948, + "time_per_iteration": 2.634019613265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066585, + "balance_loss_mlp": 1.05610037, + "diversity_loss_mlp": 0.0, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.0698899799050157, + "language_loss": 0.7929486, + "learning_rate": 0.00041566557357859506, + "loss": 0.80361444, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2949, + "time_per_iteration": 2.861374616622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.05816913, + "diversity_loss_mlp": 0.0, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.0603589352170923, + "language_loss": 0.79605162, + "learning_rate": 0.0004153585111962502, + "loss": 0.80673802, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2950, + "time_per_iteration": 3.3136749267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076091, + "balance_loss_mlp": 1.06528509, + "diversity_loss_mlp": 0.0, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.07046051490297799, + "language_loss": 0.84271163, + "learning_rate": 0.0004150514816746453, + "loss": 0.85347259, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.10803223, + "routerloss_mlp": 0.0, + "step": 2951, + "time_per_iteration": 2.7142550945281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079575, + "balance_loss_mlp": 1.0689894, + "diversity_loss_mlp": 0.0, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.07561213643312675, + "language_loss": 0.85564739, + "learning_rate": 0.0004147444851329802, + "loss": 0.8664431, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2952, + "time_per_iteration": 2.663442611694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079915, + "balance_loss_mlp": 1.06943655, + "diversity_loss_mlp": 0.0, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.06334656392280237, + "language_loss": 0.85917854, + "learning_rate": 0.00041443752169044126, + "loss": 0.86997765, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2953, + "time_per_iteration": 3.0424787998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083209, + "balance_loss_mlp": 1.07296944, + "diversity_loss_mlp": 0.0, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.08759511227816434, + "language_loss": 0.84844387, + "learning_rate": 0.0004141305914662025, + "loss": 0.85927594, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.10241699, + "routerloss_mlp": 0.0, + "step": 2954, + "time_per_iteration": 2.720574378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080604, + "balance_loss_mlp": 1.06977344, + "diversity_loss_mlp": 0.0, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0625505952609041, + "language_loss": 0.80443704, + "learning_rate": 0.0004138236945794246, + "loss": 0.81524312, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.10839844, + "routerloss_mlp": 0.0, + "step": 2955, + "time_per_iteration": 2.880007743835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067912, + "balance_loss_mlp": 1.05775595, + "diversity_loss_mlp": 0.0, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.08164782403227437, + "language_loss": 0.84066302, + "learning_rate": 0.00041351683114925576, + "loss": 0.85134214, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2956, + "time_per_iteration": 3.061213731765747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072205, + "balance_loss_mlp": 1.06213737, + "diversity_loss_mlp": 0.0, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.06079019071224684, + "language_loss": 0.86355555, + "learning_rate": 0.0004132100012948308, + "loss": 0.87427759, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 2957, + "time_per_iteration": 2.631786823272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069231, + "balance_loss_mlp": 1.0587523, + "diversity_loss_mlp": 0.0, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.07979265854660174, + "language_loss": 0.84526646, + "learning_rate": 0.00041290320513527145, + "loss": 0.85595882, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2958, + "time_per_iteration": 2.5593366622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061477, + "balance_loss_mlp": 1.05111814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.09201222931646683, + "language_loss": 0.85128796, + "learning_rate": 0.0004125964427896867, + "loss": 0.86190271, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.1036377, + "routerloss_mlp": 0.0, + "step": 2959, + "time_per_iteration": 2.667381525039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063517, + "balance_loss_mlp": 1.05320501, + "diversity_loss_mlp": 0.0, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06922825543149586, + "language_loss": 0.79212141, + "learning_rate": 0.0004122897143771723, + "loss": 0.80275661, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2960, + "time_per_iteration": 2.523068904876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067248, + "balance_loss_mlp": 1.0569005, + "diversity_loss_mlp": 0.0, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.06880331468011665, + "language_loss": 0.81306094, + "learning_rate": 0.0004119830200168109, + "loss": 0.82373345, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 2961, + "time_per_iteration": 2.7224626541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106382, + "balance_loss_mlp": 1.05356169, + "diversity_loss_mlp": 0.0, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.08443053343043137, + "language_loss": 0.88515878, + "learning_rate": 0.0004116763598276714, + "loss": 0.89579695, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 2962, + "time_per_iteration": 2.4910728931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067582, + "balance_loss_mlp": 1.05738318, + "diversity_loss_mlp": 0.0, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.07427131552828858, + "language_loss": 0.81298989, + "learning_rate": 0.00041136973392881017, + "loss": 0.82366574, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.10198975, + "routerloss_mlp": 0.0, + "step": 2963, + "time_per_iteration": 2.8261218070983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063452, + "balance_loss_mlp": 1.05275846, + "diversity_loss_mlp": 0.0, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.0795338566562928, + "language_loss": 0.82039535, + "learning_rate": 0.00041106314243926983, + "loss": 0.83102989, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.10699463, + "routerloss_mlp": 0.0, + "step": 2964, + "time_per_iteration": 2.7321033477783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058191, + "balance_loss_mlp": 1.04802823, + "diversity_loss_mlp": 0.0, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.07985594809339186, + "language_loss": 0.87473917, + "learning_rate": 0.0004107565854780798, + "loss": 0.88532114, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2965, + "time_per_iteration": 2.685188055038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105982, + "balance_loss_mlp": 1.0495863, + "diversity_loss_mlp": 0.0, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.12021988187086102, + "language_loss": 0.80887079, + "learning_rate": 0.000410450063164256, + "loss": 0.81946903, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.10241699, + "routerloss_mlp": 0.0, + "step": 2966, + "time_per_iteration": 2.8859732151031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061114, + "balance_loss_mlp": 1.05084372, + "diversity_loss_mlp": 0.0, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.07877125068742231, + "language_loss": 0.82298398, + "learning_rate": 0.00041014357561680115, + "loss": 0.83359516, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 2967, + "time_per_iteration": 2.5546090602874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072036, + "balance_loss_mlp": 1.06186163, + "diversity_loss_mlp": 0.0, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.0603559044145355, + "language_loss": 0.86396813, + "learning_rate": 0.0004098371229547039, + "loss": 0.87468845, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.10174561, + "routerloss_mlp": 0.0, + "step": 2968, + "time_per_iteration": 2.7246880531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055906, + "balance_loss_mlp": 1.05082798, + "diversity_loss_mlp": 0.0, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.032213471653528905, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81066716, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 2969, + "time_per_iteration": 4.802457571029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845784, + "balance_loss_mlp": 1.44834208, + "diversity_loss_mlp": 0.21849446, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.042172582609019446, + "language_loss": 0.80489594, + "learning_rate": 0.00040922432276247107, + "loss": 0.81335378, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01236574, + "step": 2970, + "time_per_iteration": 2.579711675643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100592, + "balance_loss_mlp": 1.09026289, + "diversity_loss_mlp": 0.0, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.08651791755700546, + "language_loss": 0.84556907, + "learning_rate": 0.0004089179754702457, + "loss": 0.85657501, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2971, + "time_per_iteration": 2.744509220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109172, + "balance_loss_mlp": 1.08128309, + "diversity_loss_mlp": 0.0, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.0875480726861112, + "language_loss": 0.79658413, + "learning_rate": 0.00040861166353919843, + "loss": 0.80750132, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2972, + "time_per_iteration": 2.816767692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843649, + "balance_loss_mlp": 1.44322622, + "diversity_loss_mlp": 0.21953782, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.0303598736791247, + "language_loss": 0.81879437, + "learning_rate": 0.00040830538708824983, + "loss": 0.82723081, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01226737, + "step": 2973, + "time_per_iteration": 2.8936269283294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084736, + "balance_loss_mlp": 1.07479978, + "diversity_loss_mlp": 0.0, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.06866249599002382, + "language_loss": 0.81754982, + "learning_rate": 0.000407999146236307, + "loss": 0.82839715, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2974, + "time_per_iteration": 2.558587074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086743, + "balance_loss_mlp": 1.07657444, + "diversity_loss_mlp": 0.0, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.07286762161416734, + "language_loss": 0.83382261, + "learning_rate": 0.0004076929411022634, + "loss": 0.84468997, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2975, + "time_per_iteration": 2.604498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082309, + "balance_loss_mlp": 1.07231879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.06868291627032407, + "language_loss": 0.79575276, + "learning_rate": 0.0004073867718049982, + "loss": 0.80657583, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 2976, + "time_per_iteration": 3.082519054412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841274, + "balance_loss_mlp": 1.44052804, + "diversity_loss_mlp": 0.21771878, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.03510584247140754, + "language_loss": 0.8255651, + "learning_rate": 0.00040708063846337704, + "loss": 0.83397782, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01215104, + "step": 2977, + "time_per_iteration": 2.7563750743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108914, + "balance_loss_mlp": 1.07897186, + "diversity_loss_mlp": 0.0, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.07105452232664011, + "language_loss": 0.81019402, + "learning_rate": 0.00040677454119625143, + "loss": 0.82108539, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2978, + "time_per_iteration": 2.575923442840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089611, + "balance_loss_mlp": 1.07962155, + "diversity_loss_mlp": 0.0, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.07243213986729599, + "language_loss": 0.82912952, + "learning_rate": 0.0004064684801224587, + "loss": 0.84002566, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.09985352, + "routerloss_mlp": 0.0, + "step": 2979, + "time_per_iteration": 2.5965535640716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085844, + "balance_loss_mlp": 1.07600939, + "diversity_loss_mlp": 0.0, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.11138747568582645, + "language_loss": 0.80322999, + "learning_rate": 0.00040616245536082224, + "loss": 0.81408834, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 2980, + "time_per_iteration": 2.599320650100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079792, + "balance_loss_mlp": 1.07008803, + "diversity_loss_mlp": 0.0, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.06764455313032879, + "language_loss": 0.81366718, + "learning_rate": 0.00040585646703015165, + "loss": 0.82446504, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 2981, + "time_per_iteration": 2.8000056743621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_mlp": 1.0740515, + "diversity_loss_mlp": 0.0, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.07435230765684324, + "language_loss": 0.78094304, + "learning_rate": 0.0004055505152492419, + "loss": 0.79178286, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2982, + "time_per_iteration": 2.6867222785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075905, + "balance_loss_mlp": 1.06574273, + "diversity_loss_mlp": 0.0, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.06874763078804642, + "language_loss": 0.74040514, + "learning_rate": 0.00040524460013687425, + "loss": 0.7511642, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2983, + "time_per_iteration": 2.722419500350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070682, + "balance_loss_mlp": 1.06058455, + "diversity_loss_mlp": 0.0, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.06717754752260814, + "language_loss": 0.81118953, + "learning_rate": 0.0004049387218118155, + "loss": 0.82189637, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 2984, + "time_per_iteration": 2.960744857788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065588, + "balance_loss_mlp": 1.05519915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07543134348802799, + "language_loss": 0.85138291, + "learning_rate": 0.00040463288039281777, + "loss": 0.86203879, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2985, + "time_per_iteration": 2.769758939743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_mlp": 1.03847778, + "diversity_loss_mlp": 0.0, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.0202426857746204, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78919691, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.04785156, + "routerloss_mlp": 0.0, + "step": 2986, + "time_per_iteration": 4.966659784317017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062164, + "balance_loss_mlp": 1.05206716, + "diversity_loss_mlp": 0.0, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.15131369926607025, + "language_loss": 0.82060635, + "learning_rate": 0.0004040213087479444, + "loss": 0.83122802, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 2987, + "time_per_iteration": 2.9445290565490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071505, + "balance_loss_mlp": 1.0615747, + "diversity_loss_mlp": 0.0, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.0782867157663105, + "language_loss": 0.85397077, + "learning_rate": 0.0004037155787595018, + "loss": 0.86468589, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2988, + "time_per_iteration": 2.5765254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066911, + "balance_loss_mlp": 1.05708241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.06722963936024443, + "language_loss": 0.80743146, + "learning_rate": 0.000403409886151987, + "loss": 0.81810057, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 2989, + "time_per_iteration": 2.916736364364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_mlp": 1.02410662, + "diversity_loss_mlp": 0.0, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.01652195359171043, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.8302803, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.0480957, + "routerloss_mlp": 0.0, + "step": 2990, + "time_per_iteration": 4.79939866065979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019783, + "balance_loss_mlp": 1.0149194, + "diversity_loss_mlp": 0.0, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.012607930583697005, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79218388, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 2991, + "time_per_iteration": 4.873241901397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107448, + "balance_loss_mlp": 1.06493187, + "diversity_loss_mlp": 0.0, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.07321689676824589, + "language_loss": 0.7675758, + "learning_rate": 0.00040249303380173807, + "loss": 0.77832061, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 2992, + "time_per_iteration": 3.119454860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075897, + "balance_loss_mlp": 1.06607461, + "diversity_loss_mlp": 0.0, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.06951674167184135, + "language_loss": 0.78929973, + "learning_rate": 0.00040218749190459126, + "loss": 0.80005872, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 2993, + "time_per_iteration": 2.735741138458252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074749, + "balance_loss_mlp": 1.06464601, + "diversity_loss_mlp": 0.0, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.09040694151318206, + "language_loss": 0.82524914, + "learning_rate": 0.00040188198798162775, + "loss": 0.83599663, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.10101318, + "routerloss_mlp": 0.0, + "step": 2994, + "time_per_iteration": 2.604189872741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107928, + "balance_loss_mlp": 1.06903386, + "diversity_loss_mlp": 0.0, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.07247823517444965, + "language_loss": 0.85413349, + "learning_rate": 0.000401576522151455, + "loss": 0.86492634, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.10247803, + "routerloss_mlp": 0.0, + "step": 2995, + "time_per_iteration": 2.8580820560455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082336, + "balance_loss_mlp": 1.07231033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.07641213429349043, + "language_loss": 0.82611746, + "learning_rate": 0.0004012710945326651, + "loss": 0.83694082, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2996, + "time_per_iteration": 2.7899913787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093927, + "balance_loss_mlp": 1.08396673, + "diversity_loss_mlp": 0.0, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.06499516885792743, + "language_loss": 0.81305802, + "learning_rate": 0.0004009657052438355, + "loss": 0.82399726, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 2997, + "time_per_iteration": 2.7985143661499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109354, + "balance_loss_mlp": 1.08339536, + "diversity_loss_mlp": 0.0, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.07919341256021087, + "language_loss": 0.85873878, + "learning_rate": 0.00040066035440352904, + "loss": 0.86967415, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 2998, + "time_per_iteration": 2.633052110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032353, + "balance_loss_mlp": 1.02706063, + "diversity_loss_mlp": 0.0, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.024696349234847453, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80325484, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 2999, + "time_per_iteration": 4.901000022888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111633, + "balance_loss_mlp": 1.10161996, + "diversity_loss_mlp": 0.0, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.09685011562347093, + "language_loss": 0.76085562, + "learning_rate": 0.00040004976854266145, + "loss": 0.77197194, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3000, + "time_per_iteration": 2.5440561771392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106478, + "balance_loss_mlp": 1.09615445, + "diversity_loss_mlp": 0.0, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.08566214489971447, + "language_loss": 0.81596673, + "learning_rate": 0.0003997445337591505, + "loss": 0.82703155, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.10327148, + "routerloss_mlp": 0.0, + "step": 3001, + "time_per_iteration": 2.6576101779937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101254, + "balance_loss_mlp": 1.09120488, + "diversity_loss_mlp": 0.0, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.07034086792873868, + "language_loss": 0.74008942, + "learning_rate": 0.0003994393378982635, + "loss": 0.75110197, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3002, + "time_per_iteration": 2.646756172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_mlp": 1.02816153, + "diversity_loss_mlp": 0.0, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.018933197318392565, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80571294, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.05126953, + "routerloss_mlp": 0.0, + "step": 3003, + "time_per_iteration": 4.810927867889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084692, + "balance_loss_mlp": 1.07440448, + "diversity_loss_mlp": 0.0, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.09168460196837042, + "language_loss": 0.8788178, + "learning_rate": 0.0003988290634182961, + "loss": 0.88966477, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.10290527, + "routerloss_mlp": 0.0, + "step": 3004, + "time_per_iteration": 2.8026678562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086517, + "balance_loss_mlp": 1.0765686, + "diversity_loss_mlp": 0.0, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.07023697016091271, + "language_loss": 0.80836314, + "learning_rate": 0.0003985239850361453, + "loss": 0.81922829, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3005, + "time_per_iteration": 2.605581760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108379, + "balance_loss_mlp": 1.0739491, + "diversity_loss_mlp": 0.0, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.08589270039345176, + "language_loss": 0.84542817, + "learning_rate": 0.0003982189460504777, + "loss": 0.85626608, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.09838867, + "routerloss_mlp": 0.0, + "step": 3006, + "time_per_iteration": 2.755309820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081707, + "balance_loss_mlp": 1.07148504, + "diversity_loss_mlp": 0.0, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.07367765629951939, + "language_loss": 0.79058981, + "learning_rate": 0.00039791394657971935, + "loss": 0.80140698, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3007, + "time_per_iteration": 2.7115721702575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083463, + "balance_loss_mlp": 1.07349145, + "diversity_loss_mlp": 0.0, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.08639799759711958, + "language_loss": 0.84195948, + "learning_rate": 0.00039760898674228205, + "loss": 0.85279417, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.09967041, + "routerloss_mlp": 0.0, + "step": 3008, + "time_per_iteration": 2.6536192893981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.07249665, + "diversity_loss_mlp": 0.0, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.06522284264232586, + "language_loss": 0.80620825, + "learning_rate": 0.0003973040666565613, + "loss": 0.81703728, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.10406494, + "routerloss_mlp": 0.0, + "step": 3009, + "time_per_iteration": 3.0663528442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083981, + "balance_loss_mlp": 1.07382393, + "diversity_loss_mlp": 0.0, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.06612730330601824, + "language_loss": 0.82148051, + "learning_rate": 0.000396999186440938, + "loss": 0.83232027, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 3010, + "time_per_iteration": 2.8332176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078314, + "balance_loss_mlp": 1.06794286, + "diversity_loss_mlp": 0.0, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.0828593686110812, + "language_loss": 0.85258269, + "learning_rate": 0.000396694346213777, + "loss": 0.86336583, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 3011, + "time_per_iteration": 2.6009714603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107282, + "balance_loss_mlp": 1.06272256, + "diversity_loss_mlp": 0.0, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.06962390382868744, + "language_loss": 0.83265769, + "learning_rate": 0.0003963895460934276, + "loss": 0.84338593, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 3012, + "time_per_iteration": 3.1654391288757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069146, + "balance_loss_mlp": 1.05900097, + "diversity_loss_mlp": 0.0, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.07925389671051855, + "language_loss": 0.84790504, + "learning_rate": 0.00039608478619822376, + "loss": 0.85859656, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.10144043, + "routerloss_mlp": 0.0, + "step": 3013, + "time_per_iteration": 2.427522659301758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_mlp": 1.05792189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.06006231039706783, + "language_loss": 0.82350284, + "learning_rate": 0.00039578006664648394, + "loss": 0.83418107, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3014, + "time_per_iteration": 2.744586229324341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073341, + "balance_loss_mlp": 1.06352377, + "diversity_loss_mlp": 0.0, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.06972986465808689, + "language_loss": 0.81348431, + "learning_rate": 0.0003954753875565105, + "loss": 0.82421774, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3015, + "time_per_iteration": 3.0640695095062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072847, + "balance_loss_mlp": 1.06282723, + "diversity_loss_mlp": 0.0, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.07357715078918559, + "language_loss": 0.82623494, + "learning_rate": 0.00039517074904659057, + "loss": 0.83696342, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3016, + "time_per_iteration": 2.6665265560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010727, + "balance_loss_mlp": 1.06269789, + "diversity_loss_mlp": 0.0, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.06753013197016527, + "language_loss": 0.84737754, + "learning_rate": 0.00039486615123499535, + "loss": 0.85810453, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.10003662, + "routerloss_mlp": 0.0, + "step": 3017, + "time_per_iteration": 2.868724822998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067949, + "balance_loss_mlp": 1.05761325, + "diversity_loss_mlp": 0.0, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.06414820954678578, + "language_loss": 0.84855384, + "learning_rate": 0.00039456159423997996, + "loss": 0.85923326, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 3018, + "time_per_iteration": 2.7043581008911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067563, + "balance_loss_mlp": 1.05765033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.06908857206879536, + "language_loss": 0.89950442, + "learning_rate": 0.00039425707817978406, + "loss": 0.91018009, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3019, + "time_per_iteration": 2.661128044128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106838, + "balance_loss_mlp": 1.0578835, + "diversity_loss_mlp": 0.0, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.08125232064199928, + "language_loss": 0.83649898, + "learning_rate": 0.00039395260317263124, + "loss": 0.84718275, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 3020, + "time_per_iteration": 2.5645148754119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070808, + "balance_loss_mlp": 1.06039524, + "diversity_loss_mlp": 0.0, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.06887634041791851, + "language_loss": 0.85043871, + "learning_rate": 0.0003936481693367291, + "loss": 0.86114681, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3021, + "time_per_iteration": 2.7062771320343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077533, + "balance_loss_mlp": 1.06673217, + "diversity_loss_mlp": 0.0, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08641696356618225, + "language_loss": 0.87619507, + "learning_rate": 0.0003933437767902697, + "loss": 0.88697034, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.10803223, + "routerloss_mlp": 0.0, + "step": 3022, + "time_per_iteration": 2.7680017948150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078804, + "balance_loss_mlp": 1.06846249, + "diversity_loss_mlp": 0.0, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.0708496595357851, + "language_loss": 0.78467089, + "learning_rate": 0.00039303942565142825, + "loss": 0.79545891, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 3023, + "time_per_iteration": 2.7319986820220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071706, + "balance_loss_mlp": 1.06121564, + "diversity_loss_mlp": 0.0, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.06941107329713525, + "language_loss": 0.76844412, + "learning_rate": 0.0003927351160383644, + "loss": 0.77916121, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 3024, + "time_per_iteration": 2.7925262451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069902, + "balance_loss_mlp": 1.05980492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.07084631667240687, + "language_loss": 0.77815473, + "learning_rate": 0.000392430848069222, + "loss": 0.78885376, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 3025, + "time_per_iteration": 2.5290136337280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075514, + "balance_loss_mlp": 1.06532741, + "diversity_loss_mlp": 0.0, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.07224483468752362, + "language_loss": 0.82501459, + "learning_rate": 0.00039212662186212795, + "loss": 0.83576977, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3026, + "time_per_iteration": 2.6017684936523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106609, + "balance_loss_mlp": 1.05593956, + "diversity_loss_mlp": 0.0, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.05478704818063415, + "language_loss": 0.77076197, + "learning_rate": 0.0003918224375351934, + "loss": 0.78142285, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 3027, + "time_per_iteration": 2.707127571105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069708, + "balance_loss_mlp": 1.05940795, + "diversity_loss_mlp": 0.0, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.07026049561627037, + "language_loss": 0.78559566, + "learning_rate": 0.0003915182952065135, + "loss": 0.79629278, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 3028, + "time_per_iteration": 2.6728062629699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863261, + "balance_loss_mlp": 1.48110199, + "diversity_loss_mlp": 0.21947324, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.028926470462326558, + "language_loss": 0.87632734, + "learning_rate": 0.0003912141949941664, + "loss": 0.88495994, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0129736, + "step": 3029, + "time_per_iteration": 2.7290279865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068105, + "balance_loss_mlp": 1.05748928, + "diversity_loss_mlp": 0.0, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.11092566755711959, + "language_loss": 0.82848042, + "learning_rate": 0.0003909101370162143, + "loss": 0.83916146, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 3030, + "time_per_iteration": 2.5907628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_mlp": 1.05161262, + "diversity_loss_mlp": 0.0, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.028764883169419067, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73491609, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.06103516, + "routerloss_mlp": 0.0, + "step": 3031, + "time_per_iteration": 4.87787127494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066859, + "balance_loss_mlp": 1.05651772, + "diversity_loss_mlp": 0.0, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.06710106844205427, + "language_loss": 0.82853395, + "learning_rate": 0.0003903021482356622, + "loss": 0.83920258, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 3032, + "time_per_iteration": 2.777536153793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067707, + "balance_loss_mlp": 1.05757427, + "diversity_loss_mlp": 0.0, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.05521171326439417, + "language_loss": 0.82775813, + "learning_rate": 0.00038999821766910465, + "loss": 0.83843517, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.10131836, + "routerloss_mlp": 0.0, + "step": 3033, + "time_per_iteration": 2.990370035171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064798, + "balance_loss_mlp": 1.05444503, + "diversity_loss_mlp": 0.0, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.06933125597123427, + "language_loss": 0.85725427, + "learning_rate": 0.00038969432980902606, + "loss": 0.86790228, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 3034, + "time_per_iteration": 2.522594690322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101659, + "balance_loss_mlp": 1.01134527, + "diversity_loss_mlp": 0.0, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.016170176694849804, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80801094, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.05249023, + "routerloss_mlp": 0.0, + "step": 3035, + "time_per_iteration": 4.804777383804321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070254, + "balance_loss_mlp": 1.06007361, + "diversity_loss_mlp": 0.0, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.06630987198212972, + "language_loss": 0.82630336, + "learning_rate": 0.00038908668268020953, + "loss": 0.83700585, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.10180664, + "routerloss_mlp": 0.0, + "step": 3036, + "time_per_iteration": 2.6598165035247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064922, + "balance_loss_mlp": 1.0547123, + "diversity_loss_mlp": 0.0, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.06353975651870693, + "language_loss": 0.85077345, + "learning_rate": 0.00038878292364738097, + "loss": 0.86142278, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3037, + "time_per_iteration": 2.817431688308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066587, + "balance_loss_mlp": 1.05653155, + "diversity_loss_mlp": 0.0, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.06847185322789755, + "language_loss": 0.86992419, + "learning_rate": 0.0003884792077928508, + "loss": 0.88059008, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 3038, + "time_per_iteration": 2.515582323074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067425, + "balance_loss_mlp": 1.05704808, + "diversity_loss_mlp": 0.0, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.08132102193369704, + "language_loss": 0.76704037, + "learning_rate": 0.0003881755352345322, + "loss": 0.77771461, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 3039, + "time_per_iteration": 2.506476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.05959702, + "diversity_loss_mlp": 0.0, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.05655703451029381, + "language_loss": 0.87182224, + "learning_rate": 0.0003878719060903207, + "loss": 0.88252252, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 3040, + "time_per_iteration": 2.5755503177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077595, + "balance_loss_mlp": 1.06733704, + "diversity_loss_mlp": 0.0, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.07213898072930079, + "language_loss": 0.83620822, + "learning_rate": 0.0003875683204780961, + "loss": 0.84698415, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 3041, + "time_per_iteration": 2.7087528705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00858209, + "balance_loss_mlp": 1.47420132, + "diversity_loss_mlp": 0.21720865, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.0337374590034744, + "language_loss": 0.85750413, + "learning_rate": 0.00038726477851572043, + "loss": 0.86608613, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01250451, + "step": 3042, + "time_per_iteration": 2.8391060829162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085797, + "balance_loss_mlp": 1.07552087, + "diversity_loss_mlp": 0.0, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.07424787281712622, + "language_loss": 0.8043561, + "learning_rate": 0.0003869612803210395, + "loss": 0.81521404, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3043, + "time_per_iteration": 2.6728439331054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_mlp": 1.07525158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.0731909762270397, + "language_loss": 0.83286428, + "learning_rate": 0.0003866578260118817, + "loss": 0.8437193, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 3044, + "time_per_iteration": 2.6332969665527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108221, + "balance_loss_mlp": 1.07239914, + "diversity_loss_mlp": 0.0, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.07445534470947208, + "language_loss": 0.82966632, + "learning_rate": 0.0003863544157060581, + "loss": 0.84048843, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3045, + "time_per_iteration": 2.668837785720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081605, + "balance_loss_mlp": 1.07137656, + "diversity_loss_mlp": 0.0, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.07387128485113956, + "language_loss": 0.82359195, + "learning_rate": 0.0003860510495213634, + "loss": 0.83440793, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3046, + "time_per_iteration": 2.8229498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106696, + "balance_loss_mlp": 1.05705416, + "diversity_loss_mlp": 0.0, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.08160785595799389, + "language_loss": 0.78622752, + "learning_rate": 0.0003857477275755746, + "loss": 0.79689717, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3047, + "time_per_iteration": 2.6294050216674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066008, + "balance_loss_mlp": 1.0557915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.0580402220657833, + "language_loss": 0.83646655, + "learning_rate": 0.00038544444998645167, + "loss": 0.84712666, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.10217285, + "routerloss_mlp": 0.0, + "step": 3048, + "time_per_iteration": 3.0289785861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059507, + "balance_loss_mlp": 1.04951751, + "diversity_loss_mlp": 0.0, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.0674332369398686, + "language_loss": 0.81847656, + "learning_rate": 0.00038514121687173767, + "loss": 0.82907164, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 3049, + "time_per_iteration": 2.5797152519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058576, + "balance_loss_mlp": 1.04861593, + "diversity_loss_mlp": 0.0, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.08495884025795868, + "language_loss": 0.82019609, + "learning_rate": 0.00038483802834915807, + "loss": 0.83078188, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3050, + "time_per_iteration": 3.0199241638183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061818, + "balance_loss_mlp": 1.05154216, + "diversity_loss_mlp": 0.0, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.07816426751212531, + "language_loss": 0.78978479, + "learning_rate": 0.00038453488453642074, + "loss": 0.800403, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3051, + "time_per_iteration": 2.7338953018188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105642, + "balance_loss_mlp": 1.04610801, + "diversity_loss_mlp": 0.0, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.07385283463746846, + "language_loss": 0.86878967, + "learning_rate": 0.00038423178555121697, + "loss": 0.87935388, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.10308838, + "routerloss_mlp": 0.0, + "step": 3052, + "time_per_iteration": 2.7545297145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058315, + "balance_loss_mlp": 1.04783666, + "diversity_loss_mlp": 0.0, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.07920619209623277, + "language_loss": 0.85583031, + "learning_rate": 0.00038392873151121994, + "loss": 0.86641347, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 3053, + "time_per_iteration": 3.07143235206604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.04924083, + "diversity_loss_mlp": 0.0, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.07754087781816771, + "language_loss": 0.83137167, + "learning_rate": 0.0003836257225340859, + "loss": 0.84196955, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.10552979, + "routerloss_mlp": 0.0, + "step": 3054, + "time_per_iteration": 2.6132304668426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066843, + "balance_loss_mlp": 1.05597091, + "diversity_loss_mlp": 0.0, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.0689474058081498, + "language_loss": 0.82020974, + "learning_rate": 0.00038332275873745336, + "loss": 0.83087826, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.10882568, + "routerloss_mlp": 0.0, + "step": 3055, + "time_per_iteration": 3.107823371887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855378, + "balance_loss_mlp": 1.46855807, + "diversity_loss_mlp": 0.21676093, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.026786885849911755, + "language_loss": 0.82891941, + "learning_rate": 0.0003830198402389431, + "loss": 0.83747321, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01271825, + "step": 3056, + "time_per_iteration": 2.7645249366760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_mlp": 1.03548789, + "diversity_loss_mlp": 0.0, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.027829027984012215, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78389645, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.05224609, + "routerloss_mlp": 0.0, + "step": 3057, + "time_per_iteration": 4.995454549789429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082248, + "balance_loss_mlp": 1.07115602, + "diversity_loss_mlp": 0.0, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.10105227922023945, + "language_loss": 0.83302426, + "learning_rate": 0.0003824141396066855, + "loss": 0.8438468, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.11096191, + "routerloss_mlp": 0.0, + "step": 3058, + "time_per_iteration": 2.568283796310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086707, + "balance_loss_mlp": 1.07570362, + "diversity_loss_mlp": 0.0, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.10870959422332387, + "language_loss": 0.8283565, + "learning_rate": 0.000382111357708092, + "loss": 0.83922356, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.10998535, + "routerloss_mlp": 0.0, + "step": 3059, + "time_per_iteration": 2.7063958644866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080617, + "balance_loss_mlp": 1.06985879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.09017347087331092, + "language_loss": 0.83373827, + "learning_rate": 0.00038180862157792864, + "loss": 0.84454447, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 3060, + "time_per_iteration": 2.7716259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071354, + "balance_loss_mlp": 1.06098306, + "diversity_loss_mlp": 0.0, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.06780881013643715, + "language_loss": 0.81814772, + "learning_rate": 0.0003815059313337279, + "loss": 0.82886124, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 3061, + "time_per_iteration": 2.664134979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.06180596, + "diversity_loss_mlp": 0.0, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.06335749004143083, + "language_loss": 0.78063929, + "learning_rate": 0.00038120328709300436, + "loss": 0.79135942, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3062, + "time_per_iteration": 2.8627028465270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066046, + "balance_loss_mlp": 1.05566847, + "diversity_loss_mlp": 0.0, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.06769296518732247, + "language_loss": 0.8382163, + "learning_rate": 0.0003809006889732549, + "loss": 0.84887671, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 3063, + "time_per_iteration": 2.809983253479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066721, + "balance_loss_mlp": 1.05686879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.07471445768221775, + "language_loss": 0.88052714, + "learning_rate": 0.0003805981370919589, + "loss": 0.89119434, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3064, + "time_per_iteration": 2.526881456375122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106806, + "balance_loss_mlp": 1.05822492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.06588713514234819, + "language_loss": 0.83812523, + "learning_rate": 0.0003802956315665771, + "loss": 0.84880579, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3065, + "time_per_iteration": 2.6691834926605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072593, + "balance_loss_mlp": 1.06285346, + "diversity_loss_mlp": 0.0, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.11425397529110681, + "language_loss": 0.8185159, + "learning_rate": 0.0003799931725145529, + "loss": 0.82924175, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.09729004, + "routerloss_mlp": 0.0, + "step": 3066, + "time_per_iteration": 2.6098556518554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_mlp": 1.06719375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.07983506473752326, + "language_loss": 0.85902935, + "learning_rate": 0.00037969076005331083, + "loss": 0.86980045, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3067, + "time_per_iteration": 2.7626185417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081851, + "balance_loss_mlp": 1.07184935, + "diversity_loss_mlp": 0.0, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.07247659487205776, + "language_loss": 0.8802191, + "learning_rate": 0.00037938839430025817, + "loss": 0.89103758, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3068, + "time_per_iteration": 2.6493396759033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088575, + "balance_loss_mlp": 1.07886577, + "diversity_loss_mlp": 0.0, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.0655302097756617, + "language_loss": 0.85496283, + "learning_rate": 0.0003790860753727835, + "loss": 0.8658486, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3069, + "time_per_iteration": 2.7941815853118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089673, + "balance_loss_mlp": 1.07995713, + "diversity_loss_mlp": 0.0, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0796849495747384, + "language_loss": 0.82864797, + "learning_rate": 0.00037878380338825766, + "loss": 0.83954477, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3070, + "time_per_iteration": 2.6861939430236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102877, + "balance_loss_mlp": 1.09311378, + "diversity_loss_mlp": 0.0, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.08458672700427887, + "language_loss": 0.81556624, + "learning_rate": 0.00037848157846403287, + "loss": 0.82659507, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3071, + "time_per_iteration": 2.873662233352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101959, + "balance_loss_mlp": 1.09236836, + "diversity_loss_mlp": 0.0, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.07248408902015292, + "language_loss": 0.83281767, + "learning_rate": 0.0003781794007174435, + "loss": 0.84383726, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3072, + "time_per_iteration": 2.762472629547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088348, + "balance_loss_mlp": 1.08360386, + "diversity_loss_mlp": 0.0, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.032251872290910595, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75162888, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.04736328, + "routerloss_mlp": 0.0, + "step": 3073, + "time_per_iteration": 4.854618787765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107188, + "balance_loss_mlp": 1.09715033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.058981009489694675, + "language_loss": 0.80947924, + "learning_rate": 0.0003775751872264152, + "loss": 0.8205511, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.1003418, + "routerloss_mlp": 0.0, + "step": 3074, + "time_per_iteration": 2.771085023880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101985, + "balance_loss_mlp": 1.09195375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.056077752757325364, + "language_loss": 0.87175214, + "learning_rate": 0.0003772731517165527, + "loss": 0.88277197, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.10028076, + "routerloss_mlp": 0.0, + "step": 3075, + "time_per_iteration": 2.8292393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103862, + "balance_loss_mlp": 1.09419441, + "diversity_loss_mlp": 0.0, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.07602524147414737, + "language_loss": 0.83311272, + "learning_rate": 0.0003769711638534784, + "loss": 0.84415126, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3076, + "time_per_iteration": 2.97261381149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099488, + "balance_loss_mlp": 1.08962953, + "diversity_loss_mlp": 0.0, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.07287223806238774, + "language_loss": 0.79046565, + "learning_rate": 0.00037666922375443446, + "loss": 0.8014605, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3077, + "time_per_iteration": 2.6755480766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093205, + "balance_loss_mlp": 1.08349538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.06803693763690793, + "language_loss": 0.81907725, + "learning_rate": 0.00037636733153664396, + "loss": 0.83000934, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3078, + "time_per_iteration": 2.8055219650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109815, + "balance_loss_mlp": 1.08854795, + "diversity_loss_mlp": 0.0, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.08595437511710807, + "language_loss": 0.80202127, + "learning_rate": 0.0003760654873173124, + "loss": 0.81300277, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3079, + "time_per_iteration": 2.6700353622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089136, + "balance_loss_mlp": 1.07927787, + "diversity_loss_mlp": 0.0, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.06826446524438025, + "language_loss": 0.82043588, + "learning_rate": 0.00037576369121362566, + "loss": 0.8313272, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3080, + "time_per_iteration": 2.596071481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089019, + "balance_loss_mlp": 1.07946444, + "diversity_loss_mlp": 0.0, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.057614109423291045, + "language_loss": 0.81680822, + "learning_rate": 0.0003754619433427516, + "loss": 0.82769841, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3081, + "time_per_iteration": 2.9003093242645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087273, + "balance_loss_mlp": 1.07771826, + "diversity_loss_mlp": 0.0, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.09118109008842482, + "language_loss": 0.7796042, + "learning_rate": 0.0003751602438218392, + "loss": 0.79047692, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3082, + "time_per_iteration": 2.7739951610565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06927121, + "diversity_loss_mlp": 0.0, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.07641398361038237, + "language_loss": 0.84107417, + "learning_rate": 0.0003748585927680186, + "loss": 0.85186076, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3083, + "time_per_iteration": 2.6706809997558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087865, + "balance_loss_mlp": 1.07850111, + "diversity_loss_mlp": 0.0, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.07450452823339063, + "language_loss": 0.82992828, + "learning_rate": 0.00037455699029840086, + "loss": 0.84080696, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3084, + "time_per_iteration": 2.648775100708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082396, + "balance_loss_mlp": 1.07310402, + "diversity_loss_mlp": 0.0, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.0678124296562273, + "language_loss": 0.84694779, + "learning_rate": 0.0003742554365300787, + "loss": 0.85777175, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3085, + "time_per_iteration": 2.787437677383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00854998, + "balance_loss_mlp": 1.4632709, + "diversity_loss_mlp": 0.21810779, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.030613192067315453, + "language_loss": 0.79049134, + "learning_rate": 0.0003739539315801255, + "loss": 0.79904133, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01430825, + "step": 3086, + "time_per_iteration": 2.9476425647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088902, + "balance_loss_mlp": 1.07956231, + "diversity_loss_mlp": 0.0, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.08021663243926581, + "language_loss": 0.91758776, + "learning_rate": 0.000373652475565596, + "loss": 0.92847675, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3087, + "time_per_iteration": 2.473820924758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086482, + "balance_loss_mlp": 1.07684994, + "diversity_loss_mlp": 0.0, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.0746565513598584, + "language_loss": 0.81288451, + "learning_rate": 0.00037335106860352587, + "loss": 0.8237493, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3088, + "time_per_iteration": 2.6710119247436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.07624292, + "diversity_loss_mlp": 0.0, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.06157127364570171, + "language_loss": 0.82947195, + "learning_rate": 0.00037304971081093146, + "loss": 0.84033072, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3089, + "time_per_iteration": 2.5530550479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095759, + "balance_loss_mlp": 1.0863055, + "diversity_loss_mlp": 0.0, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.06188782031055571, + "language_loss": 0.80896157, + "learning_rate": 0.00037274840230481024, + "loss": 0.81991911, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3090, + "time_per_iteration": 2.707697868347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094547, + "balance_loss_mlp": 1.08488476, + "diversity_loss_mlp": 0.0, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.07660649649984981, + "language_loss": 0.79309815, + "learning_rate": 0.00037244714320214077, + "loss": 0.80404359, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3091, + "time_per_iteration": 2.524418354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094365, + "balance_loss_mlp": 1.08449435, + "diversity_loss_mlp": 0.0, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.07189913531932149, + "language_loss": 0.83442843, + "learning_rate": 0.000372145933619882, + "loss": 0.84537208, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3092, + "time_per_iteration": 2.889267683029175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098289, + "balance_loss_mlp": 1.0883646, + "diversity_loss_mlp": 0.0, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.08404319768947686, + "language_loss": 0.82928061, + "learning_rate": 0.000371844773674974, + "loss": 0.84026349, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3093, + "time_per_iteration": 2.729433059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00849837, + "balance_loss_mlp": 1.45755267, + "diversity_loss_mlp": 0.21677493, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.03215359042810467, + "language_loss": 0.82038867, + "learning_rate": 0.0003715436634843375, + "loss": 0.82888705, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01267278, + "step": 3094, + "time_per_iteration": 2.8759658336639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110065, + "balance_loss_mlp": 1.10049295, + "diversity_loss_mlp": 0.0, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.05868361705811182, + "language_loss": 0.80998492, + "learning_rate": 0.00037124260316487355, + "loss": 0.82108557, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3095, + "time_per_iteration": 2.8515610694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.11049807, + "diversity_loss_mlp": 0.0, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.06311708190042467, + "language_loss": 0.89435279, + "learning_rate": 0.0003709415928334643, + "loss": 0.90555483, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3096, + "time_per_iteration": 2.5820794105529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00850727, + "balance_loss_mlp": 1.45894229, + "diversity_loss_mlp": 0.21772251, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.03378868601366531, + "language_loss": 0.80653715, + "learning_rate": 0.00037064063260697233, + "loss": 0.81504446, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01239414, + "step": 3097, + "time_per_iteration": 2.897676467895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138893, + "balance_loss_mlp": 1.12893891, + "diversity_loss_mlp": 0.0, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.06769209825818075, + "language_loss": 0.78597271, + "learning_rate": 0.0003703397226022407, + "loss": 0.79736161, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3098, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056672, + "balance_loss_mlp": 1.05123568, + "diversity_loss_mlp": 0.0, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.0345928166567928, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76556545, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.05444336, + "routerloss_mlp": 0.0, + "step": 3099, + "time_per_iteration": 4.977718114852905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847219, + "balance_loss_mlp": 1.45243645, + "diversity_loss_mlp": 0.21764749, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.029968084230811296, + "language_loss": 0.83180296, + "learning_rate": 0.0003697380537253339, + "loss": 0.84027505, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01217673, + "step": 3100, + "time_per_iteration": 2.673551559448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.11119175, + "diversity_loss_mlp": 0.0, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.06630352939366652, + "language_loss": 0.81596649, + "learning_rate": 0.0003694372950867471, + "loss": 0.82717824, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 3101, + "time_per_iteration": 2.7776670455932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119741, + "balance_loss_mlp": 1.1100198, + "diversity_loss_mlp": 0.0, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.07189145573728124, + "language_loss": 0.77408171, + "learning_rate": 0.0003691365871370976, + "loss": 0.78527915, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3102, + "time_per_iteration": 3.04355525970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116521, + "balance_loss_mlp": 1.1067102, + "diversity_loss_mlp": 0.0, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06839859357083694, + "language_loss": 0.8504554, + "learning_rate": 0.00036883592999313093, + "loss": 0.8616206, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3103, + "time_per_iteration": 2.6881608963012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_mlp": 1.1020087, + "diversity_loss_mlp": 0.0, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.07720585150601726, + "language_loss": 0.7960434, + "learning_rate": 0.0003685353237715722, + "loss": 0.80715817, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3104, + "time_per_iteration": 2.910879135131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104035, + "balance_loss_mlp": 1.09433126, + "diversity_loss_mlp": 0.0, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.08349083770410728, + "language_loss": 0.81658864, + "learning_rate": 0.0003682347685891274, + "loss": 0.82762903, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3105, + "time_per_iteration": 2.8556530475616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093856, + "balance_loss_mlp": 1.08412814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.07861180875636395, + "language_loss": 0.80587226, + "learning_rate": 0.0003679342645624822, + "loss": 0.81681079, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3106, + "time_per_iteration": 2.9788949489593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091288, + "balance_loss_mlp": 1.08144689, + "diversity_loss_mlp": 0.0, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.062123999367099406, + "language_loss": 0.81345969, + "learning_rate": 0.0003676338118083025, + "loss": 0.82437259, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.09832764, + "routerloss_mlp": 0.0, + "step": 3107, + "time_per_iteration": 3.0514276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083265, + "balance_loss_mlp": 1.07369304, + "diversity_loss_mlp": 0.0, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.07200241428310707, + "language_loss": 0.79341209, + "learning_rate": 0.0003673334104432347, + "loss": 0.8042447, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3108, + "time_per_iteration": 2.6402766704559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084433, + "balance_loss_mlp": 1.07493854, + "diversity_loss_mlp": 0.0, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.06431634181531254, + "language_loss": 0.83437502, + "learning_rate": 0.0003670330605839048, + "loss": 0.84521937, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3109, + "time_per_iteration": 2.8350021839141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071839, + "balance_loss_mlp": 1.06252289, + "diversity_loss_mlp": 0.0, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.08338826074003908, + "language_loss": 0.76629049, + "learning_rate": 0.0003667327623469191, + "loss": 0.77700889, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3110, + "time_per_iteration": 2.7434427738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086579, + "balance_loss_mlp": 1.0770725, + "diversity_loss_mlp": 0.0, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.07334566089126898, + "language_loss": 0.7758621, + "learning_rate": 0.00036643251584886333, + "loss": 0.78672791, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3111, + "time_per_iteration": 2.7712619304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080276, + "balance_loss_mlp": 1.07075715, + "diversity_loss_mlp": 0.0, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.0661546294312284, + "language_loss": 0.81729323, + "learning_rate": 0.00036613232120630393, + "loss": 0.82809597, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3112, + "time_per_iteration": 2.6437926292419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077999, + "balance_loss_mlp": 1.06822348, + "diversity_loss_mlp": 0.0, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.09952194732663294, + "language_loss": 0.80305058, + "learning_rate": 0.00036583217853578643, + "loss": 0.81383061, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3113, + "time_per_iteration": 2.5917038917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085265, + "balance_loss_mlp": 1.07562053, + "diversity_loss_mlp": 0.0, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.09394979208953491, + "language_loss": 0.77671385, + "learning_rate": 0.000365532087953837, + "loss": 0.78756654, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.09631348, + "routerloss_mlp": 0.0, + "step": 3114, + "time_per_iteration": 3.6197850704193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075561, + "balance_loss_mlp": 1.06598282, + "diversity_loss_mlp": 0.0, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.08322265150120763, + "language_loss": 0.89675403, + "learning_rate": 0.00036523204957696065, + "loss": 0.90750962, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3115, + "time_per_iteration": 2.5928850173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068785, + "balance_loss_mlp": 1.05900383, + "diversity_loss_mlp": 0.0, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.07018475264035358, + "language_loss": 0.80565965, + "learning_rate": 0.00036493206352164324, + "loss": 0.81634748, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3116, + "time_per_iteration": 2.9302330017089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070118, + "balance_loss_mlp": 1.06046212, + "diversity_loss_mlp": 0.0, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.07338463965566117, + "language_loss": 0.85090643, + "learning_rate": 0.000364632129904349, + "loss": 0.86160767, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3117, + "time_per_iteration": 2.7801764011383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072158, + "balance_loss_mlp": 1.0622344, + "diversity_loss_mlp": 0.0, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.06545944211786243, + "language_loss": 0.78013116, + "learning_rate": 0.00036433224884152283, + "loss": 0.79085279, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 3118, + "time_per_iteration": 2.714756727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107233, + "balance_loss_mlp": 1.06249511, + "diversity_loss_mlp": 0.0, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.08041065589047977, + "language_loss": 0.77752131, + "learning_rate": 0.00036403242044958875, + "loss": 0.78824466, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.09832764, + "routerloss_mlp": 0.0, + "step": 3119, + "time_per_iteration": 2.583292245864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078089, + "balance_loss_mlp": 1.06846261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.07420053325288596, + "language_loss": 0.91699272, + "learning_rate": 0.0003637326448449507, + "loss": 0.92777365, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3120, + "time_per_iteration": 2.717006206512451 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 260120304, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7078069337325568.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/training_args.bin b/sft_pretrain/Full_competesmoev30/checkpoint-3120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b6a9277adbc97dc93da839d7637a55f6cb09192 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe769c1cc19035ec98b831c3889d46da4eb91c0444d770f41a815de3d19398a +size 7992 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-3120/zero_to_fp32.py b/sft_pretrain/Full_competesmoev30/checkpoint-3120/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-3120/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/added_tokens.json b/sft_pretrain/Full_competesmoev30/checkpoint-4160/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/config.json b/sft_pretrain/Full_competesmoev30/checkpoint-4160/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28a5bb1c149304f33214eee3c6e2764711ffb065 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.005, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.005, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": true, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 9, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev30", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/generation_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-4160/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb7cf961b27abb4401884effb6230842d4154472 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02eb6b113920d264af9fff5fae7c66ec1263785a29526e4f98ef0c995c13c701 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f4af8b5e9a5fbb0aad8dda78b37bce135c9990f --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73cdd59612f7a54523f3f36087dadb9e960e99452f8d56de1b473bc22cbca774 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9293a9dde16f249543d2377c9712399d10b6eacc --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe82104af0173aa3b006a91bb55abcc39d29ca11727a877ea7227971a83e102 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92db8e23143a0b3d77e607b91e28d859e1672cc3 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96dd16bceb2cbeecf85eab56d5b76516b4a5f643e7c8c241afeb18734d79b4ab +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5a2fe963e263f9e44fd7614a0a8ec65bba4b103 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a8d0a7e14794e5b1bda54c9d52576007bc399e73300b1ec15453302f3c1f615 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..47b0275c06ee458a6dc149f5140d3937ebe9b2b3 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20ed38c7819dbd320ebea6d64c618bdb08b9a638ac90d27f55ccf62d04bb66c3 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a6a27149b1b8269ebf5f954de66986fd091fa53 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b31dc11a13ccc2063cef037e46a356eea38c079b448195233889ed736fec0103 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..22dfd69660db1eb569071746239b289136370b2b --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e514cfa6f2b765dafb5691c7ab13c290ab9b9da6943ec72dae5f08e9849e296e +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/latest b/sft_pretrain/Full_competesmoev30/checkpoint-4160/latest new file mode 100644 index 0000000000000000000000000000000000000000..ae01dfd535e9ee314b565695c1d61230ecf4c494 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/latest @@ -0,0 +1 @@ +global_step4160 \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/model-00001-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-4160/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/model-00002-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-4160/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b0b64c4297ac87180a08b164d6b3534092c09ca --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffdb69b9c5e24e9bd3bc8237f750236335435f29822b26a9b3b8dc3328441c82 +size 3759030203 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/model.safetensors.index.json b/sft_pretrain/Full_competesmoev30/checkpoint-4160/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..1c36aea017a82c896c2bf8d32802184967811e4c --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/model.safetensors.index.json @@ -0,0 +1,673 @@ +{ + "metadata": { + "total_size": 8731429675 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_0.pth b/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..74aaffdc337c5a168a279aed341c53617abfb292 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7428511a0f39116505eb0e78fefd1d50fe2ddacee4482cdd5d925938d450347 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_1.pth b/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_2.pth b/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_3.pth b/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/special_tokens_map.json b/sft_pretrain/Full_competesmoev30/checkpoint-4160/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/tokenizer.model b/sft_pretrain/Full_competesmoev30/checkpoint-4160/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/tokenizer_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-4160/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/trainer_state.json b/sft_pretrain/Full_competesmoev30/checkpoint-4160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f3c3bf6bff776b49da0969dde06191dc95e57afb --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/trainer_state.json @@ -0,0 +1,70153 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003078106964217, + "eval_steps": 500, + "global_step": 4160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03936368, + "balance_loss_mlp": 2.84994221, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 15.847607787273237, + "language_loss": 2.91765308, + "learning_rate": 0.0, + "loss": 1.97528625, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.278199672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02015882, + "balance_loss_mlp": 1.26743817, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 26.39987998366427, + "language_loss": 2.42349291, + "learning_rate": 0.00013726078121135892, + "loss": 2.44365168, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 7.4765625, + "step": 2, + "time_per_iteration": 2.74550199508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02034476, + "balance_loss_mlp": 1.28603244, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 23.46624299076427, + "language_loss": 2.13354897, + "learning_rate": 0.00021755319103969496, + "loss": 2.15389395, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 7.4765625, + "step": 3, + "time_per_iteration": 2.820986270904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02058399, + "balance_loss_mlp": 1.29927421, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 3.493910581799846, + "language_loss": 1.37129521, + "learning_rate": 0.00027452156242271784, + "loss": 1.3918792, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 7.5859375, + "step": 4, + "time_per_iteration": 2.677243947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02066247, + "balance_loss_mlp": 1.30979228, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 0.8674817587168525, + "language_loss": 1.33187473, + "learning_rate": 0.0003187096642208417, + "loss": 1.35253716, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 7.55859375, + "step": 5, + "time_per_iteration": 2.6032657623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02071583, + "balance_loss_mlp": 1.31322157, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 2.033424387355904, + "language_loss": 1.30649018, + "learning_rate": 0.0003548139722510539, + "loss": 1.32720602, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 7.578125, + "step": 6, + "time_per_iteration": 2.6967170238494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101369, + "balance_loss_mlp": 1.33652186, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7061194413900653, + "language_loss": 1.22160292, + "learning_rate": 0.00038533972973918044, + "loss": 1.24261677, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 7.64453125, + "step": 7, + "time_per_iteration": 2.7199785709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02146806, + "balance_loss_mlp": 1.36975181, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.35850971046258795, + "language_loss": 1.17196155, + "learning_rate": 0.0004117823436340768, + "loss": 1.19342971, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 7.76171875, + "step": 8, + "time_per_iteration": 2.6428823471069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02153063, + "balance_loss_mlp": 1.36837983, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.22105321402960548, + "language_loss": 1.2430563, + "learning_rate": 0.00043510638207938993, + "loss": 1.26458693, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 7.8359375, + "step": 9, + "time_per_iteration": 2.7773404121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194678, + "balance_loss_mlp": 1.4077065, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.2650641779955913, + "language_loss": 1.13927829, + "learning_rate": 0.00045597044543220066, + "loss": 1.16122508, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 7.87109375, + "step": 10, + "time_per_iteration": 2.6966803073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215625, + "balance_loss_mlp": 1.42216802, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.17099192662038445, + "language_loss": 1.11761594, + "learning_rate": 0.00047484428652143135, + "loss": 1.13977218, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 7.921875, + "step": 11, + "time_per_iteration": 2.846426010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218955, + "balance_loss_mlp": 1.42854977, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.11899482154082718, + "language_loss": 1.17641664, + "learning_rate": 0.0004920747534624128, + "loss": 1.19860613, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 7.890625, + "step": 12, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02207543, + "balance_loss_mlp": 1.41751897, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.14172497717456267, + "language_loss": 1.20158505, + "learning_rate": 0.0005079252465375872, + "loss": 1.22366059, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 7.8984375, + "step": 13, + "time_per_iteration": 2.7560088634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02203989, + "balance_loss_mlp": 1.41625452, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.1448362910448976, + "language_loss": 1.09927368, + "learning_rate": 0.0005226005109505393, + "loss": 1.12131357, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 7.859375, + "step": 14, + "time_per_iteration": 2.623379707336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02125464, + "balance_loss_mlp": 1.36481309, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.13392565488521943, + "language_loss": 1.15514731, + "learning_rate": 0.0005362628552605367, + "loss": 1.17640197, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 7.59765625, + "step": 15, + "time_per_iteration": 2.596914768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02122013, + "balance_loss_mlp": 1.3682282, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.12347082932885804, + "language_loss": 1.19854355, + "learning_rate": 0.0005490431248454357, + "loss": 1.21976352, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 7.53125, + "step": 16, + "time_per_iteration": 2.685072898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02419001, + "balance_loss_mlp": 1.67742407, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2736231848322761, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78124118, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.40625, + "step": 17, + "time_per_iteration": 5.928683757781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02002798, + "balance_loss_mlp": 1.29097593, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.09154168539226555, + "language_loss": 1.06151795, + "learning_rate": 0.0005723671632907488, + "loss": 1.08154595, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.12109375, + "step": 18, + "time_per_iteration": 2.6618175506591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945774, + "balance_loss_mlp": 1.26141703, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11342789334024792, + "language_loss": 1.1168499, + "learning_rate": 0.0005830738490244919, + "loss": 1.13630772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.8515625, + "step": 19, + "time_per_iteration": 2.5248160362243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01908107, + "balance_loss_mlp": 1.24625731, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10096694408553891, + "language_loss": 1.13845825, + "learning_rate": 0.0005932312266435596, + "loss": 1.15753937, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.62109375, + "step": 20, + "time_per_iteration": 2.800579309463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01843731, + "balance_loss_mlp": 1.21316147, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.1378013237236713, + "language_loss": 1.09039617, + "learning_rate": 0.0006028929207788754, + "loss": 1.10883355, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 6.30078125, + "step": 21, + "time_per_iteration": 2.693075656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01796963, + "balance_loss_mlp": 1.19309616, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.10529209836160877, + "language_loss": 1.11936951, + "learning_rate": 0.0006121050677327902, + "loss": 1.13733912, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 6.03125, + "step": 22, + "time_per_iteration": 2.8881568908691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746784, + "balance_loss_mlp": 1.17724967, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.085047282331545, + "language_loss": 1.02962387, + "learning_rate": 0.0006209076479463684, + "loss": 1.04709172, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 5.70703125, + "step": 23, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01714578, + "balance_loss_mlp": 1.16831291, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.1446104563316411, + "language_loss": 1.12823486, + "learning_rate": 0.0006293355346737718, + "loss": 1.1453805, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 5.46875, + "step": 24, + "time_per_iteration": 2.662325382232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664908, + "balance_loss_mlp": 1.14725351, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.08929005506461926, + "language_loss": 1.08926165, + "learning_rate": 0.0006374193284416834, + "loss": 1.10591078, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 5.17578125, + "step": 25, + "time_per_iteration": 2.7794790267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01647718, + "balance_loss_mlp": 1.15752983, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.382953647696995, + "language_loss": 1.07588863, + "learning_rate": 0.0006451860277489461, + "loss": 1.09236586, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.89453125, + "step": 26, + "time_per_iteration": 2.6574552059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01623745, + "balance_loss_mlp": 1.1686517, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.13377036730821817, + "language_loss": 1.14740276, + "learning_rate": 0.0006526595731190848, + "loss": 1.16364002, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 4.55078125, + "step": 27, + "time_per_iteration": 2.5226099491119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558493, + "balance_loss_mlp": 1.14078379, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.07887885702942038, + "language_loss": 1.08901012, + "learning_rate": 0.0006598612921618983, + "loss": 1.10459495, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 4.18359375, + "step": 28, + "time_per_iteration": 2.839459180831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01503024, + "balance_loss_mlp": 1.11487842, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.08107526710192482, + "language_loss": 1.0255661, + "learning_rate": 0.0006668102665011454, + "loss": 1.04059625, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 3.87695312, + "step": 29, + "time_per_iteration": 3.257913589477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474291, + "balance_loss_mlp": 1.11227608, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.13697687064909753, + "language_loss": 1.11483085, + "learning_rate": 0.0006735236364718957, + "loss": 1.1295737, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 3.6171875, + "step": 30, + "time_per_iteration": 2.7084178924560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0142553, + "balance_loss_mlp": 1.09460521, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.11726589989245696, + "language_loss": 1.10265064, + "learning_rate": 0.0006800168558381346, + "loss": 1.11690593, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 3.31054688, + "step": 31, + "time_per_iteration": 2.588890552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01390474, + "balance_loss_mlp": 1.08758759, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.10666498872881085, + "language_loss": 1.13109517, + "learning_rate": 0.0006863039060567947, + "loss": 1.14499998, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 3.0234375, + "step": 32, + "time_per_iteration": 2.671940326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372012, + "balance_loss_mlp": 1.09372997, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.09439068448398888, + "language_loss": 1.06106949, + "learning_rate": 0.0006923974775611263, + "loss": 1.07478976, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 2.78710938, + "step": 33, + "time_per_iteration": 2.854475498199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370442, + "balance_loss_mlp": 1.11390388, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.06215931521992215, + "language_loss": 1.03014469, + "learning_rate": 0.0006983091239737814, + "loss": 1.04384923, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 2.56445312, + "step": 34, + "time_per_iteration": 3.0690298080444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361344, + "balance_loss_mlp": 1.12464166, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.09515467516314563, + "language_loss": 1.01683736, + "learning_rate": 0.0007040493939600222, + "loss": 1.03045082, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 2.36523438, + "step": 35, + "time_per_iteration": 2.8111989498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344012, + "balance_loss_mlp": 1.12600231, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.06987238068095514, + "language_loss": 1.02534437, + "learning_rate": 0.0007096279445021078, + "loss": 1.0387845, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 2.18554688, + "step": 36, + "time_per_iteration": 2.704871654510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340389, + "balance_loss_mlp": 1.14107156, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.1404335763188921, + "language_loss": 1.09097314, + "learning_rate": 0.0007150536386503726, + "loss": 1.10437703, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 1.9921875, + "step": 37, + "time_per_iteration": 2.872793436050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315876, + "balance_loss_mlp": 1.13486814, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.16061978088166937, + "language_loss": 1.01896858, + "learning_rate": 0.0007203346302358509, + "loss": 1.0321275, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 1.81054688, + "step": 38, + "time_per_iteration": 2.9352476596832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304512, + "balance_loss_mlp": 1.13332772, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.19798610454398824, + "language_loss": 1.06942129, + "learning_rate": 0.000725478437577282, + "loss": 1.08246636, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 1.71386719, + "step": 39, + "time_per_iteration": 2.766380786895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266397, + "balance_loss_mlp": 1.10894561, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.0682924496804484, + "language_loss": 1.01676083, + "learning_rate": 0.0007304920078549186, + "loss": 1.02942467, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 1.57324219, + "step": 40, + "time_per_iteration": 2.7017316818237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260084, + "balance_loss_mlp": 1.10988009, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.18661861035366387, + "language_loss": 1.03648829, + "learning_rate": 0.0007353817735343603, + "loss": 1.04908907, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 1.50097656, + "step": 41, + "time_per_iteration": 2.7103593349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243555, + "balance_loss_mlp": 1.10651195, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.09436856387031409, + "language_loss": 0.996611, + "learning_rate": 0.0007401537019902344, + "loss": 1.00904644, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 1.37109375, + "step": 42, + "time_per_iteration": 2.6113343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223311, + "balance_loss_mlp": 1.09961998, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.12261468754490484, + "language_loss": 1.02989793, + "learning_rate": 0.0007448133392900729, + "loss": 1.04213095, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.23535156, + "step": 43, + "time_per_iteration": 2.6736834049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123183, + "balance_loss_mlp": 1.11490965, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.06742287935331995, + "language_loss": 0.98469728, + "learning_rate": 0.0007493658489441491, + "loss": 0.9970156, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.16699219, + "step": 44, + "time_per_iteration": 2.8660154342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221739, + "balance_loss_mlp": 1.11549973, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.13165016268944502, + "language_loss": 1.02125764, + "learning_rate": 0.0007538160463002316, + "loss": 1.03347504, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.06445312, + "step": 45, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219104, + "balance_loss_mlp": 1.12082767, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.09154051415002856, + "language_loss": 1.05303812, + "learning_rate": 0.0007581684291577274, + "loss": 1.06522906, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.98193359, + "step": 46, + "time_per_iteration": 2.5779762268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211973, + "balance_loss_mlp": 1.12180293, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.10098348979088022, + "language_loss": 1.08761919, + "learning_rate": 0.0007624272050891776, + "loss": 1.09973884, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.90185547, + "step": 47, + "time_per_iteration": 2.8511393070220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.09893048, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.06288361982709323, + "language_loss": 0.98731792, + "learning_rate": 0.0007665963158851307, + "loss": 0.9991011, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.79345703, + "step": 48, + "time_per_iteration": 2.7975704669952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117803, + "balance_loss_mlp": 1.10588408, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.07935638516568921, + "language_loss": 1.07018328, + "learning_rate": 0.0007706794594783609, + "loss": 1.08196378, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.72167969, + "step": 49, + "time_per_iteration": 2.762869358062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170672, + "balance_loss_mlp": 1.10281849, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.06589219417940043, + "language_loss": 1.06122911, + "learning_rate": 0.0007746801096530423, + "loss": 1.07293582, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.67919922, + "step": 50, + "time_per_iteration": 2.755232334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116692, + "balance_loss_mlp": 1.10545588, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09337036144210262, + "language_loss": 1.10751569, + "learning_rate": 0.0007786015338021173, + "loss": 1.11918497, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.61376953, + "step": 51, + "time_per_iteration": 2.6145899295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.10279799, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.0700474736529942, + "language_loss": 1.03127432, + "learning_rate": 0.0007824468089603051, + "loss": 1.04286635, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.56396484, + "step": 52, + "time_per_iteration": 2.653333902359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.1128397, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.0678828268350522, + "language_loss": 1.02721131, + "learning_rate": 0.0007862188363098669, + "loss": 1.0388329, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.4934082, + "step": 53, + "time_per_iteration": 3.16854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150565, + "balance_loss_mlp": 1.10464573, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.07226768628462193, + "language_loss": 1.03151178, + "learning_rate": 0.0007899203543304438, + "loss": 1.04301751, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.45947266, + "step": 54, + "time_per_iteration": 2.684342384338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153237, + "balance_loss_mlp": 1.10901022, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.2877805661885644, + "language_loss": 1.16480064, + "learning_rate": 0.0007935539507422731, + "loss": 1.17633295, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.44213867, + "step": 55, + "time_per_iteration": 2.550560235977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135293, + "balance_loss_mlp": 1.09545326, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.09011321470942846, + "language_loss": 1.08752644, + "learning_rate": 0.0007971220733732573, + "loss": 1.09887934, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.39819336, + "step": 56, + "time_per_iteration": 2.6777026653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138051, + "balance_loss_mlp": 1.10307515, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.08011479339587849, + "language_loss": 1.04026377, + "learning_rate": 0.0008006270400641869, + "loss": 1.05164433, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.34985352, + "step": 57, + "time_per_iteration": 2.6899423599243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140153, + "balance_loss_mlp": 1.10787153, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.11169369867739573, + "language_loss": 1.05261517, + "learning_rate": 0.0008040710477125043, + "loss": 1.06401682, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.32275391, + "step": 58, + "time_per_iteration": 2.723038911819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144338, + "balance_loss_mlp": 1.11403465, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.15034464280850074, + "language_loss": 1.06417704, + "learning_rate": 0.0008074561805429771, + "loss": 1.07562041, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.30297852, + "step": 59, + "time_per_iteration": 2.6378283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136058, + "balance_loss_mlp": 1.10842514, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.12260992246729245, + "language_loss": 1.03937411, + "learning_rate": 0.0008107844176832545, + "loss": 1.05073476, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.27612305, + "step": 60, + "time_per_iteration": 2.700141668319702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143848, + "balance_loss_mlp": 1.11745548, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.07189127634205647, + "language_loss": 1.05365705, + "learning_rate": 0.0008140576401132568, + "loss": 1.06509542, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.2644043, + "step": 61, + "time_per_iteration": 2.6508264541625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141309, + "balance_loss_mlp": 1.11781311, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.05216073972873087, + "language_loss": 1.06422329, + "learning_rate": 0.0008172776370494935, + "loss": 1.07563639, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.23461914, + "step": 62, + "time_per_iteration": 2.725492238998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136117, + "balance_loss_mlp": 1.11272764, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.101779425959611, + "language_loss": 1.13612652, + "learning_rate": 0.0008204461118185703, + "loss": 1.14748764, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.23376465, + "step": 63, + "time_per_iteration": 2.5753746032714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148279, + "balance_loss_mlp": 1.12627339, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.07447427381713748, + "language_loss": 1.0324012, + "learning_rate": 0.0008235646872681536, + "loss": 1.04388404, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.22009277, + "step": 64, + "time_per_iteration": 2.5766890048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134709, + "balance_loss_mlp": 1.11331069, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.38827595406324295, + "language_loss": 1.02755439, + "learning_rate": 0.0008266349107584288, + "loss": 1.03890157, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.2142334, + "step": 65, + "time_per_iteration": 2.6795432567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150765, + "balance_loss_mlp": 1.12982011, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.12495940986475743, + "language_loss": 1.06208372, + "learning_rate": 0.0008296582587724851, + "loss": 1.07359147, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.20947266, + "step": 66, + "time_per_iteration": 2.7176458835601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140545, + "balance_loss_mlp": 1.11969519, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.1040817091496257, + "language_loss": 1.04495656, + "learning_rate": 0.0008326361411800136, + "loss": 1.05636215, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.20861816, + "step": 67, + "time_per_iteration": 2.944484233856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11664486, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.1236975736999165, + "language_loss": 1.04613113, + "learning_rate": 0.0008355699051851403, + "loss": 1.05749726, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1998291, + "step": 68, + "time_per_iteration": 2.7155401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163949, + "balance_loss_mlp": 1.14371967, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.08669769947970225, + "language_loss": 1.11325383, + "learning_rate": 0.0008384608389860635, + "loss": 1.12489343, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.20214844, + "step": 69, + "time_per_iteration": 2.6746206283569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.15127182, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.13494585106435908, + "language_loss": 1.01927853, + "learning_rate": 0.000841310175171381, + "loss": 1.03098571, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.19433594, + "step": 70, + "time_per_iteration": 2.6096978187561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116458, + "balance_loss_mlp": 1.14537501, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.08071853308807045, + "language_loss": 0.99831259, + "learning_rate": 0.000844119093875517, + "loss": 1.00995839, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.19189453, + "step": 71, + "time_per_iteration": 2.7110228538513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172694, + "balance_loss_mlp": 1.1531322, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.1298896621631551, + "language_loss": 1.05077183, + "learning_rate": 0.0008468887257134666, + "loss": 1.06249881, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.19543457, + "step": 72, + "time_per_iteration": 2.6877832412719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117331, + "balance_loss_mlp": 1.15338969, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.15655470084299106, + "language_loss": 1.07319438, + "learning_rate": 0.0008496201545131264, + "loss": 1.08492744, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.19909668, + "step": 73, + "time_per_iteration": 2.712404251098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155518, + "balance_loss_mlp": 1.13590837, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.16190508579873739, + "language_loss": 1.04767108, + "learning_rate": 0.0008523144198617317, + "loss": 1.05922627, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.19604492, + "step": 74, + "time_per_iteration": 3.1923534870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136399, + "balance_loss_mlp": 1.11624122, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.09478832041488004, + "language_loss": 1.04861999, + "learning_rate": 0.0008549725194813783, + "loss": 1.05998397, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.20153809, + "step": 75, + "time_per_iteration": 2.6708076000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116508, + "balance_loss_mlp": 1.09800684, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.08770819878028477, + "language_loss": 1.03907192, + "learning_rate": 0.0008575954114472099, + "loss": 1.05023694, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.18481445, + "step": 76, + "time_per_iteration": 3.13152813911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115762, + "balance_loss_mlp": 1.09717751, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.13848190952411177, + "language_loss": 1.01474786, + "learning_rate": 0.0008601840162606118, + "loss": 1.02590549, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.18591309, + "step": 77, + "time_per_iteration": 3.0026464462280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126663, + "balance_loss_mlp": 1.10745883, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04300320251384177, + "language_loss": 1.07548404, + "learning_rate": 0.000862739218788641, + "loss": 1.08675063, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.19189453, + "step": 78, + "time_per_iteration": 2.780151128768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136666, + "balance_loss_mlp": 1.11736631, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.05300805683051922, + "language_loss": 1.05217659, + "learning_rate": 0.0008652618700799138, + "loss": 1.0635432, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.19287109, + "step": 79, + "time_per_iteration": 2.644989252090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115046, + "balance_loss_mlp": 1.13105261, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.13679514692214284, + "language_loss": 1.04483461, + "learning_rate": 0.0008677527890662774, + "loss": 1.05633926, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.19384766, + "step": 80, + "time_per_iteration": 2.4652533531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151484, + "balance_loss_mlp": 1.13120639, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.06949005945359786, + "language_loss": 1.05593443, + "learning_rate": 0.0008702127641587799, + "loss": 1.06744933, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.20263672, + "step": 81, + "time_per_iteration": 2.6423192024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155894, + "balance_loss_mlp": 1.13492513, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.09507058081046676, + "language_loss": 1.01514888, + "learning_rate": 0.0008726425547457192, + "loss": 1.02670789, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.20959473, + "step": 82, + "time_per_iteration": 2.7670798301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.11376882, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.0793725108169458, + "language_loss": 1.00304663, + "learning_rate": 0.0008750428925998964, + "loss": 1.01438546, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.20117188, + "step": 83, + "time_per_iteration": 2.7451062202453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145272, + "balance_loss_mlp": 1.12516141, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.14534943996774727, + "language_loss": 1.06251049, + "learning_rate": 0.0008774144832015932, + "loss": 1.07396317, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.2010498, + "step": 84, + "time_per_iteration": 2.7039954662323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01784137, + "balance_loss_mlp": 1.77116704, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.33978769388161495, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76558447, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.12988281, + "step": 85, + "time_per_iteration": 4.672428846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133263, + "balance_loss_mlp": 1.11339045, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.0814354491433929, + "language_loss": 1.01647198, + "learning_rate": 0.0008820741205014318, + "loss": 1.02780461, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.19873047, + "step": 86, + "time_per_iteration": 2.9217472076416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135249, + "balance_loss_mlp": 1.11522174, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.09136661427056217, + "language_loss": 1.02933669, + "learning_rate": 0.0008843634575408404, + "loss": 1.04068923, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20031738, + "step": 87, + "time_per_iteration": 2.7795376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126576, + "balance_loss_mlp": 1.10805094, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.08653972064742017, + "language_loss": 1.04609084, + "learning_rate": 0.0008866266301555082, + "loss": 1.0573566, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.18518066, + "step": 88, + "time_per_iteration": 2.7490010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144591, + "balance_loss_mlp": 1.12630451, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.0643644920813647, + "language_loss": 1.05052233, + "learning_rate": 0.0008888642296509615, + "loss": 1.06196821, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.18273926, + "step": 89, + "time_per_iteration": 2.594862222671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167485, + "balance_loss_mlp": 1.14840007, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.0960094219381758, + "language_loss": 1.09507632, + "learning_rate": 0.0008910768275115906, + "loss": 1.10675108, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.1907959, + "step": 90, + "time_per_iteration": 2.732243299484253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168948, + "balance_loss_mlp": 1.14970791, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.08670111946866453, + "language_loss": 1.05579484, + "learning_rate": 0.0008932649762767675, + "loss": 1.06748414, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.19238281, + "step": 91, + "time_per_iteration": 2.58011531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156938, + "balance_loss_mlp": 1.13799536, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.1377326340865385, + "language_loss": 1.07988524, + "learning_rate": 0.0008954292103690864, + "loss": 1.09145451, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.18933105, + "step": 92, + "time_per_iteration": 2.88777494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144865, + "balance_loss_mlp": 1.12581539, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.08013614344713903, + "language_loss": 1.10040021, + "learning_rate": 0.0008975700468778296, + "loss": 1.11184883, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.19042969, + "step": 93, + "time_per_iteration": 2.5774590969085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153192, + "balance_loss_mlp": 1.13429725, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.08120240816831911, + "language_loss": 1.03244281, + "learning_rate": 0.0008996879863005366, + "loss": 1.04397476, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.18896484, + "step": 94, + "time_per_iteration": 2.6684646606445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166139, + "balance_loss_mlp": 1.14685082, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.10696755240582503, + "language_loss": 1.0365541, + "learning_rate": 0.0009017835132453337, + "loss": 1.04821539, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.19262695, + "step": 95, + "time_per_iteration": 2.5731871128082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160643, + "balance_loss_mlp": 1.14130712, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.09689172385373614, + "language_loss": 1.03809953, + "learning_rate": 0.0009038570970964896, + "loss": 1.04970598, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.1932373, + "step": 96, + "time_per_iteration": 2.7642133235931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142174, + "balance_loss_mlp": 1.1226114, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0731237284630876, + "language_loss": 1.01012015, + "learning_rate": 0.0009059091926454854, + "loss": 1.02154183, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.19543457, + "step": 97, + "time_per_iteration": 2.5798768997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134737, + "balance_loss_mlp": 1.11522222, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.09616120207899966, + "language_loss": 1.00179553, + "learning_rate": 0.0009079402406897198, + "loss": 1.01314282, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.19494629, + "step": 98, + "time_per_iteration": 3.2566075325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.12357211, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.06455780129345397, + "language_loss": 1.01265812, + "learning_rate": 0.0009099506686008212, + "loss": 1.02409148, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.19763184, + "step": 99, + "time_per_iteration": 2.799565553665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129571, + "balance_loss_mlp": 1.11054564, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.10657448879387016, + "language_loss": 1.0467732, + "learning_rate": 0.0009119408908644013, + "loss": 1.05806899, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.19030762, + "step": 100, + "time_per_iteration": 2.684875249862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122425, + "balance_loss_mlp": 1.10363734, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.06970738765852934, + "language_loss": 1.09725833, + "learning_rate": 0.0009139113095929519, + "loss": 1.1084826, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.18762207, + "step": 101, + "time_per_iteration": 2.8530783653259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130466, + "balance_loss_mlp": 1.11095107, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.04951217111237057, + "language_loss": 1.03750157, + "learning_rate": 0.0009158623150134762, + "loss": 1.04880619, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.19506836, + "step": 102, + "time_per_iteration": 2.5738718509674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124372, + "balance_loss_mlp": 1.10552466, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.07829016079597523, + "language_loss": 1.03829539, + "learning_rate": 0.000917794285931332, + "loss": 1.04953909, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.18859863, + "step": 103, + "time_per_iteration": 2.6672050952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116034, + "balance_loss_mlp": 1.09756863, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.06055754000551873, + "language_loss": 0.96430528, + "learning_rate": 0.0009197075901716639, + "loss": 0.97546566, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.18444824, + "step": 104, + "time_per_iteration": 2.7030909061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143399, + "balance_loss_mlp": 1.12458754, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.08851166873462187, + "language_loss": 1.06492853, + "learning_rate": 0.0009216025849997171, + "loss": 1.07636249, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.18798828, + "step": 105, + "time_per_iteration": 2.770717144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.11799645, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.1087806769155691, + "language_loss": 1.01426148, + "learning_rate": 0.0009234796175212258, + "loss": 1.02562797, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.18640137, + "step": 106, + "time_per_iteration": 2.9345030784606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145469, + "balance_loss_mlp": 1.12691963, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.08314221817588373, + "language_loss": 1.04264343, + "learning_rate": 0.000925339025064007, + "loss": 1.05409813, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.18530273, + "step": 107, + "time_per_iteration": 2.9724230766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136133, + "balance_loss_mlp": 1.11766744, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.06103111074840472, + "language_loss": 0.9746207, + "learning_rate": 0.0009271811355418027, + "loss": 0.98598194, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.18457031, + "step": 108, + "time_per_iteration": 2.8312766551971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114122, + "balance_loss_mlp": 1.12251627, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09366723049874563, + "language_loss": 1.0430491, + "learning_rate": 0.0009290062678013548, + "loss": 1.05446124, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.18713379, + "step": 109, + "time_per_iteration": 2.8890299797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119997, + "balance_loss_mlp": 1.10091138, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.07845117671788823, + "language_loss": 1.02498507, + "learning_rate": 0.0009308147319536321, + "loss": 1.03618503, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.19067383, + "step": 110, + "time_per_iteration": 2.6301145553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124178, + "balance_loss_mlp": 1.10517561, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.06169483511964636, + "language_loss": 1.08628201, + "learning_rate": 0.0009326068296900676, + "loss": 1.09752393, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.18981934, + "step": 111, + "time_per_iteration": 2.8480148315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124091, + "balance_loss_mlp": 1.1046958, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.07277353768082521, + "language_loss": 1.00328588, + "learning_rate": 0.0009343828545846161, + "loss": 1.01452684, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.19384766, + "step": 112, + "time_per_iteration": 2.785245656967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_mlp": 1.12596965, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.0989159829516975, + "language_loss": 1.03963184, + "learning_rate": 0.0009361430923823841, + "loss": 1.05108869, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.19702148, + "step": 113, + "time_per_iteration": 2.6218817234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139838, + "balance_loss_mlp": 1.11994159, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.08134488401387123, + "language_loss": 1.07289195, + "learning_rate": 0.0009378878212755459, + "loss": 1.08429039, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.19885254, + "step": 114, + "time_per_iteration": 2.489394426345825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135841, + "balance_loss_mlp": 1.11546779, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.08931795851274972, + "language_loss": 0.98084462, + "learning_rate": 0.0009396173121672103, + "loss": 0.992203, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.20373535, + "step": 115, + "time_per_iteration": 2.6338186264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.11229324, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.07784948028132394, + "language_loss": 1.03230667, + "learning_rate": 0.0009413318289238633, + "loss": 1.04362714, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.1973877, + "step": 116, + "time_per_iteration": 2.7797064781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119746, + "balance_loss_mlp": 1.10049319, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.10235619274826367, + "language_loss": 0.95674431, + "learning_rate": 0.0009430316286169771, + "loss": 0.96794176, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.19226074, + "step": 117, + "time_per_iteration": 3.0148251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123727, + "balance_loss_mlp": 1.10400951, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.08556933686221588, + "language_loss": 1.00759292, + "learning_rate": 0.0009447169617543361, + "loss": 1.0188303, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.19714355, + "step": 118, + "time_per_iteration": 2.570577383041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147362, + "balance_loss_mlp": 1.12738276, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.14195532580527156, + "language_loss": 1.07468402, + "learning_rate": 0.0009463880725016029, + "loss": 1.08615768, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.19970703, + "step": 119, + "time_per_iteration": 2.687791585922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119491, + "balance_loss_mlp": 1.1002152, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.12580227983012474, + "language_loss": 1.02723956, + "learning_rate": 0.0009480451988946134, + "loss": 1.03843451, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19274902, + "step": 120, + "time_per_iteration": 2.86080002784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.09974504, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.09779732210141849, + "language_loss": 1.04102588, + "learning_rate": 0.0009496885730428627, + "loss": 1.05221319, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1895752, + "step": 121, + "time_per_iteration": 3.058720350265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129608, + "balance_loss_mlp": 1.11076128, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.21300696817673925, + "language_loss": 1.02294064, + "learning_rate": 0.0009513184213246156, + "loss": 1.03423667, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.18859863, + "step": 122, + "time_per_iteration": 2.634585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112851, + "balance_loss_mlp": 1.10879278, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.08876505507315528, + "language_loss": 1.05331969, + "learning_rate": 0.0009529349645740552, + "loss": 1.06460488, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.19702148, + "step": 123, + "time_per_iteration": 2.68062686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139736, + "balance_loss_mlp": 1.11948287, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.07165211399576038, + "language_loss": 1.04294729, + "learning_rate": 0.0009545384182608524, + "loss": 1.05434453, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.20239258, + "step": 124, + "time_per_iteration": 2.541867971420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147945, + "balance_loss_mlp": 1.12758446, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.1170262954091428, + "language_loss": 1.01733518, + "learning_rate": 0.0009561289926625252, + "loss": 1.02881455, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.20361328, + "step": 125, + "time_per_iteration": 2.6904866695404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144311, + "balance_loss_mlp": 1.12337756, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.0767802787123007, + "language_loss": 1.06512678, + "learning_rate": 0.0009577068930299292, + "loss": 1.07656991, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.20935059, + "step": 126, + "time_per_iteration": 2.5956666469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112026, + "balance_loss_mlp": 1.10011339, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.05578094289714296, + "language_loss": 1.01563096, + "learning_rate": 0.0009592723197462087, + "loss": 1.02683353, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.20141602, + "step": 127, + "time_per_iteration": 2.652282953262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135813, + "balance_loss_mlp": 1.11633444, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.08941911012616197, + "language_loss": 0.98464531, + "learning_rate": 0.0009608254684795125, + "loss": 0.99600339, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.19470215, + "step": 128, + "time_per_iteration": 2.9219348430633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113214, + "balance_loss_mlp": 1.11204123, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.07851670709976168, + "language_loss": 1.01339173, + "learning_rate": 0.0009623665303297678, + "loss": 1.02471328, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.20092773, + "step": 129, + "time_per_iteration": 2.72129225730896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138949, + "balance_loss_mlp": 1.11936343, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.10234054898828188, + "language_loss": 1.05215728, + "learning_rate": 0.0009638956919697878, + "loss": 1.0635469, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19580078, + "step": 130, + "time_per_iteration": 2.8943347930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120076, + "balance_loss_mlp": 1.10040641, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.07955649128739337, + "language_loss": 0.97532988, + "learning_rate": 0.0009654131357809714, + "loss": 0.98653066, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.19665527, + "step": 131, + "time_per_iteration": 2.5710790157318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131377, + "balance_loss_mlp": 1.11108756, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.09603534709419483, + "language_loss": 1.06830871, + "learning_rate": 0.0009669190399838441, + "loss": 1.07962251, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.20275879, + "step": 132, + "time_per_iteration": 3.12355899810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104613, + "balance_loss_mlp": 1.08422863, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.07678679730921736, + "language_loss": 0.99635059, + "learning_rate": 0.0009684135787636724, + "loss": 1.0073967, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.20373535, + "step": 133, + "time_per_iteration": 2.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011225, + "balance_loss_mlp": 1.10198379, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.06194161941979751, + "language_loss": 1.03999257, + "learning_rate": 0.0009698969223913726, + "loss": 1.05121756, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.2052002, + "step": 134, + "time_per_iteration": 3.0173001289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111055, + "balance_loss_mlp": 1.09066617, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.06876216863310104, + "language_loss": 1.06792855, + "learning_rate": 0.0009713692373399265, + "loss": 1.07903397, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.19873047, + "step": 135, + "time_per_iteration": 2.670929431915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134721, + "balance_loss_mlp": 1.33280921, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.15411027982306336, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.80803436, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.14355469, + "step": 136, + "time_per_iteration": 5.4502341747283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142923, + "balance_loss_mlp": 1.13023889, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.0420308652143082, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.78953964, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.911421298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.1204778, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.15008184892874737, + "language_loss": 0.99414909, + "learning_rate": 0.0009757216201974225, + "loss": 1.00555539, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.20141602, + "step": 138, + "time_per_iteration": 2.805294990539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163113, + "balance_loss_mlp": 1.1417979, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.10042691837700132, + "language_loss": 1.04683781, + "learning_rate": 0.0009771514130396581, + "loss": 1.05846894, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.21325684, + "step": 139, + "time_per_iteration": 2.6785237789154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.15150893, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.13712828131438198, + "language_loss": 1.04777944, + "learning_rate": 0.00097857095638274, + "loss": 1.05949712, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.20275879, + "step": 140, + "time_per_iteration": 2.5689632892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161751, + "balance_loss_mlp": 1.140818, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.04776427930188189, + "language_loss": 0.96152979, + "learning_rate": 0.0009799803961288726, + "loss": 0.97314727, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.20922852, + "step": 141, + "time_per_iteration": 3.005524158477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114311, + "balance_loss_mlp": 1.12280869, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.08242063446041879, + "language_loss": 1.02058709, + "learning_rate": 0.000981379875086876, + "loss": 1.03201818, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.20300293, + "step": 142, + "time_per_iteration": 3.0404272079467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149559, + "balance_loss_mlp": 1.12884021, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.08811908081945614, + "language_loss": 0.97007114, + "learning_rate": 0.0009827695330590185, + "loss": 0.98156673, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.20727539, + "step": 143, + "time_per_iteration": 2.677872896194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139202, + "balance_loss_mlp": 1.11838782, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.09095558281985278, + "language_loss": 0.9660008, + "learning_rate": 0.0009841495069248256, + "loss": 0.97739279, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.20788574, + "step": 144, + "time_per_iteration": 3.0181970596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124304, + "balance_loss_mlp": 1.10402668, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.06968867614461936, + "language_loss": 0.96011639, + "learning_rate": 0.0009855199307219871, + "loss": 0.97135949, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.20275879, + "step": 145, + "time_per_iteration": 2.6638803482055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129462, + "balance_loss_mlp": 1.10819507, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.10380696742567494, + "language_loss": 0.97768301, + "learning_rate": 0.0009868809357244854, + "loss": 0.98897767, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.21264648, + "step": 146, + "time_per_iteration": 2.6609416007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_mlp": 1.08754969, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.04767435219925792, + "language_loss": 1.01976728, + "learning_rate": 0.0009882326505180556, + "loss": 1.03085351, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.21081543, + "step": 147, + "time_per_iteration": 2.7018306255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116059, + "balance_loss_mlp": 1.09487534, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.081387986355653, + "language_loss": 1.0020777, + "learning_rate": 0.0009895752010730906, + "loss": 1.01323831, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.21191406, + "step": 148, + "time_per_iteration": 2.9776458740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114545, + "balance_loss_mlp": 1.09280121, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07164111136345892, + "language_loss": 1.06547272, + "learning_rate": 0.0009909087108150867, + "loss": 1.07661819, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.21740723, + "step": 149, + "time_per_iteration": 2.7685787677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120998, + "balance_loss_mlp": 1.09932601, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.09002123643314056, + "language_loss": 1.07463562, + "learning_rate": 0.0009922333006927371, + "loss": 1.08584571, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.2166748, + "step": 150, + "time_per_iteration": 2.5377442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134752, + "balance_loss_mlp": 1.11268604, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.07882603128859848, + "language_loss": 1.00827551, + "learning_rate": 0.0009935490892437632, + "loss": 1.01962304, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.22070312, + "step": 151, + "time_per_iteration": 2.5629055500030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126049, + "balance_loss_mlp": 1.10497248, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.07540534084758796, + "language_loss": 0.99210167, + "learning_rate": 0.0009948561926585687, + "loss": 1.00336218, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.21069336, + "step": 152, + "time_per_iteration": 2.755824565887451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133957, + "balance_loss_mlp": 1.1110214, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09890448438657973, + "language_loss": 1.02627087, + "learning_rate": 0.0009961547248418122, + "loss": 1.03761053, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.22937012, + "step": 153, + "time_per_iteration": 2.6255645751953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115876, + "balance_loss_mlp": 1.09208155, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.0750271830701194, + "language_loss": 0.99508584, + "learning_rate": 0.0009974447974719707, + "loss": 1.00624466, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.23791504, + "step": 154, + "time_per_iteration": 2.685029983520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126502, + "balance_loss_mlp": 1.10213518, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.12681443605953674, + "language_loss": 1.01620197, + "learning_rate": 0.0009987265200589763, + "loss": 1.02746701, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.24365234, + "step": 155, + "time_per_iteration": 2.7264955043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119337, + "balance_loss_mlp": 1.09590077, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.07965097154096117, + "language_loss": 1.01522899, + "learning_rate": 0.001, + "loss": 1.02642226, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.23400879, + "step": 156, + "time_per_iteration": 2.864698886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111456, + "balance_loss_mlp": 1.09257805, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.061020534493473076, + "language_loss": 0.9859184, + "learning_rate": 0.0009999999029413921, + "loss": 0.99706399, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.2199707, + "step": 157, + "time_per_iteration": 2.8241283893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125679, + "balance_loss_mlp": 1.1049242, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.05862251807890935, + "language_loss": 1.00346851, + "learning_rate": 0.0009999996117656068, + "loss": 1.01472545, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.2076416, + "step": 158, + "time_per_iteration": 2.7097458839416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113624, + "balance_loss_mlp": 1.09279847, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.09545570145123992, + "language_loss": 0.93653512, + "learning_rate": 0.0009999991264727564, + "loss": 0.94767129, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.20837402, + "step": 159, + "time_per_iteration": 2.756363868713379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110635, + "balance_loss_mlp": 1.08577418, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.09475469160316574, + "language_loss": 1.04571712, + "learning_rate": 0.0009999984470630296, + "loss": 1.05678058, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.20581055, + "step": 160, + "time_per_iteration": 2.5990707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112086, + "balance_loss_mlp": 1.09061611, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.07420241291943742, + "language_loss": 0.9342289, + "learning_rate": 0.0009999975735366902, + "loss": 0.94534969, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.21472168, + "step": 161, + "time_per_iteration": 3.06878662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114184, + "balance_loss_mlp": 1.09270215, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0799449593456649, + "language_loss": 0.95189524, + "learning_rate": 0.0009999965058940775, + "loss": 0.96303707, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.21484375, + "step": 162, + "time_per_iteration": 3.4937808513641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112457, + "balance_loss_mlp": 1.10226631, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08293329451395655, + "language_loss": 1.01278222, + "learning_rate": 0.0009999952441356057, + "loss": 1.02402782, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.22314453, + "step": 163, + "time_per_iteration": 2.535121202468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109663, + "balance_loss_mlp": 1.08820534, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.06727245316799851, + "language_loss": 1.0154388, + "learning_rate": 0.000999993788261765, + "loss": 1.02653539, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.21472168, + "step": 164, + "time_per_iteration": 3.5832889080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110987, + "balance_loss_mlp": 1.08942175, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.07205404441274409, + "language_loss": 1.03110182, + "learning_rate": 0.00099999213827312, + "loss": 1.04221165, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.21569824, + "step": 165, + "time_per_iteration": 2.8096628189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118839, + "balance_loss_mlp": 1.09684491, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.050309165813849886, + "language_loss": 0.98088074, + "learning_rate": 0.000999990294170312, + "loss": 0.99206913, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.22009277, + "step": 166, + "time_per_iteration": 2.663135051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116813, + "balance_loss_mlp": 1.09486628, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.06058681172545402, + "language_loss": 1.02190185, + "learning_rate": 0.0009999882559540566, + "loss": 1.03306985, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.21948242, + "step": 167, + "time_per_iteration": 2.649784564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118672, + "balance_loss_mlp": 1.09543872, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.10019647540930027, + "language_loss": 0.98887956, + "learning_rate": 0.000999986023625145, + "loss": 1.00006628, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.23217773, + "step": 168, + "time_per_iteration": 2.6998720169067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01817799, + "balance_loss_mlp": 1.79767668, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.21411409700219255, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80742216, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.20117188, + "step": 169, + "time_per_iteration": 5.029488563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112576, + "balance_loss_mlp": 1.10157228, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.09130724925200479, + "language_loss": 0.99515283, + "learning_rate": 0.0009999809766328958, + "loss": 1.00641036, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.24206543, + "step": 170, + "time_per_iteration": 2.6508679389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153612, + "balance_loss_mlp": 1.12968671, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.0981725040523357, + "language_loss": 1.01766157, + "learning_rate": 0.0009999781619715177, + "loss": 1.02919769, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.23925781, + "step": 171, + "time_per_iteration": 2.5449466705322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151071, + "balance_loss_mlp": 1.12767053, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.10018141203760955, + "language_loss": 1.0104121, + "learning_rate": 0.000999975153201402, + "loss": 1.02192283, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.23388672, + "step": 172, + "time_per_iteration": 2.8463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114097, + "balance_loss_mlp": 1.11745048, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.05920698759335099, + "language_loss": 0.98661143, + "learning_rate": 0.0009999719503237174, + "loss": 0.99802113, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.23498535, + "step": 173, + "time_per_iteration": 2.733147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157549, + "balance_loss_mlp": 1.1333611, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.12686135486457134, + "language_loss": 1.07479167, + "learning_rate": 0.0009999685533397073, + "loss": 1.08636713, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.24194336, + "step": 174, + "time_per_iteration": 2.5705809593200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110896, + "balance_loss_mlp": 1.08707762, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.07652801902249555, + "language_loss": 0.99758261, + "learning_rate": 0.00099996496225069, + "loss": 1.00869155, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.23815918, + "step": 175, + "time_per_iteration": 2.6572659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118018, + "balance_loss_mlp": 1.09399772, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.05463854096335067, + "language_loss": 1.01895058, + "learning_rate": 0.0009999611770580604, + "loss": 1.03013086, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.24023438, + "step": 176, + "time_per_iteration": 2.8216159343719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.09596181, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.08810438351502946, + "language_loss": 1.01167393, + "learning_rate": 0.0009999571977632876, + "loss": 1.02288568, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.25231934, + "step": 177, + "time_per_iteration": 2.581037998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115073, + "balance_loss_mlp": 1.09040904, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.08419866181616258, + "language_loss": 1.03353202, + "learning_rate": 0.0009999530243679166, + "loss": 1.04468274, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.24682617, + "step": 178, + "time_per_iteration": 2.5844500064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137225, + "balance_loss_mlp": 1.11332321, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.13671082465577608, + "language_loss": 0.99045932, + "learning_rate": 0.0009999486568735675, + "loss": 1.00183165, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.23913574, + "step": 179, + "time_per_iteration": 3.044409990310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125047, + "balance_loss_mlp": 1.1010983, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.0738854697341979, + "language_loss": 0.99422705, + "learning_rate": 0.0009999440952819362, + "loss": 1.00547755, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.23950195, + "step": 180, + "time_per_iteration": 3.644280433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112251, + "balance_loss_mlp": 1.08836114, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.04789131390967285, + "language_loss": 0.98983485, + "learning_rate": 0.0009999393395947935, + "loss": 1.00095737, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2388916, + "step": 181, + "time_per_iteration": 2.8229053020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114504, + "balance_loss_mlp": 1.08992302, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.08040661288612141, + "language_loss": 1.02358437, + "learning_rate": 0.0009999343898139858, + "loss": 1.03472936, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.24584961, + "step": 182, + "time_per_iteration": 2.6112709045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123737, + "balance_loss_mlp": 1.09824967, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.0879280890069936, + "language_loss": 1.01010704, + "learning_rate": 0.0009999292459414348, + "loss": 1.02134442, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.25476074, + "step": 183, + "time_per_iteration": 2.574800491333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111495, + "balance_loss_mlp": 1.08559036, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.08068750200828848, + "language_loss": 1.05455053, + "learning_rate": 0.0009999239079791374, + "loss": 1.06566548, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.25915527, + "step": 184, + "time_per_iteration": 2.5650548934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110884, + "balance_loss_mlp": 1.08343673, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.07300059562366337, + "language_loss": 0.98493111, + "learning_rate": 0.0009999183759291659, + "loss": 0.99601954, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.25427246, + "step": 185, + "time_per_iteration": 2.7383785247802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110571, + "balance_loss_mlp": 1.08168936, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.09426698036311254, + "language_loss": 1.00536895, + "learning_rate": 0.0009999126497936682, + "loss": 1.01642609, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.24023438, + "step": 186, + "time_per_iteration": 2.5103538036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110913, + "balance_loss_mlp": 1.08740544, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.07507023604654985, + "language_loss": 1.03590488, + "learning_rate": 0.0009999067295748676, + "loss": 1.047014, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.23510742, + "step": 187, + "time_per_iteration": 2.806403160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112247, + "balance_loss_mlp": 1.09995186, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.10679989437153373, + "language_loss": 1.00781608, + "learning_rate": 0.000999900615275062, + "loss": 1.01904082, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.22509766, + "step": 188, + "time_per_iteration": 2.6750597953796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105556, + "balance_loss_mlp": 1.0823226, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.06425431277780277, + "language_loss": 1.06987619, + "learning_rate": 0.0009998943068966256, + "loss": 1.0809319, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.23242188, + "step": 189, + "time_per_iteration": 2.4297006130218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.0826813, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.07322572175010231, + "language_loss": 1.01591444, + "learning_rate": 0.0009998878044420072, + "loss": 1.02697778, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23669434, + "step": 190, + "time_per_iteration": 2.6686899662017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108272, + "balance_loss_mlp": 1.08489525, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.07088525550270033, + "language_loss": 0.97819, + "learning_rate": 0.0009998811079137318, + "loss": 0.98927271, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.23400879, + "step": 191, + "time_per_iteration": 2.5795974731445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118931, + "balance_loss_mlp": 1.09439743, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.07437245365565072, + "language_loss": 0.9895249, + "learning_rate": 0.0009998742173143987, + "loss": 1.0007143, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.24536133, + "step": 192, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133748, + "balance_loss_mlp": 1.10824919, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.06698686336952825, + "language_loss": 0.98415262, + "learning_rate": 0.0009998671326466833, + "loss": 0.99549013, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.25524902, + "step": 193, + "time_per_iteration": 2.955780506134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136952, + "balance_loss_mlp": 1.10922432, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.07154145387165563, + "language_loss": 0.99267447, + "learning_rate": 0.0009998598539133362, + "loss": 1.00404394, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.27734375, + "step": 194, + "time_per_iteration": 3.0137686729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163557, + "balance_loss_mlp": 1.13373041, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.09795763902625766, + "language_loss": 1.00780571, + "learning_rate": 0.0009998523811171828, + "loss": 1.01944125, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2980957, + "step": 195, + "time_per_iteration": 2.5090267658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164155, + "balance_loss_mlp": 1.13323212, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0756543485462421, + "language_loss": 1.0036695, + "learning_rate": 0.0009998447142611248, + "loss": 1.015311, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.30883789, + "step": 196, + "time_per_iteration": 2.653759241104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12615836, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.10738469994654526, + "language_loss": 0.9438082, + "learning_rate": 0.0009998368533481387, + "loss": 0.95537138, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.30126953, + "step": 197, + "time_per_iteration": 3.03090763092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123277, + "balance_loss_mlp": 1.09433353, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08947148055588174, + "language_loss": 0.97516447, + "learning_rate": 0.0009998287983812762, + "loss": 0.98639727, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.28930664, + "step": 198, + "time_per_iteration": 2.842519760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.10672641, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.08719552456544254, + "language_loss": 1.03183711, + "learning_rate": 0.0009998205493636646, + "loss": 1.04316807, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.26416016, + "step": 199, + "time_per_iteration": 2.657094955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099837, + "balance_loss_mlp": 1.07485092, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.11937452390124363, + "language_loss": 0.95869702, + "learning_rate": 0.0009998121062985063, + "loss": 0.96969533, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.24987793, + "step": 200, + "time_per_iteration": 2.6954355239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108694, + "balance_loss_mlp": 1.08444691, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.09459530753006626, + "language_loss": 0.98493665, + "learning_rate": 0.0009998034691890794, + "loss": 0.9960236, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.24243164, + "step": 201, + "time_per_iteration": 2.7717928886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.08075976, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.07675440437740683, + "language_loss": 1.0290482, + "learning_rate": 0.0009997946380387369, + "loss": 1.04009235, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.23632812, + "step": 202, + "time_per_iteration": 2.63975191116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111336, + "balance_loss_mlp": 1.08706474, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09220046036918417, + "language_loss": 1.04956245, + "learning_rate": 0.0009997856128509076, + "loss": 1.06067586, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.24279785, + "step": 203, + "time_per_iteration": 2.856816053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124883, + "balance_loss_mlp": 1.10112453, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.08622839045605694, + "language_loss": 0.99688643, + "learning_rate": 0.0009997763936290952, + "loss": 1.00813532, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.23754883, + "step": 204, + "time_per_iteration": 2.5392112731933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113243, + "balance_loss_mlp": 1.10773039, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.09842935942049862, + "language_loss": 1.0453217, + "learning_rate": 0.0009997669803768789, + "loss": 1.05664587, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24694824, + "step": 205, + "time_per_iteration": 2.7708992958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108465, + "balance_loss_mlp": 1.08426595, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.10843184908981528, + "language_loss": 0.9984858, + "learning_rate": 0.0009997573730979134, + "loss": 1.00957048, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.24194336, + "step": 206, + "time_per_iteration": 2.7474939823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01685643, + "balance_loss_mlp": 1.6616106, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.13014896830523812, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80878842, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 0.24023438, + "step": 207, + "time_per_iteration": 4.682751655578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109669, + "balance_loss_mlp": 1.08474243, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.07677308889428856, + "language_loss": 0.98866731, + "learning_rate": 0.0009997375764747294, + "loss": 0.99976397, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.24926758, + "step": 208, + "time_per_iteration": 2.9866418838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110763, + "balance_loss_mlp": 1.08659935, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.07362493409063897, + "language_loss": 0.96845645, + "learning_rate": 0.0009997273871381967, + "loss": 0.97956407, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.24169922, + "step": 209, + "time_per_iteration": 2.7354848384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125333, + "balance_loss_mlp": 1.09998906, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.07873798613461079, + "language_loss": 1.01664305, + "learning_rate": 0.0009997170037902862, + "loss": 1.0278964, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.25366211, + "step": 210, + "time_per_iteration": 2.704061269760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120611, + "balance_loss_mlp": 1.09462297, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.06515356853390573, + "language_loss": 1.04550838, + "learning_rate": 0.0009997064264350292, + "loss": 1.05671442, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.26013184, + "step": 211, + "time_per_iteration": 2.8975577354431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113542, + "balance_loss_mlp": 1.08662462, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.07652094351016743, + "language_loss": 0.98263478, + "learning_rate": 0.0009996956550765317, + "loss": 0.99377024, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.26928711, + "step": 212, + "time_per_iteration": 2.6716954708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125588, + "balance_loss_mlp": 1.09752572, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07289633346919515, + "language_loss": 0.93075061, + "learning_rate": 0.0009996846897189762, + "loss": 0.94200653, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.28051758, + "step": 213, + "time_per_iteration": 2.621661901473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110085, + "balance_loss_mlp": 1.08412087, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.055838089119108855, + "language_loss": 0.99370623, + "learning_rate": 0.0009996735303666193, + "loss": 1.004807, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.2598877, + "step": 214, + "time_per_iteration": 2.6928601264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095032, + "balance_loss_mlp": 1.06966448, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.04962656356162825, + "language_loss": 1.01034558, + "learning_rate": 0.0009996621770237937, + "loss": 1.02129602, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.25390625, + "step": 215, + "time_per_iteration": 2.760256290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098352, + "balance_loss_mlp": 1.07167339, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.06820201547086252, + "language_loss": 0.97216904, + "learning_rate": 0.0009996506296949073, + "loss": 0.98315251, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.26708984, + "step": 216, + "time_per_iteration": 2.921712636947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106582, + "balance_loss_mlp": 1.0792954, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.05678696526689756, + "language_loss": 0.96681535, + "learning_rate": 0.0009996388883844428, + "loss": 0.97788119, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27282715, + "step": 217, + "time_per_iteration": 2.6392288208007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092837, + "balance_loss_mlp": 1.06704009, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06325985488704432, + "language_loss": 1.01514912, + "learning_rate": 0.0009996269530969588, + "loss": 1.02607751, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25830078, + "step": 218, + "time_per_iteration": 2.6588566303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105308, + "balance_loss_mlp": 1.08038127, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.07879458740668356, + "language_loss": 0.99769139, + "learning_rate": 0.0009996148238370888, + "loss": 1.00874448, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24938965, + "step": 219, + "time_per_iteration": 2.7322278022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103161, + "balance_loss_mlp": 1.07711363, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.0629407592127239, + "language_loss": 0.95434463, + "learning_rate": 0.0009996025006095421, + "loss": 0.96537632, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.26049805, + "step": 220, + "time_per_iteration": 3.336355209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02460831, + "balance_loss_mlp": 2.43965983, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.4526401201513886, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.80243975, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 0.21191406, + "step": 221, + "time_per_iteration": 5.584397315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138695, + "balance_loss_mlp": 1.11146736, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.08000509590360377, + "language_loss": 0.96767551, + "learning_rate": 0.0009995772722706307, + "loss": 0.9790625, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.27246094, + "step": 222, + "time_per_iteration": 2.932035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.14898777, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.06295735346771135, + "language_loss": 1.10290885, + "learning_rate": 0.0009995643671690604, + "loss": 1.1146853, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.28686523, + "step": 223, + "time_per_iteration": 2.489574909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118606, + "balance_loss_mlp": 1.15768862, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.06397701682602697, + "language_loss": 0.97599596, + "learning_rate": 0.0009995512681194023, + "loss": 0.98785651, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.28369141, + "step": 224, + "time_per_iteration": 2.8617055416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204697, + "balance_loss_mlp": 1.17644429, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.0569906191636753, + "language_loss": 0.95713508, + "learning_rate": 0.0009995379751267417, + "loss": 0.96918201, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28295898, + "step": 225, + "time_per_iteration": 3.272956371307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211045, + "balance_loss_mlp": 1.17959809, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.06210348551978246, + "language_loss": 0.970909, + "learning_rate": 0.0009995244881962398, + "loss": 0.98301941, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.31420898, + "step": 226, + "time_per_iteration": 2.629014253616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207143, + "balance_loss_mlp": 1.17750776, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.06412842399528458, + "language_loss": 0.97423029, + "learning_rate": 0.0009995108073331323, + "loss": 0.98630178, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.29614258, + "step": 227, + "time_per_iteration": 2.598266124725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.1790204, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.05900157234221112, + "language_loss": 1.00919747, + "learning_rate": 0.0009994969325427309, + "loss": 1.02128983, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.30200195, + "step": 228, + "time_per_iteration": 2.681445598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208149, + "balance_loss_mlp": 1.17727375, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.08372721248844238, + "language_loss": 0.96768719, + "learning_rate": 0.0009994828638304218, + "loss": 0.97976863, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.30883789, + "step": 229, + "time_per_iteration": 2.6330137252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213023, + "balance_loss_mlp": 1.18202829, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.09332052147555223, + "language_loss": 1.02555704, + "learning_rate": 0.0009994686012016675, + "loss": 1.0376873, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.30981445, + "step": 230, + "time_per_iteration": 2.519575595855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205079, + "balance_loss_mlp": 1.17470419, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.07303811655625075, + "language_loss": 1.02279592, + "learning_rate": 0.000999454144662005, + "loss": 1.03484678, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.3034668, + "step": 231, + "time_per_iteration": 2.8772194385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200788, + "balance_loss_mlp": 1.16729009, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.05982585511102693, + "language_loss": 0.9550131, + "learning_rate": 0.0009994394942170468, + "loss": 0.96702093, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.33520508, + "step": 232, + "time_per_iteration": 2.705536127090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200355, + "balance_loss_mlp": 1.16673827, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06482734437318205, + "language_loss": 0.93872058, + "learning_rate": 0.0009994246498724808, + "loss": 0.95072412, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.33642578, + "step": 233, + "time_per_iteration": 2.729526996612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204357, + "balance_loss_mlp": 1.17043054, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.06840473363398163, + "language_loss": 0.96267349, + "learning_rate": 0.00099940961163407, + "loss": 0.97471702, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.33935547, + "step": 234, + "time_per_iteration": 2.8506321907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210646, + "balance_loss_mlp": 1.1758604, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.061734633326469966, + "language_loss": 0.99016106, + "learning_rate": 0.0009993943795076528, + "loss": 1.0022676, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.34814453, + "step": 235, + "time_per_iteration": 2.6817193031311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012082, + "balance_loss_mlp": 1.17379582, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.07722659013027651, + "language_loss": 1.01211047, + "learning_rate": 0.0009993789534991427, + "loss": 1.02419257, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34423828, + "step": 236, + "time_per_iteration": 2.4797797203063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216471, + "balance_loss_mlp": 1.18354487, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.057771959372629855, + "language_loss": 0.96296465, + "learning_rate": 0.0009993633336145287, + "loss": 0.97512937, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.3293457, + "step": 237, + "time_per_iteration": 2.629390001296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225643, + "balance_loss_mlp": 1.19369495, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.07668042159358972, + "language_loss": 1.00654197, + "learning_rate": 0.0009993475198598752, + "loss": 1.01879823, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.31958008, + "step": 238, + "time_per_iteration": 3.01481032371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220207, + "balance_loss_mlp": 1.1866858, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08994725037560618, + "language_loss": 0.96828419, + "learning_rate": 0.0009993315122413212, + "loss": 0.98048627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.33544922, + "step": 239, + "time_per_iteration": 2.6483867168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215592, + "balance_loss_mlp": 1.18042517, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.08238446857980607, + "language_loss": 0.9678297, + "learning_rate": 0.0009993153107650818, + "loss": 0.97998565, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.35180664, + "step": 240, + "time_per_iteration": 2.594534158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199347, + "balance_loss_mlp": 1.16303563, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.09316981102360596, + "language_loss": 0.96465278, + "learning_rate": 0.0009992989154374468, + "loss": 0.9766463, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.36328125, + "step": 241, + "time_per_iteration": 2.5503900051116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190623, + "balance_loss_mlp": 1.15631413, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06540072726643342, + "language_loss": 1.03219867, + "learning_rate": 0.0009992823262647817, + "loss": 1.04410505, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.34301758, + "step": 242, + "time_per_iteration": 2.7218894958496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156597, + "balance_loss_mlp": 1.1235044, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.09177405734811558, + "language_loss": 0.97326249, + "learning_rate": 0.0009992655432535264, + "loss": 0.98482847, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.33105469, + "step": 243, + "time_per_iteration": 2.800133466720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136682, + "balance_loss_mlp": 1.10614085, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.0753000751829641, + "language_loss": 0.98140877, + "learning_rate": 0.0009992485664101973, + "loss": 0.99277562, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.30517578, + "step": 244, + "time_per_iteration": 2.6863763332366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.08648348, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.06369495608278983, + "language_loss": 1.00049853, + "learning_rate": 0.000999231395741385, + "loss": 1.01165819, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.29467773, + "step": 245, + "time_per_iteration": 3.145612955093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104415, + "balance_loss_mlp": 1.0764488, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.058358007346171054, + "language_loss": 0.97651666, + "learning_rate": 0.0009992140312537557, + "loss": 0.98756075, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.2800293, + "step": 246, + "time_per_iteration": 2.612847328186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092763, + "balance_loss_mlp": 1.06641817, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.0813165094086701, + "language_loss": 0.93562448, + "learning_rate": 0.000999196472954051, + "loss": 0.94655204, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.26379395, + "step": 247, + "time_per_iteration": 2.9633545875549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02706023, + "balance_loss_mlp": 2.55038333, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.26644214904670055, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.82130873, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.5546875, + "step": 248, + "time_per_iteration": 5.665804624557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.12381256, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.07780849766073628, + "language_loss": 1.00670481, + "learning_rate": 0.0009991607749457578, + "loss": 1.01821971, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.27709961, + "step": 249, + "time_per_iteration": 2.511357069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173992, + "balance_loss_mlp": 1.14483345, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.08242230719461915, + "language_loss": 0.98555326, + "learning_rate": 0.0009991426352510286, + "loss": 0.99729323, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.29174805, + "step": 250, + "time_per_iteration": 2.9747626781463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213643, + "balance_loss_mlp": 1.18186164, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.08110439009499554, + "language_loss": 0.99640858, + "learning_rate": 0.0009991243017719422, + "loss": 1.00854492, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.31787109, + "step": 251, + "time_per_iteration": 2.6450002193450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247147, + "balance_loss_mlp": 1.21276748, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.09531666026222298, + "language_loss": 0.94547766, + "learning_rate": 0.0009991057745156165, + "loss": 0.95794916, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.34375, + "step": 252, + "time_per_iteration": 2.608226776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0212821, + "balance_loss_mlp": 2.05687547, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.23568337742673945, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84039193, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.71484375, + "step": 253, + "time_per_iteration": 5.009166955947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253718, + "balance_loss_mlp": 1.22112656, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.11732554794190522, + "language_loss": 1.02719152, + "learning_rate": 0.0009990681387000943, + "loss": 1.03972876, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.32568359, + "step": 254, + "time_per_iteration": 2.733544111251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259536, + "balance_loss_mlp": 1.22959042, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.10757948615664437, + "language_loss": 0.99075437, + "learning_rate": 0.0009990490301555093, + "loss": 1.00334978, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.29907227, + "step": 255, + "time_per_iteration": 2.952223777770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01833791, + "balance_loss_mlp": 1.79201972, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.13001926806611183, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81048942, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.41796875, + "step": 256, + "time_per_iteration": 4.834028244018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01839647, + "balance_loss_mlp": 1.7994014, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.11989001468728706, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81082386, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.40234375, + "step": 257, + "time_per_iteration": 4.963416814804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764173, + "balance_loss_mlp": 1.72659838, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.09913369297847359, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71740055, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.375, + "step": 258, + "time_per_iteration": 4.860485076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242536, + "balance_loss_mlp": 1.21342516, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.09740558448014502, + "language_loss": 0.93272007, + "learning_rate": 0.0009989706585723202, + "loss": 0.94514549, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29101562, + "step": 259, + "time_per_iteration": 2.763617753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252163, + "balance_loss_mlp": 1.22202659, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.1249592106702951, + "language_loss": 0.99313855, + "learning_rate": 0.0009989505813633442, + "loss": 1.0056603, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.30102539, + "step": 260, + "time_per_iteration": 2.687018394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240716, + "balance_loss_mlp": 1.2099601, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12109163963871895, + "language_loss": 0.99271172, + "learning_rate": 0.000998930310444573, + "loss": 1.00511885, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.30712891, + "step": 261, + "time_per_iteration": 2.7355992794036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194626, + "balance_loss_mlp": 1.16220057, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.10196827835843725, + "language_loss": 0.96712077, + "learning_rate": 0.0009989098458238765, + "loss": 0.97906703, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.32421875, + "step": 262, + "time_per_iteration": 2.8160154819488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120265, + "balance_loss_mlp": 1.16850853, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.08050125519090791, + "language_loss": 0.96376812, + "learning_rate": 0.0009988891875091998, + "loss": 0.97579467, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.34179688, + "step": 263, + "time_per_iteration": 2.7738425731658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221172, + "balance_loss_mlp": 1.18657792, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09840792148235085, + "language_loss": 0.91716301, + "learning_rate": 0.0009988683355085636, + "loss": 0.92937469, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.34619141, + "step": 264, + "time_per_iteration": 2.7763147354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240941, + "balance_loss_mlp": 1.20393836, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.10851467261948886, + "language_loss": 0.99809039, + "learning_rate": 0.000998847289830063, + "loss": 1.01049972, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37011719, + "step": 265, + "time_per_iteration": 2.824655532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228337, + "balance_loss_mlp": 1.1930747, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.10300549526892724, + "language_loss": 0.92410266, + "learning_rate": 0.0009988260504818682, + "loss": 0.93638599, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.35253906, + "step": 266, + "time_per_iteration": 2.5484864711761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187227, + "balance_loss_mlp": 1.15127397, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.08304900792028935, + "language_loss": 0.99349552, + "learning_rate": 0.000998804617472226, + "loss": 1.00536776, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.35986328, + "step": 267, + "time_per_iteration": 2.67124342918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115754, + "balance_loss_mlp": 1.1241138, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.09977621520267708, + "language_loss": 0.94207335, + "learning_rate": 0.0009987829908094568, + "loss": 0.95364869, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.33447266, + "step": 268, + "time_per_iteration": 2.813934087753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134088, + "balance_loss_mlp": 1.09908843, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.11738978381138881, + "language_loss": 1.00792646, + "learning_rate": 0.0009987611705019569, + "loss": 1.01926744, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.3503418, + "step": 269, + "time_per_iteration": 4.138862133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117111, + "balance_loss_mlp": 1.08282614, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.05348082980263852, + "language_loss": 0.99369657, + "learning_rate": 0.0009987391565581978, + "loss": 1.00486767, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34277344, + "step": 270, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126117, + "balance_loss_mlp": 1.09176075, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.07524916084480812, + "language_loss": 0.92056942, + "learning_rate": 0.000998716948986726, + "loss": 0.93183053, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34350586, + "step": 271, + "time_per_iteration": 2.7993569374084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142479, + "balance_loss_mlp": 1.10948217, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.0817059207133684, + "language_loss": 0.94050443, + "learning_rate": 0.0009986945477961633, + "loss": 0.95192927, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.33032227, + "step": 272, + "time_per_iteration": 2.692488193511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162369, + "balance_loss_mlp": 1.13108802, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07154102990319093, + "language_loss": 0.9958387, + "learning_rate": 0.0009986719529952066, + "loss": 1.00746238, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.3125, + "step": 273, + "time_per_iteration": 2.834634780883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151893, + "balance_loss_mlp": 1.12099373, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.11641144040169231, + "language_loss": 0.98596179, + "learning_rate": 0.000998649164592628, + "loss": 0.99748075, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.30859375, + "step": 274, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128229, + "balance_loss_mlp": 1.0986656, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.08444223005841496, + "language_loss": 0.96863008, + "learning_rate": 0.0009986261825972748, + "loss": 0.97991234, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29541016, + "step": 275, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116765, + "balance_loss_mlp": 1.08734369, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.09541227165854013, + "language_loss": 0.9859423, + "learning_rate": 0.000998603007018069, + "loss": 0.99711001, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29394531, + "step": 276, + "time_per_iteration": 2.7675342559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108591, + "balance_loss_mlp": 1.07731009, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.06559506468622318, + "language_loss": 0.95903766, + "learning_rate": 0.0009985796378640089, + "loss": 0.97012359, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.3125, + "step": 277, + "time_per_iteration": 2.7019519805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111687, + "balance_loss_mlp": 1.08012068, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.07318038514420845, + "language_loss": 0.95983016, + "learning_rate": 0.0009985560751441665, + "loss": 0.97094703, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.31542969, + "step": 278, + "time_per_iteration": 2.8234922885894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111914, + "balance_loss_mlp": 1.0874306, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.07220087085065136, + "language_loss": 0.98319995, + "learning_rate": 0.00099853231886769, + "loss": 0.99439132, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.31713867, + "step": 279, + "time_per_iteration": 2.7748613357543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133292, + "balance_loss_mlp": 1.10162961, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06439402113592181, + "language_loss": 0.98657203, + "learning_rate": 0.0009985083690438024, + "loss": 0.99790496, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.31640625, + "step": 280, + "time_per_iteration": 2.700810670852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132869, + "balance_loss_mlp": 1.10204113, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.04843472954862069, + "language_loss": 0.89283121, + "learning_rate": 0.0009984842256818016, + "loss": 0.9041599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.30786133, + "step": 281, + "time_per_iteration": 3.115292549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113546, + "balance_loss_mlp": 1.10580087, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.06657413960403659, + "language_loss": 0.99515754, + "learning_rate": 0.0009984598887910613, + "loss": 1.00651217, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.29614258, + "step": 282, + "time_per_iteration": 2.735640048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140553, + "balance_loss_mlp": 1.10893846, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.07881571737542031, + "language_loss": 0.95306879, + "learning_rate": 0.0009984353583810297, + "loss": 0.96447432, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.31616211, + "step": 283, + "time_per_iteration": 2.8240931034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128602, + "balance_loss_mlp": 1.09834647, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0943213260733239, + "language_loss": 0.97471213, + "learning_rate": 0.0009984106344612302, + "loss": 0.98599815, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.30224609, + "step": 284, + "time_per_iteration": 2.802689790725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119254, + "balance_loss_mlp": 1.08964229, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.0726777825280204, + "language_loss": 0.92919928, + "learning_rate": 0.0009983857170412615, + "loss": 0.94039178, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.29589844, + "step": 285, + "time_per_iteration": 3.0111782550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.10165143, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.06957121076923053, + "language_loss": 0.92976809, + "learning_rate": 0.000998360606130798, + "loss": 0.94110835, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.32324219, + "step": 286, + "time_per_iteration": 2.8221306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01949249, + "balance_loss_mlp": 1.90461755, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.20138197735421756, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71022367, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.44726562, + "step": 287, + "time_per_iteration": 4.872509956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160615, + "balance_loss_mlp": 1.12447047, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.09083797153449202, + "language_loss": 0.98382282, + "learning_rate": 0.0009983098038774552, + "loss": 0.99542892, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.36132812, + "step": 288, + "time_per_iteration": 2.7861900329589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156365, + "balance_loss_mlp": 1.54524422, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.05039988105800305, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79733872, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.18359375, + "step": 289, + "time_per_iteration": 4.809176683425903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183294, + "balance_loss_mlp": 1.14958155, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.11767359006900376, + "language_loss": 0.95852768, + "learning_rate": 0.0009982582277800948, + "loss": 0.9703607, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.33666992, + "step": 290, + "time_per_iteration": 2.5785539150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11738336, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.09005932528563108, + "language_loss": 1.03039932, + "learning_rate": 0.0009982321495648908, + "loss": 1.04188573, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.3125, + "step": 291, + "time_per_iteration": 2.798412561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133747, + "balance_loss_mlp": 1.10218096, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.07041326246084649, + "language_loss": 0.9488259, + "learning_rate": 0.0009982058779188115, + "loss": 0.96016335, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.31542969, + "step": 292, + "time_per_iteration": 2.7117443084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113109, + "balance_loss_mlp": 1.08354521, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.0659469171672323, + "language_loss": 1.02221513, + "learning_rate": 0.0009981794128520567, + "loss": 1.0333463, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.29589844, + "step": 293, + "time_per_iteration": 2.83561372756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113063, + "balance_loss_mlp": 1.10104227, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.07618014203826041, + "language_loss": 0.98908657, + "learning_rate": 0.000998152754374901, + "loss": 1.00039291, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.2956543, + "step": 294, + "time_per_iteration": 2.879502773284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133819, + "balance_loss_mlp": 1.1052562, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.09109925372268521, + "language_loss": 0.94850433, + "learning_rate": 0.0009981259024976943, + "loss": 0.95984244, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.28564453, + "step": 295, + "time_per_iteration": 2.708038568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129466, + "balance_loss_mlp": 1.10023606, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.08548016831625774, + "language_loss": 0.92669952, + "learning_rate": 0.0009980988572308612, + "loss": 0.93799424, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.29248047, + "step": 296, + "time_per_iteration": 2.99466609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126952, + "balance_loss_mlp": 1.09779358, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.05751010220277151, + "language_loss": 0.96034563, + "learning_rate": 0.0009980716185849015, + "loss": 0.9716152, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.29174805, + "step": 297, + "time_per_iteration": 3.0216734409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135652, + "balance_loss_mlp": 1.10651755, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06310788330802251, + "language_loss": 0.92855394, + "learning_rate": 0.0009980441865703904, + "loss": 0.93991041, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29150391, + "step": 298, + "time_per_iteration": 2.6354267597198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124221, + "balance_loss_mlp": 1.09456158, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.07879622532675779, + "language_loss": 1.0091691, + "learning_rate": 0.000998016561197978, + "loss": 1.02041125, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29638672, + "step": 299, + "time_per_iteration": 2.726853370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104202, + "balance_loss_mlp": 1.0768075, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.07606317837722033, + "language_loss": 0.9243238, + "learning_rate": 0.0009979887424783895, + "loss": 0.9353658, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.27441406, + "step": 300, + "time_per_iteration": 2.866880416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03286275, + "balance_loss_mlp": 5.97428513, + "diversity_loss_mlp": 0.40086228, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.08630620995418306, + "language_loss": 1.00780904, + "learning_rate": 0.0009979607304224248, + "loss": 1.04067183, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.09870158, + "step": 301, + "time_per_iteration": 2.8737847805023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101287, + "balance_loss_mlp": 1.07100797, + "diversity_loss_mlp": 0.0, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.07465341521099292, + "language_loss": 0.98771101, + "learning_rate": 0.000997932525040959, + "loss": 0.99872386, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.30273438, + "routerloss_mlp": 0.0, + "step": 302, + "time_per_iteration": 2.646038055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097684, + "balance_loss_mlp": 1.06912112, + "diversity_loss_mlp": 0.0, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.0784548088046029, + "language_loss": 1.01345074, + "learning_rate": 0.000997904126344943, + "loss": 1.02442753, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.28527832, + "routerloss_mlp": 0.0, + "step": 303, + "time_per_iteration": 2.607773542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117476, + "balance_loss_mlp": 1.08612442, + "diversity_loss_mlp": 0.0, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.08413175271133923, + "language_loss": 0.96722186, + "learning_rate": 0.0009978755343454018, + "loss": 0.97839665, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.31323242, + "routerloss_mlp": 0.0, + "step": 304, + "time_per_iteration": 2.7423698902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.11099684, + "diversity_loss_mlp": 0.0, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.08591892096672729, + "language_loss": 0.97475642, + "learning_rate": 0.0009978467490534355, + "loss": 0.98621881, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.35229492, + "routerloss_mlp": 0.0, + "step": 305, + "time_per_iteration": 2.5751075744628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144387, + "balance_loss_mlp": 1.10974526, + "diversity_loss_mlp": 0.0, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.06674928608125212, + "language_loss": 0.95161211, + "learning_rate": 0.00099781777048022, + "loss": 0.96305597, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.34667969, + "routerloss_mlp": 0.0, + "step": 306, + "time_per_iteration": 2.697453260421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142445, + "balance_loss_mlp": 1.10766006, + "diversity_loss_mlp": 0.0, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.08714127978238019, + "language_loss": 0.96547389, + "learning_rate": 0.0009977885986370057, + "loss": 0.97689843, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.34790039, + "routerloss_mlp": 0.0, + "step": 307, + "time_per_iteration": 2.555311679840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.11098385, + "diversity_loss_mlp": 0.0, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.07630797692789458, + "language_loss": 0.93133295, + "learning_rate": 0.000997759233535118, + "loss": 0.94276774, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.32495117, + "routerloss_mlp": 0.0, + "step": 308, + "time_per_iteration": 2.7760326862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.10530353, + "diversity_loss_mlp": 0.0, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.1535726459245726, + "language_loss": 0.98530197, + "learning_rate": 0.0009977296751859576, + "loss": 0.99668187, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.32666016, + "routerloss_mlp": 0.0, + "step": 309, + "time_per_iteration": 2.7718236446380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119868, + "balance_loss_mlp": 1.09030402, + "diversity_loss_mlp": 0.0, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.09363029892750833, + "language_loss": 1.00139546, + "learning_rate": 0.0009976999236009998, + "loss": 1.01259422, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 310, + "time_per_iteration": 2.7480924129486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128418, + "balance_loss_mlp": 1.1004039, + "diversity_loss_mlp": 0.0, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.11799476734746514, + "language_loss": 1.01830125, + "learning_rate": 0.0009976699787917955, + "loss": 1.02958548, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.28051758, + "routerloss_mlp": 0.0, + "step": 311, + "time_per_iteration": 2.6702628135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02237821, + "balance_loss_mlp": 2.22513723, + "diversity_loss_mlp": 0.0, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.1521885653041848, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75680816, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 312, + "time_per_iteration": 4.968472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01934551, + "balance_loss_mlp": 3.38140035, + "diversity_loss_mlp": 0.39575127, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.05936914788699087, + "language_loss": 0.983639, + "learning_rate": 0.0009976095095472243, + "loss": 1.00298452, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.04597524, + "step": 313, + "time_per_iteration": 2.6077775955200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140935, + "balance_loss_mlp": 1.11120427, + "diversity_loss_mlp": 0.0, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.09323488343042824, + "language_loss": 0.95392269, + "learning_rate": 0.0009975789851353334, + "loss": 0.96533203, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29736328, + "routerloss_mlp": 0.0, + "step": 314, + "time_per_iteration": 2.810530424118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152798, + "balance_loss_mlp": 1.12359178, + "diversity_loss_mlp": 0.0, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.09115128879339694, + "language_loss": 0.97407585, + "learning_rate": 0.0009975482675461487, + "loss": 0.98560387, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.29223633, + "routerloss_mlp": 0.0, + "step": 315, + "time_per_iteration": 2.658961772918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165231, + "balance_loss_mlp": 1.13464189, + "diversity_loss_mlp": 0.0, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08232329918432242, + "language_loss": 0.95008749, + "learning_rate": 0.0009975173567915952, + "loss": 0.96173978, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.3059082, + "routerloss_mlp": 0.0, + "step": 316, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208938, + "balance_loss_mlp": 1.17508304, + "diversity_loss_mlp": 0.0, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.11734128354988786, + "language_loss": 0.89037865, + "learning_rate": 0.000997486252883674, + "loss": 0.90246803, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.33886719, + "routerloss_mlp": 0.0, + "step": 317, + "time_per_iteration": 2.82440447807312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246386, + "balance_loss_mlp": 1.21069503, + "diversity_loss_mlp": 0.0, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.09191065951965113, + "language_loss": 0.94435382, + "learning_rate": 0.0009974549558344602, + "loss": 0.95681769, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.35693359, + "routerloss_mlp": 0.0, + "step": 318, + "time_per_iteration": 3.6594014167785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256455, + "balance_loss_mlp": 1.22028661, + "diversity_loss_mlp": 0.0, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.10186826507715854, + "language_loss": 1.03254342, + "learning_rate": 0.000997423465656105, + "loss": 1.04510808, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.36181641, + "routerloss_mlp": 0.0, + "step": 319, + "time_per_iteration": 2.7277376651763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228783, + "balance_loss_mlp": 1.19342566, + "diversity_loss_mlp": 0.0, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.07892523617459922, + "language_loss": 1.00628281, + "learning_rate": 0.0009973917823608335, + "loss": 1.01857066, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.35375977, + "routerloss_mlp": 0.0, + "step": 320, + "time_per_iteration": 2.608973503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216411, + "balance_loss_mlp": 1.18279386, + "diversity_loss_mlp": 0.0, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.08046246772740448, + "language_loss": 0.96186835, + "learning_rate": 0.0009973599059609462, + "loss": 0.9740324, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 321, + "time_per_iteration": 2.736543655395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188588, + "balance_loss_mlp": 1.15735531, + "diversity_loss_mlp": 0.0, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.06958940991484033, + "language_loss": 0.93877137, + "learning_rate": 0.000997327836468819, + "loss": 0.95065725, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.31225586, + "routerloss_mlp": 0.0, + "step": 322, + "time_per_iteration": 2.6034624576568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_mlp": 1.14392066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.10097410409674823, + "language_loss": 0.96476239, + "learning_rate": 0.000997295573896902, + "loss": 0.97648811, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28649902, + "routerloss_mlp": 0.0, + "step": 323, + "time_per_iteration": 2.8207039833068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02388506, + "balance_loss_mlp": 2.37343788, + "diversity_loss_mlp": 0.0, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.2858946964689234, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83584547, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 324, + "time_per_iteration": 4.691263437271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01793915, + "balance_loss_mlp": 1.78142214, + "diversity_loss_mlp": 0.0, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.11944332826526777, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80365855, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 325, + "time_per_iteration": 4.837715148925781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214832, + "balance_loss_mlp": 1.18657923, + "diversity_loss_mlp": 0.0, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.0814388529334085, + "language_loss": 0.91516924, + "learning_rate": 0.000997197627828043, + "loss": 0.92731762, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 326, + "time_per_iteration": 2.5261096954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228602, + "balance_loss_mlp": 1.20018268, + "diversity_loss_mlp": 0.0, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.08774897428196327, + "language_loss": 0.86495018, + "learning_rate": 0.0009971645930629716, + "loss": 0.87723619, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.28442383, + "routerloss_mlp": 0.0, + "step": 327, + "time_per_iteration": 2.73193621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236303, + "balance_loss_mlp": 1.20914674, + "diversity_loss_mlp": 0.0, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.0823367638378532, + "language_loss": 0.99889791, + "learning_rate": 0.0009971313652814872, + "loss": 1.01126099, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.2722168, + "routerloss_mlp": 0.0, + "step": 328, + "time_per_iteration": 2.79278826713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224995, + "balance_loss_mlp": 1.1973865, + "diversity_loss_mlp": 0.0, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.1407341288256049, + "language_loss": 0.97435188, + "learning_rate": 0.0009970979444964903, + "loss": 0.98660183, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27636719, + "routerloss_mlp": 0.0, + "step": 329, + "time_per_iteration": 2.9955334663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213807, + "balance_loss_mlp": 1.18553066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.10291010686297611, + "language_loss": 0.9869082, + "learning_rate": 0.0009970643307209556, + "loss": 0.99904621, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 330, + "time_per_iteration": 2.79775071144104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202809, + "balance_loss_mlp": 1.17248201, + "diversity_loss_mlp": 0.0, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.08231148280507655, + "language_loss": 0.94842714, + "learning_rate": 0.0009970305239679334, + "loss": 0.96045524, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.30322266, + "routerloss_mlp": 0.0, + "step": 331, + "time_per_iteration": 2.802400827407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203401, + "balance_loss_mlp": 1.17300248, + "diversity_loss_mlp": 0.0, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.08804880344809486, + "language_loss": 0.99692816, + "learning_rate": 0.0009969965242505483, + "loss": 1.00896215, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.30371094, + "routerloss_mlp": 0.0, + "step": 332, + "time_per_iteration": 2.634702682495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224958, + "balance_loss_mlp": 1.19243741, + "diversity_loss_mlp": 0.0, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06414677867033303, + "language_loss": 0.95931363, + "learning_rate": 0.0009969623315820007, + "loss": 0.97156322, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 333, + "time_per_iteration": 2.6661436557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245141, + "balance_loss_mlp": 1.21149969, + "diversity_loss_mlp": 0.0, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06624608002660057, + "language_loss": 0.9590115, + "learning_rate": 0.000996927945975565, + "loss": 0.97146285, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 334, + "time_per_iteration": 2.576922655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252992, + "balance_loss_mlp": 1.21672821, + "diversity_loss_mlp": 0.0, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.07108304231036514, + "language_loss": 0.93002915, + "learning_rate": 0.0009968933674445906, + "loss": 0.94255906, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.36230469, + "routerloss_mlp": 0.0, + "step": 335, + "time_per_iteration": 2.706836462020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267675, + "balance_loss_mlp": 1.23026776, + "diversity_loss_mlp": 0.0, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.0701420022906001, + "language_loss": 0.95153642, + "learning_rate": 0.0009968585960025028, + "loss": 0.96421325, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.37402344, + "routerloss_mlp": 0.0, + "step": 336, + "time_per_iteration": 2.9356396198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01838771, + "balance_loss_mlp": 1.81416643, + "diversity_loss_mlp": 0.0, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.09587986506557475, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79491967, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.24511719, + "routerloss_mlp": 0.0, + "step": 337, + "time_per_iteration": 4.784119606018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242978, + "balance_loss_mlp": 1.20874155, + "diversity_loss_mlp": 0.0, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.1007121907193806, + "language_loss": 0.9314844, + "learning_rate": 0.0009967884744390583, + "loss": 0.94391423, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.3425293, + "routerloss_mlp": 0.0, + "step": 338, + "time_per_iteration": 3.5315823554992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209945, + "balance_loss_mlp": 1.1758039, + "diversity_loss_mlp": 0.0, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.10820011352875603, + "language_loss": 0.93812096, + "learning_rate": 0.0009967531243449256, + "loss": 0.95022047, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.34130859, + "routerloss_mlp": 0.0, + "step": 339, + "time_per_iteration": 2.6663827896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172072, + "balance_loss_mlp": 1.13959908, + "diversity_loss_mlp": 0.0, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.07246387309668721, + "language_loss": 1.014539, + "learning_rate": 0.000996717581394126, + "loss": 1.02625966, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 340, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142038, + "balance_loss_mlp": 1.11142516, + "diversity_loss_mlp": 0.0, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.07622939946709405, + "language_loss": 1.01788783, + "learning_rate": 0.000996681845600459, + "loss": 1.0293082, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.30615234, + "routerloss_mlp": 0.0, + "step": 341, + "time_per_iteration": 2.6651370525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138836, + "balance_loss_mlp": 1.10901034, + "diversity_loss_mlp": 0.0, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.06359259902727714, + "language_loss": 0.94080132, + "learning_rate": 0.0009966459169777982, + "loss": 0.95218974, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.29785156, + "routerloss_mlp": 0.0, + "step": 342, + "time_per_iteration": 2.524775981903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136152, + "balance_loss_mlp": 1.10670757, + "diversity_loss_mlp": 0.0, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.07912610309003802, + "language_loss": 1.03090763, + "learning_rate": 0.0009966097955400924, + "loss": 1.04226899, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.29418945, + "routerloss_mlp": 0.0, + "step": 343, + "time_per_iteration": 2.662269115447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074802, + "balance_loss_mlp": 1.74366593, + "diversity_loss_mlp": 0.35364389, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.10968898462568231, + "language_loss": 0.99445379, + "learning_rate": 0.0009965734813013652, + "loss": 1.00520182, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02614743, + "step": 344, + "time_per_iteration": 2.82026743888855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138748, + "balance_loss_mlp": 1.10989952, + "diversity_loss_mlp": 0.0, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.13046244738635646, + "language_loss": 0.99630761, + "learning_rate": 0.0009965369742757151, + "loss": 1.00769508, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.28833008, + "routerloss_mlp": 0.0, + "step": 345, + "time_per_iteration": 2.565809965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112942, + "balance_loss_mlp": 1.10131097, + "diversity_loss_mlp": 0.0, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.1120170016707216, + "language_loss": 0.96858162, + "learning_rate": 0.0009965002744773152, + "loss": 0.9798758, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 346, + "time_per_iteration": 3.52542781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144914, + "balance_loss_mlp": 1.1170671, + "diversity_loss_mlp": 0.0, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.08447825810050776, + "language_loss": 0.93369007, + "learning_rate": 0.0009964633819204139, + "loss": 0.94513917, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.27832031, + "routerloss_mlp": 0.0, + "step": 347, + "time_per_iteration": 2.6504640579223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02729187, + "balance_loss_mlp": 2.68856025, + "diversity_loss_mlp": 0.0, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.36365581545094156, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.84530306, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 348, + "time_per_iteration": 4.9217259883880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01886969, + "balance_loss_mlp": 1.8606472, + "diversity_loss_mlp": 0.0, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.11180228987157655, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.77040851, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.26367188, + "routerloss_mlp": 0.0, + "step": 349, + "time_per_iteration": 4.915479898452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148873, + "balance_loss_mlp": 1.11942816, + "diversity_loss_mlp": 0.0, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.08620115988858058, + "language_loss": 0.93105251, + "learning_rate": 0.000996351547842304, + "loss": 0.94254124, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29443359, + "routerloss_mlp": 0.0, + "step": 350, + "time_per_iteration": 3.2273383140563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183797, + "balance_loss_mlp": 1.152946, + "diversity_loss_mlp": 0.0, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.10656846418921655, + "language_loss": 0.91589314, + "learning_rate": 0.0009963138843953744, + "loss": 0.92773116, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.30810547, + "routerloss_mlp": 0.0, + "step": 351, + "time_per_iteration": 2.6443302631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122224, + "balance_loss_mlp": 1.19079256, + "diversity_loss_mlp": 0.0, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.12218392571909323, + "language_loss": 0.95582229, + "learning_rate": 0.000996276028262306, + "loss": 0.9680447, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.31420898, + "routerloss_mlp": 0.0, + "step": 352, + "time_per_iteration": 2.819287061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121763, + "balance_loss_mlp": 1.18711233, + "diversity_loss_mlp": 0.0, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.14903684788896404, + "language_loss": 1.01496267, + "learning_rate": 0.0009962379794577964, + "loss": 1.02713895, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.30493164, + "routerloss_mlp": 0.0, + "step": 353, + "time_per_iteration": 2.591759204864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123139, + "balance_loss_mlp": 1.2003479, + "diversity_loss_mlp": 0.0, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.0632056956592815, + "language_loss": 0.9195236, + "learning_rate": 0.000996199737996617, + "loss": 0.9318375, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "routerloss_mlp": 0.0, + "step": 354, + "time_per_iteration": 2.889040231704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209696, + "balance_loss_mlp": 1.17963195, + "diversity_loss_mlp": 0.0, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.07119928644727336, + "language_loss": 1.00405252, + "learning_rate": 0.0009961613038936149, + "loss": 1.0161494, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.30029297, + "routerloss_mlp": 0.0, + "step": 355, + "time_per_iteration": 2.5856525897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187257, + "balance_loss_mlp": 1.15755057, + "diversity_loss_mlp": 0.0, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.07116362106359332, + "language_loss": 0.93361115, + "learning_rate": 0.000996122677163711, + "loss": 0.9454838, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.296875, + "routerloss_mlp": 0.0, + "step": 356, + "time_per_iteration": 2.8134818077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213028, + "balance_loss_mlp": 1.18367887, + "diversity_loss_mlp": 0.0, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.08014414191517881, + "language_loss": 0.98940754, + "learning_rate": 0.000996083857821902, + "loss": 1.0015378, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.29345703, + "routerloss_mlp": 0.0, + "step": 357, + "time_per_iteration": 3.0531890392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237281, + "balance_loss_mlp": 1.20714498, + "diversity_loss_mlp": 0.0, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.06260381392843543, + "language_loss": 0.96791607, + "learning_rate": 0.0009960448458832588, + "loss": 0.98028892, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30126953, + "routerloss_mlp": 0.0, + "step": 358, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.20750594, + "diversity_loss_mlp": 0.0, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.07177130169486132, + "language_loss": 0.96227086, + "learning_rate": 0.000996005641362927, + "loss": 0.97463197, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28637695, + "routerloss_mlp": 0.0, + "step": 359, + "time_per_iteration": 2.58060884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229528, + "balance_loss_mlp": 1.19984436, + "diversity_loss_mlp": 0.0, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.09877521418753983, + "language_loss": 0.99257219, + "learning_rate": 0.0009959662442761274, + "loss": 1.00486755, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.29663086, + "routerloss_mlp": 0.0, + "step": 360, + "time_per_iteration": 2.8970725536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241998, + "balance_loss_mlp": 1.21033561, + "diversity_loss_mlp": 0.0, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.07509157549903762, + "language_loss": 0.93086261, + "learning_rate": 0.000995926654638155, + "loss": 0.9432826, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.31640625, + "routerloss_mlp": 0.0, + "step": 361, + "time_per_iteration": 2.787796974182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225169, + "balance_loss_mlp": 1.19405532, + "diversity_loss_mlp": 0.0, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.08313329413520473, + "language_loss": 0.94580126, + "learning_rate": 0.00099588687246438, + "loss": 0.95805293, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.31103516, + "routerloss_mlp": 0.0, + "step": 362, + "time_per_iteration": 2.826186418533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188587, + "balance_loss_mlp": 1.15785527, + "diversity_loss_mlp": 0.0, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.12654684897021498, + "language_loss": 1.02203465, + "learning_rate": 0.0009958468977702471, + "loss": 1.03392053, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 363, + "time_per_iteration": 2.5915637016296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02117372, + "balance_loss_mlp": 1.97470212, + "diversity_loss_mlp": 0.0, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.12517092959889778, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81852078, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.4296875, + "routerloss_mlp": 0.0, + "step": 364, + "time_per_iteration": 4.79950737953186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195198, + "balance_loss_mlp": 1.16406059, + "diversity_loss_mlp": 0.0, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.08484436116426784, + "language_loss": 0.90580225, + "learning_rate": 0.0009957663708830612, + "loss": 0.91775423, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 365, + "time_per_iteration": 3.2616662979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119947, + "balance_loss_mlp": 1.16575801, + "diversity_loss_mlp": 0.0, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.10575932689534903, + "language_loss": 0.93159938, + "learning_rate": 0.0009957258187212714, + "loss": 0.9435941, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.33740234, + "routerloss_mlp": 0.0, + "step": 366, + "time_per_iteration": 3.0113134384155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02012454, + "balance_loss_mlp": 1.90030205, + "diversity_loss_mlp": 0.0, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.0781885975604906, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.81207317, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.125, + "routerloss_mlp": 0.0, + "step": 367, + "time_per_iteration": 4.857182502746582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238272, + "balance_loss_mlp": 1.20377314, + "diversity_loss_mlp": 0.0, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.10459556468103207, + "language_loss": 0.9040041, + "learning_rate": 0.0009956441370400167, + "loss": 0.91638684, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.34472656, + "routerloss_mlp": 0.0, + "step": 368, + "time_per_iteration": 2.6384623050689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212552, + "balance_loss_mlp": 1.17986465, + "diversity_loss_mlp": 0.0, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.11871319311308551, + "language_loss": 0.96155751, + "learning_rate": 0.0009956030075522636, + "loss": 0.973683, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.3269043, + "routerloss_mlp": 0.0, + "step": 369, + "time_per_iteration": 2.7690951824188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.85686088, + "diversity_loss_mlp": 0.26596725, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0445321938876095, + "language_loss": 0.99161661, + "learning_rate": 0.0009955616856543587, + "loss": 1.00259984, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.03691306, + "step": 370, + "time_per_iteration": 2.6551451683044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136923, + "balance_loss_mlp": 1.10690594, + "diversity_loss_mlp": 0.0, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.06345816714032589, + "language_loss": 0.89315635, + "learning_rate": 0.0009955201713623448, + "loss": 0.90452558, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.29980469, + "routerloss_mlp": 0.0, + "step": 371, + "time_per_iteration": 2.7738049030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01981215, + "balance_loss_mlp": 1.93124223, + "diversity_loss_mlp": 0.0, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.16358882606758401, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78653932, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.5, + "routerloss_mlp": 0.0, + "step": 372, + "time_per_iteration": 4.94252347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117773, + "balance_loss_mlp": 1.08999681, + "diversity_loss_mlp": 0.0, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.14652608757044766, + "language_loss": 1.03006279, + "learning_rate": 0.0009954365656605333, + "loss": 1.04124057, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.27783203, + "routerloss_mlp": 0.0, + "step": 373, + "time_per_iteration": 2.551156759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138367, + "balance_loss_mlp": 1.10901785, + "diversity_loss_mlp": 0.0, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.09116429227244367, + "language_loss": 0.95790577, + "learning_rate": 0.0009953944742831947, + "loss": 0.96928942, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.29296875, + "routerloss_mlp": 0.0, + "step": 374, + "time_per_iteration": 2.995286226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159694, + "balance_loss_mlp": 1.13084567, + "diversity_loss_mlp": 0.0, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.10582188185488459, + "language_loss": 0.99257255, + "learning_rate": 0.0009953521905766642, + "loss": 1.00416946, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.28808594, + "routerloss_mlp": 0.0, + "step": 375, + "time_per_iteration": 2.946237325668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186879, + "balance_loss_mlp": 1.15664721, + "diversity_loss_mlp": 0.0, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.09648654328935216, + "language_loss": 0.97696835, + "learning_rate": 0.0009953097145573577, + "loss": 0.98883718, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.30200195, + "routerloss_mlp": 0.0, + "step": 376, + "time_per_iteration": 2.64080548286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119333, + "balance_loss_mlp": 1.16164398, + "diversity_loss_mlp": 0.0, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.11805021949506506, + "language_loss": 0.95023847, + "learning_rate": 0.000995267046241766, + "loss": 0.96217185, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 377, + "time_per_iteration": 3.2120020389556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188603, + "balance_loss_mlp": 1.15617776, + "diversity_loss_mlp": 0.0, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.10215127385841216, + "language_loss": 0.94931126, + "learning_rate": 0.0009952241856464547, + "loss": 0.96119732, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.32421875, + "routerloss_mlp": 0.0, + "step": 378, + "time_per_iteration": 2.595047950744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183617, + "balance_loss_mlp": 1.14971423, + "diversity_loss_mlp": 0.0, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.08294465031859817, + "language_loss": 1.01604176, + "learning_rate": 0.0009951811327880632, + "loss": 1.02787805, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.33911133, + "routerloss_mlp": 0.0, + "step": 379, + "time_per_iteration": 2.7318813800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173744, + "balance_loss_mlp": 1.13891101, + "diversity_loss_mlp": 0.0, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.06744176383892367, + "language_loss": 0.94898254, + "learning_rate": 0.0009951378876833063, + "loss": 0.96071994, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.34838867, + "routerloss_mlp": 0.0, + "step": 380, + "time_per_iteration": 2.565268039703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198329, + "balance_loss_mlp": 1.16392517, + "diversity_loss_mlp": 0.0, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.08808941505023588, + "language_loss": 1.01867247, + "learning_rate": 0.0009950944503489736, + "loss": 1.03065586, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.34399414, + "routerloss_mlp": 0.0, + "step": 381, + "time_per_iteration": 2.7605583667755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220014, + "balance_loss_mlp": 1.18479919, + "diversity_loss_mlp": 0.0, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.09503573620830386, + "language_loss": 0.95487726, + "learning_rate": 0.0009950508208019285, + "loss": 0.96707737, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.35253906, + "routerloss_mlp": 0.0, + "step": 382, + "time_per_iteration": 3.023996591567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224507, + "balance_loss_mlp": 1.19086623, + "diversity_loss_mlp": 0.0, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.09021711867793632, + "language_loss": 1.0023253, + "learning_rate": 0.0009950069990591096, + "loss": 1.01457047, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.33666992, + "routerloss_mlp": 0.0, + "step": 383, + "time_per_iteration": 2.62634015083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02435347, + "balance_loss_mlp": 2.36668229, + "diversity_loss_mlp": 0.0, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.252441104666548, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78836709, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.6875, + "routerloss_mlp": 0.0, + "step": 384, + "time_per_iteration": 4.887000322341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205074, + "balance_loss_mlp": 1.17217231, + "diversity_loss_mlp": 0.0, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.13776686153508858, + "language_loss": 0.92669415, + "learning_rate": 0.0009949187790542777, + "loss": 0.93874478, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.32910156, + "routerloss_mlp": 0.0, + "step": 385, + "time_per_iteration": 2.7325563430786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158547, + "balance_loss_mlp": 1.12683773, + "diversity_loss_mlp": 0.0, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.09404920935129117, + "language_loss": 0.89306223, + "learning_rate": 0.0009948743808265148, + "loss": 0.90464771, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 386, + "time_per_iteration": 2.723581314086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152345, + "balance_loss_mlp": 1.12321043, + "diversity_loss_mlp": 0.0, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.11553674714385681, + "language_loss": 0.98625511, + "learning_rate": 0.0009948297904714782, + "loss": 0.99777853, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29125977, + "routerloss_mlp": 0.0, + "step": 387, + "time_per_iteration": 2.6925902366638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152671, + "balance_loss_mlp": 1.12460923, + "diversity_loss_mlp": 0.0, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.10281917509950625, + "language_loss": 0.91430104, + "learning_rate": 0.0009947850080064796, + "loss": 0.92582774, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.28076172, + "routerloss_mlp": 0.0, + "step": 388, + "time_per_iteration": 2.7813222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_mlp": 1.80238378, + "diversity_loss_mlp": 0.24433145, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.03140321958098528, + "language_loss": 0.96549261, + "learning_rate": 0.0009947400334489047, + "loss": 0.97600979, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0283502, + "step": 389, + "time_per_iteration": 3.055640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_mlp": 1.11867988, + "diversity_loss_mlp": 0.0, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.10120121915973303, + "language_loss": 0.87344396, + "learning_rate": 0.0009946948668162145, + "loss": 0.88490444, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.27392578, + "routerloss_mlp": 0.0, + "step": 390, + "time_per_iteration": 2.7240688800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159261, + "balance_loss_mlp": 1.13079381, + "diversity_loss_mlp": 0.0, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.0733706931740777, + "language_loss": 0.92598295, + "learning_rate": 0.0009946495081259441, + "loss": 0.93757558, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 391, + "time_per_iteration": 2.8451168537139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145886, + "balance_loss_mlp": 1.11753774, + "diversity_loss_mlp": 0.0, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.0986246500370879, + "language_loss": 0.95604634, + "learning_rate": 0.0009946039573957035, + "loss": 0.96750522, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.28344727, + "routerloss_mlp": 0.0, + "step": 392, + "time_per_iteration": 2.943962574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142945, + "balance_loss_mlp": 1.11550307, + "diversity_loss_mlp": 0.0, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.0698233472363084, + "language_loss": 0.92221498, + "learning_rate": 0.000994558214643177, + "loss": 0.93364441, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.27441406, + "routerloss_mlp": 0.0, + "step": 393, + "time_per_iteration": 2.7336390018463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137637, + "balance_loss_mlp": 1.10933709, + "diversity_loss_mlp": 0.0, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.0667709001177297, + "language_loss": 0.93581867, + "learning_rate": 0.000994512279886123, + "loss": 0.94719505, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 394, + "time_per_iteration": 3.0792524814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148773, + "balance_loss_mlp": 1.12104487, + "diversity_loss_mlp": 0.0, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.057306164352953166, + "language_loss": 0.94243777, + "learning_rate": 0.0009944661531423758, + "loss": 0.95392549, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.27758789, + "routerloss_mlp": 0.0, + "step": 395, + "time_per_iteration": 2.7003707885742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169709, + "balance_loss_mlp": 1.14162326, + "diversity_loss_mlp": 0.0, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.09187664036534561, + "language_loss": 0.92709243, + "learning_rate": 0.000994419834429843, + "loss": 0.93878949, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 396, + "time_per_iteration": 2.654961109161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184579, + "balance_loss_mlp": 1.15613592, + "diversity_loss_mlp": 0.0, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.10401840603132484, + "language_loss": 0.96742636, + "learning_rate": 0.0009943733237665069, + "loss": 0.97927213, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 397, + "time_per_iteration": 2.8282015323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204203, + "balance_loss_mlp": 1.17542565, + "diversity_loss_mlp": 0.0, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.06433229599495933, + "language_loss": 0.96130294, + "learning_rate": 0.0009943266211704248, + "loss": 0.97334492, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.28759766, + "routerloss_mlp": 0.0, + "step": 398, + "time_per_iteration": 2.970426321029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183998, + "balance_loss_mlp": 1.15534043, + "diversity_loss_mlp": 0.0, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.08157022591406732, + "language_loss": 0.98195136, + "learning_rate": 0.000994279726659728, + "loss": 0.99379134, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.28662109, + "routerloss_mlp": 0.0, + "step": 399, + "time_per_iteration": 2.5123794078826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177562, + "balance_loss_mlp": 1.14926195, + "diversity_loss_mlp": 0.0, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.07895179134063258, + "language_loss": 0.95376462, + "learning_rate": 0.0009942326402526231, + "loss": 0.96554029, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.28320312, + "routerloss_mlp": 0.0, + "step": 400, + "time_per_iteration": 2.52349591255188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146856, + "balance_loss_mlp": 1.11905658, + "diversity_loss_mlp": 0.0, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.0705701607591385, + "language_loss": 0.94442534, + "learning_rate": 0.0009941853619673902, + "loss": 0.95589387, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.27807617, + "routerloss_mlp": 0.0, + "step": 401, + "time_per_iteration": 2.643442153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.10811007, + "diversity_loss_mlp": 0.0, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.11619926948996102, + "language_loss": 0.97199881, + "learning_rate": 0.0009941378918223844, + "loss": 0.9833436, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.26416016, + "routerloss_mlp": 0.0, + "step": 402, + "time_per_iteration": 3.05241322517395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124539, + "balance_loss_mlp": 1.09765708, + "diversity_loss_mlp": 0.0, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.0628584922031364, + "language_loss": 0.90586787, + "learning_rate": 0.0009940902298360354, + "loss": 0.91711324, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.26916504, + "routerloss_mlp": 0.0, + "step": 403, + "time_per_iteration": 2.739593744277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123125, + "balance_loss_mlp": 1.09564674, + "diversity_loss_mlp": 0.0, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.07463467829204698, + "language_loss": 0.99357891, + "learning_rate": 0.0009940423760268473, + "loss": 1.00481009, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.27478027, + "routerloss_mlp": 0.0, + "step": 404, + "time_per_iteration": 2.863248825073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123907, + "balance_loss_mlp": 1.09644127, + "diversity_loss_mlp": 0.0, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.08544352707712408, + "language_loss": 0.93046296, + "learning_rate": 0.0009939943304133982, + "loss": 0.94170201, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.27514648, + "routerloss_mlp": 0.0, + "step": 405, + "time_per_iteration": 2.631242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929276, + "balance_loss_mlp": 1.55583501, + "diversity_loss_mlp": 0.25816602, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.039808149400508724, + "language_loss": 1.0085814, + "learning_rate": 0.0009939460930143416, + "loss": 1.017874, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02227605, + "step": 406, + "time_per_iteration": 2.655000925064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908113, + "balance_loss_mlp": 1.5136435, + "diversity_loss_mlp": 0.25845903, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.031543409668047605, + "language_loss": 0.94866949, + "learning_rate": 0.0009938976638484043, + "loss": 0.95775062, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02206134, + "step": 407, + "time_per_iteration": 2.932522773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125815, + "balance_loss_mlp": 1.09954083, + "diversity_loss_mlp": 0.0, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.0874520562524596, + "language_loss": 0.93291676, + "learning_rate": 0.0009938490429343887, + "loss": 0.94417489, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 408, + "time_per_iteration": 2.5488343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128514, + "balance_loss_mlp": 1.10140562, + "diversity_loss_mlp": 0.0, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.1051667442879041, + "language_loss": 0.94155729, + "learning_rate": 0.0009938002302911709, + "loss": 0.95284247, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 409, + "time_per_iteration": 2.7672979831695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.10946035, + "diversity_loss_mlp": 0.0, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.09613329153911296, + "language_loss": 0.9601537, + "learning_rate": 0.0009937512259377015, + "loss": 0.97151482, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 410, + "time_per_iteration": 2.674072504043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159019, + "balance_loss_mlp": 1.13217306, + "diversity_loss_mlp": 0.0, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.05951235305386178, + "language_loss": 0.95475662, + "learning_rate": 0.000993702029893006, + "loss": 0.96634674, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.26879883, + "routerloss_mlp": 0.0, + "step": 411, + "time_per_iteration": 2.7913753986358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185856, + "balance_loss_mlp": 1.15731764, + "diversity_loss_mlp": 0.0, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.10961223184545879, + "language_loss": 0.95336723, + "learning_rate": 0.0009936526421761838, + "loss": 0.96522582, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.28540039, + "routerloss_mlp": 0.0, + "step": 412, + "time_per_iteration": 3.036557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181446, + "balance_loss_mlp": 1.15414703, + "diversity_loss_mlp": 0.0, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.09075853005030154, + "language_loss": 0.97731507, + "learning_rate": 0.000993603062806409, + "loss": 0.98912954, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.27319336, + "routerloss_mlp": 0.0, + "step": 413, + "time_per_iteration": 2.690500259399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166438, + "balance_loss_mlp": 1.1394248, + "diversity_loss_mlp": 0.0, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.0841151797190701, + "language_loss": 1.00301099, + "learning_rate": 0.0009935532918029298, + "loss": 1.01467538, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.27050781, + "routerloss_mlp": 0.0, + "step": 414, + "time_per_iteration": 2.6386477947235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171646, + "balance_loss_mlp": 1.14432323, + "diversity_loss_mlp": 0.0, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.07267589634089947, + "language_loss": 0.94145483, + "learning_rate": 0.0009935033291850694, + "loss": 0.95317131, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.27307129, + "routerloss_mlp": 0.0, + "step": 415, + "time_per_iteration": 2.6771326065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138565, + "balance_loss_mlp": 1.11312544, + "diversity_loss_mlp": 0.0, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.09244391725109519, + "language_loss": 0.96404541, + "learning_rate": 0.0009934531749722247, + "loss": 0.97543103, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 416, + "time_per_iteration": 2.586975574493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132517, + "balance_loss_mlp": 1.10733998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.0915153559751851, + "language_loss": 0.94398224, + "learning_rate": 0.0009934028291838672, + "loss": 0.95530736, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.25183105, + "routerloss_mlp": 0.0, + "step": 417, + "time_per_iteration": 2.7062928676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150706, + "balance_loss_mlp": 1.1251713, + "diversity_loss_mlp": 0.0, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.10053131301435142, + "language_loss": 0.89968443, + "learning_rate": 0.0009933522918395433, + "loss": 0.91119152, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.25549316, + "routerloss_mlp": 0.0, + "step": 418, + "time_per_iteration": 2.65326189994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00760745, + "balance_loss_mlp": 1.16580379, + "diversity_loss_mlp": 0.256477, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.006992447528439397, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79011846, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.049605, + "step": 419, + "time_per_iteration": 4.8772523403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176473, + "balance_loss_mlp": 1.15143883, + "diversity_loss_mlp": 0.0, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08608768077535772, + "language_loss": 1.07860529, + "learning_rate": 0.000993250642561551, + "loss": 1.09036994, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.25061035, + "routerloss_mlp": 0.0, + "step": 420, + "time_per_iteration": 2.588672399520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.15165043, + "diversity_loss_mlp": 0.0, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.09804047271530963, + "language_loss": 0.93524832, + "learning_rate": 0.0009931995306673466, + "loss": 0.94701445, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 421, + "time_per_iteration": 2.734513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200943, + "balance_loss_mlp": 1.17474103, + "diversity_loss_mlp": 0.0, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.0768650968130289, + "language_loss": 0.98959565, + "learning_rate": 0.000993148227296103, + "loss": 1.00160503, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 422, + "time_per_iteration": 2.6389012336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185361, + "balance_loss_mlp": 1.1604228, + "diversity_loss_mlp": 0.0, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.08220754838372611, + "language_loss": 0.87845761, + "learning_rate": 0.000993096732467738, + "loss": 0.89031118, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.24938965, + "routerloss_mlp": 0.0, + "step": 423, + "time_per_iteration": 2.976412057876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00884908, + "balance_loss_mlp": 1.45653749, + "diversity_loss_mlp": 0.26738948, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.04326164577840749, + "language_loss": 0.94753903, + "learning_rate": 0.0009930450462022435, + "loss": 0.95638812, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02294483, + "step": 424, + "time_per_iteration": 2.9038002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02462639, + "balance_loss_mlp": 2.35582733, + "diversity_loss_mlp": 0.0, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.15208391867633483, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.81652445, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.0703125, + "routerloss_mlp": 0.0, + "step": 425, + "time_per_iteration": 4.893689155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182525, + "balance_loss_mlp": 1.15690684, + "diversity_loss_mlp": 0.0, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.10181541083425144, + "language_loss": 0.92197704, + "learning_rate": 0.0009929410994402065, + "loss": 0.93380231, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.25646973, + "routerloss_mlp": 0.0, + "step": 426, + "time_per_iteration": 3.793488025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863772, + "balance_loss_mlp": 1.42266524, + "diversity_loss_mlp": 0.26325443, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.038163151149059646, + "language_loss": 0.97185421, + "learning_rate": 0.0009928888389840196, + "loss": 0.98049194, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02081174, + "step": 427, + "time_per_iteration": 2.7310097217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196199, + "balance_loss_mlp": 1.1708436, + "diversity_loss_mlp": 0.0, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.1014811860289813, + "language_loss": 0.98936689, + "learning_rate": 0.0009928363871714147, + "loss": 1.00132895, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.25378418, + "routerloss_mlp": 0.0, + "step": 428, + "time_per_iteration": 2.650698184967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198239, + "balance_loss_mlp": 1.17194164, + "diversity_loss_mlp": 0.0, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.0884548399202502, + "language_loss": 0.93840969, + "learning_rate": 0.0009927837440227556, + "loss": 0.95039201, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 429, + "time_per_iteration": 2.8162689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199498, + "balance_loss_mlp": 1.17399931, + "diversity_loss_mlp": 0.0, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.0660726649824177, + "language_loss": 0.88846099, + "learning_rate": 0.0009927309095584798, + "loss": 0.90045595, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.25524902, + "routerloss_mlp": 0.0, + "step": 430, + "time_per_iteration": 2.975594997406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190829, + "balance_loss_mlp": 1.1661284, + "diversity_loss_mlp": 0.0, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.08430379744466543, + "language_loss": 0.98639262, + "learning_rate": 0.0009926778837991, + "loss": 0.99830091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.24682617, + "routerloss_mlp": 0.0, + "step": 431, + "time_per_iteration": 2.595855236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187757, + "balance_loss_mlp": 1.16231799, + "diversity_loss_mlp": 0.0, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.08045199303169787, + "language_loss": 0.97297168, + "learning_rate": 0.000992624666765202, + "loss": 0.98484921, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.2545166, + "routerloss_mlp": 0.0, + "step": 432, + "time_per_iteration": 2.828488826751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195331, + "balance_loss_mlp": 1.17080951, + "diversity_loss_mlp": 0.0, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.08518069864439091, + "language_loss": 0.9513936, + "learning_rate": 0.000992571258477447, + "loss": 0.96334684, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 433, + "time_per_iteration": 2.7914628982543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181479, + "balance_loss_mlp": 1.15727913, + "diversity_loss_mlp": 0.0, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.08514456826718247, + "language_loss": 0.89393032, + "learning_rate": 0.0009925176589565695, + "loss": 0.90574509, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.24182129, + "routerloss_mlp": 0.0, + "step": 434, + "time_per_iteration": 2.847381830215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154073, + "balance_loss_mlp": 1.13002813, + "diversity_loss_mlp": 0.0, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.09497783603336436, + "language_loss": 0.99263078, + "learning_rate": 0.0009924638682233791, + "loss": 1.00417161, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.24047852, + "routerloss_mlp": 0.0, + "step": 435, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505725, + "balance_loss_mlp": 2.43934894, + "diversity_loss_mlp": 0.0, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06827578128022488, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82070321, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.6640625, + "routerloss_mlp": 0.0, + "step": 436, + "time_per_iteration": 4.539026737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138708, + "balance_loss_mlp": 1.11440182, + "diversity_loss_mlp": 0.0, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.10357837156718612, + "language_loss": 0.8856501, + "learning_rate": 0.0009923557132036668, + "loss": 0.89703721, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 437, + "time_per_iteration": 3.0414698123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124752, + "balance_loss_mlp": 1.09998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.06660243724344939, + "language_loss": 0.94103611, + "learning_rate": 0.0009923013489591345, + "loss": 0.95228368, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.24768066, + "routerloss_mlp": 0.0, + "step": 438, + "time_per_iteration": 2.7426626682281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857144, + "balance_loss_mlp": 1.4199276, + "diversity_loss_mlp": 0.26049304, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04620678173721227, + "language_loss": 0.92873847, + "learning_rate": 0.0009922467935862681, + "loss": 0.93730992, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01693399, + "step": 439, + "time_per_iteration": 3.107149124145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113851, + "balance_loss_mlp": 1.11386943, + "diversity_loss_mlp": 0.0, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.07763968648184205, + "language_loss": 0.95120305, + "learning_rate": 0.0009921920471062478, + "loss": 0.96258819, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 440, + "time_per_iteration": 2.572195529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139984, + "balance_loss_mlp": 1.11489022, + "diversity_loss_mlp": 0.0, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.0880262953369173, + "language_loss": 0.92829931, + "learning_rate": 0.0009921371095403281, + "loss": 0.93969917, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.25109863, + "routerloss_mlp": 0.0, + "step": 441, + "time_per_iteration": 2.6386919021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156684, + "balance_loss_mlp": 1.13206697, + "diversity_loss_mlp": 0.0, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.09427081021892933, + "language_loss": 0.95792937, + "learning_rate": 0.0009920819809098379, + "loss": 0.96949625, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 442, + "time_per_iteration": 2.588674783706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169571, + "balance_loss_mlp": 1.1441319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.0873536117240321, + "language_loss": 0.91373646, + "learning_rate": 0.0009920266612361798, + "loss": 0.92543221, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 443, + "time_per_iteration": 2.755526065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167225, + "balance_loss_mlp": 1.14349055, + "diversity_loss_mlp": 0.0, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.07116177044877865, + "language_loss": 0.90907955, + "learning_rate": 0.0009919711505408308, + "loss": 0.92075175, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.23718262, + "routerloss_mlp": 0.0, + "step": 444, + "time_per_iteration": 2.7939865589141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116645, + "balance_loss_mlp": 1.14170241, + "diversity_loss_mlp": 0.0, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.09221719775958219, + "language_loss": 0.89192301, + "learning_rate": 0.000991915448845342, + "loss": 0.90358752, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.24731445, + "routerloss_mlp": 0.0, + "step": 445, + "time_per_iteration": 2.5457842350006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154656, + "balance_loss_mlp": 1.13168466, + "diversity_loss_mlp": 0.0, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.08780021998431992, + "language_loss": 0.98329008, + "learning_rate": 0.000991859556171339, + "loss": 0.99483669, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.22973633, + "routerloss_mlp": 0.0, + "step": 446, + "time_per_iteration": 2.6356756687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0083848, + "balance_loss_mlp": 1.38336182, + "diversity_loss_mlp": 0.25472927, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.049564893991705376, + "language_loss": 1.00050902, + "learning_rate": 0.000991803472540521, + "loss": 1.00889397, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01943407, + "step": 447, + "time_per_iteration": 2.631704807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130193, + "balance_loss_mlp": 1.1087712, + "diversity_loss_mlp": 0.0, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.11682082282160788, + "language_loss": 0.94917679, + "learning_rate": 0.0009917471979746615, + "loss": 0.96047872, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 448, + "time_per_iteration": 2.9820516109466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122722, + "balance_loss_mlp": 1.10119319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.07207820272739716, + "language_loss": 0.94521272, + "learning_rate": 0.0009916907324956086, + "loss": 0.95643997, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 449, + "time_per_iteration": 2.701571464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127326, + "balance_loss_mlp": 1.10453379, + "diversity_loss_mlp": 0.0, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.081693490118891, + "language_loss": 0.90889072, + "learning_rate": 0.0009916340761252837, + "loss": 0.92016399, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 450, + "time_per_iteration": 2.598238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124635, + "balance_loss_mlp": 1.10287929, + "diversity_loss_mlp": 0.0, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.08322873762038852, + "language_loss": 0.88526833, + "learning_rate": 0.0009915772288856832, + "loss": 0.89651471, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.21765137, + "routerloss_mlp": 0.0, + "step": 451, + "time_per_iteration": 3.0680441856384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121178, + "balance_loss_mlp": 1.09876692, + "diversity_loss_mlp": 0.0, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.07764148626601892, + "language_loss": 0.8994481, + "learning_rate": 0.000991520190798877, + "loss": 0.91065991, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22412109, + "routerloss_mlp": 0.0, + "step": 452, + "time_per_iteration": 2.7982983589172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136254, + "balance_loss_mlp": 1.11281788, + "diversity_loss_mlp": 0.0, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.11496723003988224, + "language_loss": 0.98584056, + "learning_rate": 0.0009914629618870089, + "loss": 0.99720311, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 453, + "time_per_iteration": 2.8737423419952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0218934, + "balance_loss_mlp": 2.1624465, + "diversity_loss_mlp": 0.0, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.09249743450545506, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.8086521, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.26953125, + "routerloss_mlp": 0.0, + "step": 454, + "time_per_iteration": 4.756322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065274, + "balance_loss_mlp": 2.03780842, + "diversity_loss_mlp": 0.0, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.0744981683452351, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83493233, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.27539062, + "routerloss_mlp": 0.0, + "step": 455, + "time_per_iteration": 2.173584461212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848454, + "balance_loss_mlp": 1.40727437, + "diversity_loss_mlp": 0.24745712, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.04702924064086775, + "language_loss": 0.92085564, + "learning_rate": 0.0009912901304235883, + "loss": 0.92934018, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0210887, + "step": 456, + "time_per_iteration": 2.868276596069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273346, + "balance_loss_mlp": 1.24886012, + "diversity_loss_mlp": 0.0, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.1518400720273604, + "language_loss": 0.87943619, + "learning_rate": 0.000991232138434397, + "loss": 0.89216965, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.24499512, + "routerloss_mlp": 0.0, + "step": 457, + "time_per_iteration": 2.8729381561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262571, + "balance_loss_mlp": 1.23763299, + "diversity_loss_mlp": 0.0, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.14470377187588201, + "language_loss": 0.94336045, + "learning_rate": 0.000991173955731976, + "loss": 0.9559862, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 458, + "time_per_iteration": 2.7100729942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218734, + "balance_loss_mlp": 1.19520259, + "diversity_loss_mlp": 0.0, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.09239254139658798, + "language_loss": 0.99845707, + "learning_rate": 0.0009911155823389137, + "loss": 1.01064444, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.23547363, + "routerloss_mlp": 0.0, + "step": 459, + "time_per_iteration": 2.9462080001831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178782, + "balance_loss_mlp": 1.1555717, + "diversity_loss_mlp": 0.0, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.0878830171329016, + "language_loss": 0.95269191, + "learning_rate": 0.000991057018277873, + "loss": 0.9644798, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.23205566, + "routerloss_mlp": 0.0, + "step": 460, + "time_per_iteration": 2.7473583221435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151252, + "balance_loss_mlp": 1.12904322, + "diversity_loss_mlp": 0.0, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.1205367347306004, + "language_loss": 0.9509443, + "learning_rate": 0.0009909982635715898, + "loss": 0.96245682, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22216797, + "routerloss_mlp": 0.0, + "step": 461, + "time_per_iteration": 2.6226725578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145607, + "balance_loss_mlp": 1.12300491, + "diversity_loss_mlp": 0.0, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.0884001914091671, + "language_loss": 0.94182885, + "learning_rate": 0.0009909393182428751, + "loss": 0.95328492, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22619629, + "routerloss_mlp": 0.0, + "step": 462, + "time_per_iteration": 2.632216453552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157329, + "balance_loss_mlp": 1.13402367, + "diversity_loss_mlp": 0.0, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.09814328047414513, + "language_loss": 0.89072084, + "learning_rate": 0.000990880182314614, + "loss": 0.90229416, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 463, + "time_per_iteration": 2.6763410568237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.008652, + "balance_loss_mlp": 1.44467092, + "diversity_loss_mlp": 0.24997658, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.034550824680377484, + "language_loss": 0.89998591, + "learning_rate": 0.0009908208558097643, + "loss": 0.90863788, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01787652, + "step": 464, + "time_per_iteration": 2.9323060512542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224446, + "balance_loss_mlp": 1.20036614, + "diversity_loss_mlp": 0.0, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.11121459240038054, + "language_loss": 0.9153899, + "learning_rate": 0.000990761338751359, + "loss": 0.92763436, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 465, + "time_per_iteration": 2.7976956367492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01887012, + "balance_loss_mlp": 1.84867477, + "diversity_loss_mlp": 0.0, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.10155840838291885, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75546634, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.3828125, + "routerloss_mlp": 0.0, + "step": 466, + "time_per_iteration": 4.965139150619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319273, + "balance_loss_mlp": 1.29344034, + "diversity_loss_mlp": 0.0, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.10901527230577203, + "language_loss": 0.93872285, + "learning_rate": 0.0009906417330663815, + "loss": 0.95191562, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 467, + "time_per_iteration": 2.628042459487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01352641, + "balance_loss_mlp": 1.3264153, + "diversity_loss_mlp": 0.0, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.10051526680757361, + "language_loss": 0.90321958, + "learning_rate": 0.0009905816444862442, + "loss": 0.91674596, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 468, + "time_per_iteration": 2.613952398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396274, + "balance_loss_mlp": 1.36905813, + "diversity_loss_mlp": 0.0, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.10220310656667285, + "language_loss": 0.88433367, + "learning_rate": 0.0009905213654454216, + "loss": 0.89829642, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.27209473, + "routerloss_mlp": 0.0, + "step": 469, + "time_per_iteration": 2.897365093231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01363851, + "balance_loss_mlp": 1.3367548, + "diversity_loss_mlp": 0.0, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.11223211494597432, + "language_loss": 0.94907629, + "learning_rate": 0.0009904608959673158, + "loss": 0.96271479, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.2713623, + "routerloss_mlp": 0.0, + "step": 470, + "time_per_iteration": 2.7828967571258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328731, + "balance_loss_mlp": 1.30289829, + "diversity_loss_mlp": 0.0, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.10534875872888719, + "language_loss": 0.94143116, + "learning_rate": 0.000990400236075403, + "loss": 0.95471847, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 471, + "time_per_iteration": 2.5291385650634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126, + "balance_loss_mlp": 1.23546696, + "diversity_loss_mlp": 0.0, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.08150240013734093, + "language_loss": 0.92401147, + "learning_rate": 0.0009903393857932338, + "loss": 0.93661153, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 472, + "time_per_iteration": 2.6317975521087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234666, + "balance_loss_mlp": 1.21105075, + "diversity_loss_mlp": 0.0, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.1079858906687858, + "language_loss": 0.89742762, + "learning_rate": 0.0009902783451444317, + "loss": 0.90977424, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.23583984, + "routerloss_mlp": 0.0, + "step": 473, + "time_per_iteration": 2.708159923553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204783, + "balance_loss_mlp": 1.18326581, + "diversity_loss_mlp": 0.0, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.08561107807714156, + "language_loss": 0.94620812, + "learning_rate": 0.0009902171141526956, + "loss": 0.95825595, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 474, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196875, + "balance_loss_mlp": 1.17460644, + "diversity_loss_mlp": 0.0, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.10745755704500252, + "language_loss": 0.82875264, + "learning_rate": 0.000990155692841797, + "loss": 0.84072143, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.22277832, + "routerloss_mlp": 0.0, + "step": 475, + "time_per_iteration": 2.985820770263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191147, + "balance_loss_mlp": 1.16911697, + "diversity_loss_mlp": 0.0, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.10692573165988825, + "language_loss": 0.93685389, + "learning_rate": 0.0009900940812355818, + "loss": 0.9487654, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.22033691, + "routerloss_mlp": 0.0, + "step": 476, + "time_per_iteration": 2.882946014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182015, + "balance_loss_mlp": 1.15972316, + "diversity_loss_mlp": 0.0, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.15748592495925862, + "language_loss": 0.89566875, + "learning_rate": 0.00099003227935797, + "loss": 0.90748894, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.22290039, + "routerloss_mlp": 0.0, + "step": 477, + "time_per_iteration": 2.729729413986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176422, + "balance_loss_mlp": 1.15324748, + "diversity_loss_mlp": 0.0, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.11223041806675033, + "language_loss": 0.92644513, + "learning_rate": 0.000989970287232955, + "loss": 0.93820935, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.23156738, + "routerloss_mlp": 0.0, + "step": 478, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168241, + "balance_loss_mlp": 1.14524555, + "diversity_loss_mlp": 0.0, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.08330283562574453, + "language_loss": 0.90444613, + "learning_rate": 0.0009899081048846043, + "loss": 0.91612852, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 479, + "time_per_iteration": 2.548454523086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230508, + "balance_loss_mlp": 1.20630884, + "diversity_loss_mlp": 0.0, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.17103007353978975, + "language_loss": 0.94793594, + "learning_rate": 0.0009898457323370593, + "loss": 0.96024096, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.24206543, + "routerloss_mlp": 0.0, + "step": 480, + "time_per_iteration": 2.582655668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249007, + "balance_loss_mlp": 1.22349596, + "diversity_loss_mlp": 0.0, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.11976742763400251, + "language_loss": 0.9370476, + "learning_rate": 0.000989783169614535, + "loss": 0.94953763, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25537109, + "routerloss_mlp": 0.0, + "step": 481, + "time_per_iteration": 2.6305787563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01772239, + "balance_loss_mlp": 1.74649, + "diversity_loss_mlp": 0.0, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.0876770513617693, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80524993, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 482, + "time_per_iteration": 4.8690409660339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276229, + "balance_loss_mlp": 1.25084925, + "diversity_loss_mlp": 0.0, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.10686208189243855, + "language_loss": 0.91100538, + "learning_rate": 0.000989657473741779, + "loss": 0.92376775, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25402832, + "routerloss_mlp": 0.0, + "step": 483, + "time_per_iteration": 2.8294553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275465, + "balance_loss_mlp": 1.25022864, + "diversity_loss_mlp": 0.0, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.09087050091564236, + "language_loss": 0.92375994, + "learning_rate": 0.0009895943406403465, + "loss": 0.93651462, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.25244141, + "routerloss_mlp": 0.0, + "step": 484, + "time_per_iteration": 2.728445053100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231643, + "balance_loss_mlp": 1.20584655, + "diversity_loss_mlp": 0.0, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.11173906110031175, + "language_loss": 0.85102737, + "learning_rate": 0.0009895310174615338, + "loss": 0.86334383, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.25805664, + "routerloss_mlp": 0.0, + "step": 485, + "time_per_iteration": 2.809858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01674879, + "balance_loss_mlp": 1.65122819, + "diversity_loss_mlp": 0.0, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.0891862493938321, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.77393395, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.23632812, + "routerloss_mlp": 0.0, + "step": 486, + "time_per_iteration": 4.675356388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149968, + "balance_loss_mlp": 1.1268059, + "diversity_loss_mlp": 0.0, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.12873710921953274, + "language_loss": 0.89867461, + "learning_rate": 0.0009894038009701782, + "loss": 0.91017425, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.23168945, + "routerloss_mlp": 0.0, + "step": 487, + "time_per_iteration": 2.646655797958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141338, + "balance_loss_mlp": 1.11786556, + "diversity_loss_mlp": 0.0, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.11717214663903742, + "language_loss": 0.89069557, + "learning_rate": 0.0009893399077070253, + "loss": 0.90210891, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.23474121, + "routerloss_mlp": 0.0, + "step": 488, + "time_per_iteration": 2.578733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936332, + "balance_loss_mlp": 1.59238243, + "diversity_loss_mlp": 0.24211329, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.03786592480343135, + "language_loss": 0.88446009, + "learning_rate": 0.0009892758244652718, + "loss": 0.89382339, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0190843, + "step": 489, + "time_per_iteration": 2.72853946685791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131365, + "balance_loss_mlp": 1.10876274, + "diversity_loss_mlp": 0.0, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.09957245788293691, + "language_loss": 0.92780352, + "learning_rate": 0.0009892115512697968, + "loss": 0.93911719, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 490, + "time_per_iteration": 2.6975181102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127367, + "balance_loss_mlp": 1.10648203, + "diversity_loss_mlp": 0.0, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.09077239739165983, + "language_loss": 0.95311546, + "learning_rate": 0.0009891470881455537, + "loss": 0.96438909, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 491, + "time_per_iteration": 2.674140214920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141034, + "balance_loss_mlp": 1.12092364, + "diversity_loss_mlp": 0.0, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.08843271909801863, + "language_loss": 0.91967297, + "learning_rate": 0.0009890824351175692, + "loss": 0.93108326, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.20092773, + "routerloss_mlp": 0.0, + "step": 492, + "time_per_iteration": 2.689789295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148763, + "balance_loss_mlp": 1.12847304, + "diversity_loss_mlp": 0.0, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.0818574716555875, + "language_loss": 0.96715915, + "learning_rate": 0.0009890175922109435, + "loss": 0.97864676, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.20288086, + "routerloss_mlp": 0.0, + "step": 493, + "time_per_iteration": 2.653787136077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161837, + "balance_loss_mlp": 1.14108253, + "diversity_loss_mlp": 0.0, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.10785532679009643, + "language_loss": 0.94627249, + "learning_rate": 0.0009889525594508513, + "loss": 0.95789087, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.20751953, + "routerloss_mlp": 0.0, + "step": 494, + "time_per_iteration": 3.013289213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168804, + "balance_loss_mlp": 1.14887238, + "diversity_loss_mlp": 0.0, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.09313196509024183, + "language_loss": 0.89226812, + "learning_rate": 0.0009888873368625404, + "loss": 0.90395617, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 495, + "time_per_iteration": 2.4990835189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215397, + "balance_loss_mlp": 1.19448745, + "diversity_loss_mlp": 0.0, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.11525575263217126, + "language_loss": 0.92808712, + "learning_rate": 0.0009888219244713326, + "loss": 0.94024116, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 496, + "time_per_iteration": 2.828477382659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235818, + "balance_loss_mlp": 1.2138716, + "diversity_loss_mlp": 0.0, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.13708349411569606, + "language_loss": 0.92383498, + "learning_rate": 0.0009887563223026229, + "loss": 0.93619317, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 497, + "time_per_iteration": 2.6688501834869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03358766, + "balance_loss_mlp": 3.33902526, + "diversity_loss_mlp": 0.0, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.4973253845941573, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.82426929, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19726562, + "routerloss_mlp": 0.0, + "step": 498, + "time_per_iteration": 4.9225428104400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125204, + "balance_loss_mlp": 1.22810328, + "diversity_loss_mlp": 0.0, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.09338533863845942, + "language_loss": 0.9145627, + "learning_rate": 0.0009886245487346482, + "loss": 0.92708313, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.23925781, + "routerloss_mlp": 0.0, + "step": 499, + "time_per_iteration": 3.0396392345428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273949, + "balance_loss_mlp": 1.24874783, + "diversity_loss_mlp": 0.0, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.12406156723875504, + "language_loss": 0.94657683, + "learning_rate": 0.0009885583773865422, + "loss": 0.95931631, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 500, + "time_per_iteration": 2.434283971786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_mlp": 1.29096031, + "diversity_loss_mlp": 0.0, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.11518840252548597, + "language_loss": 0.91528684, + "learning_rate": 0.0009884920163632524, + "loss": 0.92847896, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 501, + "time_per_iteration": 2.6888957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131255, + "balance_loss_mlp": 1.28246212, + "diversity_loss_mlp": 0.0, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.12991803618191863, + "language_loss": 0.93797207, + "learning_rate": 0.000988425465690543, + "loss": 0.95109755, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.30102539, + "routerloss_mlp": 0.0, + "step": 502, + "time_per_iteration": 2.5672004222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283439, + "balance_loss_mlp": 1.25225365, + "diversity_loss_mlp": 0.0, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.11000587000012971, + "language_loss": 0.91223967, + "learning_rate": 0.0009883587253942505, + "loss": 0.92507404, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 503, + "time_per_iteration": 2.7560157775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_mlp": 1.24281311, + "diversity_loss_mlp": 0.0, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.10509235815923167, + "language_loss": 0.97371984, + "learning_rate": 0.0009882917955002862, + "loss": 0.9864552, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 504, + "time_per_iteration": 2.5183091163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227481, + "balance_loss_mlp": 1.1978929, + "diversity_loss_mlp": 0.0, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.11004475447178139, + "language_loss": 0.90284961, + "learning_rate": 0.0009882246760346343, + "loss": 0.91512442, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 505, + "time_per_iteration": 2.6169376373291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215441, + "balance_loss_mlp": 1.18637753, + "diversity_loss_mlp": 0.0, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.13294554223904492, + "language_loss": 0.94025862, + "learning_rate": 0.0009881573670233533, + "loss": 0.95241302, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.29077148, + "routerloss_mlp": 0.0, + "step": 506, + "time_per_iteration": 2.5373079776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012064, + "balance_loss_mlp": 1.17976809, + "diversity_loss_mlp": 0.0, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.07932421313758002, + "language_loss": 0.89223576, + "learning_rate": 0.0009880898684925747, + "loss": 0.90429974, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 507, + "time_per_iteration": 2.661796808242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206827, + "balance_loss_mlp": 1.18070853, + "diversity_loss_mlp": 0.0, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.09132088261693337, + "language_loss": 0.87935519, + "learning_rate": 0.0009880221804685037, + "loss": 0.89142346, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.26159668, + "routerloss_mlp": 0.0, + "step": 508, + "time_per_iteration": 2.542513608932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02552291, + "balance_loss_mlp": 2.42869496, + "diversity_loss_mlp": 0.0, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.1282373293100265, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.8189671, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 509, + "time_per_iteration": 4.707206964492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280503, + "balance_loss_mlp": 1.25399113, + "diversity_loss_mlp": 0.0, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.09929466646798928, + "language_loss": 0.93586993, + "learning_rate": 0.0009878862360456733, + "loss": 0.94867498, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.265625, + "routerloss_mlp": 0.0, + "step": 510, + "time_per_iteration": 2.6981284618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284628, + "balance_loss_mlp": 1.25883126, + "diversity_loss_mlp": 0.0, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.10250849932844218, + "language_loss": 0.87516463, + "learning_rate": 0.0009878179796996922, + "loss": 0.88801086, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.25817871, + "routerloss_mlp": 0.0, + "step": 511, + "time_per_iteration": 2.7541561126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281708, + "balance_loss_mlp": 1.25468373, + "diversity_loss_mlp": 0.0, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.10234956077068923, + "language_loss": 0.90780497, + "learning_rate": 0.0009877495339659754, + "loss": 0.92062211, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.27038574, + "routerloss_mlp": 0.0, + "step": 512, + "time_per_iteration": 2.7744665145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278173, + "balance_loss_mlp": 1.25241184, + "diversity_loss_mlp": 0.0, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.11291475079800635, + "language_loss": 0.85683644, + "learning_rate": 0.000987680898871096, + "loss": 0.86961818, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 513, + "time_per_iteration": 2.8321592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289018, + "balance_loss_mlp": 1.26217198, + "diversity_loss_mlp": 0.0, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.10190264212433507, + "language_loss": 0.85800934, + "learning_rate": 0.0009876120744417, + "loss": 0.87089956, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.26867676, + "routerloss_mlp": 0.0, + "step": 514, + "time_per_iteration": 2.945312023162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245022, + "balance_loss_mlp": 1.2198211, + "diversity_loss_mlp": 0.0, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.09616865198011539, + "language_loss": 0.94088352, + "learning_rate": 0.0009875430607045078, + "loss": 0.9533338, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 515, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214395, + "balance_loss_mlp": 1.19058895, + "diversity_loss_mlp": 0.0, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.0895550710797692, + "language_loss": 0.91242373, + "learning_rate": 0.000987473857686313, + "loss": 0.9245677, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.23791504, + "routerloss_mlp": 0.0, + "step": 516, + "time_per_iteration": 2.7530250549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218622, + "balance_loss_mlp": 1.19458985, + "diversity_loss_mlp": 0.0, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.11626991588591096, + "language_loss": 0.92559797, + "learning_rate": 0.0009874044654139824, + "loss": 0.93778414, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.24023438, + "routerloss_mlp": 0.0, + "step": 517, + "time_per_iteration": 2.7673146724700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188049, + "balance_loss_mlp": 1.16410005, + "diversity_loss_mlp": 0.0, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.09260385447056875, + "language_loss": 0.91065013, + "learning_rate": 0.0009873348839144563, + "loss": 0.92253065, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.23950195, + "routerloss_mlp": 0.0, + "step": 518, + "time_per_iteration": 2.5385515689849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.13979197, + "diversity_loss_mlp": 0.0, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.07604390633760301, + "language_loss": 0.95252264, + "learning_rate": 0.000987265113214749, + "loss": 0.96414435, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.22375488, + "routerloss_mlp": 0.0, + "step": 519, + "time_per_iteration": 2.556882619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171262, + "balance_loss_mlp": 1.14849353, + "diversity_loss_mlp": 0.0, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.093032650642813, + "language_loss": 0.94720447, + "learning_rate": 0.0009871951533419476, + "loss": 0.95891708, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.22753906, + "routerloss_mlp": 0.0, + "step": 520, + "time_per_iteration": 2.724825143814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163285, + "balance_loss_mlp": 1.14063525, + "diversity_loss_mlp": 0.0, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.07732484115861517, + "language_loss": 0.87440532, + "learning_rate": 0.0009871250043232132, + "loss": 0.88603818, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.22631836, + "routerloss_mlp": 0.0, + "step": 521, + "time_per_iteration": 2.756647825241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171709, + "balance_loss_mlp": 1.14840364, + "diversity_loss_mlp": 0.0, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.08586449419627491, + "language_loss": 0.8592059, + "learning_rate": 0.0009870546661857797, + "loss": 0.87092298, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 522, + "time_per_iteration": 2.611241340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188262, + "balance_loss_mlp": 1.16447985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.11121774977632432, + "language_loss": 0.93899059, + "learning_rate": 0.0009869841389569553, + "loss": 0.9508732, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.2376709, + "routerloss_mlp": 0.0, + "step": 523, + "time_per_iteration": 2.986001491546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897074, + "balance_loss_mlp": 1.51972795, + "diversity_loss_mlp": 0.23477924, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.04055297882665198, + "language_loss": 0.88430732, + "learning_rate": 0.0009869134226641206, + "loss": 0.89327806, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01982057, + "step": 524, + "time_per_iteration": 2.5944766998291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213869, + "balance_loss_mlp": 1.19106424, + "diversity_loss_mlp": 0.0, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.1040439940574723, + "language_loss": 0.87633705, + "learning_rate": 0.0009868425173347303, + "loss": 0.88847572, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 525, + "time_per_iteration": 2.679245710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202393, + "balance_loss_mlp": 1.17973125, + "diversity_loss_mlp": 0.0, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.10306076043273057, + "language_loss": 0.95430547, + "learning_rate": 0.0009867714229963125, + "loss": 0.96632946, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 526, + "time_per_iteration": 2.6960504055023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194179, + "balance_loss_mlp": 1.17121899, + "diversity_loss_mlp": 0.0, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.13221329860014494, + "language_loss": 0.92439747, + "learning_rate": 0.000986700139676468, + "loss": 0.93633932, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 527, + "time_per_iteration": 2.5740442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226752, + "balance_loss_mlp": 1.20331526, + "diversity_loss_mlp": 0.0, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.07480383753700154, + "language_loss": 0.90178651, + "learning_rate": 0.0009866286674028717, + "loss": 0.91405398, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 528, + "time_per_iteration": 2.6214394569396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00901033, + "balance_loss_mlp": 1.53179681, + "diversity_loss_mlp": 0.23385583, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.042015219172821444, + "language_loss": 0.87127066, + "learning_rate": 0.0009865570062032717, + "loss": 0.88028097, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820667, + "step": 529, + "time_per_iteration": 2.947612762451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243163, + "balance_loss_mlp": 1.21885657, + "diversity_loss_mlp": 0.0, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.11620953964099495, + "language_loss": 0.91896212, + "learning_rate": 0.0009864851561054893, + "loss": 0.93139374, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 530, + "time_per_iteration": 2.8097901344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192516, + "balance_loss_mlp": 1.16937733, + "diversity_loss_mlp": 0.0, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.0991735208834069, + "language_loss": 0.90383148, + "learning_rate": 0.0009864131171374191, + "loss": 0.9157567, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.23132324, + "routerloss_mlp": 0.0, + "step": 531, + "time_per_iteration": 2.6775832176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169363, + "balance_loss_mlp": 1.14682031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.08125371515716559, + "language_loss": 0.90489674, + "learning_rate": 0.0009863408893270292, + "loss": 0.91659039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.2253418, + "routerloss_mlp": 0.0, + "step": 532, + "time_per_iteration": 2.7877254486083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.1120224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.11770570969906818, + "language_loss": 0.85183895, + "learning_rate": 0.0009862684727023605, + "loss": 0.8631804, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 533, + "time_per_iteration": 2.717573642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128208, + "balance_loss_mlp": 1.10571277, + "diversity_loss_mlp": 0.0, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.10673213842736717, + "language_loss": 0.88664484, + "learning_rate": 0.0009861958672915283, + "loss": 0.89792687, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.22497559, + "routerloss_mlp": 0.0, + "step": 534, + "time_per_iteration": 2.7880847454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111155, + "balance_loss_mlp": 1.08948302, + "diversity_loss_mlp": 0.0, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.11915216532291298, + "language_loss": 0.88834876, + "learning_rate": 0.0009861230731227201, + "loss": 0.89946032, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.21679688, + "routerloss_mlp": 0.0, + "step": 535, + "time_per_iteration": 2.844203233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121725, + "balance_loss_mlp": 1.10002935, + "diversity_loss_mlp": 0.0, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.11019657032079996, + "language_loss": 0.90318179, + "learning_rate": 0.0009860500902241973, + "loss": 0.91439903, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.21716309, + "routerloss_mlp": 0.0, + "step": 536, + "time_per_iteration": 2.5753133296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126411, + "balance_loss_mlp": 1.10444033, + "diversity_loss_mlp": 0.0, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.13353850851854182, + "language_loss": 0.95278764, + "learning_rate": 0.0009859769186242942, + "loss": 0.96405172, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.21984863, + "routerloss_mlp": 0.0, + "step": 537, + "time_per_iteration": 2.544611930847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894726, + "balance_loss_mlp": 1.52693653, + "diversity_loss_mlp": 0.22699235, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04205207536563703, + "language_loss": 0.88558614, + "learning_rate": 0.0009859035583514187, + "loss": 0.8945334, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01776124, + "step": 538, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257859, + "balance_loss_mlp": 1.23475599, + "diversity_loss_mlp": 0.0, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.11200334451020948, + "language_loss": 0.89448857, + "learning_rate": 0.0009858300094340517, + "loss": 0.90706718, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.23071289, + "routerloss_mlp": 0.0, + "step": 539, + "time_per_iteration": 2.7679364681243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291272, + "balance_loss_mlp": 1.26785898, + "diversity_loss_mlp": 0.0, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.17493624211104222, + "language_loss": 0.84562349, + "learning_rate": 0.0009857562719007473, + "loss": 0.85853624, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.23388672, + "routerloss_mlp": 0.0, + "step": 540, + "time_per_iteration": 2.6256375312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267144, + "balance_loss_mlp": 1.24492311, + "diversity_loss_mlp": 0.0, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.14114133743563548, + "language_loss": 0.86615884, + "learning_rate": 0.0009856823457801331, + "loss": 0.87883031, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 541, + "time_per_iteration": 2.8773691654205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254714, + "balance_loss_mlp": 1.23256469, + "diversity_loss_mlp": 0.0, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.08733197639022866, + "language_loss": 0.93604994, + "learning_rate": 0.00098560823110091, + "loss": 0.94859707, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.22167969, + "routerloss_mlp": 0.0, + "step": 542, + "time_per_iteration": 2.6173057556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206738, + "balance_loss_mlp": 1.18436217, + "diversity_loss_mlp": 0.0, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.14252191795618116, + "language_loss": 0.94814467, + "learning_rate": 0.000985533927891851, + "loss": 0.96021199, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.22387695, + "routerloss_mlp": 0.0, + "step": 543, + "time_per_iteration": 2.682035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00924177, + "balance_loss_mlp": 1.58877563, + "diversity_loss_mlp": 0.22542018, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.04171093567104517, + "language_loss": 0.92462713, + "learning_rate": 0.0009854594361818044, + "loss": 0.93386889, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01707876, + "step": 544, + "time_per_iteration": 2.771606922149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134006, + "balance_loss_mlp": 1.11126077, + "diversity_loss_mlp": 0.0, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.16622789723447462, + "language_loss": 0.91736549, + "learning_rate": 0.0009853847559996897, + "loss": 0.92870551, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.22729492, + "routerloss_mlp": 0.0, + "step": 545, + "time_per_iteration": 2.714980363845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131307, + "balance_loss_mlp": 1.10896707, + "diversity_loss_mlp": 0.0, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.13863422454282084, + "language_loss": 0.90834534, + "learning_rate": 0.0009853098873745, + "loss": 0.91965836, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.22351074, + "routerloss_mlp": 0.0, + "step": 546, + "time_per_iteration": 2.98349928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127677, + "balance_loss_mlp": 1.10500383, + "diversity_loss_mlp": 0.0, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.15888834478547278, + "language_loss": 0.90073705, + "learning_rate": 0.0009852348303353027, + "loss": 0.91201389, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.22668457, + "routerloss_mlp": 0.0, + "step": 547, + "time_per_iteration": 2.782012701034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148987, + "balance_loss_mlp": 1.12613487, + "diversity_loss_mlp": 0.0, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.10179846154330349, + "language_loss": 0.82990968, + "learning_rate": 0.000985159584911237, + "loss": 0.84139955, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 548, + "time_per_iteration": 3.102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216658, + "balance_loss_mlp": 1.19307828, + "diversity_loss_mlp": 0.0, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.12466178148261096, + "language_loss": 0.89916652, + "learning_rate": 0.0009850841511315162, + "loss": 0.91133308, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.2355957, + "routerloss_mlp": 0.0, + "step": 549, + "time_per_iteration": 2.61226749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241093, + "balance_loss_mlp": 1.21708441, + "diversity_loss_mlp": 0.0, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.11901003741868514, + "language_loss": 0.90615034, + "learning_rate": 0.0009850085290254256, + "loss": 0.91856128, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.23986816, + "routerloss_mlp": 0.0, + "step": 550, + "time_per_iteration": 2.7958199977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914838, + "balance_loss_mlp": 1.5724771, + "diversity_loss_mlp": 0.22113116, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.03122458898086593, + "language_loss": 0.87977409, + "learning_rate": 0.0009849327186223246, + "loss": 0.88892245, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0180343, + "step": 551, + "time_per_iteration": 2.799394130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242815, + "balance_loss_mlp": 1.21818638, + "diversity_loss_mlp": 0.0, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.10957849833176474, + "language_loss": 0.95181417, + "learning_rate": 0.000984856719951646, + "loss": 0.96424234, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.24609375, + "routerloss_mlp": 0.0, + "step": 552, + "time_per_iteration": 2.559286117553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121032, + "balance_loss_mlp": 1.18546462, + "diversity_loss_mlp": 0.0, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.09349197696587547, + "language_loss": 0.91760498, + "learning_rate": 0.0009847805330428943, + "loss": 0.92970818, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.24865723, + "routerloss_mlp": 0.0, + "step": 553, + "time_per_iteration": 2.906571388244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875983, + "balance_loss_mlp": 1.49139261, + "diversity_loss_mlp": 0.22127438, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05457604420902532, + "language_loss": 0.93558431, + "learning_rate": 0.0009847041579256481, + "loss": 0.94434416, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01964992, + "step": 554, + "time_per_iteration": 2.6159372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202664, + "balance_loss_mlp": 1.17859542, + "diversity_loss_mlp": 0.0, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.08395889420783041, + "language_loss": 0.94042808, + "learning_rate": 0.0009846275946295592, + "loss": 0.95245475, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 555, + "time_per_iteration": 2.592341184616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182493, + "balance_loss_mlp": 1.15904498, + "diversity_loss_mlp": 0.0, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.08262845202589308, + "language_loss": 0.8740595, + "learning_rate": 0.0009845508431843518, + "loss": 0.8858844, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 556, + "time_per_iteration": 3.0123813152313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177615, + "balance_loss_mlp": 1.15481031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.07593810566908125, + "language_loss": 0.88148719, + "learning_rate": 0.0009844739036198233, + "loss": 0.8932634, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 557, + "time_per_iteration": 2.6356143951416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184514, + "balance_loss_mlp": 1.16157842, + "diversity_loss_mlp": 0.0, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.09177793780956148, + "language_loss": 0.94916999, + "learning_rate": 0.0009843967759658448, + "loss": 0.96101511, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 558, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02293865, + "balance_loss_mlp": 2.17026901, + "diversity_loss_mlp": 0.0, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.09925677209713644, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.75061619, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 559, + "time_per_iteration": 4.829499244689941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207667, + "balance_loss_mlp": 1.18555331, + "diversity_loss_mlp": 0.0, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.1031420062274817, + "language_loss": 0.9552027, + "learning_rate": 0.000984241956509384, + "loss": 0.96727937, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 560, + "time_per_iteration": 2.65759539604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204016, + "balance_loss_mlp": 1.18220043, + "diversity_loss_mlp": 0.0, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.08944048757536185, + "language_loss": 0.90505213, + "learning_rate": 0.0009841642647670078, + "loss": 0.91709226, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 561, + "time_per_iteration": 2.591806173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194467, + "balance_loss_mlp": 1.17308092, + "diversity_loss_mlp": 0.0, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.08297191380839272, + "language_loss": 0.85483265, + "learning_rate": 0.0009840863850553944, + "loss": 0.8667773, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.21398926, + "routerloss_mlp": 0.0, + "step": 562, + "time_per_iteration": 2.963149309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179499, + "balance_loss_mlp": 1.15856552, + "diversity_loss_mlp": 0.0, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.18759249419324772, + "language_loss": 0.9088884, + "learning_rate": 0.0009840083174047782, + "loss": 0.92068338, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.20947266, + "routerloss_mlp": 0.0, + "step": 563, + "time_per_iteration": 2.71415114402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169496, + "balance_loss_mlp": 1.14940953, + "diversity_loss_mlp": 0.0, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.08351477183844232, + "language_loss": 0.86295354, + "learning_rate": 0.0009839300618454685, + "loss": 0.87464857, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.20080566, + "routerloss_mlp": 0.0, + "step": 564, + "time_per_iteration": 2.8288042545318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163807, + "balance_loss_mlp": 1.14280224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0761185875884483, + "language_loss": 0.9141686, + "learning_rate": 0.0009838516184078466, + "loss": 0.92580664, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.21020508, + "routerloss_mlp": 0.0, + "step": 565, + "time_per_iteration": 2.8194022178649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177734, + "balance_loss_mlp": 1.15682447, + "diversity_loss_mlp": 0.0, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.14122321260962364, + "language_loss": 0.88377023, + "learning_rate": 0.0009837729871223669, + "loss": 0.89554763, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 566, + "time_per_iteration": 2.6096079349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194985, + "balance_loss_mlp": 1.17372978, + "diversity_loss_mlp": 0.0, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.1066586812750682, + "language_loss": 0.88896918, + "learning_rate": 0.0009836941680195568, + "loss": 0.90091902, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.21264648, + "routerloss_mlp": 0.0, + "step": 567, + "time_per_iteration": 2.779846429824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210465, + "balance_loss_mlp": 1.18900692, + "diversity_loss_mlp": 0.0, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.09744135285550241, + "language_loss": 0.84777021, + "learning_rate": 0.0009836151611300166, + "loss": 0.85987484, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 568, + "time_per_iteration": 3.2130274772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210546, + "balance_loss_mlp": 1.18979168, + "diversity_loss_mlp": 0.0, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.0877787159655237, + "language_loss": 0.95202124, + "learning_rate": 0.0009835359664844194, + "loss": 0.96412671, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.2076416, + "routerloss_mlp": 0.0, + "step": 569, + "time_per_iteration": 2.614626407623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02163392, + "balance_loss_mlp": 2.12848806, + "diversity_loss_mlp": 0.0, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.098326155744124, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.83200204, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.34960938, + "routerloss_mlp": 0.0, + "step": 570, + "time_per_iteration": 4.910563230514526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188679, + "balance_loss_mlp": 1.16738796, + "diversity_loss_mlp": 0.0, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.10673198509513786, + "language_loss": 0.92503107, + "learning_rate": 0.0009833770140481118, + "loss": 0.93691778, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.21313477, + "routerloss_mlp": 0.0, + "step": 571, + "time_per_iteration": 2.6361794471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167522, + "balance_loss_mlp": 1.14587367, + "diversity_loss_mlp": 0.0, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.06757736028097705, + "language_loss": 0.82720339, + "learning_rate": 0.000983297256319112, + "loss": 0.83887863, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.21655273, + "routerloss_mlp": 0.0, + "step": 572, + "time_per_iteration": 3.2420709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148716, + "balance_loss_mlp": 1.12606621, + "diversity_loss_mlp": 0.0, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.09218112459591986, + "language_loss": 0.87054348, + "learning_rate": 0.000983217310957477, + "loss": 0.88203067, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 573, + "time_per_iteration": 2.7485547065734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139139, + "balance_loss_mlp": 1.11725259, + "diversity_loss_mlp": 0.0, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.08282639029669561, + "language_loss": 0.90421212, + "learning_rate": 0.000983137177994244, + "loss": 0.91560352, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.21899414, + "routerloss_mlp": 0.0, + "step": 574, + "time_per_iteration": 2.8651185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142425, + "balance_loss_mlp": 1.11990607, + "diversity_loss_mlp": 0.0, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.08655490231030577, + "language_loss": 0.8561765, + "learning_rate": 0.0009830568574605235, + "loss": 0.8676008, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.22521973, + "routerloss_mlp": 0.0, + "step": 575, + "time_per_iteration": 2.942331075668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162411, + "balance_loss_mlp": 1.13946342, + "diversity_loss_mlp": 0.0, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.08792859421485215, + "language_loss": 0.88113999, + "learning_rate": 0.0009829763493874992, + "loss": 0.89276409, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 576, + "time_per_iteration": 3.0282514095306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173849, + "balance_loss_mlp": 1.15098429, + "diversity_loss_mlp": 0.0, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.10676499351314739, + "language_loss": 0.9303807, + "learning_rate": 0.0009828956538064264, + "loss": 0.94211912, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.2286377, + "routerloss_mlp": 0.0, + "step": 577, + "time_per_iteration": 2.7946369647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173248, + "balance_loss_mlp": 1.1503005, + "diversity_loss_mlp": 0.0, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.11074471638842859, + "language_loss": 0.91223717, + "learning_rate": 0.0009828147707486344, + "loss": 0.92396963, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 578, + "time_per_iteration": 2.731588125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.13424993, + "diversity_loss_mlp": 0.0, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.09317476454713723, + "language_loss": 0.86116958, + "learning_rate": 0.0009827337002455245, + "loss": 0.87273794, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 579, + "time_per_iteration": 2.639047145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134437, + "balance_loss_mlp": 1.11184728, + "diversity_loss_mlp": 0.0, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.07918824025832125, + "language_loss": 0.88299757, + "learning_rate": 0.0009826524423285712, + "loss": 0.89434195, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.22595215, + "routerloss_mlp": 0.0, + "step": 580, + "time_per_iteration": 2.911012649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114105, + "balance_loss_mlp": 1.11881745, + "diversity_loss_mlp": 0.0, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.10469703454021252, + "language_loss": 0.89618349, + "learning_rate": 0.0009825709970293218, + "loss": 0.90759397, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 581, + "time_per_iteration": 2.8837828636169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135626, + "balance_loss_mlp": 1.11433506, + "diversity_loss_mlp": 0.0, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.1022616119694228, + "language_loss": 0.95317924, + "learning_rate": 0.0009824893643793956, + "loss": 0.96453559, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.21289062, + "routerloss_mlp": 0.0, + "step": 582, + "time_per_iteration": 3.0962114334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948798, + "balance_loss_mlp": 1.63779283, + "diversity_loss_mlp": 0.22248407, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.04350556393742171, + "language_loss": 0.88843536, + "learning_rate": 0.0009824075444104857, + "loss": 0.89792335, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01865991, + "step": 583, + "time_per_iteration": 2.719085454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157517, + "balance_loss_mlp": 1.13638163, + "diversity_loss_mlp": 0.0, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.10740950198198211, + "language_loss": 0.93831933, + "learning_rate": 0.000982325537154357, + "loss": 0.94989443, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.21154785, + "routerloss_mlp": 0.0, + "step": 584, + "time_per_iteration": 2.597120523452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117352, + "balance_loss_mlp": 1.15234792, + "diversity_loss_mlp": 0.0, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.12322952105084124, + "language_loss": 0.94442445, + "learning_rate": 0.0009822433426428484, + "loss": 0.95615965, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.21179199, + "routerloss_mlp": 0.0, + "step": 585, + "time_per_iteration": 2.571805238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238103, + "balance_loss_mlp": 1.2166214, + "diversity_loss_mlp": 0.0, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.08678287386034968, + "language_loss": 0.87089044, + "learning_rate": 0.0009821609609078697, + "loss": 0.88327146, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.21484375, + "routerloss_mlp": 0.0, + "step": 586, + "time_per_iteration": 2.586289405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320429, + "balance_loss_mlp": 1.29861343, + "diversity_loss_mlp": 0.0, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.09324667942342675, + "language_loss": 0.89581811, + "learning_rate": 0.0009820783919814045, + "loss": 0.90902239, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 587, + "time_per_iteration": 2.804417848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01397697, + "balance_loss_mlp": 1.37499988, + "diversity_loss_mlp": 0.0, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.11766834316785481, + "language_loss": 0.82825267, + "learning_rate": 0.0009819956358955095, + "loss": 0.8422296, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.22705078, + "routerloss_mlp": 0.0, + "step": 588, + "time_per_iteration": 2.5654590129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433511, + "balance_loss_mlp": 1.41009879, + "diversity_loss_mlp": 0.0, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.13254981657968556, + "language_loss": 0.84316242, + "learning_rate": 0.0009819126926823127, + "loss": 0.85749757, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.23413086, + "routerloss_mlp": 0.0, + "step": 589, + "time_per_iteration": 2.5090954303741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369151, + "balance_loss_mlp": 1.34720445, + "diversity_loss_mlp": 0.0, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.12923638752993147, + "language_loss": 0.87131608, + "learning_rate": 0.000981829562374016, + "loss": 0.88500756, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 590, + "time_per_iteration": 2.7904558181762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263432, + "balance_loss_mlp": 1.24309444, + "diversity_loss_mlp": 0.0, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.0979331207375339, + "language_loss": 0.97635686, + "learning_rate": 0.0009817462450028933, + "loss": 0.98899126, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 591, + "time_per_iteration": 2.6596498489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186211, + "balance_loss_mlp": 1.16698265, + "diversity_loss_mlp": 0.0, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.0791908179615389, + "language_loss": 0.85476398, + "learning_rate": 0.0009816627406012916, + "loss": 0.86662614, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 592, + "time_per_iteration": 2.795384168624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143018, + "balance_loss_mlp": 1.12423062, + "diversity_loss_mlp": 0.0, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.14133504737490046, + "language_loss": 0.85158926, + "learning_rate": 0.0009815790492016295, + "loss": 0.86301947, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.18774414, + "routerloss_mlp": 0.0, + "step": 593, + "time_per_iteration": 2.968202829360962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113857, + "balance_loss_mlp": 1.11954474, + "diversity_loss_mlp": 0.0, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.10990083394980393, + "language_loss": 0.87156999, + "learning_rate": 0.0009814951708363993, + "loss": 0.88295579, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.19006348, + "routerloss_mlp": 0.0, + "step": 594, + "time_per_iteration": 2.8341050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01993613, + "balance_loss_mlp": 1.96176016, + "diversity_loss_mlp": 0.0, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.10325359814292956, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79984605, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.31835938, + "routerloss_mlp": 0.0, + "step": 595, + "time_per_iteration": 4.746119976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11163688, + "diversity_loss_mlp": 0.0, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.1448933947746474, + "language_loss": 0.89056683, + "learning_rate": 0.0009813268533395648, + "loss": 0.90187395, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.19067383, + "routerloss_mlp": 0.0, + "step": 596, + "time_per_iteration": 2.592421054840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151969, + "balance_loss_mlp": 1.13301492, + "diversity_loss_mlp": 0.0, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.12455054099529249, + "language_loss": 0.8755219, + "learning_rate": 0.0009812424142733073, + "loss": 0.88704157, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.18933105, + "routerloss_mlp": 0.0, + "step": 597, + "time_per_iteration": 2.549654483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158973, + "balance_loss_mlp": 1.13961387, + "diversity_loss_mlp": 0.0, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.1533400924271749, + "language_loss": 0.86129421, + "learning_rate": 0.000981157788372175, + "loss": 0.87288398, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 598, + "time_per_iteration": 3.029372453689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181573, + "balance_loss_mlp": 1.16308403, + "diversity_loss_mlp": 0.0, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.08122879346901381, + "language_loss": 0.89185023, + "learning_rate": 0.0009810729756690223, + "loss": 0.90366596, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.18481445, + "routerloss_mlp": 0.0, + "step": 599, + "time_per_iteration": 2.72200608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225343, + "balance_loss_mlp": 1.20584035, + "diversity_loss_mlp": 0.0, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.09322481346022114, + "language_loss": 0.91937912, + "learning_rate": 0.0009809879761967766, + "loss": 0.93163252, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.19482422, + "routerloss_mlp": 0.0, + "step": 600, + "time_per_iteration": 2.9454104900360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240049, + "balance_loss_mlp": 1.22046316, + "diversity_loss_mlp": 0.0, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.11235514763344263, + "language_loss": 0.86727029, + "learning_rate": 0.0009809027899884378, + "loss": 0.87967086, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.19580078, + "routerloss_mlp": 0.0, + "step": 601, + "time_per_iteration": 2.888047218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288764, + "balance_loss_mlp": 1.26829576, + "diversity_loss_mlp": 0.0, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.07021797329248278, + "language_loss": 0.88593882, + "learning_rate": 0.0009808174170770779, + "loss": 0.89882648, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.20458984, + "routerloss_mlp": 0.0, + "step": 602, + "time_per_iteration": 2.8045670986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02144093, + "balance_loss_mlp": 2.11128712, + "diversity_loss_mlp": 0.0, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.1124732092134732, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.87042338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.328125, + "routerloss_mlp": 0.0, + "step": 603, + "time_per_iteration": 4.899731397628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341078, + "balance_loss_mlp": 1.32069361, + "diversity_loss_mlp": 0.0, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.10202627615666406, + "language_loss": 0.93765342, + "learning_rate": 0.0009806461112779462, + "loss": 0.95106417, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 604, + "time_per_iteration": 2.6618311405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291209, + "balance_loss_mlp": 1.27080083, + "diversity_loss_mlp": 0.0, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.13219567018011513, + "language_loss": 0.87928259, + "learning_rate": 0.0009805601784566814, + "loss": 0.89219463, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.20397949, + "routerloss_mlp": 0.0, + "step": 605, + "time_per_iteration": 2.4783012866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229751, + "balance_loss_mlp": 1.20996237, + "diversity_loss_mlp": 0.0, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.07794567116482086, + "language_loss": 0.95705628, + "learning_rate": 0.0009804740590654089, + "loss": 0.9693538, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.19787598, + "routerloss_mlp": 0.0, + "step": 606, + "time_per_iteration": 2.6886532306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155761, + "balance_loss_mlp": 1.13543582, + "diversity_loss_mlp": 0.0, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.09113538166915294, + "language_loss": 0.90117687, + "learning_rate": 0.0009803877531375635, + "loss": 0.91273439, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 607, + "time_per_iteration": 2.877068281173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127783, + "balance_loss_mlp": 1.1072073, + "diversity_loss_mlp": 0.0, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.0886917383310614, + "language_loss": 0.90959686, + "learning_rate": 0.0009803012607066523, + "loss": 0.92087471, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.20581055, + "routerloss_mlp": 0.0, + "step": 608, + "time_per_iteration": 2.7187952995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110833, + "balance_loss_mlp": 1.08786178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.061304878637031934, + "language_loss": 0.89645171, + "learning_rate": 0.0009802145818062543, + "loss": 0.90753502, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 609, + "time_per_iteration": 2.692622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920288, + "balance_loss_mlp": 1.57755673, + "diversity_loss_mlp": 0.22646153, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.03934500472587961, + "language_loss": 0.91726142, + "learning_rate": 0.0009801277164700212, + "loss": 0.92646432, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01827916, + "step": 610, + "time_per_iteration": 2.5983645915985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100014, + "balance_loss_mlp": 1.07810283, + "diversity_loss_mlp": 0.0, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.11493980483313035, + "language_loss": 0.90203917, + "learning_rate": 0.0009800406647316776, + "loss": 0.91303933, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.21911621, + "routerloss_mlp": 0.0, + "step": 611, + "time_per_iteration": 2.83890438079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02350268, + "balance_loss_mlp": 2.30563617, + "diversity_loss_mlp": 0.0, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.20114955038596882, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7926473, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.44726562, + "routerloss_mlp": 0.0, + "step": 612, + "time_per_iteration": 4.795763254165649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111114, + "balance_loss_mlp": 1.09067178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.10624240262278996, + "language_loss": 0.88978302, + "learning_rate": 0.000979866002183916, + "loss": 0.9008944, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 613, + "time_per_iteration": 2.660820484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.10140252, + "diversity_loss_mlp": 0.0, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.11793468153173196, + "language_loss": 0.90023279, + "learning_rate": 0.0009797783914423082, + "loss": 0.91144633, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.19946289, + "routerloss_mlp": 0.0, + "step": 614, + "time_per_iteration": 2.8052501678466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.13508475, + "diversity_loss_mlp": 0.0, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.09232041353489327, + "language_loss": 0.84365702, + "learning_rate": 0.0009796905944342094, + "loss": 0.8552016, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.19360352, + "routerloss_mlp": 0.0, + "step": 615, + "time_per_iteration": 2.829193115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164283, + "balance_loss_mlp": 1.14475632, + "diversity_loss_mlp": 0.0, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.08204462941928636, + "language_loss": 0.88193601, + "learning_rate": 0.0009796026111937057, + "loss": 0.89357883, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.19519043, + "routerloss_mlp": 0.0, + "step": 616, + "time_per_iteration": 2.5868873596191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165697, + "balance_loss_mlp": 1.14656377, + "diversity_loss_mlp": 0.0, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.08667467412120618, + "language_loss": 0.88612103, + "learning_rate": 0.0009795144417549552, + "loss": 0.89777797, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.19128418, + "routerloss_mlp": 0.0, + "step": 617, + "time_per_iteration": 2.689771890640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163262, + "balance_loss_mlp": 1.14452195, + "diversity_loss_mlp": 0.0, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.07824422885129345, + "language_loss": 0.8978498, + "learning_rate": 0.0009794260861521883, + "loss": 0.90948236, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.18737793, + "routerloss_mlp": 0.0, + "step": 618, + "time_per_iteration": 2.78352689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154292, + "balance_loss_mlp": 1.13528955, + "diversity_loss_mlp": 0.0, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.09960243519509318, + "language_loss": 0.86907887, + "learning_rate": 0.0009793375444197075, + "loss": 0.88062179, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 619, + "time_per_iteration": 2.618597984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159739, + "balance_loss_mlp": 1.14053416, + "diversity_loss_mlp": 0.0, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.09155899478389973, + "language_loss": 0.85016847, + "learning_rate": 0.000979248816591888, + "loss": 0.86176586, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 620, + "time_per_iteration": 2.7570278644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145713, + "balance_loss_mlp": 1.12721133, + "diversity_loss_mlp": 0.0, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.1108991519321712, + "language_loss": 0.86349535, + "learning_rate": 0.0009791599027031766, + "loss": 0.87495244, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 621, + "time_per_iteration": 3.2095139026641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137126, + "balance_loss_mlp": 1.11841059, + "diversity_loss_mlp": 0.0, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.09815511109151757, + "language_loss": 0.86187375, + "learning_rate": 0.0009790708027880932, + "loss": 0.873245, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 622, + "time_per_iteration": 2.878537654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01885107, + "balance_loss_mlp": 1.84448004, + "diversity_loss_mlp": 0.0, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.060338107853692736, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.79312396, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 623, + "time_per_iteration": 4.854407787322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.12785053, + "diversity_loss_mlp": 0.0, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.08227936779447462, + "language_loss": 0.9313252, + "learning_rate": 0.0009788920450172487, + "loss": 0.94280195, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.19812012, + "routerloss_mlp": 0.0, + "step": 624, + "time_per_iteration": 2.633763551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173257, + "balance_loss_mlp": 1.15283692, + "diversity_loss_mlp": 0.0, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.08898942147955141, + "language_loss": 0.90448737, + "learning_rate": 0.0009788023872308875, + "loss": 0.91621995, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.20410156, + "routerloss_mlp": 0.0, + "step": 625, + "time_per_iteration": 2.5277719497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01862648, + "balance_loss_mlp": 1.82163978, + "diversity_loss_mlp": 0.0, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.06145643913195344, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.77291644, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.41015625, + "routerloss_mlp": 0.0, + "step": 626, + "time_per_iteration": 4.746332883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165065, + "balance_loss_mlp": 1.1446321, + "diversity_loss_mlp": 0.0, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.07179626691480034, + "language_loss": 0.93775636, + "learning_rate": 0.0009786225140303285, + "loss": 0.94940698, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.2043457, + "routerloss_mlp": 0.0, + "step": 627, + "time_per_iteration": 2.650980234146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.13354802, + "diversity_loss_mlp": 0.0, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.1000912175423248, + "language_loss": 0.91955918, + "learning_rate": 0.0009785322986859634, + "loss": 0.93110657, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.21191406, + "routerloss_mlp": 0.0, + "step": 628, + "time_per_iteration": 2.699179172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0098085, + "balance_loss_mlp": 1.69793713, + "diversity_loss_mlp": 0.22907162, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.03434932946066091, + "language_loss": 0.92752671, + "learning_rate": 0.0009784418975588838, + "loss": 0.93733525, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01734566, + "step": 629, + "time_per_iteration": 2.7467246055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131113, + "balance_loss_mlp": 1.10905957, + "diversity_loss_mlp": 0.0, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.08662072407619689, + "language_loss": 0.93157279, + "learning_rate": 0.0009783513106841862, + "loss": 0.94288397, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.22070312, + "routerloss_mlp": 0.0, + "step": 630, + "time_per_iteration": 2.699862003326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01893774, + "balance_loss_mlp": 1.85181284, + "diversity_loss_mlp": 0.0, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.08318726834589595, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78626478, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.41992188, + "routerloss_mlp": 0.0, + "step": 631, + "time_per_iteration": 4.952157258987427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129662, + "balance_loss_mlp": 1.10740614, + "diversity_loss_mlp": 0.0, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.08011431594745816, + "language_loss": 0.87836802, + "learning_rate": 0.0009781695798326854, + "loss": 0.88966465, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.22265625, + "routerloss_mlp": 0.0, + "step": 632, + "time_per_iteration": 2.5692520141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.10132909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.08866631591317527, + "language_loss": 0.87804729, + "learning_rate": 0.0009780784359264365, + "loss": 0.88928837, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 633, + "time_per_iteration": 2.6267781257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00719882, + "balance_loss_mlp": 1.16367078, + "diversity_loss_mlp": 0.22089316, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.0030158712959469035, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.74908578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02760048, + "step": 634, + "time_per_iteration": 4.819004535675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00956665, + "balance_loss_mlp": 1.64561963, + "diversity_loss_mlp": 0.23289478, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.029780004210258365, + "language_loss": 0.87410563, + "learning_rate": 0.000977895591329867, + "loss": 0.88367236, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017408, + "step": 635, + "time_per_iteration": 2.8417630195617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111829, + "balance_loss_mlp": 1.09035909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.07301537581986137, + "language_loss": 0.86799347, + "learning_rate": 0.000977803890710533, + "loss": 0.87911177, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 636, + "time_per_iteration": 2.721245765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_mlp": 1.08507979, + "diversity_loss_mlp": 0.0, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.0646034576227674, + "language_loss": 0.93395561, + "learning_rate": 0.0009777120045912774, + "loss": 0.94501537, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.20898438, + "routerloss_mlp": 0.0, + "step": 637, + "time_per_iteration": 2.5976381301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114015, + "balance_loss_mlp": 1.09267688, + "diversity_loss_mlp": 0.0, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.07520229878174765, + "language_loss": 0.89586985, + "learning_rate": 0.0009776199330077736, + "loss": 0.90700996, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.21362305, + "routerloss_mlp": 0.0, + "step": 638, + "time_per_iteration": 2.7055575847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127487, + "balance_loss_mlp": 1.10741186, + "diversity_loss_mlp": 0.0, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.08952902399696973, + "language_loss": 0.91934389, + "learning_rate": 0.0009775276759957667, + "loss": 0.93061876, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.20068359, + "routerloss_mlp": 0.0, + "step": 639, + "time_per_iteration": 2.703442096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113385, + "balance_loss_mlp": 1.11285698, + "diversity_loss_mlp": 0.0, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.08734236555353025, + "language_loss": 0.8993817, + "learning_rate": 0.0009774352335910745, + "loss": 0.91072023, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.21008301, + "routerloss_mlp": 0.0, + "step": 640, + "time_per_iteration": 2.798133373260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133691, + "balance_loss_mlp": 1.11327052, + "diversity_loss_mlp": 0.0, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.08010684820371014, + "language_loss": 0.94195282, + "learning_rate": 0.000977342605829586, + "loss": 0.95328975, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.20422363, + "routerloss_mlp": 0.0, + "step": 641, + "time_per_iteration": 2.72929310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167832, + "balance_loss_mlp": 1.14699411, + "diversity_loss_mlp": 0.0, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.08202605728626432, + "language_loss": 0.85741401, + "learning_rate": 0.0009772497927472623, + "loss": 0.86909235, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.20837402, + "routerloss_mlp": 0.0, + "step": 642, + "time_per_iteration": 3.071017265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166824, + "balance_loss_mlp": 1.14637995, + "diversity_loss_mlp": 0.0, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.0829252807022359, + "language_loss": 0.84863311, + "learning_rate": 0.0009771567943801368, + "loss": 0.86030138, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.20446777, + "routerloss_mlp": 0.0, + "step": 643, + "time_per_iteration": 2.667830228805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.16058123, + "diversity_loss_mlp": 0.0, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.07304892670416417, + "language_loss": 0.89067769, + "learning_rate": 0.0009770636107643152, + "loss": 0.90248668, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.203125, + "routerloss_mlp": 0.0, + "step": 644, + "time_per_iteration": 2.715703010559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187033, + "balance_loss_mlp": 1.16633821, + "diversity_loss_mlp": 0.0, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.07624328698635177, + "language_loss": 0.87043303, + "learning_rate": 0.0009769702419359738, + "loss": 0.88230342, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.20703125, + "routerloss_mlp": 0.0, + "step": 645, + "time_per_iteration": 2.645270586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199535, + "balance_loss_mlp": 1.17913866, + "diversity_loss_mlp": 0.0, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.10325279424343262, + "language_loss": 0.88927197, + "learning_rate": 0.000976876687931362, + "loss": 0.90126729, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 646, + "time_per_iteration": 2.9558987617492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154414, + "balance_loss_mlp": 1.13427997, + "diversity_loss_mlp": 0.0, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.10259074887379964, + "language_loss": 0.84658372, + "learning_rate": 0.0009767829487868005, + "loss": 0.85812783, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.20129395, + "routerloss_mlp": 0.0, + "step": 647, + "time_per_iteration": 2.593254566192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165839, + "balance_loss_mlp": 1.14557362, + "diversity_loss_mlp": 0.0, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.08660672395493044, + "language_loss": 0.88729513, + "learning_rate": 0.000976689024538682, + "loss": 0.8989535, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.20263672, + "routerloss_mlp": 0.0, + "step": 648, + "time_per_iteration": 2.6087043285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.12564492, + "diversity_loss_mlp": 0.0, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.09471610460140056, + "language_loss": 0.86980593, + "learning_rate": 0.0009765949152234716, + "loss": 0.88127637, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.21411133, + "routerloss_mlp": 0.0, + "step": 649, + "time_per_iteration": 2.8878984451293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130912, + "balance_loss_mlp": 2.08723378, + "diversity_loss_mlp": 0.0, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.17488169385486374, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.80816996, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.4375, + "routerloss_mlp": 0.0, + "step": 650, + "time_per_iteration": 4.7227959632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125186, + "balance_loss_mlp": 1.10393071, + "diversity_loss_mlp": 0.0, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.09783498118048492, + "language_loss": 0.81436628, + "learning_rate": 0.0009764061415379919, + "loss": 0.82561815, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.21276855, + "routerloss_mlp": 0.0, + "step": 651, + "time_per_iteration": 3.2849485874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135606, + "balance_loss_mlp": 1.11419618, + "diversity_loss_mlp": 0.0, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08568090703098526, + "language_loss": 0.88376707, + "learning_rate": 0.0009763114772410109, + "loss": 0.89512312, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 652, + "time_per_iteration": 2.640482187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147244, + "balance_loss_mlp": 1.12633479, + "diversity_loss_mlp": 0.0, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.0799999486499222, + "language_loss": 0.86490756, + "learning_rate": 0.0009762166280235146, + "loss": 0.87638003, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.20910645, + "routerloss_mlp": 0.0, + "step": 653, + "time_per_iteration": 2.9535903930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188128, + "balance_loss_mlp": 1.16659844, + "diversity_loss_mlp": 0.0, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.09522027236447655, + "language_loss": 0.86765033, + "learning_rate": 0.0009761215939223267, + "loss": 0.87953162, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 654, + "time_per_iteration": 2.7124929428100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186032, + "balance_loss_mlp": 1.16533732, + "diversity_loss_mlp": 0.0, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.11212167432887624, + "language_loss": 0.85993934, + "learning_rate": 0.0009760263749743428, + "loss": 0.87179965, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.20690918, + "routerloss_mlp": 0.0, + "step": 655, + "time_per_iteration": 2.5919461250305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171572, + "balance_loss_mlp": 1.1518662, + "diversity_loss_mlp": 0.0, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.09226162692886594, + "language_loss": 0.89700639, + "learning_rate": 0.0009759309712165299, + "loss": 0.9087221, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.19702148, + "routerloss_mlp": 0.0, + "step": 656, + "time_per_iteration": 2.746537685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161192, + "balance_loss_mlp": 1.14149833, + "diversity_loss_mlp": 0.0, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.08627335840647962, + "language_loss": 0.92326117, + "learning_rate": 0.0009758353826859272, + "loss": 0.9348731, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 657, + "time_per_iteration": 2.5861480236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128682, + "balance_loss_mlp": 1.10790431, + "diversity_loss_mlp": 0.0, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.1059978443595565, + "language_loss": 0.88603538, + "learning_rate": 0.0009757396094196456, + "loss": 0.89732224, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.20788574, + "routerloss_mlp": 0.0, + "step": 658, + "time_per_iteration": 2.8773136138916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130444, + "balance_loss_mlp": 1.11040533, + "diversity_loss_mlp": 0.0, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.12293029558515219, + "language_loss": 0.83426332, + "learning_rate": 0.0009756436514548673, + "loss": 0.8455677, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.20031738, + "routerloss_mlp": 0.0, + "step": 659, + "time_per_iteration": 2.810722589492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.11438441, + "diversity_loss_mlp": 0.0, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.06793027871708798, + "language_loss": 0.87658846, + "learning_rate": 0.0009755475088288466, + "loss": 0.88793576, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.20349121, + "routerloss_mlp": 0.0, + "step": 660, + "time_per_iteration": 2.7121376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.12785089, + "diversity_loss_mlp": 0.0, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08710392398912287, + "language_loss": 0.89421189, + "learning_rate": 0.0009754511815789095, + "loss": 0.90569162, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.20117188, + "routerloss_mlp": 0.0, + "step": 661, + "time_per_iteration": 2.777318239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162586, + "balance_loss_mlp": 1.14171267, + "diversity_loss_mlp": 0.0, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08537034247511402, + "language_loss": 0.84716892, + "learning_rate": 0.0009753546697424533, + "loss": 0.85879481, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 662, + "time_per_iteration": 2.6664726734161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169368, + "balance_loss_mlp": 1.14935231, + "diversity_loss_mlp": 0.0, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.08593929583832248, + "language_loss": 0.89815515, + "learning_rate": 0.0009752579733569475, + "loss": 0.90984881, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.20019531, + "routerloss_mlp": 0.0, + "step": 663, + "time_per_iteration": 2.695844888687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02192512, + "balance_loss_mlp": 2.16352034, + "diversity_loss_mlp": 0.0, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.2093028146020386, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.77073896, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.2890625, + "routerloss_mlp": 0.0, + "step": 664, + "time_per_iteration": 4.96467137336731 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00927072, + "balance_loss_mlp": 1.59828615, + "diversity_loss_mlp": 0.21952696, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.040572636524321984, + "language_loss": 0.8949101, + "learning_rate": 0.0009750640270890217, + "loss": 0.90418077, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816532, + "step": 665, + "time_per_iteration": 2.7632246017456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241186, + "balance_loss_mlp": 1.22053885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.08846289988129392, + "language_loss": 0.95572138, + "learning_rate": 0.0009749667772818983, + "loss": 0.96813321, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.20654297, + "routerloss_mlp": 0.0, + "step": 666, + "time_per_iteration": 3.037458896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183198, + "balance_loss_mlp": 1.80241597, + "diversity_loss_mlp": 0.0, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.11554481164154014, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.7876792, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.29492188, + "routerloss_mlp": 0.0, + "step": 667, + "time_per_iteration": 4.810182332992554 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244511, + "balance_loss_mlp": 1.22299325, + "diversity_loss_mlp": 0.0, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.09137997717488894, + "language_loss": 0.94816601, + "learning_rate": 0.0009747717245101093, + "loss": 0.9606111, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.21520996, + "routerloss_mlp": 0.0, + "step": 668, + "time_per_iteration": 2.552507162094116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917856, + "balance_loss_mlp": 1.58052325, + "diversity_loss_mlp": 0.21830653, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.03508480239171642, + "language_loss": 0.8457346, + "learning_rate": 0.00097467392162117, + "loss": 0.85491318, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01844162, + "step": 669, + "time_per_iteration": 2.6064391136169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242109, + "balance_loss_mlp": 1.21882796, + "diversity_loss_mlp": 0.0, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.1666980552990896, + "language_loss": 0.90609741, + "learning_rate": 0.0009745759344474708, + "loss": 0.91851848, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.23266602, + "routerloss_mlp": 0.0, + "step": 670, + "time_per_iteration": 2.826202392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229033, + "balance_loss_mlp": 1.2077179, + "diversity_loss_mlp": 0.0, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.09671049007121679, + "language_loss": 0.88974905, + "learning_rate": 0.0009744777630270536, + "loss": 0.90203935, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.21337891, + "routerloss_mlp": 0.0, + "step": 671, + "time_per_iteration": 2.578334331512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233527, + "balance_loss_mlp": 1.21067417, + "diversity_loss_mlp": 0.0, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.08999527722625096, + "language_loss": 0.92790663, + "learning_rate": 0.000974379407398032, + "loss": 0.94024187, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 672, + "time_per_iteration": 2.8661158084869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237675, + "balance_loss_mlp": 1.21589506, + "diversity_loss_mlp": 0.0, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.09653126460783178, + "language_loss": 0.81875724, + "learning_rate": 0.0009742808675985913, + "loss": 0.83113402, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.21801758, + "routerloss_mlp": 0.0, + "step": 673, + "time_per_iteration": 3.0861356258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260533, + "balance_loss_mlp": 1.23754919, + "diversity_loss_mlp": 0.0, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.08653130412501808, + "language_loss": 0.90219223, + "learning_rate": 0.0009741821436669876, + "loss": 0.91479754, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 674, + "time_per_iteration": 2.5609960556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267597, + "balance_loss_mlp": 1.24489975, + "diversity_loss_mlp": 0.0, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.09623752325881015, + "language_loss": 0.91791725, + "learning_rate": 0.0009740832356415492, + "loss": 0.93059325, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.22680664, + "routerloss_mlp": 0.0, + "step": 675, + "time_per_iteration": 2.544027805328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295128, + "balance_loss_mlp": 1.27278781, + "diversity_loss_mlp": 0.0, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.08903369590662558, + "language_loss": 0.87403589, + "learning_rate": 0.0009739841435606756, + "loss": 0.88698715, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.22338867, + "routerloss_mlp": 0.0, + "step": 676, + "time_per_iteration": 2.9931325912475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261461, + "balance_loss_mlp": 1.23933589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.0602287995404217, + "language_loss": 0.89557111, + "learning_rate": 0.0009738848674628377, + "loss": 0.90818572, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 677, + "time_per_iteration": 2.7290966510772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264602, + "balance_loss_mlp": 1.24307275, + "diversity_loss_mlp": 0.0, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.10468610894957399, + "language_loss": 0.88751101, + "learning_rate": 0.000973785407386578, + "loss": 0.90015703, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 678, + "time_per_iteration": 2.7950329780578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00969584, + "balance_loss_mlp": 1.6979661, + "diversity_loss_mlp": 0.20886885, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.03344489204860934, + "language_loss": 0.86933386, + "learning_rate": 0.0009736857633705103, + "loss": 0.87902969, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01616703, + "step": 679, + "time_per_iteration": 2.8691866397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193718, + "balance_loss_mlp": 1.17283261, + "diversity_loss_mlp": 0.0, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.08130386374469858, + "language_loss": 0.92363989, + "learning_rate": 0.0009735859354533196, + "loss": 0.93557703, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 680, + "time_per_iteration": 2.6832337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155917, + "balance_loss_mlp": 1.13447094, + "diversity_loss_mlp": 0.0, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.0924188238597787, + "language_loss": 0.91083395, + "learning_rate": 0.0009734859236737628, + "loss": 0.92239314, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.21459961, + "routerloss_mlp": 0.0, + "step": 681, + "time_per_iteration": 2.6023473739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125397, + "balance_loss_mlp": 1.10410571, + "diversity_loss_mlp": 0.0, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.08442474228180671, + "language_loss": 0.93186569, + "learning_rate": 0.0009733857280706678, + "loss": 0.9431197, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.2130127, + "routerloss_mlp": 0.0, + "step": 682, + "time_per_iteration": 2.5775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968386, + "balance_loss_mlp": 1.69064701, + "diversity_loss_mlp": 0.21057674, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.03992508312329801, + "language_loss": 0.84369749, + "learning_rate": 0.000973285348682934, + "loss": 0.85338134, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777408, + "step": 683, + "time_per_iteration": 2.768641233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01618305, + "balance_loss_mlp": 1.58530831, + "diversity_loss_mlp": 0.0, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.09794042911652269, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79516685, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.33007812, + "routerloss_mlp": 0.0, + "step": 684, + "time_per_iteration": 4.802167177200317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094162, + "balance_loss_mlp": 1.07383704, + "diversity_loss_mlp": 0.0, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.12652995306024198, + "language_loss": 0.84832728, + "learning_rate": 0.0009730840387095046, + "loss": 0.8592689, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.20324707, + "routerloss_mlp": 0.0, + "step": 685, + "time_per_iteration": 3.2910287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112414, + "balance_loss_mlp": 1.09188628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.13012317463795417, + "language_loss": 0.90537834, + "learning_rate": 0.0009729831082019642, + "loss": 0.91650254, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.20532227, + "routerloss_mlp": 0.0, + "step": 686, + "time_per_iteration": 2.7909138202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121716, + "balance_loss_mlp": 1.101331, + "diversity_loss_mlp": 0.0, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08096428549902779, + "language_loss": 0.88353586, + "learning_rate": 0.0009728819940660958, + "loss": 0.89475298, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 687, + "time_per_iteration": 2.7699429988861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131653, + "balance_loss_mlp": 1.11135173, + "diversity_loss_mlp": 0.0, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.07933225152322496, + "language_loss": 0.85085285, + "learning_rate": 0.0009727806963411557, + "loss": 0.86216938, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 688, + "time_per_iteration": 2.581984519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144043, + "balance_loss_mlp": 1.12350333, + "diversity_loss_mlp": 0.0, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.09807362554425139, + "language_loss": 0.87180853, + "learning_rate": 0.000972679215066471, + "loss": 0.88324893, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.20544434, + "routerloss_mlp": 0.0, + "step": 689, + "time_per_iteration": 2.6538989543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148083, + "balance_loss_mlp": 1.12809181, + "diversity_loss_mlp": 0.0, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.09247782934143206, + "language_loss": 0.98983967, + "learning_rate": 0.0009725775502814401, + "loss": 1.00132048, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.19995117, + "routerloss_mlp": 0.0, + "step": 690, + "time_per_iteration": 2.610485315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167941, + "balance_loss_mlp": 1.14827132, + "diversity_loss_mlp": 0.0, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.08082631328369684, + "language_loss": 0.84880829, + "learning_rate": 0.0009724757020255327, + "loss": 0.8604877, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.1965332, + "routerloss_mlp": 0.0, + "step": 691, + "time_per_iteration": 2.8424370288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152001, + "balance_loss_mlp": 1.13209307, + "diversity_loss_mlp": 0.0, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09067820147092803, + "language_loss": 0.87807095, + "learning_rate": 0.0009723736703382902, + "loss": 0.88959098, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.19897461, + "routerloss_mlp": 0.0, + "step": 692, + "time_per_iteration": 2.5578606128692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149266, + "balance_loss_mlp": 1.13037133, + "diversity_loss_mlp": 0.0, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07979062216362842, + "language_loss": 0.82877922, + "learning_rate": 0.0009722714552593244, + "loss": 0.84027195, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 693, + "time_per_iteration": 2.6148533821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153464, + "balance_loss_mlp": 1.13444984, + "diversity_loss_mlp": 0.0, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08708336283232748, + "language_loss": 0.94164526, + "learning_rate": 0.000972169056828319, + "loss": 0.9531799, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 694, + "time_per_iteration": 2.517944097518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154915, + "balance_loss_mlp": 1.1360321, + "diversity_loss_mlp": 0.0, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.0753733884935208, + "language_loss": 0.86921358, + "learning_rate": 0.0009720664750850283, + "loss": 0.8807627, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 695, + "time_per_iteration": 2.8149421215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148667, + "balance_loss_mlp": 1.1299628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.09445278911045346, + "language_loss": 0.92951906, + "learning_rate": 0.0009719637100692784, + "loss": 0.94100577, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 696, + "time_per_iteration": 2.719451904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149322, + "balance_loss_mlp": 1.13098741, + "diversity_loss_mlp": 0.0, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.10008701466446891, + "language_loss": 0.82604736, + "learning_rate": 0.0009718607618209661, + "loss": 0.83754057, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 697, + "time_per_iteration": 2.8692104816436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148038, + "balance_loss_mlp": 1.12914348, + "diversity_loss_mlp": 0.0, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.07908911060166324, + "language_loss": 0.87701273, + "learning_rate": 0.0009717576303800595, + "loss": 0.88849318, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 698, + "time_per_iteration": 3.0484437942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139219, + "balance_loss_mlp": 1.11988366, + "diversity_loss_mlp": 0.0, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.12480577454910273, + "language_loss": 0.85819161, + "learning_rate": 0.0009716543157865975, + "loss": 0.86958385, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.1932373, + "routerloss_mlp": 0.0, + "step": 699, + "time_per_iteration": 2.706787347793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144768, + "balance_loss_mlp": 1.12586117, + "diversity_loss_mlp": 0.0, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.16362357873421526, + "language_loss": 0.83352965, + "learning_rate": 0.0009715508180806907, + "loss": 0.84497738, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.18896484, + "routerloss_mlp": 0.0, + "step": 700, + "time_per_iteration": 3.1985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162079, + "balance_loss_mlp": 1.14230227, + "diversity_loss_mlp": 0.0, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.08746408781150025, + "language_loss": 0.90170425, + "learning_rate": 0.0009714471373025202, + "loss": 0.91332507, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.19763184, + "routerloss_mlp": 0.0, + "step": 701, + "time_per_iteration": 3.487022638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.13656974, + "diversity_loss_mlp": 0.0, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.10787745491017559, + "language_loss": 0.88186693, + "learning_rate": 0.0009713432734923386, + "loss": 0.89343208, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 702, + "time_per_iteration": 2.6239736080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.14830136, + "diversity_loss_mlp": 0.0, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.09670789671988574, + "language_loss": 0.86879516, + "learning_rate": 0.0009712392266904696, + "loss": 0.88047349, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.19506836, + "routerloss_mlp": 0.0, + "step": 703, + "time_per_iteration": 2.7542335987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181198, + "balance_loss_mlp": 1.16149247, + "diversity_loss_mlp": 0.0, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.10598212751912446, + "language_loss": 0.85246772, + "learning_rate": 0.0009711349969373076, + "loss": 0.86427975, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 704, + "time_per_iteration": 3.162461042404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175522, + "balance_loss_mlp": 1.15518451, + "diversity_loss_mlp": 0.0, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.0954290464489283, + "language_loss": 0.80285007, + "learning_rate": 0.0009710305842733178, + "loss": 0.81460524, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 705, + "time_per_iteration": 2.7630715370178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155052, + "balance_loss_mlp": 1.13601446, + "diversity_loss_mlp": 0.0, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.09437017973872532, + "language_loss": 0.89630616, + "learning_rate": 0.0009709259887390373, + "loss": 0.9078567, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.19030762, + "routerloss_mlp": 0.0, + "step": 706, + "time_per_iteration": 2.6160268783569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895019, + "balance_loss_mlp": 1.55161047, + "diversity_loss_mlp": 0.20666173, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.04273378361131697, + "language_loss": 0.90874577, + "learning_rate": 0.0009708212103750737, + "loss": 0.91769588, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588319, + "step": 707, + "time_per_iteration": 2.594606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180444, + "balance_loss_mlp": 1.16110778, + "diversity_loss_mlp": 0.0, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.08814378894040824, + "language_loss": 0.87522972, + "learning_rate": 0.0009707162492221051, + "loss": 0.88703418, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.19335938, + "routerloss_mlp": 0.0, + "step": 708, + "time_per_iteration": 2.8884427547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197388, + "balance_loss_mlp": 1.17801642, + "diversity_loss_mlp": 0.0, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.07892254834086627, + "language_loss": 0.87611169, + "learning_rate": 0.0009706111053208815, + "loss": 0.8880856, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 709, + "time_per_iteration": 2.7824413776397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213311, + "balance_loss_mlp": 1.19383228, + "diversity_loss_mlp": 0.0, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.10389736734512126, + "language_loss": 0.85504246, + "learning_rate": 0.0009705057787122232, + "loss": 0.86717558, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.19458008, + "routerloss_mlp": 0.0, + "step": 710, + "time_per_iteration": 2.529498815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178108, + "balance_loss_mlp": 1.15870059, + "diversity_loss_mlp": 0.0, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.07975606670492637, + "language_loss": 0.91293353, + "learning_rate": 0.0009704002694370216, + "loss": 0.92471457, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.19384766, + "routerloss_mlp": 0.0, + "step": 711, + "time_per_iteration": 2.5365610122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152018, + "balance_loss_mlp": 1.13282573, + "diversity_loss_mlp": 0.0, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.08453852441771745, + "language_loss": 0.86583841, + "learning_rate": 0.0009702945775362388, + "loss": 0.87735862, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.19177246, + "routerloss_mlp": 0.0, + "step": 712, + "time_per_iteration": 2.595674514770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111883, + "balance_loss_mlp": 1.10022175, + "diversity_loss_mlp": 0.0, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.08096963371537849, + "language_loss": 0.87088716, + "learning_rate": 0.0009701887030509086, + "loss": 0.88207549, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.18615723, + "routerloss_mlp": 0.0, + "step": 713, + "time_per_iteration": 2.6124320030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112657, + "balance_loss_mlp": 1.09444165, + "diversity_loss_mlp": 0.0, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.12434454369652892, + "language_loss": 0.91262931, + "learning_rate": 0.0009700826460221346, + "loss": 0.92375588, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.18225098, + "routerloss_mlp": 0.0, + "step": 714, + "time_per_iteration": 2.674612283706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115832, + "balance_loss_mlp": 1.09812903, + "diversity_loss_mlp": 0.0, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.11407804289300516, + "language_loss": 0.92571628, + "learning_rate": 0.0009699764064910921, + "loss": 0.93687463, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.17712402, + "routerloss_mlp": 0.0, + "step": 715, + "time_per_iteration": 2.8810853958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121116, + "balance_loss_mlp": 1.10322237, + "diversity_loss_mlp": 0.0, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08940816195623212, + "language_loss": 0.86826718, + "learning_rate": 0.0009698699844990268, + "loss": 0.87947834, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.17907715, + "routerloss_mlp": 0.0, + "step": 716, + "time_per_iteration": 2.697970151901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153213, + "balance_loss_mlp": 1.13561809, + "diversity_loss_mlp": 0.0, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.07906779204708066, + "language_loss": 0.88138282, + "learning_rate": 0.0009697633800872555, + "loss": 0.89291501, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 717, + "time_per_iteration": 2.8897392749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197417, + "balance_loss_mlp": 1.1801312, + "diversity_loss_mlp": 0.0, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.10867682790127652, + "language_loss": 0.9066782, + "learning_rate": 0.0009696565932971655, + "loss": 0.91865242, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 718, + "time_per_iteration": 2.8944718837738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209582, + "balance_loss_mlp": 1.19165277, + "diversity_loss_mlp": 0.0, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.0949883595308799, + "language_loss": 0.89814746, + "learning_rate": 0.0009695496241702153, + "loss": 0.91024327, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 719, + "time_per_iteration": 2.7888894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188478, + "balance_loss_mlp": 1.17082274, + "diversity_loss_mlp": 0.0, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.11627833553714081, + "language_loss": 0.86245799, + "learning_rate": 0.0009694424727479339, + "loss": 0.87434286, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.17687988, + "routerloss_mlp": 0.0, + "step": 720, + "time_per_iteration": 2.901224374771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157865, + "balance_loss_mlp": 1.14056826, + "diversity_loss_mlp": 0.0, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.09369792564045784, + "language_loss": 0.88928097, + "learning_rate": 0.0009693351390719213, + "loss": 0.90085959, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 721, + "time_per_iteration": 2.6945152282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126431, + "balance_loss_mlp": 1.10868096, + "diversity_loss_mlp": 0.0, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.07998653864580182, + "language_loss": 0.90800881, + "learning_rate": 0.000969227623183848, + "loss": 0.91927308, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 722, + "time_per_iteration": 2.789515733718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110503, + "balance_loss_mlp": 1.0873754, + "diversity_loss_mlp": 0.0, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.07914116119322331, + "language_loss": 0.90912664, + "learning_rate": 0.0009691199251254554, + "loss": 0.92017698, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.17663574, + "routerloss_mlp": 0.0, + "step": 723, + "time_per_iteration": 2.8231685161590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0093359, + "balance_loss_mlp": 1.62175167, + "diversity_loss_mlp": 0.20987722, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.03669424434563534, + "language_loss": 0.86868215, + "learning_rate": 0.0009690120449385555, + "loss": 0.87801802, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777578, + "step": 724, + "time_per_iteration": 2.8498518466949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093753, + "balance_loss_mlp": 1.07543111, + "diversity_loss_mlp": 0.0, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.10366482624390064, + "language_loss": 0.92449063, + "learning_rate": 0.0009689039826650312, + "loss": 0.93542814, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.18322754, + "routerloss_mlp": 0.0, + "step": 725, + "time_per_iteration": 2.7611966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0154366, + "balance_loss_mlp": 1.50932813, + "diversity_loss_mlp": 0.0, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.08078369374569346, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.78066719, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.34375, + "routerloss_mlp": 0.0, + "step": 726, + "time_per_iteration": 4.927435398101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933775, + "balance_loss_mlp": 1.62253523, + "diversity_loss_mlp": 0.20735951, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.04309218151041253, + "language_loss": 0.87429261, + "learning_rate": 0.0009686873120259941, + "loss": 0.88363039, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01882811, + "step": 727, + "time_per_iteration": 2.602264165878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113926, + "balance_loss_mlp": 1.12035322, + "diversity_loss_mlp": 0.0, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.14876828859354083, + "language_loss": 0.8713131, + "learning_rate": 0.0009685787037446004, + "loss": 0.88270569, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.18884277, + "routerloss_mlp": 0.0, + "step": 728, + "time_per_iteration": 2.806549072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118218, + "balance_loss_mlp": 1.09903765, + "diversity_loss_mlp": 0.0, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.1987640778264907, + "language_loss": 0.87505388, + "learning_rate": 0.0009684699135448201, + "loss": 0.88623607, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 729, + "time_per_iteration": 2.7200138568878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112792, + "balance_loss_mlp": 1.09435034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.0640895655048784, + "language_loss": 0.92135447, + "learning_rate": 0.0009683609414688895, + "loss": 0.93248242, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.18432617, + "routerloss_mlp": 0.0, + "step": 730, + "time_per_iteration": 2.7423696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911127, + "balance_loss_mlp": 1.58117688, + "diversity_loss_mlp": 0.20959289, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.03249579551243702, + "language_loss": 0.86587501, + "learning_rate": 0.0009682517875591154, + "loss": 0.87498629, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01574249, + "step": 731, + "time_per_iteration": 2.809400796890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199938, + "balance_loss_mlp": 1.18138909, + "diversity_loss_mlp": 0.0, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.07609394509363156, + "language_loss": 0.86229968, + "learning_rate": 0.0009681424518578749, + "loss": 0.87429905, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.18530273, + "routerloss_mlp": 0.0, + "step": 732, + "time_per_iteration": 2.725839614868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283686, + "balance_loss_mlp": 1.26505399, + "diversity_loss_mlp": 0.0, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.1414658743658329, + "language_loss": 0.87506676, + "learning_rate": 0.000968032934407616, + "loss": 0.88790363, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.1862793, + "routerloss_mlp": 0.0, + "step": 733, + "time_per_iteration": 2.583768844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01310281, + "balance_loss_mlp": 1.29136264, + "diversity_loss_mlp": 0.0, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.10963887531318486, + "language_loss": 0.81871867, + "learning_rate": 0.0009679232352508571, + "loss": 0.8318215, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.18908691, + "routerloss_mlp": 0.0, + "step": 734, + "time_per_iteration": 2.785585880279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286635, + "balance_loss_mlp": 1.26744211, + "diversity_loss_mlp": 0.0, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.10469043869015734, + "language_loss": 0.80695581, + "learning_rate": 0.0009678133544301871, + "loss": 0.81982213, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 735, + "time_per_iteration": 2.6638481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224375, + "balance_loss_mlp": 1.20588589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.06500438819618859, + "language_loss": 0.91870093, + "learning_rate": 0.0009677032919882658, + "loss": 0.93094468, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 736, + "time_per_iteration": 2.6578378677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197974, + "balance_loss_mlp": 1.18056929, + "diversity_loss_mlp": 0.0, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.09940630997209131, + "language_loss": 0.91374373, + "learning_rate": 0.000967593047967823, + "loss": 0.92572349, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.17419434, + "routerloss_mlp": 0.0, + "step": 737, + "time_per_iteration": 2.5236403942108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117212, + "balance_loss_mlp": 1.15476346, + "diversity_loss_mlp": 0.0, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.10840920786543624, + "language_loss": 0.86479127, + "learning_rate": 0.0009674826224116593, + "loss": 0.87651253, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 738, + "time_per_iteration": 2.803260326385498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134605, + "balance_loss_mlp": 1.11759412, + "diversity_loss_mlp": 0.0, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.09051392518082112, + "language_loss": 0.86862409, + "learning_rate": 0.0009673720153626455, + "loss": 0.87997013, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 739, + "time_per_iteration": 2.6086573600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.10798764, + "diversity_loss_mlp": 0.0, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.11444093339414264, + "language_loss": 0.8689152, + "learning_rate": 0.0009672612268637235, + "loss": 0.88016504, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.17016602, + "routerloss_mlp": 0.0, + "step": 740, + "time_per_iteration": 2.582648277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116151, + "balance_loss_mlp": 1.09880614, + "diversity_loss_mlp": 0.0, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.10874190594389947, + "language_loss": 0.84213787, + "learning_rate": 0.0009671502569579048, + "loss": 0.85329938, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 741, + "time_per_iteration": 2.7945284843444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132432, + "balance_loss_mlp": 1.11539662, + "diversity_loss_mlp": 0.0, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.07140691777849974, + "language_loss": 0.89503837, + "learning_rate": 0.0009670391056882719, + "loss": 0.90636265, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.17053223, + "routerloss_mlp": 0.0, + "step": 742, + "time_per_iteration": 2.71687912940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149228, + "balance_loss_mlp": 1.13240731, + "diversity_loss_mlp": 0.0, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.08672376963732596, + "language_loss": 0.88698781, + "learning_rate": 0.0009669277730979776, + "loss": 0.89848006, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 743, + "time_per_iteration": 3.2029030323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147207, + "balance_loss_mlp": 1.13025546, + "diversity_loss_mlp": 0.0, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.09113342882689801, + "language_loss": 0.85227454, + "learning_rate": 0.0009668162592302449, + "loss": 0.86374664, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 744, + "time_per_iteration": 2.899656057357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165032, + "balance_loss_mlp": 1.14748406, + "diversity_loss_mlp": 0.0, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.07780467137911447, + "language_loss": 0.86560214, + "learning_rate": 0.0009667045641283676, + "loss": 0.87725246, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.17553711, + "routerloss_mlp": 0.0, + "step": 745, + "time_per_iteration": 2.6474997997283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.14148676, + "diversity_loss_mlp": 0.0, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.09864944110558675, + "language_loss": 0.95312673, + "learning_rate": 0.0009665926878357092, + "loss": 0.96471858, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.17700195, + "routerloss_mlp": 0.0, + "step": 746, + "time_per_iteration": 2.946307420730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00851982, + "balance_loss_mlp": 1.46230698, + "diversity_loss_mlp": 0.20995456, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.034792990408202794, + "language_loss": 0.91192698, + "learning_rate": 0.0009664806303957043, + "loss": 0.92044681, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01585159, + "step": 747, + "time_per_iteration": 2.706286668777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160661, + "balance_loss_mlp": 1.14221931, + "diversity_loss_mlp": 0.0, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.08367194984434445, + "language_loss": 0.87066692, + "learning_rate": 0.0009663683918518571, + "loss": 0.88227355, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 748, + "time_per_iteration": 2.892982244491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136526, + "balance_loss_mlp": 1.11831081, + "diversity_loss_mlp": 0.0, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.07455761265115375, + "language_loss": 0.85490787, + "learning_rate": 0.0009662559722477428, + "loss": 0.86627316, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.18237305, + "routerloss_mlp": 0.0, + "step": 749, + "time_per_iteration": 2.6979615688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01292346, + "balance_loss_mlp": 1.2582047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.08640394257539531, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77455318, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.34179688, + "routerloss_mlp": 0.0, + "step": 750, + "time_per_iteration": 4.991304397583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.11068118, + "diversity_loss_mlp": 0.0, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.07866539193327844, + "language_loss": 0.89197791, + "learning_rate": 0.0009660305900333632, + "loss": 0.90326303, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.17834473, + "routerloss_mlp": 0.0, + "step": 751, + "time_per_iteration": 2.6706793308258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121937, + "balance_loss_mlp": 1.1038413, + "diversity_loss_mlp": 0.0, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.10038132697844201, + "language_loss": 0.82478833, + "learning_rate": 0.0009659176275105992, + "loss": 0.83600777, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.1809082, + "routerloss_mlp": 0.0, + "step": 752, + "time_per_iteration": 2.697909355163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126212, + "balance_loss_mlp": 1.10777032, + "diversity_loss_mlp": 0.0, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.10638604925915984, + "language_loss": 0.85756153, + "learning_rate": 0.0009658044841025701, + "loss": 0.86882365, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 753, + "time_per_iteration": 2.7749171257019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128331, + "balance_loss_mlp": 1.1107595, + "diversity_loss_mlp": 0.0, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.09130861127340602, + "language_loss": 0.81584072, + "learning_rate": 0.0009656911598532021, + "loss": 0.827124, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.17590332, + "routerloss_mlp": 0.0, + "step": 754, + "time_per_iteration": 2.635702610015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136592, + "balance_loss_mlp": 1.11914003, + "diversity_loss_mlp": 0.0, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.06835454276473461, + "language_loss": 0.90494555, + "learning_rate": 0.0009655776548064917, + "loss": 0.9163115, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.17456055, + "routerloss_mlp": 0.0, + "step": 755, + "time_per_iteration": 2.6545748710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135969, + "balance_loss_mlp": 1.11902952, + "diversity_loss_mlp": 0.0, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.07886906074703284, + "language_loss": 0.88367254, + "learning_rate": 0.0009654639690065054, + "loss": 0.89503217, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 756, + "time_per_iteration": 2.8773815631866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150961, + "balance_loss_mlp": 1.13343716, + "diversity_loss_mlp": 0.0, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.07604063018618923, + "language_loss": 0.8823185, + "learning_rate": 0.00096535010249738, + "loss": 0.89382815, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.17529297, + "routerloss_mlp": 0.0, + "step": 757, + "time_per_iteration": 2.7175021171569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846707, + "balance_loss_mlp": 1.45519352, + "diversity_loss_mlp": 0.20419648, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.03954501513556402, + "language_loss": 0.82782531, + "learning_rate": 0.0009652360553233224, + "loss": 0.83629239, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017012, + "step": 758, + "time_per_iteration": 2.7434637546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115333, + "balance_loss_mlp": 1.12624609, + "diversity_loss_mlp": 0.0, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.03342191973393777, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.7492708, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 759, + "time_per_iteration": 4.910880088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188786, + "balance_loss_mlp": 1.17063034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.0638252555407819, + "language_loss": 0.81659228, + "learning_rate": 0.0009650074191575883, + "loss": 0.82848012, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.18151855, + "routerloss_mlp": 0.0, + "step": 760, + "time_per_iteration": 3.2028603553771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213565, + "balance_loss_mlp": 1.19484925, + "diversity_loss_mlp": 0.0, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.07046318146001718, + "language_loss": 0.86031073, + "learning_rate": 0.0009648928302546766, + "loss": 0.87244636, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 761, + "time_per_iteration": 2.6812515258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243947, + "balance_loss_mlp": 1.22551703, + "diversity_loss_mlp": 0.0, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.0884537515073792, + "language_loss": 0.85470825, + "learning_rate": 0.0009647780608643613, + "loss": 0.86714768, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.1842041, + "routerloss_mlp": 0.0, + "step": 762, + "time_per_iteration": 3.3486785888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012302, + "balance_loss_mlp": 1.21243811, + "diversity_loss_mlp": 0.0, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.12042495658723557, + "language_loss": 0.874053, + "learning_rate": 0.0009646631110312001, + "loss": 0.88635492, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 763, + "time_per_iteration": 2.6648313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172377, + "balance_loss_mlp": 1.1544956, + "diversity_loss_mlp": 0.0, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.05916332097574664, + "language_loss": 0.8841719, + "learning_rate": 0.0009645479807998203, + "loss": 0.89589572, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 764, + "time_per_iteration": 2.7347912788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147505, + "balance_loss_mlp": 1.12983775, + "diversity_loss_mlp": 0.0, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06985321722585584, + "language_loss": 0.92467874, + "learning_rate": 0.0009644326702149196, + "loss": 0.93615377, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.17675781, + "routerloss_mlp": 0.0, + "step": 765, + "time_per_iteration": 2.7316319942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135962, + "balance_loss_mlp": 1.11803293, + "diversity_loss_mlp": 0.0, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.09157028460957184, + "language_loss": 0.84919345, + "learning_rate": 0.0009643171793212653, + "loss": 0.86055309, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 766, + "time_per_iteration": 3.116917610168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105537, + "balance_loss_mlp": 1.08738184, + "diversity_loss_mlp": 0.0, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.08034801396880724, + "language_loss": 0.89233959, + "learning_rate": 0.0009642015081636952, + "loss": 0.90339494, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.18164062, + "routerloss_mlp": 0.0, + "step": 767, + "time_per_iteration": 2.705993175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103513, + "balance_loss_mlp": 1.08563185, + "diversity_loss_mlp": 0.0, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.09221888586765616, + "language_loss": 0.88360566, + "learning_rate": 0.0009640856567871166, + "loss": 0.8946408, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.17895508, + "routerloss_mlp": 0.0, + "step": 768, + "time_per_iteration": 2.5172243118286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108598, + "balance_loss_mlp": 1.08981061, + "diversity_loss_mlp": 0.0, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.0844592716079577, + "language_loss": 0.89047211, + "learning_rate": 0.0009639696252365072, + "loss": 0.9015581, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.18786621, + "routerloss_mlp": 0.0, + "step": 769, + "time_per_iteration": 3.034848690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105095, + "balance_loss_mlp": 1.08673656, + "diversity_loss_mlp": 0.0, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.07095543604969227, + "language_loss": 0.81996548, + "learning_rate": 0.0009638534135569144, + "loss": 0.83101642, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.18371582, + "routerloss_mlp": 0.0, + "step": 770, + "time_per_iteration": 2.947564125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106726, + "balance_loss_mlp": 1.08859468, + "diversity_loss_mlp": 0.0, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.08627707323979403, + "language_loss": 0.9012745, + "learning_rate": 0.0009637370217934554, + "loss": 0.91234171, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.18139648, + "routerloss_mlp": 0.0, + "step": 771, + "time_per_iteration": 2.6592423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.09355128, + "diversity_loss_mlp": 0.0, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.06345294765682771, + "language_loss": 0.82981932, + "learning_rate": 0.0009636204499913175, + "loss": 0.84093815, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 772, + "time_per_iteration": 2.8836610317230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115362, + "balance_loss_mlp": 1.09749293, + "diversity_loss_mlp": 0.0, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06338786563117527, + "language_loss": 0.87914705, + "learning_rate": 0.0009635036981957581, + "loss": 0.89030063, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 773, + "time_per_iteration": 2.885239601135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132405, + "balance_loss_mlp": 1.11417794, + "diversity_loss_mlp": 0.0, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.08623405645423676, + "language_loss": 0.90735364, + "learning_rate": 0.0009633867664521043, + "loss": 0.91867769, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.18212891, + "routerloss_mlp": 0.0, + "step": 774, + "time_per_iteration": 2.802264451980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159356, + "balance_loss_mlp": 1.14176083, + "diversity_loss_mlp": 0.0, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.09977443827883303, + "language_loss": 0.86760318, + "learning_rate": 0.0009632696548057527, + "loss": 0.8791967, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 775, + "time_per_iteration": 2.5641794204711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187156, + "balance_loss_mlp": 1.16960835, + "diversity_loss_mlp": 0.0, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.08744626586779954, + "language_loss": 0.85013115, + "learning_rate": 0.0009631523633021704, + "loss": 0.86200273, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 776, + "time_per_iteration": 2.7851786613464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881631, + "balance_loss_mlp": 1.52411294, + "diversity_loss_mlp": 0.20632464, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.038364140445948956, + "language_loss": 0.88378215, + "learning_rate": 0.0009630348919868936, + "loss": 0.89259851, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164127, + "step": 777, + "time_per_iteration": 2.7285845279693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191902, + "balance_loss_mlp": 1.17415154, + "diversity_loss_mlp": 0.0, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.14061909589017782, + "language_loss": 0.81450796, + "learning_rate": 0.0009629172409055293, + "loss": 0.82642698, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 778, + "time_per_iteration": 2.5018203258514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154162, + "balance_loss_mlp": 1.13728166, + "diversity_loss_mlp": 0.0, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.06968828956123203, + "language_loss": 0.87518388, + "learning_rate": 0.0009627994101037531, + "loss": 0.88672549, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 779, + "time_per_iteration": 2.763136863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139257, + "balance_loss_mlp": 1.12231779, + "diversity_loss_mlp": 0.0, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.07833298109740298, + "language_loss": 0.88761836, + "learning_rate": 0.0009626813996273114, + "loss": 0.8990109, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 780, + "time_per_iteration": 2.8791675567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117275, + "balance_loss_mlp": 1.09990597, + "diversity_loss_mlp": 0.0, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.09603506751758703, + "language_loss": 0.89051467, + "learning_rate": 0.0009625632095220198, + "loss": 0.90168738, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 781, + "time_per_iteration": 2.8194801807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119786, + "balance_loss_mlp": 1.10251248, + "diversity_loss_mlp": 0.0, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.1003760880169841, + "language_loss": 0.86904705, + "learning_rate": 0.0009624448398337637, + "loss": 0.88024497, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 782, + "time_per_iteration": 2.511925458908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117445, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.08409428795596587, + "language_loss": 0.8913728, + "learning_rate": 0.0009623262906084984, + "loss": 0.90254724, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.17236328, + "routerloss_mlp": 0.0, + "step": 783, + "time_per_iteration": 2.9890754222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125496, + "balance_loss_mlp": 1.10804367, + "diversity_loss_mlp": 0.0, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.07818041002140835, + "language_loss": 0.90351313, + "learning_rate": 0.0009622075618922486, + "loss": 0.9147681, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 784, + "time_per_iteration": 2.6550891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119712, + "balance_loss_mlp": 1.10261774, + "diversity_loss_mlp": 0.0, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.07239943737193227, + "language_loss": 0.87125635, + "learning_rate": 0.0009620886537311091, + "loss": 0.88245344, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.17114258, + "routerloss_mlp": 0.0, + "step": 785, + "time_per_iteration": 2.646864652633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125571, + "balance_loss_mlp": 1.10794032, + "diversity_loss_mlp": 0.0, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.08980079735835493, + "language_loss": 0.85309643, + "learning_rate": 0.000961969566171244, + "loss": 0.86435217, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.1763916, + "routerloss_mlp": 0.0, + "step": 786, + "time_per_iteration": 2.5803041458129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136873, + "balance_loss_mlp": 1.11938524, + "diversity_loss_mlp": 0.0, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.08282756535064502, + "language_loss": 0.8993417, + "learning_rate": 0.0009618502992588873, + "loss": 0.91071045, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.17504883, + "routerloss_mlp": 0.0, + "step": 787, + "time_per_iteration": 2.6479151248931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124837, + "balance_loss_mlp": 1.10727715, + "diversity_loss_mlp": 0.0, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07571751270322945, + "language_loss": 0.8792628, + "learning_rate": 0.0009617308530403424, + "loss": 0.89051116, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 788, + "time_per_iteration": 3.002804756164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125535, + "balance_loss_mlp": 1.10758173, + "diversity_loss_mlp": 0.0, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0842913885359751, + "language_loss": 0.88032806, + "learning_rate": 0.0009616112275619825, + "loss": 0.89158338, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 789, + "time_per_iteration": 2.6842775344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110837, + "balance_loss_mlp": 1.09398067, + "diversity_loss_mlp": 0.0, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.07451962795351484, + "language_loss": 0.83893597, + "learning_rate": 0.0009614914228702503, + "loss": 0.85004437, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 790, + "time_per_iteration": 2.714026689529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095726, + "balance_loss_mlp": 1.07848811, + "diversity_loss_mlp": 0.0, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.07099161447381937, + "language_loss": 0.89133644, + "learning_rate": 0.0009613714390116581, + "loss": 0.90229368, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.17260742, + "routerloss_mlp": 0.0, + "step": 791, + "time_per_iteration": 2.947917938232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089669, + "balance_loss_mlp": 1.0730865, + "diversity_loss_mlp": 0.0, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.07518738092336623, + "language_loss": 0.86102855, + "learning_rate": 0.0009612512760327879, + "loss": 0.87192523, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 792, + "time_per_iteration": 2.887404203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092437, + "balance_loss_mlp": 1.07553315, + "diversity_loss_mlp": 0.0, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.09992337759040973, + "language_loss": 0.85428631, + "learning_rate": 0.0009611309339802909, + "loss": 0.86521071, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 793, + "time_per_iteration": 2.463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101517, + "balance_loss_mlp": 1.08537626, + "diversity_loss_mlp": 0.0, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.07717151134226699, + "language_loss": 0.84535038, + "learning_rate": 0.0009610104129008881, + "loss": 0.85636556, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 794, + "time_per_iteration": 3.1276698112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108014, + "balance_loss_mlp": 1.09176612, + "diversity_loss_mlp": 0.0, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.07067272187318202, + "language_loss": 0.88475168, + "learning_rate": 0.0009608897128413701, + "loss": 0.89583182, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 795, + "time_per_iteration": 2.7658157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110863, + "balance_loss_mlp": 1.09251332, + "diversity_loss_mlp": 0.0, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.05987412473430484, + "language_loss": 0.85522842, + "learning_rate": 0.0009607688338485965, + "loss": 0.86631477, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 796, + "time_per_iteration": 2.849942207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112598, + "balance_loss_mlp": 1.10935068, + "diversity_loss_mlp": 0.0, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.07148533051381147, + "language_loss": 0.90245026, + "learning_rate": 0.0009606477759694969, + "loss": 0.91371006, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 797, + "time_per_iteration": 3.0240113735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144466, + "balance_loss_mlp": 1.12839675, + "diversity_loss_mlp": 0.0, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07535837127697287, + "language_loss": 0.87540114, + "learning_rate": 0.0009605265392510703, + "loss": 0.88684577, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 798, + "time_per_iteration": 2.6324868202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147656, + "balance_loss_mlp": 1.13140786, + "diversity_loss_mlp": 0.0, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.070317951825601, + "language_loss": 0.91919398, + "learning_rate": 0.0009604051237403846, + "loss": 0.93067056, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 799, + "time_per_iteration": 2.6472957134246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159957, + "balance_loss_mlp": 1.14441192, + "diversity_loss_mlp": 0.0, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.08825283549053219, + "language_loss": 0.8626982, + "learning_rate": 0.0009602835294845776, + "loss": 0.8742978, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 800, + "time_per_iteration": 2.4501516819000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141823, + "balance_loss_mlp": 1.12552738, + "diversity_loss_mlp": 0.0, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.07489761537063061, + "language_loss": 0.89964634, + "learning_rate": 0.0009601617565308565, + "loss": 0.91106457, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 801, + "time_per_iteration": 2.6480391025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00945745, + "balance_loss_mlp": 1.65525413, + "diversity_loss_mlp": 0.20237769, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.03656221347615257, + "language_loss": 0.8655234, + "learning_rate": 0.0009600398049264977, + "loss": 0.87498081, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01692954, + "step": 802, + "time_per_iteration": 3.0029048919677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00923116, + "balance_loss_mlp": 1.61011553, + "diversity_loss_mlp": 0.20312682, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.045238735441598905, + "language_loss": 0.92041564, + "learning_rate": 0.0009599176747188469, + "loss": 0.92964679, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164945, + "step": 803, + "time_per_iteration": 2.860461473464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113914, + "balance_loss_mlp": 1.12246239, + "diversity_loss_mlp": 0.0, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.08350523706559901, + "language_loss": 0.83155477, + "learning_rate": 0.0009597953659553196, + "loss": 0.84294617, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.16687012, + "routerloss_mlp": 0.0, + "step": 804, + "time_per_iteration": 2.733302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139673, + "balance_loss_mlp": 1.12363935, + "diversity_loss_mlp": 0.0, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.08094420015679657, + "language_loss": 0.89484847, + "learning_rate": 0.0009596728786833997, + "loss": 0.90624517, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.16027832, + "routerloss_mlp": 0.0, + "step": 805, + "time_per_iteration": 2.602963447570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112483, + "balance_loss_mlp": 1.10851073, + "diversity_loss_mlp": 0.0, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.09295267358895155, + "language_loss": 0.8926357, + "learning_rate": 0.0009595502129506415, + "loss": 0.90388405, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 806, + "time_per_iteration": 3.358494997024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112525, + "balance_loss_mlp": 1.10893035, + "diversity_loss_mlp": 0.0, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.09807919542340894, + "language_loss": 0.82600027, + "learning_rate": 0.0009594273688046678, + "loss": 0.83725274, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 807, + "time_per_iteration": 2.7516088485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121041, + "balance_loss_mlp": 1.10408974, + "diversity_loss_mlp": 0.0, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.13657059547118527, + "language_loss": 0.85685933, + "learning_rate": 0.000959304346293171, + "loss": 0.86806977, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 808, + "time_per_iteration": 2.676118850708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133717, + "balance_loss_mlp": 1.11686087, + "diversity_loss_mlp": 0.0, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.08670416080232539, + "language_loss": 0.88104093, + "learning_rate": 0.0009591811454639125, + "loss": 0.89237815, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.16870117, + "routerloss_mlp": 0.0, + "step": 809, + "time_per_iteration": 2.806877613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143795, + "balance_loss_mlp": 1.12712979, + "diversity_loss_mlp": 0.0, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.07575766208840308, + "language_loss": 0.88623202, + "learning_rate": 0.0009590577663647234, + "loss": 0.89766991, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 810, + "time_per_iteration": 2.705397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167139, + "balance_loss_mlp": 1.15012765, + "diversity_loss_mlp": 0.0, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.07966338850805216, + "language_loss": 0.86178398, + "learning_rate": 0.0009589342090435036, + "loss": 0.87345541, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 811, + "time_per_iteration": 2.767648935317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164589, + "balance_loss_mlp": 1.14749408, + "diversity_loss_mlp": 0.0, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07988119295983553, + "language_loss": 0.87430739, + "learning_rate": 0.0009588104735482223, + "loss": 0.88595331, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 812, + "time_per_iteration": 2.6543996334075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.14989901, + "diversity_loss_mlp": 0.0, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.09429144108453459, + "language_loss": 0.83906114, + "learning_rate": 0.0009586865599269177, + "loss": 0.85073483, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 813, + "time_per_iteration": 2.632206439971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180179, + "balance_loss_mlp": 1.1632992, + "diversity_loss_mlp": 0.0, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.08748302318090055, + "language_loss": 0.88416874, + "learning_rate": 0.0009585624682276977, + "loss": 0.89597052, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 814, + "time_per_iteration": 2.7365036010742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187488, + "balance_loss_mlp": 1.17066741, + "diversity_loss_mlp": 0.0, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.08109713122840453, + "language_loss": 0.87263978, + "learning_rate": 0.0009584381984987386, + "loss": 0.88451469, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 815, + "time_per_iteration": 2.5354831218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011941, + "balance_loss_mlp": 1.1770407, + "diversity_loss_mlp": 0.0, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.07928759805262754, + "language_loss": 0.89978456, + "learning_rate": 0.0009583137507882864, + "loss": 0.91172552, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.17077637, + "routerloss_mlp": 0.0, + "step": 816, + "time_per_iteration": 2.679156541824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895961, + "balance_loss_mlp": 1.55854249, + "diversity_loss_mlp": 0.20119007, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.035733799703693336, + "language_loss": 0.81236839, + "learning_rate": 0.000958189125144656, + "loss": 0.82132804, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160944, + "step": 817, + "time_per_iteration": 2.6629080772399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211679, + "balance_loss_mlp": 1.1954186, + "diversity_loss_mlp": 0.0, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08655764528844483, + "language_loss": 0.88309336, + "learning_rate": 0.0009580643216162313, + "loss": 0.89521015, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 818, + "time_per_iteration": 2.6631743907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174608, + "balance_loss_mlp": 1.15813375, + "diversity_loss_mlp": 0.0, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.07543766685957613, + "language_loss": 0.79610753, + "learning_rate": 0.0009579393402514652, + "loss": 0.80785358, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 819, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116637, + "balance_loss_mlp": 1.15002656, + "diversity_loss_mlp": 0.0, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.08555828674018097, + "language_loss": 0.90543056, + "learning_rate": 0.0009578141810988801, + "loss": 0.91709423, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 820, + "time_per_iteration": 2.6443581581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154879, + "balance_loss_mlp": 1.13852358, + "diversity_loss_mlp": 0.0, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.08457683432578478, + "language_loss": 0.90617025, + "learning_rate": 0.0009576888442070668, + "loss": 0.91771901, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.16357422, + "routerloss_mlp": 0.0, + "step": 821, + "time_per_iteration": 2.588172197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131243, + "balance_loss_mlp": 1.11597228, + "diversity_loss_mlp": 0.0, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08246293521158644, + "language_loss": 0.92183721, + "learning_rate": 0.0009575633296246854, + "loss": 0.93314958, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 822, + "time_per_iteration": 2.5674116611480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894902, + "balance_loss_mlp": 1.55344844, + "diversity_loss_mlp": 0.20225295, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.035537794180972825, + "language_loss": 0.83368647, + "learning_rate": 0.0009574376374004652, + "loss": 0.84263551, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01705186, + "step": 823, + "time_per_iteration": 2.6215808391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124038, + "balance_loss_mlp": 1.10815978, + "diversity_loss_mlp": 0.0, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.07732147283422666, + "language_loss": 0.801727, + "learning_rate": 0.000957311767583204, + "loss": 0.81296742, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 824, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114811, + "balance_loss_mlp": 1.12617576, + "diversity_loss_mlp": 0.0, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.06675818035974217, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83219701, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.21972656, + "routerloss_mlp": 0.0, + "step": 825, + "time_per_iteration": 4.730658531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00883043, + "balance_loss_mlp": 1.5295732, + "diversity_loss_mlp": 0.20110103, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.0472865977200058, + "language_loss": 0.91635585, + "learning_rate": 0.0009570594953650961, + "loss": 0.92518628, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01770616, + "step": 826, + "time_per_iteration": 2.528219699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119191, + "balance_loss_mlp": 1.10247803, + "diversity_loss_mlp": 0.0, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.1137923923451387, + "language_loss": 0.80430406, + "learning_rate": 0.00095693309306219, + "loss": 0.81549597, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 827, + "time_per_iteration": 3.0950989723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111184, + "balance_loss_mlp": 1.09513879, + "diversity_loss_mlp": 0.0, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.08215179220405018, + "language_loss": 0.87886679, + "learning_rate": 0.0009568065133621244, + "loss": 0.8899852, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.16699219, + "routerloss_mlp": 0.0, + "step": 828, + "time_per_iteration": 3.367777109146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106235, + "balance_loss_mlp": 1.08993912, + "diversity_loss_mlp": 0.0, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.0806870261134831, + "language_loss": 0.85100621, + "learning_rate": 0.0009566797563140422, + "loss": 0.86206853, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 829, + "time_per_iteration": 2.8803212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122437, + "balance_loss_mlp": 1.10618925, + "diversity_loss_mlp": 0.0, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.0881590388408274, + "language_loss": 0.88045579, + "learning_rate": 0.0009565528219671547, + "loss": 0.89168018, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 830, + "time_per_iteration": 2.8965914249420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130205, + "balance_loss_mlp": 1.11437368, + "diversity_loss_mlp": 0.0, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.08433678519740714, + "language_loss": 0.84820044, + "learning_rate": 0.0009564257103707418, + "loss": 0.85950249, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 831, + "time_per_iteration": 2.6071205139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138047, + "balance_loss_mlp": 1.12237096, + "diversity_loss_mlp": 0.0, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08192391736137887, + "language_loss": 0.90990019, + "learning_rate": 0.0009562984215741533, + "loss": 0.92128068, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.15661621, + "routerloss_mlp": 0.0, + "step": 832, + "time_per_iteration": 2.647022008895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126204, + "balance_loss_mlp": 1.11050415, + "diversity_loss_mlp": 0.0, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.08304692865674389, + "language_loss": 0.8233614, + "learning_rate": 0.0009561709556268065, + "loss": 0.83462346, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 833, + "time_per_iteration": 2.7033326625823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113334, + "balance_loss_mlp": 1.09758639, + "diversity_loss_mlp": 0.0, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.1118379895427605, + "language_loss": 0.94022137, + "learning_rate": 0.0009560433125781884, + "loss": 0.95135468, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 834, + "time_per_iteration": 2.7286314964294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137088, + "balance_loss_mlp": 1.12088716, + "diversity_loss_mlp": 0.0, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.07457680689162895, + "language_loss": 0.92389894, + "learning_rate": 0.0009559154924778544, + "loss": 0.93526971, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 835, + "time_per_iteration": 2.7348785400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143876, + "balance_loss_mlp": 1.12812805, + "diversity_loss_mlp": 0.0, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.10043267780752475, + "language_loss": 0.85037422, + "learning_rate": 0.0009557874953754284, + "loss": 0.86181295, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 836, + "time_per_iteration": 3.069246768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156501, + "balance_loss_mlp": 1.14049125, + "diversity_loss_mlp": 0.0, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08327927090533828, + "language_loss": 0.83506572, + "learning_rate": 0.0009556593213206038, + "loss": 0.84663069, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 837, + "time_per_iteration": 2.7368414402008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190738, + "balance_loss_mlp": 1.17505026, + "diversity_loss_mlp": 0.0, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.08045457133261572, + "language_loss": 0.87076676, + "learning_rate": 0.0009555309703631414, + "loss": 0.88267422, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 838, + "time_per_iteration": 2.72027850151062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180132, + "balance_loss_mlp": 1.16382456, + "diversity_loss_mlp": 0.0, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.09367634959673259, + "language_loss": 0.87476748, + "learning_rate": 0.0009554024425528722, + "loss": 0.88656878, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 839, + "time_per_iteration": 2.7314722537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173375, + "balance_loss_mlp": 1.15756762, + "diversity_loss_mlp": 0.0, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0683151622017414, + "language_loss": 0.88983327, + "learning_rate": 0.0009552737379396948, + "loss": 0.90156698, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.15795898, + "routerloss_mlp": 0.0, + "step": 840, + "time_per_iteration": 2.6384117603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165757, + "balance_loss_mlp": 1.14950919, + "diversity_loss_mlp": 0.0, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.08203724053437887, + "language_loss": 0.87545735, + "learning_rate": 0.0009551448565735767, + "loss": 0.88711488, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 841, + "time_per_iteration": 2.7497382164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158402, + "balance_loss_mlp": 1.14156926, + "diversity_loss_mlp": 0.0, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.08523302245909381, + "language_loss": 0.84374112, + "learning_rate": 0.0009550157985045543, + "loss": 0.8553251, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 842, + "time_per_iteration": 3.080169916152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114708, + "balance_loss_mlp": 1.13046193, + "diversity_loss_mlp": 0.0, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.10255895710786052, + "language_loss": 0.89356017, + "learning_rate": 0.0009548865637827321, + "loss": 0.90503097, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 843, + "time_per_iteration": 2.684195041656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158581, + "balance_loss_mlp": 1.14129627, + "diversity_loss_mlp": 0.0, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.08376364289368579, + "language_loss": 0.89409387, + "learning_rate": 0.0009547571524582838, + "loss": 0.90567964, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 844, + "time_per_iteration": 2.5846645832061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157702, + "balance_loss_mlp": 1.14051175, + "diversity_loss_mlp": 0.0, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.09201378669766774, + "language_loss": 0.92096436, + "learning_rate": 0.0009546275645814512, + "loss": 0.93254137, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.17211914, + "routerloss_mlp": 0.0, + "step": 845, + "time_per_iteration": 2.603830575942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165367, + "balance_loss_mlp": 1.1485343, + "diversity_loss_mlp": 0.0, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.11870998115484692, + "language_loss": 0.8935858, + "learning_rate": 0.0009544978002025446, + "loss": 0.90523952, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 846, + "time_per_iteration": 2.57155179977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167547, + "balance_loss_mlp": 1.15075064, + "diversity_loss_mlp": 0.0, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.08095587687984966, + "language_loss": 0.86639023, + "learning_rate": 0.0009543678593719434, + "loss": 0.87806571, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.16809082, + "routerloss_mlp": 0.0, + "step": 847, + "time_per_iteration": 2.7022597789764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189002, + "balance_loss_mlp": 1.17215741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.06757237913003537, + "language_loss": 0.87374425, + "learning_rate": 0.0009542377421400945, + "loss": 0.8856343, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 848, + "time_per_iteration": 2.7858939170837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209239, + "balance_loss_mlp": 1.1922878, + "diversity_loss_mlp": 0.0, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.0709695929057924, + "language_loss": 0.83489215, + "learning_rate": 0.0009541074485575145, + "loss": 0.84698457, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 849, + "time_per_iteration": 2.7202138900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206318, + "balance_loss_mlp": 1.18949735, + "diversity_loss_mlp": 0.0, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.09796618546415216, + "language_loss": 0.91934282, + "learning_rate": 0.0009539769786747874, + "loss": 0.93140602, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 850, + "time_per_iteration": 2.6165611743927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183142, + "balance_loss_mlp": 1.16619003, + "diversity_loss_mlp": 0.0, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.08882238893928415, + "language_loss": 0.81184316, + "learning_rate": 0.0009538463325425665, + "loss": 0.82367456, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 851, + "time_per_iteration": 2.686708927154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.13394117, + "diversity_loss_mlp": 0.0, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.07439357185799754, + "language_loss": 0.85950458, + "learning_rate": 0.0009537155102115728, + "loss": 0.87101221, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 852, + "time_per_iteration": 2.5918595790863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875998, + "balance_loss_mlp": 1.52336514, + "diversity_loss_mlp": 0.19506347, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.033648266618603755, + "language_loss": 0.83653182, + "learning_rate": 0.0009535845117325961, + "loss": 0.84529185, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0167836, + "step": 853, + "time_per_iteration": 2.724388599395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106481, + "balance_loss_mlp": 1.08957744, + "diversity_loss_mlp": 0.0, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.08216353114673619, + "language_loss": 0.93429655, + "learning_rate": 0.0009534533371564946, + "loss": 0.94536138, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 854, + "time_per_iteration": 2.7487661838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011031, + "balance_loss_mlp": 1.08627963, + "diversity_loss_mlp": 0.0, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.1393079137823864, + "language_loss": 0.88947123, + "learning_rate": 0.0009533219865341949, + "loss": 0.9005022, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 855, + "time_per_iteration": 2.5900051593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095935, + "balance_loss_mlp": 1.0794363, + "diversity_loss_mlp": 0.0, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.09213408499242232, + "language_loss": 0.86629748, + "learning_rate": 0.0009531904599166916, + "loss": 0.87725687, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 856, + "time_per_iteration": 2.6516594886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093162, + "balance_loss_mlp": 1.07659197, + "diversity_loss_mlp": 0.0, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.11803940214792888, + "language_loss": 0.85319799, + "learning_rate": 0.0009530587573550478, + "loss": 0.86412966, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 857, + "time_per_iteration": 2.6046345233917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.06968486, + "diversity_loss_mlp": 0.0, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.035898632567184195, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75406808, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 858, + "time_per_iteration": 5.039424180984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113669, + "balance_loss_mlp": 1.12172914, + "diversity_loss_mlp": 0.0, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.11200047020164162, + "language_loss": 0.90257657, + "learning_rate": 0.0009527948246039337, + "loss": 0.91394353, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.14929199, + "routerloss_mlp": 0.0, + "step": 859, + "time_per_iteration": 2.550898551940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912162, + "balance_loss_mlp": 1.5939728, + "diversity_loss_mlp": 0.19291875, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.041813305841329106, + "language_loss": 0.87981749, + "learning_rate": 0.000952662594516931, + "loss": 0.88893914, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871633, + "step": 860, + "time_per_iteration": 3.135986089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159964, + "balance_loss_mlp": 1.14404976, + "diversity_loss_mlp": 0.0, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.09693666764449156, + "language_loss": 0.86321676, + "learning_rate": 0.0009525301886907234, + "loss": 0.87481636, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 861, + "time_per_iteration": 2.8601465225219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117936, + "balance_loss_mlp": 1.16340995, + "diversity_loss_mlp": 0.0, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.08775979857040934, + "language_loss": 0.87897611, + "learning_rate": 0.0009523976071767155, + "loss": 0.89076972, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 862, + "time_per_iteration": 2.676481246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186964, + "balance_loss_mlp": 1.17058492, + "diversity_loss_mlp": 0.0, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.08829714099376759, + "language_loss": 0.87565947, + "learning_rate": 0.00095226485002638, + "loss": 0.88752913, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 863, + "time_per_iteration": 2.7554168701171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188442, + "balance_loss_mlp": 1.17221785, + "diversity_loss_mlp": 0.0, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.07683945950910559, + "language_loss": 0.89008975, + "learning_rate": 0.0009521319172912576, + "loss": 0.90197414, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.16223145, + "routerloss_mlp": 0.0, + "step": 864, + "time_per_iteration": 2.7515084743499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180456, + "balance_loss_mlp": 1.16381395, + "diversity_loss_mlp": 0.0, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.07957847945510911, + "language_loss": 0.95031559, + "learning_rate": 0.0009519988090229579, + "loss": 0.96212018, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 865, + "time_per_iteration": 2.671473741531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177408, + "balance_loss_mlp": 1.16058719, + "diversity_loss_mlp": 0.0, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.08787110668844439, + "language_loss": 0.87748879, + "learning_rate": 0.0009518655252731576, + "loss": 0.8892628, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 866, + "time_per_iteration": 2.7561991214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152051, + "balance_loss_mlp": 1.13470602, + "diversity_loss_mlp": 0.0, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.07641565274747647, + "language_loss": 0.90193641, + "learning_rate": 0.0009517320660936022, + "loss": 0.91345698, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.17358398, + "routerloss_mlp": 0.0, + "step": 867, + "time_per_iteration": 2.7005693912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177189, + "balance_loss_mlp": 1.16064239, + "diversity_loss_mlp": 0.0, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.08424262891613502, + "language_loss": 0.83321446, + "learning_rate": 0.0009515984315361051, + "loss": 0.84498632, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.16552734, + "routerloss_mlp": 0.0, + "step": 868, + "time_per_iteration": 2.7969586849212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167914, + "balance_loss_mlp": 1.15145087, + "diversity_loss_mlp": 0.0, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08829416831991993, + "language_loss": 0.87132847, + "learning_rate": 0.000951464621652548, + "loss": 0.88300765, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 869, + "time_per_iteration": 2.6121644973754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152825, + "balance_loss_mlp": 1.13639808, + "diversity_loss_mlp": 0.0, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.07099792340868973, + "language_loss": 0.79077303, + "learning_rate": 0.0009513306364948804, + "loss": 0.80230129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 870, + "time_per_iteration": 2.7814862728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140954, + "balance_loss_mlp": 1.12481356, + "diversity_loss_mlp": 0.0, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09401721418936884, + "language_loss": 0.89126736, + "learning_rate": 0.0009511964761151197, + "loss": 0.90267694, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 871, + "time_per_iteration": 2.601903200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152354, + "balance_loss_mlp": 1.13628435, + "diversity_loss_mlp": 0.0, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.07594901152089473, + "language_loss": 0.90430808, + "learning_rate": 0.0009510621405653521, + "loss": 0.91583163, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 872, + "time_per_iteration": 2.6015260219573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140995, + "balance_loss_mlp": 1.12449682, + "diversity_loss_mlp": 0.0, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.08553354640914074, + "language_loss": 0.84159112, + "learning_rate": 0.0009509276298977309, + "loss": 0.85300112, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 873, + "time_per_iteration": 2.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156157, + "balance_loss_mlp": 1.13969469, + "diversity_loss_mlp": 0.0, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.09960357111836311, + "language_loss": 0.81973028, + "learning_rate": 0.0009507929441644778, + "loss": 0.83129185, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 874, + "time_per_iteration": 3.518749237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141075, + "balance_loss_mlp": 1.12455297, + "diversity_loss_mlp": 0.0, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.09789550875526438, + "language_loss": 0.86003464, + "learning_rate": 0.0009506580834178826, + "loss": 0.87144536, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.1652832, + "routerloss_mlp": 0.0, + "step": 875, + "time_per_iteration": 2.7423431873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152406, + "balance_loss_mlp": 1.13565707, + "diversity_loss_mlp": 0.0, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.08790070613593892, + "language_loss": 0.91631377, + "learning_rate": 0.0009505230477103028, + "loss": 0.92783785, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 876, + "time_per_iteration": 2.698725938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133355, + "balance_loss_mlp": 1.11677289, + "diversity_loss_mlp": 0.0, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.09908277874944699, + "language_loss": 0.81365788, + "learning_rate": 0.0009503878370941641, + "loss": 0.82499135, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 877, + "time_per_iteration": 2.791314125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891363, + "balance_loss_mlp": 1.54620337, + "diversity_loss_mlp": 0.20141272, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.04203797903351432, + "language_loss": 0.89092785, + "learning_rate": 0.0009502524516219595, + "loss": 0.89984149, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01755447, + "step": 878, + "time_per_iteration": 2.776076078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_mlp": 1.12719083, + "diversity_loss_mlp": 0.0, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.08982042340710936, + "language_loss": 0.90123284, + "learning_rate": 0.0009501168913462506, + "loss": 0.91266429, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 879, + "time_per_iteration": 2.6948277950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112281, + "balance_loss_mlp": 1.09587741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.05096984028598956, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80234206, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 880, + "time_per_iteration": 4.850466728210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143836, + "balance_loss_mlp": 1.12831497, + "diversity_loss_mlp": 0.0, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.08080936273118028, + "language_loss": 0.85235959, + "learning_rate": 0.0009498452465949042, + "loss": 0.8637979, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.1550293, + "routerloss_mlp": 0.0, + "step": 881, + "time_per_iteration": 3.2163655757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147334, + "balance_loss_mlp": 1.13156271, + "diversity_loss_mlp": 0.0, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.06875421208466073, + "language_loss": 0.91363323, + "learning_rate": 0.0009497091622247285, + "loss": 0.92510653, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 882, + "time_per_iteration": 2.686939239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152935, + "balance_loss_mlp": 1.13735437, + "diversity_loss_mlp": 0.0, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.08376903723107024, + "language_loss": 0.93688583, + "learning_rate": 0.0009495729032619723, + "loss": 0.94841516, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.15563965, + "routerloss_mlp": 0.0, + "step": 883, + "time_per_iteration": 2.709554433822632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164282, + "balance_loss_mlp": 1.14845097, + "diversity_loss_mlp": 0.0, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07836441801613908, + "language_loss": 0.83897853, + "learning_rate": 0.0009494364697595354, + "loss": 0.85062128, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 884, + "time_per_iteration": 2.905869722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192457, + "balance_loss_mlp": 1.17685246, + "diversity_loss_mlp": 0.0, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.08347533231949411, + "language_loss": 0.89193916, + "learning_rate": 0.0009492998617703867, + "loss": 0.90386373, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 885, + "time_per_iteration": 2.655181884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.18021917, + "diversity_loss_mlp": 0.0, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.09597329726050118, + "language_loss": 0.87667245, + "learning_rate": 0.0009491630793475619, + "loss": 0.88863432, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 886, + "time_per_iteration": 2.6077725887298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195953, + "balance_loss_mlp": 1.17983615, + "diversity_loss_mlp": 0.0, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.09161300078510141, + "language_loss": 0.8529889, + "learning_rate": 0.0009490261225441643, + "loss": 0.86494851, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 887, + "time_per_iteration": 2.8882617950439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169082, + "balance_loss_mlp": 1.15244031, + "diversity_loss_mlp": 0.0, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07944379291645969, + "language_loss": 0.90366387, + "learning_rate": 0.0009488889914133656, + "loss": 0.91535467, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 888, + "time_per_iteration": 2.969808578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192276, + "balance_loss_mlp": 1.17532432, + "diversity_loss_mlp": 0.0, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.0816216626447537, + "language_loss": 0.89335579, + "learning_rate": 0.0009487516860084047, + "loss": 0.90527856, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 889, + "time_per_iteration": 2.6975717544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164555, + "balance_loss_mlp": 1.14738929, + "diversity_loss_mlp": 0.0, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.08956429914743876, + "language_loss": 0.88835347, + "learning_rate": 0.0009486142063825884, + "loss": 0.89999902, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 890, + "time_per_iteration": 2.5376908779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087842, + "balance_loss_mlp": 1.07248783, + "diversity_loss_mlp": 0.0, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.041165905845677725, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73514056, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 891, + "time_per_iteration": 4.961901664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168071, + "balance_loss_mlp": 1.15150142, + "diversity_loss_mlp": 0.0, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.09530662242326329, + "language_loss": 0.89790797, + "learning_rate": 0.0009483387246819542, + "loss": 0.90958869, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 892, + "time_per_iteration": 2.7075483798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.0489924, + "diversity_loss_mlp": 0.0, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.03173229244132217, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83349359, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 893, + "time_per_iteration": 4.639479398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175334, + "balance_loss_mlp": 1.15915704, + "diversity_loss_mlp": 0.0, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.09568003043121609, + "language_loss": 0.88799989, + "learning_rate": 0.0009480625467392688, + "loss": 0.89975327, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 894, + "time_per_iteration": 2.6601061820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.04933381, + "diversity_loss_mlp": 0.0, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.02668432598653126, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79057646, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 895, + "time_per_iteration": 4.739619970321655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154117, + "balance_loss_mlp": 1.13857174, + "diversity_loss_mlp": 0.0, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0641043143423189, + "language_loss": 0.87743723, + "learning_rate": 0.0009477856729834196, + "loss": 0.88897842, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 896, + "time_per_iteration": 2.7397632598876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143695, + "balance_loss_mlp": 1.12863934, + "diversity_loss_mlp": 0.0, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.08265751895316475, + "language_loss": 0.89999056, + "learning_rate": 0.0009476469753098809, + "loss": 0.9114275, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 897, + "time_per_iteration": 2.7494678497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.13624024, + "diversity_loss_mlp": 0.0, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08701823937514089, + "language_loss": 0.86839932, + "learning_rate": 0.0009475081038443738, + "loss": 0.87991428, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.15234375, + "routerloss_mlp": 0.0, + "step": 898, + "time_per_iteration": 2.6241486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147135, + "balance_loss_mlp": 1.13179302, + "diversity_loss_mlp": 0.0, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.10104724937619765, + "language_loss": 0.85756111, + "learning_rate": 0.0009473690586408124, + "loss": 0.86903244, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 899, + "time_per_iteration": 2.8371973037719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141451, + "balance_loss_mlp": 1.1257633, + "diversity_loss_mlp": 0.0, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.08019640817702944, + "language_loss": 0.86364079, + "learning_rate": 0.0009472298397531792, + "loss": 0.87505525, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 900, + "time_per_iteration": 2.742392063140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158392, + "balance_loss_mlp": 1.14285886, + "diversity_loss_mlp": 0.0, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.08623310667606855, + "language_loss": 0.86846912, + "learning_rate": 0.0009470904472355235, + "loss": 0.88005304, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 901, + "time_per_iteration": 2.6695165634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168499, + "balance_loss_mlp": 1.15235806, + "diversity_loss_mlp": 0.0, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.08505658620970231, + "language_loss": 0.7976377, + "learning_rate": 0.0009469508811419626, + "loss": 0.80932266, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 902, + "time_per_iteration": 2.706495761871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295395, + "balance_loss_mlp": 1.28533375, + "diversity_loss_mlp": 0.0, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.12561294289393785, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72909224, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 903, + "time_per_iteration": 4.816544532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201232, + "balance_loss_mlp": 1.18432808, + "diversity_loss_mlp": 0.0, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.08260915403461032, + "language_loss": 0.83578205, + "learning_rate": 0.0009466712284439292, + "loss": 0.84779429, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 904, + "time_per_iteration": 2.7518186569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225673, + "balance_loss_mlp": 1.20837545, + "diversity_loss_mlp": 0.0, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.10172065741669829, + "language_loss": 0.88445127, + "learning_rate": 0.0009465311419480276, + "loss": 0.89670801, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 905, + "time_per_iteration": 2.6713294982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222896, + "balance_loss_mlp": 1.20540833, + "diversity_loss_mlp": 0.0, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.08928567213571854, + "language_loss": 0.88188136, + "learning_rate": 0.0009463908820933622, + "loss": 0.89411032, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 906, + "time_per_iteration": 2.838935375213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211371, + "balance_loss_mlp": 1.19455028, + "diversity_loss_mlp": 0.0, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.07641026648080583, + "language_loss": 0.82561022, + "learning_rate": 0.0009462504489343868, + "loss": 0.83772391, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 907, + "time_per_iteration": 2.814695119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176767, + "balance_loss_mlp": 1.15961313, + "diversity_loss_mlp": 0.0, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.1031074016814366, + "language_loss": 0.88790941, + "learning_rate": 0.0009461098425256222, + "loss": 0.89967716, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 908, + "time_per_iteration": 2.6116297245025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159634, + "balance_loss_mlp": 1.14329028, + "diversity_loss_mlp": 0.0, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.08015161116044169, + "language_loss": 0.86030436, + "learning_rate": 0.0009459690629216567, + "loss": 0.87190068, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 909, + "time_per_iteration": 2.6483752727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130085, + "balance_loss_mlp": 1.11407518, + "diversity_loss_mlp": 0.0, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.1301831169035446, + "language_loss": 0.87761313, + "learning_rate": 0.0009458281101771457, + "loss": 0.88891399, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 910, + "time_per_iteration": 2.6089227199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00992009, + "balance_loss_mlp": 1.75545192, + "diversity_loss_mlp": 0.19214596, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.033219305186726854, + "language_loss": 0.82887536, + "learning_rate": 0.0009456869843468122, + "loss": 0.83879542, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820984, + "step": 911, + "time_per_iteration": 2.895577907562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110935, + "balance_loss_mlp": 1.09519958, + "diversity_loss_mlp": 0.0, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.09801228329993106, + "language_loss": 0.78689641, + "learning_rate": 0.0009455456854854459, + "loss": 0.79800576, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 912, + "time_per_iteration": 2.61677885055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112332, + "balance_loss_mlp": 1.09684718, + "diversity_loss_mlp": 0.0, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.10345929433375275, + "language_loss": 0.84027654, + "learning_rate": 0.0009454042136479039, + "loss": 0.8513999, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 913, + "time_per_iteration": 2.63289737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970368, + "balance_loss_mlp": 1.71473479, + "diversity_loss_mlp": 0.18966624, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.036406885856323776, + "language_loss": 0.82874572, + "learning_rate": 0.0009452625688891103, + "loss": 0.83844936, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816791, + "step": 914, + "time_per_iteration": 2.5505056381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00652668, + "balance_loss_mlp": 1.1176697, + "diversity_loss_mlp": 0.15453993, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.002103211778310914, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79387403, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01656273, + "step": 915, + "time_per_iteration": 4.6835761070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138887, + "balance_loss_mlp": 1.12381876, + "diversity_loss_mlp": 0.0, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.10180381633640839, + "language_loss": 0.92940623, + "learning_rate": 0.0009449787608278015, + "loss": 0.94079512, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 916, + "time_per_iteration": 2.7294180393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155245, + "balance_loss_mlp": 1.13949776, + "diversity_loss_mlp": 0.0, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08481056496958321, + "language_loss": 0.92318904, + "learning_rate": 0.0009448365976354704, + "loss": 0.9347415, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 917, + "time_per_iteration": 2.4908158779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174187, + "balance_loss_mlp": 1.15821338, + "diversity_loss_mlp": 0.0, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.1031397623895646, + "language_loss": 0.89928877, + "learning_rate": 0.0009446942617422558, + "loss": 0.91103065, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 918, + "time_per_iteration": 2.5721499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191219, + "balance_loss_mlp": 1.1748755, + "diversity_loss_mlp": 0.0, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.17804953788653613, + "language_loss": 0.85687363, + "learning_rate": 0.0009445517532034176, + "loss": 0.86878586, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 919, + "time_per_iteration": 2.6613845825195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195517, + "balance_loss_mlp": 1.18031824, + "diversity_loss_mlp": 0.0, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.09678678856513988, + "language_loss": 0.89147103, + "learning_rate": 0.0009444090720742824, + "loss": 0.90342629, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 920, + "time_per_iteration": 2.587042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186456, + "balance_loss_mlp": 1.17107785, + "diversity_loss_mlp": 0.0, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.10185153476697495, + "language_loss": 0.87654328, + "learning_rate": 0.0009442662184102439, + "loss": 0.88840789, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 921, + "time_per_iteration": 2.8263702392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153869, + "balance_loss_mlp": 1.13851511, + "diversity_loss_mlp": 0.0, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.07023953845341, + "language_loss": 0.87764925, + "learning_rate": 0.000944123192266763, + "loss": 0.88918793, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 922, + "time_per_iteration": 2.789288282394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914197, + "balance_loss_mlp": 1.60349846, + "diversity_loss_mlp": 0.18745996, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.03372690713262746, + "language_loss": 0.83555657, + "learning_rate": 0.0009439799936993671, + "loss": 0.84469855, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871805, + "step": 923, + "time_per_iteration": 2.7374520301818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137351, + "balance_loss_mlp": 1.12125802, + "diversity_loss_mlp": 0.0, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.08202300708599226, + "language_loss": 0.87886107, + "learning_rate": 0.0009438366227636511, + "loss": 0.89023459, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.16088867, + "routerloss_mlp": 0.0, + "step": 924, + "time_per_iteration": 2.7159595489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148154, + "balance_loss_mlp": 1.13190556, + "diversity_loss_mlp": 0.0, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.08035818105278464, + "language_loss": 0.86048192, + "learning_rate": 0.0009436930795152763, + "loss": 0.8719635, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 925, + "time_per_iteration": 2.8248116970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143318, + "balance_loss_mlp": 1.12739205, + "diversity_loss_mlp": 0.0, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07405817727017547, + "language_loss": 0.86317486, + "learning_rate": 0.0009435493640099713, + "loss": 0.87460804, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 926, + "time_per_iteration": 2.8155741691589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161834, + "balance_loss_mlp": 1.1451211, + "diversity_loss_mlp": 0.0, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.09122083849675254, + "language_loss": 0.84453332, + "learning_rate": 0.0009434054763035314, + "loss": 0.8561517, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 927, + "time_per_iteration": 2.636686325073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158411, + "balance_loss_mlp": 1.1422224, + "diversity_loss_mlp": 0.0, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.0663266274239875, + "language_loss": 0.85362542, + "learning_rate": 0.0009432614164518185, + "loss": 0.86520946, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 928, + "time_per_iteration": 2.9446685314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171163, + "balance_loss_mlp": 1.15443754, + "diversity_loss_mlp": 0.0, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07726522608444414, + "language_loss": 0.84178561, + "learning_rate": 0.000943117184510762, + "loss": 0.85349721, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 929, + "time_per_iteration": 3.0194530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175374, + "balance_loss_mlp": 1.16435885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.030831515732685378, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79965341, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 930, + "time_per_iteration": 5.04656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172004, + "balance_loss_mlp": 1.15555263, + "diversity_loss_mlp": 0.0, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.08209248711818126, + "language_loss": 0.88495553, + "learning_rate": 0.0009428282045846674, + "loss": 0.89667559, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.16455078, + "routerloss_mlp": 0.0, + "step": 931, + "time_per_iteration": 2.6833221912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905029, + "balance_loss_mlp": 1.58147573, + "diversity_loss_mlp": 0.18920106, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.030391877730158674, + "language_loss": 0.89804769, + "learning_rate": 0.0009426834567118214, + "loss": 0.90709794, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01969042, + "step": 932, + "time_per_iteration": 3.0804004669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174106, + "balance_loss_mlp": 1.15761924, + "diversity_loss_mlp": 0.0, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.06967623980831897, + "language_loss": 0.80600739, + "learning_rate": 0.0009425385369740155, + "loss": 0.81774843, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.16491699, + "routerloss_mlp": 0.0, + "step": 933, + "time_per_iteration": 3.039576530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172613, + "balance_loss_mlp": 1.15553069, + "diversity_loss_mlp": 0.0, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.09198882046168515, + "language_loss": 0.87049097, + "learning_rate": 0.0009423934454275125, + "loss": 0.88221705, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 934, + "time_per_iteration": 2.8528192043304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147429, + "balance_loss_mlp": 1.13053656, + "diversity_loss_mlp": 0.0, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.09002999058802562, + "language_loss": 0.92077851, + "learning_rate": 0.0009422481821286418, + "loss": 0.93225282, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.16906738, + "routerloss_mlp": 0.0, + "step": 935, + "time_per_iteration": 2.720700740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140916, + "balance_loss_mlp": 1.12434602, + "diversity_loss_mlp": 0.0, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.11818586168906865, + "language_loss": 0.88474637, + "learning_rate": 0.0009421027471337998, + "loss": 0.89615548, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 936, + "time_per_iteration": 2.61820125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114364, + "balance_loss_mlp": 1.12680769, + "diversity_loss_mlp": 0.0, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.13119105141522364, + "language_loss": 0.82430404, + "learning_rate": 0.0009419571404994493, + "loss": 0.83574045, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 937, + "time_per_iteration": 2.6458749771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.11016333, + "diversity_loss_mlp": 0.0, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.10011425098636609, + "language_loss": 0.90748799, + "learning_rate": 0.00094181136228212, + "loss": 0.91875559, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 938, + "time_per_iteration": 2.659946918487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132333, + "balance_loss_mlp": 1.11602521, + "diversity_loss_mlp": 0.0, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06984091109722412, + "language_loss": 0.86027002, + "learning_rate": 0.0009416654125384077, + "loss": 0.8715933, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 939, + "time_per_iteration": 2.723839044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182476, + "balance_loss_mlp": 1.17174697, + "diversity_loss_mlp": 0.0, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.0414358910702132, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.8095485, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 940, + "time_per_iteration": 4.920511722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141007, + "balance_loss_mlp": 1.12453222, + "diversity_loss_mlp": 0.0, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.0813056862192268, + "language_loss": 0.83903325, + "learning_rate": 0.000941372998698552, + "loss": 0.85044336, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 941, + "time_per_iteration": 2.937645673751831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00896978, + "balance_loss_mlp": 1.56833267, + "diversity_loss_mlp": 0.1911485, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.04191931915848681, + "language_loss": 0.82149267, + "learning_rate": 0.0009412265347159336, + "loss": 0.83046246, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0172378, + "step": 942, + "time_per_iteration": 2.7250781059265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116112, + "balance_loss_mlp": 1.14446664, + "diversity_loss_mlp": 0.0, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.08706600394859935, + "language_loss": 0.84761524, + "learning_rate": 0.0009410798994339829, + "loss": 0.85922647, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 943, + "time_per_iteration": 2.5916900634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115721, + "balance_loss_mlp": 1.14027047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.07414862428622851, + "language_loss": 0.87698966, + "learning_rate": 0.000940933092909628, + "loss": 0.88856173, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 944, + "time_per_iteration": 2.6747801303863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166789, + "balance_loss_mlp": 1.15049326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.07390491400887403, + "language_loss": 0.83424389, + "learning_rate": 0.0009407861151998649, + "loss": 0.84591174, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 945, + "time_per_iteration": 2.602691411972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163795, + "balance_loss_mlp": 1.14708209, + "diversity_loss_mlp": 0.0, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.07435679337016335, + "language_loss": 0.86087269, + "learning_rate": 0.0009406389663617552, + "loss": 0.87251067, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 946, + "time_per_iteration": 2.6775379180908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139209, + "balance_loss_mlp": 1.12300825, + "diversity_loss_mlp": 0.0, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.08423780444915897, + "language_loss": 0.86031067, + "learning_rate": 0.000940491646452427, + "loss": 0.87170279, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 947, + "time_per_iteration": 2.717313051223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134537, + "balance_loss_mlp": 1.11805058, + "diversity_loss_mlp": 0.0, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.0716601161320721, + "language_loss": 0.90799212, + "learning_rate": 0.000940344155529075, + "loss": 0.91933751, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 948, + "time_per_iteration": 2.645601749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905236, + "balance_loss_mlp": 1.57791471, + "diversity_loss_mlp": 0.19691566, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.03478780514937427, + "language_loss": 0.87420666, + "learning_rate": 0.0009401964936489605, + "loss": 0.883259, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01782099, + "step": 949, + "time_per_iteration": 2.546546459197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132433, + "balance_loss_mlp": 1.11666203, + "diversity_loss_mlp": 0.0, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.11218622077210595, + "language_loss": 0.85308415, + "learning_rate": 0.0009400486608694108, + "loss": 0.86440849, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 950, + "time_per_iteration": 2.71462345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135805, + "balance_loss_mlp": 1.1190201, + "diversity_loss_mlp": 0.0, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.07143871570155125, + "language_loss": 0.87176299, + "learning_rate": 0.0009399006572478195, + "loss": 0.88312101, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 951, + "time_per_iteration": 3.0933260917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137853, + "balance_loss_mlp": 1.12129509, + "diversity_loss_mlp": 0.0, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.08672794105569953, + "language_loss": 0.90997601, + "learning_rate": 0.0009397524828416468, + "loss": 0.92135453, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 952, + "time_per_iteration": 2.6721160411834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906668, + "balance_loss_mlp": 1.58174932, + "diversity_loss_mlp": 0.19792399, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.0341945315399877, + "language_loss": 0.96079636, + "learning_rate": 0.0009396041377084192, + "loss": 0.96986312, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01683164, + "step": 953, + "time_per_iteration": 2.6563429832458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147916, + "balance_loss_mlp": 1.1312983, + "diversity_loss_mlp": 0.0, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.07156922543086394, + "language_loss": 0.87274891, + "learning_rate": 0.0009394556219057295, + "loss": 0.88422805, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 954, + "time_per_iteration": 2.710129499435425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.1480366, + "diversity_loss_mlp": 0.0, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08933499459227748, + "language_loss": 0.83389091, + "learning_rate": 0.0009393069354912362, + "loss": 0.84553862, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 955, + "time_per_iteration": 2.736077070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162546, + "balance_loss_mlp": 1.1459167, + "diversity_loss_mlp": 0.0, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.10088049230192819, + "language_loss": 0.81851852, + "learning_rate": 0.0009391580785226649, + "loss": 0.83014399, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 956, + "time_per_iteration": 2.8675243854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139492, + "balance_loss_mlp": 1.12933517, + "diversity_loss_mlp": 0.0, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.028623000900350283, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80479944, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 957, + "time_per_iteration": 4.758531332015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128949, + "balance_loss_mlp": 1.11177051, + "diversity_loss_mlp": 0.0, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.0742792603097427, + "language_loss": 0.8674221, + "learning_rate": 0.0009388598531545196, + "loss": 0.87871158, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 958, + "time_per_iteration": 2.8665144443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110151, + "balance_loss_mlp": 1.09304404, + "diversity_loss_mlp": 0.0, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.08387101873752756, + "language_loss": 0.85292655, + "learning_rate": 0.000938710484870727, + "loss": 0.86402804, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.17126465, + "routerloss_mlp": 0.0, + "step": 959, + "time_per_iteration": 2.5621094703674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113798, + "balance_loss_mlp": 1.09718001, + "diversity_loss_mlp": 0.0, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.08027143748444723, + "language_loss": 0.85896957, + "learning_rate": 0.0009385609462644189, + "loss": 0.87010753, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 960, + "time_per_iteration": 2.6949400901794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122642, + "balance_loss_mlp": 1.10596502, + "diversity_loss_mlp": 0.0, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07967759372686231, + "language_loss": 0.8535409, + "learning_rate": 0.0009384112373936514, + "loss": 0.86476731, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 961, + "time_per_iteration": 2.644244432449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.11566615, + "diversity_loss_mlp": 0.0, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.09330138113238175, + "language_loss": 0.91539109, + "learning_rate": 0.0009382613583165467, + "loss": 0.92671585, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 962, + "time_per_iteration": 2.8191375732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128481, + "balance_loss_mlp": 1.11161256, + "diversity_loss_mlp": 0.0, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.08799115365988901, + "language_loss": 0.89600122, + "learning_rate": 0.0009381113090912928, + "loss": 0.90728599, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 963, + "time_per_iteration": 2.77341890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137775, + "balance_loss_mlp": 1.12159812, + "diversity_loss_mlp": 0.0, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.08224545608030313, + "language_loss": 0.89354098, + "learning_rate": 0.000937961089776144, + "loss": 0.90491867, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 964, + "time_per_iteration": 2.6057045459747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140677, + "balance_loss_mlp": 1.12448788, + "diversity_loss_mlp": 0.0, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.08763662153745684, + "language_loss": 0.82399738, + "learning_rate": 0.0009378107004294208, + "loss": 0.83540416, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 965, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132665, + "balance_loss_mlp": 1.11624968, + "diversity_loss_mlp": 0.0, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.0696996408734829, + "language_loss": 0.91584361, + "learning_rate": 0.0009376601411095096, + "loss": 0.92717028, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.16418457, + "routerloss_mlp": 0.0, + "step": 966, + "time_per_iteration": 2.6557700634002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108368, + "balance_loss_mlp": 1.09209585, + "diversity_loss_mlp": 0.0, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.0928645758984953, + "language_loss": 0.86438054, + "learning_rate": 0.0009375094118748622, + "loss": 0.8754642, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.16271973, + "routerloss_mlp": 0.0, + "step": 967, + "time_per_iteration": 2.5574727058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121341, + "balance_loss_mlp": 1.10546279, + "diversity_loss_mlp": 0.0, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.08866997131388626, + "language_loss": 0.90710455, + "learning_rate": 0.0009373585127839976, + "loss": 0.91831791, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 968, + "time_per_iteration": 2.9949731826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.1066587, + "diversity_loss_mlp": 0.0, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08663719992470821, + "language_loss": 0.90892541, + "learning_rate": 0.0009372074438954994, + "loss": 0.92014849, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.15637207, + "routerloss_mlp": 0.0, + "step": 969, + "time_per_iteration": 2.583392381668091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115205, + "balance_loss_mlp": 1.09983897, + "diversity_loss_mlp": 0.0, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.1288159292638968, + "language_loss": 0.91714692, + "learning_rate": 0.0009370562052680181, + "loss": 0.92829901, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.15356445, + "routerloss_mlp": 0.0, + "step": 970, + "time_per_iteration": 2.476053476333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131477, + "balance_loss_mlp": 1.1160872, + "diversity_loss_mlp": 0.0, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.05501755081279848, + "language_loss": 0.89296091, + "learning_rate": 0.0009369047969602695, + "loss": 0.90427566, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 971, + "time_per_iteration": 2.705310344696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161734, + "balance_loss_mlp": 1.14604628, + "diversity_loss_mlp": 0.0, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.09590230746039986, + "language_loss": 0.86690193, + "learning_rate": 0.0009367532190310357, + "loss": 0.8785193, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 972, + "time_per_iteration": 2.551683187484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151378, + "balance_loss_mlp": 1.13526106, + "diversity_loss_mlp": 0.0, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.13723256450586457, + "language_loss": 0.88859725, + "learning_rate": 0.0009366014715391644, + "loss": 0.90011096, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 973, + "time_per_iteration": 2.6311707496643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140536, + "balance_loss_mlp": 1.12521768, + "diversity_loss_mlp": 0.0, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.0667022200872989, + "language_loss": 0.83902818, + "learning_rate": 0.0009364495545435693, + "loss": 0.85043353, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 974, + "time_per_iteration": 2.756056308746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121055, + "balance_loss_mlp": 1.10528326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.06720472395514528, + "language_loss": 0.88235438, + "learning_rate": 0.0009362974681032297, + "loss": 0.89356488, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 975, + "time_per_iteration": 2.601027488708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117499, + "balance_loss_mlp": 1.10179889, + "diversity_loss_mlp": 0.0, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.09372829562862567, + "language_loss": 0.88529336, + "learning_rate": 0.0009361452122771907, + "loss": 0.8964684, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 976, + "time_per_iteration": 2.8729074001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124468, + "balance_loss_mlp": 1.107934, + "diversity_loss_mlp": 0.0, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.10248565336705484, + "language_loss": 0.83506191, + "learning_rate": 0.0009359927871245635, + "loss": 0.84630656, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 977, + "time_per_iteration": 2.4633541107177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114403, + "balance_loss_mlp": 1.12861657, + "diversity_loss_mlp": 0.0, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09207140211488826, + "language_loss": 0.85937703, + "learning_rate": 0.0009358401927045246, + "loss": 0.87081736, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 978, + "time_per_iteration": 2.8528451919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165656, + "balance_loss_mlp": 1.15002799, + "diversity_loss_mlp": 0.0, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.09819064259764942, + "language_loss": 0.88151729, + "learning_rate": 0.0009356874290763166, + "loss": 0.89317381, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 979, + "time_per_iteration": 3.4732589721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165217, + "balance_loss_mlp": 1.14985144, + "diversity_loss_mlp": 0.0, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.07125364842819645, + "language_loss": 0.88739443, + "learning_rate": 0.0009355344962992474, + "loss": 0.8990466, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 980, + "time_per_iteration": 2.618013381958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092711, + "balance_loss_mlp": 1.61735535, + "diversity_loss_mlp": 0.20325859, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.031158428526317693, + "language_loss": 0.8787328, + "learning_rate": 0.0009353813944326908, + "loss": 0.88800395, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0168031, + "step": 981, + "time_per_iteration": 2.926612377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00925726, + "balance_loss_mlp": 1.616956, + "diversity_loss_mlp": 0.20126666, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0354798675553145, + "language_loss": 0.82752389, + "learning_rate": 0.0009352281235360863, + "loss": 0.83678114, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01661466, + "step": 982, + "time_per_iteration": 2.7461719512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156754, + "balance_loss_mlp": 1.14193642, + "diversity_loss_mlp": 0.0, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.08008026175511872, + "language_loss": 0.84875655, + "learning_rate": 0.0009350746836689389, + "loss": 0.86032403, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 983, + "time_per_iteration": 2.5128703117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232965, + "balance_loss_mlp": 1.22199774, + "diversity_loss_mlp": 0.0, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.06420942239022731, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82672185, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 984, + "time_per_iteration": 4.987680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144146, + "balance_loss_mlp": 1.12880325, + "diversity_loss_mlp": 0.0, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08702988523082197, + "language_loss": 0.82654107, + "learning_rate": 0.0009347672972613634, + "loss": 0.83798254, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 985, + "time_per_iteration": 2.586580514907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891878, + "balance_loss_mlp": 1.54986262, + "diversity_loss_mlp": 0.20135348, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.032521151954013804, + "language_loss": 0.85226321, + "learning_rate": 0.0009346133508402735, + "loss": 0.86118197, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01626948, + "step": 986, + "time_per_iteration": 2.7389352321624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151414, + "balance_loss_mlp": 1.13596404, + "diversity_loss_mlp": 0.0, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.0982536864932062, + "language_loss": 0.84267235, + "learning_rate": 0.0009344592356873166, + "loss": 0.85418648, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 987, + "time_per_iteration": 2.6327145099639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157169, + "balance_loss_mlp": 1.14155281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.07528447862042392, + "language_loss": 0.78532755, + "learning_rate": 0.0009343049518623255, + "loss": 0.79689926, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 988, + "time_per_iteration": 2.7461259365081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161817, + "balance_loss_mlp": 1.14693928, + "diversity_loss_mlp": 0.0, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.07061488940634471, + "language_loss": 0.83142781, + "learning_rate": 0.0009341504994251985, + "loss": 0.84304595, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 989, + "time_per_iteration": 2.9033045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128003, + "balance_loss_mlp": 1.11765516, + "diversity_loss_mlp": 0.0, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.02664126889468688, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74648499, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 990, + "time_per_iteration": 5.065544605255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116372, + "balance_loss_mlp": 1.14821064, + "diversity_loss_mlp": 0.0, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.062492069067547173, + "language_loss": 0.81668103, + "learning_rate": 0.0009338410889544574, + "loss": 0.82831824, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 991, + "time_per_iteration": 3.0360453128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160077, + "balance_loss_mlp": 1.14444828, + "diversity_loss_mlp": 0.0, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.07188646642614673, + "language_loss": 0.87598348, + "learning_rate": 0.000933686131040967, + "loss": 0.88758421, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 992, + "time_per_iteration": 4.194309234619141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132508, + "balance_loss_mlp": 1.11693931, + "diversity_loss_mlp": 0.0, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.07096950165415856, + "language_loss": 0.90250611, + "learning_rate": 0.0009335310047555883, + "loss": 0.91383117, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 993, + "time_per_iteration": 2.7198565006256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128004, + "balance_loss_mlp": 1.11225605, + "diversity_loss_mlp": 0.0, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.07682750770192658, + "language_loss": 0.8836562, + "learning_rate": 0.0009333757101585467, + "loss": 0.89493626, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 994, + "time_per_iteration": 2.6651480197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.10621142, + "diversity_loss_mlp": 0.0, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.10461680978710068, + "language_loss": 0.9317944, + "learning_rate": 0.0009332202473101329, + "loss": 0.94301325, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 995, + "time_per_iteration": 2.667943239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00890685, + "balance_loss_mlp": 1.54595685, + "diversity_loss_mlp": 0.2013846, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.03439253799161941, + "language_loss": 0.8270663, + "learning_rate": 0.0009330646162707028, + "loss": 0.83597314, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170145, + "step": 996, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130524, + "balance_loss_mlp": 1.11483645, + "diversity_loss_mlp": 0.0, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.07379991060729872, + "language_loss": 0.84002179, + "learning_rate": 0.0009329088171006779, + "loss": 0.85132706, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 997, + "time_per_iteration": 3.133023738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136353, + "balance_loss_mlp": 1.12061739, + "diversity_loss_mlp": 0.0, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.09187105070084006, + "language_loss": 0.85599297, + "learning_rate": 0.0009327528498605446, + "loss": 0.86735654, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 998, + "time_per_iteration": 2.5390877723693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888942, + "balance_loss_mlp": 1.54108667, + "diversity_loss_mlp": 0.20404731, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.03685920036749298, + "language_loss": 0.89166534, + "learning_rate": 0.0009325967146108548, + "loss": 0.90055484, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01637482, + "step": 999, + "time_per_iteration": 2.7167420387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159789, + "balance_loss_mlp": 1.14361215, + "diversity_loss_mlp": 0.0, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.08415694153473897, + "language_loss": 0.87386107, + "learning_rate": 0.0009324404114122258, + "loss": 0.88545901, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 1000, + "time_per_iteration": 2.6833291053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164843, + "balance_loss_mlp": 1.1492269, + "diversity_loss_mlp": 0.0, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.07516183221332183, + "language_loss": 0.86446774, + "learning_rate": 0.0009322839403253397, + "loss": 0.87611622, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 1001, + "time_per_iteration": 4.16480565071106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173642, + "balance_loss_mlp": 1.15789402, + "diversity_loss_mlp": 0.0, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07739515949456567, + "language_loss": 0.84035075, + "learning_rate": 0.0009321273014109439, + "loss": 0.8520872, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1002, + "time_per_iteration": 2.9390604496002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183539, + "balance_loss_mlp": 1.16795826, + "diversity_loss_mlp": 0.0, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.08102605487142737, + "language_loss": 0.84643984, + "learning_rate": 0.0009319704947298513, + "loss": 0.85827518, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1003, + "time_per_iteration": 2.923952579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116012, + "balance_loss_mlp": 1.14496815, + "diversity_loss_mlp": 0.0, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.060771133612280225, + "language_loss": 0.88448775, + "learning_rate": 0.0009318135203429393, + "loss": 0.89608896, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.15124512, + "routerloss_mlp": 0.0, + "step": 1004, + "time_per_iteration": 2.7170984745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135222, + "balance_loss_mlp": 1.11972475, + "diversity_loss_mlp": 0.0, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.07023398647530335, + "language_loss": 0.87528408, + "learning_rate": 0.0009316563783111511, + "loss": 0.88663626, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1005, + "time_per_iteration": 2.7271320819854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011162, + "balance_loss_mlp": 1.10061884, + "diversity_loss_mlp": 0.0, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.07388032809600253, + "language_loss": 0.82009041, + "learning_rate": 0.0009314990686954943, + "loss": 0.83125246, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1006, + "time_per_iteration": 2.9210305213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108745, + "balance_loss_mlp": 1.09337938, + "diversity_loss_mlp": 0.0, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.06330578200459082, + "language_loss": 0.80805916, + "learning_rate": 0.000931341591557042, + "loss": 0.81914663, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 1007, + "time_per_iteration": 3.695157051086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095993, + "balance_loss_mlp": 1.08054364, + "diversity_loss_mlp": 0.0, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.07858263731415134, + "language_loss": 0.87216473, + "learning_rate": 0.0009311839469569325, + "loss": 0.88312465, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1008, + "time_per_iteration": 2.633854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108854, + "balance_loss_mlp": 1.07287586, + "diversity_loss_mlp": 0.0, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.14235975733457876, + "language_loss": 0.87399781, + "learning_rate": 0.0009310261349563687, + "loss": 0.88488322, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1009, + "time_per_iteration": 2.702073574066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898627, + "balance_loss_mlp": 1.56164169, + "diversity_loss_mlp": 0.20371187, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.03011805945399338, + "language_loss": 0.85438645, + "learning_rate": 0.0009308681556166186, + "loss": 0.86337274, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01594995, + "step": 1010, + "time_per_iteration": 2.8698601722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111744, + "balance_loss_mlp": 1.0962348, + "diversity_loss_mlp": 0.0, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.08879322612819535, + "language_loss": 0.87462533, + "learning_rate": 0.0009307100089990152, + "loss": 0.88574278, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1011, + "time_per_iteration": 2.7149901390075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140864, + "balance_loss_mlp": 1.12543821, + "diversity_loss_mlp": 0.0, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.07383907155719892, + "language_loss": 0.83837229, + "learning_rate": 0.0009305516951649568, + "loss": 0.84978092, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.15405273, + "routerloss_mlp": 0.0, + "step": 1012, + "time_per_iteration": 2.702683448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161407, + "balance_loss_mlp": 1.14599323, + "diversity_loss_mlp": 0.0, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07624018834593461, + "language_loss": 0.86570859, + "learning_rate": 0.0009303932141759057, + "loss": 0.87732267, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 1013, + "time_per_iteration": 2.7500197887420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168747, + "balance_loss_mlp": 1.15382242, + "diversity_loss_mlp": 0.0, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.08469076174706892, + "language_loss": 0.83575755, + "learning_rate": 0.0009302345660933902, + "loss": 0.84744501, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1014, + "time_per_iteration": 2.8010780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171185, + "balance_loss_mlp": 1.15642715, + "diversity_loss_mlp": 0.0, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.08619273283705803, + "language_loss": 0.85146868, + "learning_rate": 0.0009300757509790026, + "loss": 0.86318052, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1015, + "time_per_iteration": 2.840315103530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150596, + "balance_loss_mlp": 1.13570654, + "diversity_loss_mlp": 0.0, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.10655365126946059, + "language_loss": 0.90244913, + "learning_rate": 0.0009299167688944005, + "loss": 0.91395509, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1016, + "time_per_iteration": 2.502391815185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130549, + "balance_loss_mlp": 1.11540985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07757202619564983, + "language_loss": 0.85754222, + "learning_rate": 0.0009297576199013063, + "loss": 0.86884773, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1017, + "time_per_iteration": 2.7255496978759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00657481, + "balance_loss_mlp": 1.1064117, + "diversity_loss_mlp": 0.17609364, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.0027779106975556575, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.73659611, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01622855, + "step": 1018, + "time_per_iteration": 4.943171739578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01384914, + "balance_loss_mlp": 1.37351775, + "diversity_loss_mlp": 0.0, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.09054623740471555, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80811214, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 1019, + "time_per_iteration": 5.518418788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125322, + "balance_loss_mlp": 1.11074281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.08202201534603108, + "language_loss": 0.8648417, + "learning_rate": 0.0009292791720892659, + "loss": 0.87609494, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1020, + "time_per_iteration": 2.889078140258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131715, + "balance_loss_mlp": 1.11721921, + "diversity_loss_mlp": 0.0, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07932574612707302, + "language_loss": 0.88913518, + "learning_rate": 0.0009291193560807218, + "loss": 0.90045238, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1021, + "time_per_iteration": 2.5933609008789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136679, + "balance_loss_mlp": 1.122159, + "diversity_loss_mlp": 0.0, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.08278255048112054, + "language_loss": 0.87034905, + "learning_rate": 0.0009289593734732688, + "loss": 0.88171583, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1022, + "time_per_iteration": 2.600834369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132774, + "balance_loss_mlp": 1.11842132, + "diversity_loss_mlp": 0.0, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.08270608551386573, + "language_loss": 0.93774927, + "learning_rate": 0.0009287992243290175, + "loss": 0.94907701, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1023, + "time_per_iteration": 2.474914312362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111783, + "balance_loss_mlp": 1.10275006, + "diversity_loss_mlp": 0.0, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.06901830196983176, + "language_loss": 0.90473127, + "learning_rate": 0.0009286389087101435, + "loss": 0.91590953, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1024, + "time_per_iteration": 2.7718465328216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120328, + "balance_loss_mlp": 1.1055932, + "diversity_loss_mlp": 0.0, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07476522676232629, + "language_loss": 0.8853035, + "learning_rate": 0.0009284784266788864, + "loss": 0.89650679, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1025, + "time_per_iteration": 2.7143290042877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122071, + "balance_loss_mlp": 1.10795665, + "diversity_loss_mlp": 0.0, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.08990804702262417, + "language_loss": 0.91984832, + "learning_rate": 0.0009283177782975512, + "loss": 0.93106908, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1026, + "time_per_iteration": 2.948909282684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115739, + "balance_loss_mlp": 1.10118401, + "diversity_loss_mlp": 0.0, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08229992096701991, + "language_loss": 0.88074464, + "learning_rate": 0.000928156963628507, + "loss": 0.89190209, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1027, + "time_per_iteration": 2.5764074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109846, + "balance_loss_mlp": 1.09483802, + "diversity_loss_mlp": 0.0, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.08379460495492784, + "language_loss": 0.87978798, + "learning_rate": 0.0009279959827341877, + "loss": 0.89088643, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1028, + "time_per_iteration": 2.752347946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08043635, + "diversity_loss_mlp": 0.0, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.08467225305095022, + "language_loss": 0.87624389, + "learning_rate": 0.0009278348356770915, + "loss": 0.88720024, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1029, + "time_per_iteration": 2.555527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.08132768, + "diversity_loss_mlp": 0.0, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.0755245964113765, + "language_loss": 0.85285002, + "learning_rate": 0.0009276735225197814, + "loss": 0.86381966, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1030, + "time_per_iteration": 2.5947089195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104122, + "balance_loss_mlp": 1.08832633, + "diversity_loss_mlp": 0.0, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.08972056860523267, + "language_loss": 0.85732102, + "learning_rate": 0.0009275120433248847, + "loss": 0.86836231, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.15783691, + "routerloss_mlp": 0.0, + "step": 1031, + "time_per_iteration": 2.676872730255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109193, + "balance_loss_mlp": 1.09355247, + "diversity_loss_mlp": 0.0, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.07488561277584621, + "language_loss": 0.85529125, + "learning_rate": 0.0009273503981550931, + "loss": 0.86638314, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1032, + "time_per_iteration": 3.09958815574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099668, + "balance_loss_mlp": 1.08494592, + "diversity_loss_mlp": 0.0, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.1040963884260124, + "language_loss": 0.86882496, + "learning_rate": 0.0009271885870731626, + "loss": 0.87982166, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1033, + "time_per_iteration": 2.509047269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098009, + "balance_loss_mlp": 1.08258307, + "diversity_loss_mlp": 0.0, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.09324111295027285, + "language_loss": 0.88376671, + "learning_rate": 0.0009270266101419143, + "loss": 0.89474678, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1034, + "time_per_iteration": 2.6504034996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094236, + "balance_loss_mlp": 1.07954955, + "diversity_loss_mlp": 0.0, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.12545708784893086, + "language_loss": 0.85201651, + "learning_rate": 0.0009268644674242328, + "loss": 0.86295891, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1035, + "time_per_iteration": 2.6919047832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105423, + "balance_loss_mlp": 1.08997381, + "diversity_loss_mlp": 0.0, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.09055239952020887, + "language_loss": 0.80814689, + "learning_rate": 0.0009267021589830678, + "loss": 0.81920111, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1036, + "time_per_iteration": 2.582871198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278291, + "balance_loss_mlp": 1.26927888, + "diversity_loss_mlp": 0.0, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.10087907784966592, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78905374, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 1037, + "time_per_iteration": 4.955699920654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112922, + "balance_loss_mlp": 1.11371088, + "diversity_loss_mlp": 0.0, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.08737337363848705, + "language_loss": 0.9264009, + "learning_rate": 0.000926377045182406, + "loss": 0.93769312, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1038, + "time_per_iteration": 2.8884389400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140143, + "balance_loss_mlp": 1.12453878, + "diversity_loss_mlp": 0.0, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.10415849564176528, + "language_loss": 0.87916917, + "learning_rate": 0.0009262142399491296, + "loss": 0.89057058, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1039, + "time_per_iteration": 3.045872211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143419, + "balance_loss_mlp": 1.12763548, + "diversity_loss_mlp": 0.0, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.09906225236156592, + "language_loss": 0.87455821, + "learning_rate": 0.0009260512692448105, + "loss": 0.88599241, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.15771484, + "routerloss_mlp": 0.0, + "step": 1040, + "time_per_iteration": 2.699052572250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.10879421, + "diversity_loss_mlp": 0.0, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0911420547130344, + "language_loss": 0.8431657, + "learning_rate": 0.000925888133132719, + "loss": 0.85441184, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1041, + "time_per_iteration": 2.780141830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063145, + "balance_loss_mlp": 1.05260694, + "diversity_loss_mlp": 0.0, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.04139604987307943, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80673575, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 1042, + "time_per_iteration": 4.971017360687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100746, + "balance_loss_mlp": 1.08498645, + "diversity_loss_mlp": 0.0, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.08950731646766712, + "language_loss": 0.81070006, + "learning_rate": 0.0009255613649386244, + "loss": 0.82170749, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.1574707, + "routerloss_mlp": 0.0, + "step": 1043, + "time_per_iteration": 2.6508612632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091355, + "balance_loss_mlp": 1.07623935, + "diversity_loss_mlp": 0.0, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07614483401418765, + "language_loss": 0.78829026, + "learning_rate": 0.0009253977329834838, + "loss": 0.79920387, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1044, + "time_per_iteration": 2.7090582847595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109464, + "balance_loss_mlp": 1.07947624, + "diversity_loss_mlp": 0.0, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.0989854096864982, + "language_loss": 0.86366481, + "learning_rate": 0.0009252339358742965, + "loss": 0.8746112, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1045, + "time_per_iteration": 2.801323652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100349, + "balance_loss_mlp": 1.08526874, + "diversity_loss_mlp": 0.0, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07994799859902735, + "language_loss": 0.83704323, + "learning_rate": 0.000925069973674654, + "loss": 0.84804672, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1046, + "time_per_iteration": 2.6286635398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011046, + "balance_loss_mlp": 1.09036636, + "diversity_loss_mlp": 0.0, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.05803081938267982, + "language_loss": 0.88841283, + "learning_rate": 0.000924905846448212, + "loss": 0.89945889, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1047, + "time_per_iteration": 2.7208023071289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135029, + "balance_loss_mlp": 1.12078381, + "diversity_loss_mlp": 0.0, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.09159511175118457, + "language_loss": 0.85692465, + "learning_rate": 0.0009247415542586906, + "loss": 0.86827493, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1048, + "time_per_iteration": 2.8772377967834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089504, + "balance_loss_mlp": 1.55797935, + "diversity_loss_mlp": 0.19993141, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.028193920194447036, + "language_loss": 0.83094788, + "learning_rate": 0.0009245770971698735, + "loss": 0.83989829, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01608507, + "step": 1049, + "time_per_iteration": 2.922792911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143099, + "balance_loss_mlp": 1.12878203, + "diversity_loss_mlp": 0.0, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.08345797467079887, + "language_loss": 0.88434327, + "learning_rate": 0.0009244124752456087, + "loss": 0.89577425, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1050, + "time_per_iteration": 2.5263967514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141188, + "balance_loss_mlp": 1.12675214, + "diversity_loss_mlp": 0.0, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.07479960387863874, + "language_loss": 0.85303241, + "learning_rate": 0.0009242476885498081, + "loss": 0.86444432, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1051, + "time_per_iteration": 2.8012773990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146474, + "balance_loss_mlp": 1.13181126, + "diversity_loss_mlp": 0.0, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.07632391919964465, + "language_loss": 0.81114984, + "learning_rate": 0.0009240827371464474, + "loss": 0.82261455, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1052, + "time_per_iteration": 2.546449661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146729, + "balance_loss_mlp": 1.1323998, + "diversity_loss_mlp": 0.0, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.11219768477147798, + "language_loss": 0.84167284, + "learning_rate": 0.0009239176210995666, + "loss": 0.85314012, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1053, + "time_per_iteration": 3.4905290603637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153158, + "balance_loss_mlp": 1.13878179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.07345468089138417, + "language_loss": 0.93850195, + "learning_rate": 0.0009237523404732695, + "loss": 0.95003355, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1054, + "time_per_iteration": 2.8854215145111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116831, + "balance_loss_mlp": 1.15374279, + "diversity_loss_mlp": 0.0, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.08788286689344726, + "language_loss": 0.84136868, + "learning_rate": 0.0009235868953317235, + "loss": 0.85305184, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1055, + "time_per_iteration": 2.785616397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115453, + "balance_loss_mlp": 1.14033246, + "diversity_loss_mlp": 0.0, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.07006303181868268, + "language_loss": 0.85314858, + "learning_rate": 0.0009234212857391602, + "loss": 0.86469388, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1056, + "time_per_iteration": 3.192293167114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167757, + "balance_loss_mlp": 1.15304708, + "diversity_loss_mlp": 0.0, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.07469852363602907, + "language_loss": 0.89220309, + "learning_rate": 0.000923255511759875, + "loss": 0.9038806, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1057, + "time_per_iteration": 2.783778429031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881428, + "balance_loss_mlp": 1.53356147, + "diversity_loss_mlp": 0.1968638, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.032510948660132113, + "language_loss": 0.84587663, + "learning_rate": 0.000923089573458227, + "loss": 0.85469091, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621579, + "step": 1058, + "time_per_iteration": 2.8847100734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150746, + "balance_loss_mlp": 1.13623881, + "diversity_loss_mlp": 0.0, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.11181454207252314, + "language_loss": 0.83516467, + "learning_rate": 0.0009229234708986392, + "loss": 0.84667218, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1059, + "time_per_iteration": 2.9079415798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172867, + "balance_loss_mlp": 1.16251993, + "diversity_loss_mlp": 0.0, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.06024273804144221, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82839763, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1060, + "time_per_iteration": 4.646218776702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112152, + "balance_loss_mlp": 1.10713172, + "diversity_loss_mlp": 0.0, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.08928557521337042, + "language_loss": 0.85345757, + "learning_rate": 0.0009225907732636548, + "loss": 0.86467278, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1061, + "time_per_iteration": 2.745448112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106249, + "balance_loss_mlp": 1.09209883, + "diversity_loss_mlp": 0.0, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.079028173596336, + "language_loss": 0.86936563, + "learning_rate": 0.0009224241783174227, + "loss": 0.88042819, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1062, + "time_per_iteration": 2.6923935413360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090506, + "balance_loss_mlp": 1.07616472, + "diversity_loss_mlp": 0.0, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.07452632641130948, + "language_loss": 0.85384166, + "learning_rate": 0.0009222574193715802, + "loss": 0.86474669, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1063, + "time_per_iteration": 2.7701327800750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092958, + "balance_loss_mlp": 1.07850981, + "diversity_loss_mlp": 0.0, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06517233034985846, + "language_loss": 0.85915947, + "learning_rate": 0.000922090496490869, + "loss": 0.87008905, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1064, + "time_per_iteration": 2.7387099266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098934, + "balance_loss_mlp": 1.08404493, + "diversity_loss_mlp": 0.0, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.06963355430403552, + "language_loss": 0.89889115, + "learning_rate": 0.0009219234097400937, + "loss": 0.90988052, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1065, + "time_per_iteration": 2.859334707260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112884, + "balance_loss_mlp": 1.09778059, + "diversity_loss_mlp": 0.0, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06723697540994414, + "language_loss": 0.83086514, + "learning_rate": 0.0009217561591841237, + "loss": 0.84199405, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1066, + "time_per_iteration": 3.3065547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00886484, + "balance_loss_mlp": 1.54046464, + "diversity_loss_mlp": 0.1982768, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.03984406199709606, + "language_loss": 0.80820358, + "learning_rate": 0.0009215887448878913, + "loss": 0.8170684, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01711285, + "step": 1067, + "time_per_iteration": 2.6291754245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131678, + "balance_loss_mlp": 1.11697936, + "diversity_loss_mlp": 0.0, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.07633348035576148, + "language_loss": 0.85365784, + "learning_rate": 0.0009214211669163922, + "loss": 0.86497462, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1068, + "time_per_iteration": 2.747936725616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136914, + "balance_loss_mlp": 1.12220347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.07197705825645119, + "language_loss": 0.9405331, + "learning_rate": 0.0009212534253346862, + "loss": 0.95190227, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1069, + "time_per_iteration": 2.696131467819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128507, + "balance_loss_mlp": 1.11372542, + "diversity_loss_mlp": 0.0, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.09743186487320747, + "language_loss": 0.84269625, + "learning_rate": 0.0009210855202078964, + "loss": 0.85398132, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1070, + "time_per_iteration": 2.6194372177124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114316, + "balance_loss_mlp": 1.12903321, + "diversity_loss_mlp": 0.0, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.08033414700046611, + "language_loss": 0.87081122, + "learning_rate": 0.0009209174516012091, + "loss": 0.88224292, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1071, + "time_per_iteration": 2.5169904232025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146914, + "balance_loss_mlp": 1.13247752, + "diversity_loss_mlp": 0.0, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.06769648970134874, + "language_loss": 0.89207751, + "learning_rate": 0.0009207492195798747, + "loss": 0.90354669, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1072, + "time_per_iteration": 2.804577112197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137485, + "balance_loss_mlp": 1.12303698, + "diversity_loss_mlp": 0.0, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.0857236005827703, + "language_loss": 0.84780991, + "learning_rate": 0.0009205808242092061, + "loss": 0.85918474, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1073, + "time_per_iteration": 2.6134936809539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122455, + "balance_loss_mlp": 1.10787559, + "diversity_loss_mlp": 0.0, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.09531084522047072, + "language_loss": 0.82512677, + "learning_rate": 0.0009204122655545808, + "loss": 0.83635134, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1074, + "time_per_iteration": 3.461315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888955, + "balance_loss_mlp": 1.54418314, + "diversity_loss_mlp": 0.20175909, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.03221822204199988, + "language_loss": 0.80952764, + "learning_rate": 0.0009202435436814388, + "loss": 0.81841719, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01598355, + "step": 1075, + "time_per_iteration": 2.728055238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146745, + "balance_loss_mlp": 1.13259482, + "diversity_loss_mlp": 0.0, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.0831097658087499, + "language_loss": 0.89925295, + "learning_rate": 0.0009200746586552836, + "loss": 0.91072041, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1076, + "time_per_iteration": 2.929422616958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136182, + "balance_loss_mlp": 1.12185347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.07960863169785164, + "language_loss": 0.84148425, + "learning_rate": 0.0009199056105416825, + "loss": 0.85284609, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1077, + "time_per_iteration": 3.0795576572418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148051, + "balance_loss_mlp": 1.13384151, + "diversity_loss_mlp": 0.0, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.06589509494701294, + "language_loss": 0.86599898, + "learning_rate": 0.0009197363994062654, + "loss": 0.87747955, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1078, + "time_per_iteration": 2.8304550647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891417, + "balance_loss_mlp": 1.54815006, + "diversity_loss_mlp": 0.20151556, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.027729032115243194, + "language_loss": 0.84302026, + "learning_rate": 0.0009195670253147262, + "loss": 0.85193443, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01658459, + "step": 1079, + "time_per_iteration": 2.987715005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168872, + "balance_loss_mlp": 1.15472198, + "diversity_loss_mlp": 0.0, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.07878432741989363, + "language_loss": 0.82508785, + "learning_rate": 0.0009193974883328216, + "loss": 0.83677661, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1080, + "time_per_iteration": 2.6007754802703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178335, + "balance_loss_mlp": 1.16408908, + "diversity_loss_mlp": 0.0, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06872318796781544, + "language_loss": 0.86871535, + "learning_rate": 0.0009192277885263718, + "loss": 0.88049871, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1081, + "time_per_iteration": 2.645918846130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116777, + "balance_loss_mlp": 1.15339386, + "diversity_loss_mlp": 0.0, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.08475435362049728, + "language_loss": 0.86010319, + "learning_rate": 0.0009190579259612602, + "loss": 0.87178093, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1082, + "time_per_iteration": 3.2688331604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153529, + "balance_loss_mlp": 1.13914001, + "diversity_loss_mlp": 0.0, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.06676527060715894, + "language_loss": 0.86419082, + "learning_rate": 0.000918887900703433, + "loss": 0.8757261, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1083, + "time_per_iteration": 2.7645068168640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129996, + "balance_loss_mlp": 1.11559522, + "diversity_loss_mlp": 0.0, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07296749014166971, + "language_loss": 0.89779425, + "learning_rate": 0.0009187177128188999, + "loss": 0.90909421, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1084, + "time_per_iteration": 2.441312313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128832, + "balance_loss_mlp": 1.11915255, + "diversity_loss_mlp": 0.0, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.053207927956046876, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78285372, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1085, + "time_per_iteration": 4.864179849624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117368, + "balance_loss_mlp": 1.1029439, + "diversity_loss_mlp": 0.0, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07905606819783856, + "language_loss": 0.85833263, + "learning_rate": 0.000918376849434071, + "loss": 0.86950636, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1086, + "time_per_iteration": 4.049270868301392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112999, + "balance_loss_mlp": 1.09849179, + "diversity_loss_mlp": 0.0, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.08954509639668791, + "language_loss": 0.90778226, + "learning_rate": 0.0009182061740661098, + "loss": 0.91891223, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1087, + "time_per_iteration": 2.557358741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128974, + "balance_loss_mlp": 1.11446643, + "diversity_loss_mlp": 0.0, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.08446380837501397, + "language_loss": 0.85054636, + "learning_rate": 0.0009180353363361127, + "loss": 0.86183608, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1088, + "time_per_iteration": 3.0897305011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118573, + "balance_loss_mlp": 1.10417306, + "diversity_loss_mlp": 0.0, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.08173869768976531, + "language_loss": 0.82508695, + "learning_rate": 0.0009178643363104044, + "loss": 0.83627272, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1089, + "time_per_iteration": 3.124645948410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113657, + "balance_loss_mlp": 1.09938824, + "diversity_loss_mlp": 0.0, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.09307233053408402, + "language_loss": 0.90518665, + "learning_rate": 0.0009176931740553735, + "loss": 0.9163233, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1090, + "time_per_iteration": 2.6098225116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113731, + "balance_loss_mlp": 1.09981966, + "diversity_loss_mlp": 0.0, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.09489388322063774, + "language_loss": 0.8240813, + "learning_rate": 0.0009175218496374708, + "loss": 0.83521861, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1091, + "time_per_iteration": 3.336355686187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110612, + "balance_loss_mlp": 1.09205294, + "diversity_loss_mlp": 0.0, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.08870561470384966, + "language_loss": 0.86057436, + "learning_rate": 0.0009173503631232103, + "loss": 0.87163556, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1092, + "time_per_iteration": 3.356015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106884, + "balance_loss_mlp": 1.09269798, + "diversity_loss_mlp": 0.0, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.09478788106803046, + "language_loss": 0.82067865, + "learning_rate": 0.0009171787145791691, + "loss": 0.83174753, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1093, + "time_per_iteration": 3.2546143531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116222, + "balance_loss_mlp": 1.10199988, + "diversity_loss_mlp": 0.0, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.14674509624116924, + "language_loss": 0.80160701, + "learning_rate": 0.000917006904071987, + "loss": 0.81276917, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1094, + "time_per_iteration": 2.5837080478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911953, + "balance_loss_mlp": 1.58726883, + "diversity_loss_mlp": 0.20477253, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.035943125208157026, + "language_loss": 0.8737694, + "learning_rate": 0.0009168349316683669, + "loss": 0.88288891, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01593196, + "step": 1095, + "time_per_iteration": 2.768296718597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136825, + "balance_loss_mlp": 1.1224122, + "diversity_loss_mlp": 0.0, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.06639171103878667, + "language_loss": 0.82719827, + "learning_rate": 0.0009166627974350741, + "loss": 0.83856648, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1096, + "time_per_iteration": 2.8819992542266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145046, + "balance_loss_mlp": 1.13041949, + "diversity_loss_mlp": 0.0, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.08337696606413014, + "language_loss": 0.89929205, + "learning_rate": 0.0009164905014389373, + "loss": 0.91074252, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1097, + "time_per_iteration": 2.7877442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163813, + "balance_loss_mlp": 1.1495918, + "diversity_loss_mlp": 0.0, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.08033808486911229, + "language_loss": 0.86386079, + "learning_rate": 0.0009163180437468476, + "loss": 0.87549889, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1098, + "time_per_iteration": 2.6314592361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176615, + "balance_loss_mlp": 1.16195273, + "diversity_loss_mlp": 0.0, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.09094665560265827, + "language_loss": 0.85629344, + "learning_rate": 0.000916145424425759, + "loss": 0.86805964, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1099, + "time_per_iteration": 2.6608541011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181873, + "balance_loss_mlp": 1.16744852, + "diversity_loss_mlp": 0.0, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.09944182260515583, + "language_loss": 0.9083795, + "learning_rate": 0.0009159726435426885, + "loss": 0.9201982, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1100, + "time_per_iteration": 3.0502405166625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.134619, + "diversity_loss_mlp": 0.0, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.09151162791452093, + "language_loss": 0.90900993, + "learning_rate": 0.0009157997011647154, + "loss": 0.92050231, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1101, + "time_per_iteration": 2.6048476696014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127613, + "balance_loss_mlp": 1.11389172, + "diversity_loss_mlp": 0.0, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.07696729699318336, + "language_loss": 0.86130077, + "learning_rate": 0.0009156265973589817, + "loss": 0.87257689, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1102, + "time_per_iteration": 2.7552144527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114805, + "balance_loss_mlp": 1.10088181, + "diversity_loss_mlp": 0.0, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.07661877314329607, + "language_loss": 0.89485067, + "learning_rate": 0.0009154533321926926, + "loss": 0.90599877, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.13909912, + "routerloss_mlp": 0.0, + "step": 1103, + "time_per_iteration": 4.073851108551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105254, + "balance_loss_mlp": 1.09134197, + "diversity_loss_mlp": 0.0, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.08363594534482698, + "language_loss": 0.8717171, + "learning_rate": 0.0009152799057331156, + "loss": 0.88276958, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1104, + "time_per_iteration": 3.142221450805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100132, + "balance_loss_mlp": 1.08656633, + "diversity_loss_mlp": 0.0, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.1056362594360365, + "language_loss": 0.91270363, + "learning_rate": 0.0009151063180475805, + "loss": 0.92370498, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1105, + "time_per_iteration": 2.512547016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095772, + "balance_loss_mlp": 1.08196795, + "diversity_loss_mlp": 0.0, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.08072473316090223, + "language_loss": 0.84285367, + "learning_rate": 0.0009149325692034803, + "loss": 0.85381138, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1106, + "time_per_iteration": 2.5711469650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.06266928, + "diversity_loss_mlp": 0.0, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.04229613635199888, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.8027482, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1107, + "time_per_iteration": 4.817704916000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129097, + "balance_loss_mlp": 1.11547112, + "diversity_loss_mlp": 0.0, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.07382538641756346, + "language_loss": 0.8748607, + "learning_rate": 0.0009145845883094678, + "loss": 0.88615161, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1108, + "time_per_iteration": 3.039318561553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150208, + "balance_loss_mlp": 1.13671303, + "diversity_loss_mlp": 0.0, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07887220377556703, + "language_loss": 0.85174125, + "learning_rate": 0.000914410356394654, + "loss": 0.86324334, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1109, + "time_per_iteration": 2.76413893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116209, + "balance_loss_mlp": 1.1484766, + "diversity_loss_mlp": 0.0, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.06362602917472766, + "language_loss": 0.84447891, + "learning_rate": 0.0009142359635914709, + "loss": 0.85609984, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1110, + "time_per_iteration": 3.007201671600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163563, + "balance_loss_mlp": 1.15004468, + "diversity_loss_mlp": 0.0, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.07633144605420673, + "language_loss": 0.84598219, + "learning_rate": 0.0009140614099676245, + "loss": 0.85761786, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1111, + "time_per_iteration": 2.569401979446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161722, + "balance_loss_mlp": 1.14807272, + "diversity_loss_mlp": 0.0, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.0712977258009472, + "language_loss": 0.82590818, + "learning_rate": 0.0009138866955908821, + "loss": 0.83752549, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1112, + "time_per_iteration": 2.870701789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166663, + "balance_loss_mlp": 1.15294182, + "diversity_loss_mlp": 0.0, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.09239605609063735, + "language_loss": 0.80485952, + "learning_rate": 0.0009137118205290738, + "loss": 0.81652606, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.13739014, + "routerloss_mlp": 0.0, + "step": 1113, + "time_per_iteration": 2.9623591899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174843, + "balance_loss_mlp": 1.16082442, + "diversity_loss_mlp": 0.0, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.08763873550503462, + "language_loss": 0.90553653, + "learning_rate": 0.0009135367848500924, + "loss": 0.91728497, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1114, + "time_per_iteration": 2.5287492275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165146, + "balance_loss_mlp": 1.15138936, + "diversity_loss_mlp": 0.0, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.11593363319598911, + "language_loss": 0.86361086, + "learning_rate": 0.0009133615886218927, + "loss": 0.87526232, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1115, + "time_per_iteration": 2.6945505142211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141616, + "balance_loss_mlp": 1.12725139, + "diversity_loss_mlp": 0.0, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.08371979294567897, + "language_loss": 0.87389791, + "learning_rate": 0.0009131862319124917, + "loss": 0.88531411, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1116, + "time_per_iteration": 2.6219210624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130003, + "balance_loss_mlp": 1.1162107, + "diversity_loss_mlp": 0.0, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08272793517794225, + "language_loss": 0.83981287, + "learning_rate": 0.0009130107147899691, + "loss": 0.85111284, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1117, + "time_per_iteration": 2.698151111602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.1039083, + "diversity_loss_mlp": 0.0, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.4685945915436946, + "language_loss": 0.85086691, + "learning_rate": 0.0009128350373224665, + "loss": 0.86204791, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1118, + "time_per_iteration": 2.545565128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059182, + "balance_loss_mlp": 1.04950213, + "diversity_loss_mlp": 0.0, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.03761711697708654, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82515609, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1119, + "time_per_iteration": 4.648902416229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118843, + "balance_loss_mlp": 1.10412121, + "diversity_loss_mlp": 0.0, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07492511871579786, + "language_loss": 0.85205054, + "learning_rate": 0.0009124832016254005, + "loss": 0.86323893, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1120, + "time_per_iteration": 2.5875513553619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112958, + "balance_loss_mlp": 1.11404657, + "diversity_loss_mlp": 0.0, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.10623123993924175, + "language_loss": 0.88117284, + "learning_rate": 0.0009123070435324316, + "loss": 0.89246857, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1121, + "time_per_iteration": 2.752814769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119114, + "balance_loss_mlp": 1.10852826, + "diversity_loss_mlp": 0.0, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.05861429426141409, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78994894, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 1122, + "time_per_iteration": 4.993450880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114894, + "balance_loss_mlp": 1.13229823, + "diversity_loss_mlp": 0.0, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.09758120262844092, + "language_loss": 0.86477894, + "learning_rate": 0.0009119542471995752, + "loss": 0.87626839, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 1123, + "time_per_iteration": 2.8260560035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132116, + "balance_loss_mlp": 1.1160109, + "diversity_loss_mlp": 0.0, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.1175490331770948, + "language_loss": 0.81597894, + "learning_rate": 0.0009117776090966554, + "loss": 0.82730007, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1124, + "time_per_iteration": 2.955768585205078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133281, + "balance_loss_mlp": 1.1166153, + "diversity_loss_mlp": 0.0, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.08908783615486303, + "language_loss": 0.86717665, + "learning_rate": 0.0009116008111274899, + "loss": 0.87850952, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1125, + "time_per_iteration": 3.2493131160736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_mlp": 1.02921367, + "diversity_loss_mlp": 0.0, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.03267712428803131, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80145574, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 1126, + "time_per_iteration": 4.8121678829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148434, + "balance_loss_mlp": 1.13257909, + "diversity_loss_mlp": 0.0, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.09699177011816186, + "language_loss": 0.85244691, + "learning_rate": 0.0009112467358650396, + "loss": 0.86393118, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1127, + "time_per_iteration": 3.144075393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166528, + "balance_loss_mlp": 1.15056634, + "diversity_loss_mlp": 0.0, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.07985175184807933, + "language_loss": 0.86319685, + "learning_rate": 0.0009110694587092192, + "loss": 0.87486213, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.1595459, + "routerloss_mlp": 0.0, + "step": 1128, + "time_per_iteration": 2.7497644424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179675, + "balance_loss_mlp": 1.1634866, + "diversity_loss_mlp": 0.0, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.1038215552752292, + "language_loss": 0.81267089, + "learning_rate": 0.0009108920219620815, + "loss": 0.82446766, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 1129, + "time_per_iteration": 2.6150496006011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195026, + "balance_loss_mlp": 1.1788609, + "diversity_loss_mlp": 0.0, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06771714561059723, + "language_loss": 0.89286679, + "learning_rate": 0.0009107144256925133, + "loss": 0.9048171, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1130, + "time_per_iteration": 2.6569926738739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196317, + "balance_loss_mlp": 1.18006873, + "diversity_loss_mlp": 0.0, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08333124164895586, + "language_loss": 0.82520813, + "learning_rate": 0.0009105366699694638, + "loss": 0.83717132, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 1131, + "time_per_iteration": 2.7384698390960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200769, + "balance_loss_mlp": 1.18390059, + "diversity_loss_mlp": 0.0, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.07018840625680964, + "language_loss": 0.81826723, + "learning_rate": 0.0009103587548619439, + "loss": 0.83027488, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 1132, + "time_per_iteration": 2.8361291885375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188026, + "balance_loss_mlp": 1.17064476, + "diversity_loss_mlp": 0.0, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.08238158624987729, + "language_loss": 0.85952497, + "learning_rate": 0.0009101806804390261, + "loss": 0.87140524, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.1739502, + "routerloss_mlp": 0.0, + "step": 1133, + "time_per_iteration": 2.8646528720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846565, + "balance_loss_mlp": 1.45559311, + "diversity_loss_mlp": 0.20202307, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.03511986753794681, + "language_loss": 0.90682399, + "learning_rate": 0.0009100024467698453, + "loss": 0.91528964, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01775702, + "step": 1134, + "time_per_iteration": 2.628955364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119036, + "balance_loss_mlp": 1.17289567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.09831196896097749, + "language_loss": 0.82889581, + "learning_rate": 0.0009098240539235981, + "loss": 0.84079945, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 1135, + "time_per_iteration": 2.6857638359069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179858, + "balance_loss_mlp": 1.16191649, + "diversity_loss_mlp": 0.0, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.07855046788509763, + "language_loss": 0.87649047, + "learning_rate": 0.0009096455019695423, + "loss": 0.88828909, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 1136, + "time_per_iteration": 2.814746856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175201, + "balance_loss_mlp": 1.15702188, + "diversity_loss_mlp": 0.0, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.090535881946018, + "language_loss": 0.89789271, + "learning_rate": 0.000909466790976998, + "loss": 0.90964472, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.18188477, + "routerloss_mlp": 0.0, + "step": 1137, + "time_per_iteration": 2.503934144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151304, + "balance_loss_mlp": 1.13231349, + "diversity_loss_mlp": 0.0, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07386356915969775, + "language_loss": 0.82546908, + "learning_rate": 0.0009092879210153473, + "loss": 0.83698207, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.18981934, + "routerloss_mlp": 0.0, + "step": 1138, + "time_per_iteration": 3.106015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143167, + "balance_loss_mlp": 1.12445128, + "diversity_loss_mlp": 0.0, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.08443059177839436, + "language_loss": 0.89126158, + "learning_rate": 0.0009091088921540333, + "loss": 0.90269327, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.18701172, + "routerloss_mlp": 0.0, + "step": 1139, + "time_per_iteration": 2.5165584087371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197317, + "balance_loss_mlp": 1.18491888, + "diversity_loss_mlp": 0.0, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.06938907882855633, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76705992, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12402344, + "routerloss_mlp": 0.0, + "step": 1140, + "time_per_iteration": 4.907839775085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845315, + "balance_loss_mlp": 1.45913088, + "diversity_loss_mlp": 0.19676474, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.04157801253712285, + "language_loss": 0.84799111, + "learning_rate": 0.0009087503580104985, + "loss": 0.8564443, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01736734, + "step": 1141, + "time_per_iteration": 2.6928980350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106137, + "balance_loss_mlp": 1.08643126, + "diversity_loss_mlp": 0.0, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.09652849342648293, + "language_loss": 0.7964108, + "learning_rate": 0.0009085708528674728, + "loss": 0.80747211, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 1142, + "time_per_iteration": 2.7800490856170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.09476519, + "diversity_loss_mlp": 0.0, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.11345906914127299, + "language_loss": 0.8700006, + "learning_rate": 0.0009083911891031745, + "loss": 0.88115132, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 1143, + "time_per_iteration": 3.104893684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110402, + "balance_loss_mlp": 1.08533978, + "diversity_loss_mlp": 0.0, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.12428556161586228, + "language_loss": 0.91569418, + "learning_rate": 0.0009082113667873553, + "loss": 0.92673439, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.18676758, + "routerloss_mlp": 0.0, + "step": 1144, + "time_per_iteration": 3.0838277339935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138926, + "balance_loss_mlp": 1.12060392, + "diversity_loss_mlp": 0.0, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.0955721440223133, + "language_loss": 0.90911627, + "learning_rate": 0.0009080313859898283, + "loss": 0.92050546, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 1145, + "time_per_iteration": 2.4998109340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162548, + "balance_loss_mlp": 1.14463091, + "diversity_loss_mlp": 0.0, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07871728913387968, + "language_loss": 0.91642439, + "learning_rate": 0.0009078512467804684, + "loss": 0.92804986, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.17932129, + "routerloss_mlp": 0.0, + "step": 1146, + "time_per_iteration": 2.583137273788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192448, + "balance_loss_mlp": 1.17516243, + "diversity_loss_mlp": 0.0, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.10815580627735921, + "language_loss": 0.90245295, + "learning_rate": 0.0009076709492292119, + "loss": 0.91437739, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 1147, + "time_per_iteration": 2.6189510822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199389, + "balance_loss_mlp": 1.18260384, + "diversity_loss_mlp": 0.0, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.10018226205073696, + "language_loss": 0.88948917, + "learning_rate": 0.0009074904934060562, + "loss": 0.90148306, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1148, + "time_per_iteration": 2.6619913578033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.17623389, + "diversity_loss_mlp": 0.0, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.09879445691718633, + "language_loss": 0.85041308, + "learning_rate": 0.0009073098793810607, + "loss": 0.8623414, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.1661377, + "routerloss_mlp": 0.0, + "step": 1149, + "time_per_iteration": 2.9382119178771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185083, + "balance_loss_mlp": 1.16848898, + "diversity_loss_mlp": 0.0, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.09716543961816822, + "language_loss": 0.88557786, + "learning_rate": 0.000907129107224346, + "loss": 0.89742863, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.16601562, + "routerloss_mlp": 0.0, + "step": 1150, + "time_per_iteration": 2.717400550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.17356002, + "diversity_loss_mlp": 0.0, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.0741661773141201, + "language_loss": 0.88313866, + "learning_rate": 0.0009069481770060939, + "loss": 0.89504004, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1151, + "time_per_iteration": 2.676938056945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118655, + "balance_loss_mlp": 1.17039752, + "diversity_loss_mlp": 0.0, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06827936796637825, + "language_loss": 0.83848286, + "learning_rate": 0.000906767088796548, + "loss": 0.85034835, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1152, + "time_per_iteration": 3.442782163619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185002, + "balance_loss_mlp": 1.16889715, + "diversity_loss_mlp": 0.0, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.07358747282835834, + "language_loss": 0.87001419, + "learning_rate": 0.0009065858426660127, + "loss": 0.88186425, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1153, + "time_per_iteration": 2.6501753330230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178927, + "balance_loss_mlp": 1.16286922, + "diversity_loss_mlp": 0.0, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.0863709920952229, + "language_loss": 0.84764236, + "learning_rate": 0.0009064044386848543, + "loss": 0.85943162, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1154, + "time_per_iteration": 2.920689344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176891, + "balance_loss_mlp": 1.16032064, + "diversity_loss_mlp": 0.0, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.07669791788600007, + "language_loss": 0.88829726, + "learning_rate": 0.0009062228769234997, + "loss": 0.90006614, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 1155, + "time_per_iteration": 2.561638832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154629, + "balance_loss_mlp": 1.13797593, + "diversity_loss_mlp": 0.0, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.08447027490527963, + "language_loss": 0.81123281, + "learning_rate": 0.0009060411574524376, + "loss": 0.82277906, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 1156, + "time_per_iteration": 2.655132293701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162354, + "balance_loss_mlp": 1.14597416, + "diversity_loss_mlp": 0.0, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.08665349089557017, + "language_loss": 0.87817705, + "learning_rate": 0.0009058592803422178, + "loss": 0.88980061, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 1157, + "time_per_iteration": 3.1417362689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183028, + "balance_loss_mlp": 1.17430186, + "diversity_loss_mlp": 0.0, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.06198684812147071, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79893315, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1158, + "time_per_iteration": 4.867843866348267 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.1120069, + "diversity_loss_mlp": 0.0, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.0864152607347894, + "language_loss": 0.90156865, + "learning_rate": 0.00090549505348681, + "loss": 0.91285539, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1159, + "time_per_iteration": 2.581865072250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118982, + "balance_loss_mlp": 1.1025548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07056827667929483, + "language_loss": 0.83819324, + "learning_rate": 0.0009053127038830275, + "loss": 0.84938306, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 1160, + "time_per_iteration": 2.9969708919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881169, + "balance_loss_mlp": 1.53314447, + "diversity_loss_mlp": 0.19063006, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.04002382495760162, + "language_loss": 0.87460124, + "learning_rate": 0.000905130196922898, + "loss": 0.88341296, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01928164, + "step": 1161, + "time_per_iteration": 2.6307718753814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881407, + "balance_loss_mlp": 1.5316093, + "diversity_loss_mlp": 0.19140732, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.030280826501304762, + "language_loss": 0.86784196, + "learning_rate": 0.0009049475326772769, + "loss": 0.87665606, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01989887, + "step": 1162, + "time_per_iteration": 2.6021478176116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00889034, + "balance_loss_mlp": 1.54766631, + "diversity_loss_mlp": 0.19066738, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.03198536270345376, + "language_loss": 0.83124602, + "learning_rate": 0.0009047647112170811, + "loss": 0.84013629, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01986698, + "step": 1163, + "time_per_iteration": 2.804150342941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123868, + "balance_loss_mlp": 1.1070838, + "diversity_loss_mlp": 0.0, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.09901141435665076, + "language_loss": 0.87948084, + "learning_rate": 0.0009045817326132876, + "loss": 0.89071947, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1164, + "time_per_iteration": 3.6840732097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125369, + "balance_loss_mlp": 1.107988, + "diversity_loss_mlp": 0.0, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.08432013167879508, + "language_loss": 0.83142793, + "learning_rate": 0.0009043985969369357, + "loss": 0.84268159, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.17407227, + "routerloss_mlp": 0.0, + "step": 1165, + "time_per_iteration": 2.8148193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146301, + "balance_loss_mlp": 1.12976706, + "diversity_loss_mlp": 0.0, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.06944445596490195, + "language_loss": 0.84334069, + "learning_rate": 0.0009042153042591245, + "loss": 0.85480368, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 1166, + "time_per_iteration": 2.8004493713378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142176, + "balance_loss_mlp": 1.12542677, + "diversity_loss_mlp": 0.0, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.06821660135571728, + "language_loss": 0.85225487, + "learning_rate": 0.0009040318546510146, + "loss": 0.86367661, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 1167, + "time_per_iteration": 3.1969215869903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156354, + "balance_loss_mlp": 1.13979554, + "diversity_loss_mlp": 0.0, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.06547364647617461, + "language_loss": 0.84988701, + "learning_rate": 0.0009038482481838275, + "loss": 0.86145055, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 1168, + "time_per_iteration": 2.7087180614471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00861334, + "balance_loss_mlp": 1.49333596, + "diversity_loss_mlp": 0.19261675, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.02892951533663535, + "language_loss": 0.87266529, + "learning_rate": 0.0009036644849288455, + "loss": 0.88127863, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01835741, + "step": 1169, + "time_per_iteration": 3.1039352416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.1631248, + "diversity_loss_mlp": 0.0, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.06865085555084699, + "language_loss": 0.85404736, + "learning_rate": 0.0009034805649574118, + "loss": 0.86584634, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.16784668, + "routerloss_mlp": 0.0, + "step": 1170, + "time_per_iteration": 2.659322738647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208955, + "balance_loss_mlp": 1.1926589, + "diversity_loss_mlp": 0.0, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.07685307661183591, + "language_loss": 0.85691977, + "learning_rate": 0.0009032964883409308, + "loss": 0.86900926, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 1171, + "time_per_iteration": 2.8938751220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128211, + "balance_loss_mlp": 1.11910319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.06058864885284362, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74178743, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 1172, + "time_per_iteration": 4.983820676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217918, + "balance_loss_mlp": 1.20207548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.1048847225020503, + "language_loss": 0.8717351, + "learning_rate": 0.0009029278654587462, + "loss": 0.88391435, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.1583252, + "routerloss_mlp": 0.0, + "step": 1173, + "time_per_iteration": 2.639632225036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181665, + "balance_loss_mlp": 1.16508245, + "diversity_loss_mlp": 0.0, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.07111002228073603, + "language_loss": 0.82226282, + "learning_rate": 0.0009027433193361548, + "loss": 0.83407944, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1174, + "time_per_iteration": 2.7443323135375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159983, + "balance_loss_mlp": 1.14366364, + "diversity_loss_mlp": 0.0, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06531304020653, + "language_loss": 0.86980343, + "learning_rate": 0.00090255861685474, + "loss": 0.88140327, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 1175, + "time_per_iteration": 2.7534220218658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142116, + "balance_loss_mlp": 1.12533128, + "diversity_loss_mlp": 0.0, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.10016618462748716, + "language_loss": 0.90750074, + "learning_rate": 0.0009023737580862095, + "loss": 0.91892195, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1176, + "time_per_iteration": 2.5116937160491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114077, + "balance_loss_mlp": 1.12470055, + "diversity_loss_mlp": 0.0, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0707285441494173, + "language_loss": 0.83225566, + "learning_rate": 0.0009021887431023321, + "loss": 0.84366333, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1177, + "time_per_iteration": 2.599956512451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130034, + "balance_loss_mlp": 1.11444104, + "diversity_loss_mlp": 0.0, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.08431891612549362, + "language_loss": 0.87212515, + "learning_rate": 0.0009020035719749369, + "loss": 0.88342547, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1178, + "time_per_iteration": 2.7144312858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135701, + "balance_loss_mlp": 1.1205014, + "diversity_loss_mlp": 0.0, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.09883499682369536, + "language_loss": 0.77450085, + "learning_rate": 0.0009018182447759136, + "loss": 0.7858578, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1179, + "time_per_iteration": 2.98848557472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137145, + "balance_loss_mlp": 1.12187457, + "diversity_loss_mlp": 0.0, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.08173095074239418, + "language_loss": 0.79878223, + "learning_rate": 0.0009016327615772126, + "loss": 0.81015366, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1180, + "time_per_iteration": 2.9338154792785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149275, + "balance_loss_mlp": 1.13449335, + "diversity_loss_mlp": 0.0, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.08374692364956231, + "language_loss": 0.87680298, + "learning_rate": 0.0009014471224508451, + "loss": 0.88829577, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1181, + "time_per_iteration": 2.7131431102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881934, + "balance_loss_mlp": 1.53494334, + "diversity_loss_mlp": 0.19571492, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.04185105584005936, + "language_loss": 0.83154267, + "learning_rate": 0.0009012613274688823, + "loss": 0.84036207, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01660516, + "step": 1182, + "time_per_iteration": 2.649559736251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184244, + "balance_loss_mlp": 1.1692239, + "diversity_loss_mlp": 0.0, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.12019924395271459, + "language_loss": 0.87753081, + "learning_rate": 0.0009010753767034565, + "loss": 0.8893733, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1183, + "time_per_iteration": 2.5258986949920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175003, + "balance_loss_mlp": 1.16030502, + "diversity_loss_mlp": 0.0, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.08783280174490297, + "language_loss": 0.78918862, + "learning_rate": 0.0009008892702267599, + "loss": 0.80093861, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1184, + "time_per_iteration": 2.9962406158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139115, + "balance_loss_mlp": 1.12460732, + "diversity_loss_mlp": 0.0, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.08254121322216867, + "language_loss": 0.88525105, + "learning_rate": 0.0009007030081110457, + "loss": 0.89664215, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1185, + "time_per_iteration": 2.5990660190582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125585, + "balance_loss_mlp": 1.11087465, + "diversity_loss_mlp": 0.0, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.07610459395316062, + "language_loss": 0.84548527, + "learning_rate": 0.000900516590428627, + "loss": 0.85674113, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1186, + "time_per_iteration": 2.7377407550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121666, + "balance_loss_mlp": 1.1070751, + "diversity_loss_mlp": 0.0, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.13748029932532174, + "language_loss": 0.89182103, + "learning_rate": 0.0009003300172518778, + "loss": 0.90303767, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1187, + "time_per_iteration": 2.6916556358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116227, + "balance_loss_mlp": 1.10145736, + "diversity_loss_mlp": 0.0, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.11313229810108143, + "language_loss": 0.84335989, + "learning_rate": 0.0009001432886532321, + "loss": 0.85452211, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1188, + "time_per_iteration": 2.9698264598846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114727, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.06729358528862889, + "language_loss": 0.86774516, + "learning_rate": 0.0008999564047051843, + "loss": 0.87889242, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1189, + "time_per_iteration": 2.5002098083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136799, + "balance_loss_mlp": 1.12243462, + "diversity_loss_mlp": 0.0, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.0714274855120672, + "language_loss": 0.84824312, + "learning_rate": 0.0008997693654802894, + "loss": 0.85961115, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1190, + "time_per_iteration": 2.6300055980682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149047, + "balance_loss_mlp": 1.13425303, + "diversity_loss_mlp": 0.0, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.07754985979781381, + "language_loss": 0.86714745, + "learning_rate": 0.0008995821710511625, + "loss": 0.87863791, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1191, + "time_per_iteration": 2.7126989364624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162855, + "balance_loss_mlp": 1.14807296, + "diversity_loss_mlp": 0.0, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.11547698788472376, + "language_loss": 0.85060751, + "learning_rate": 0.0008993948214904786, + "loss": 0.86223602, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1192, + "time_per_iteration": 2.5562260150909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152534, + "balance_loss_mlp": 1.14361739, + "diversity_loss_mlp": 0.0, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.05307726892258072, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79574746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 1193, + "time_per_iteration": 4.909748792648315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187526, + "balance_loss_mlp": 1.17205215, + "diversity_loss_mlp": 0.0, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.09739164860103838, + "language_loss": 0.78353333, + "learning_rate": 0.0008990196572654427, + "loss": 0.79540861, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.15454102, + "routerloss_mlp": 0.0, + "step": 1194, + "time_per_iteration": 2.8592262268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117424, + "balance_loss_mlp": 1.1592319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.06260411033315277, + "language_loss": 0.87559408, + "learning_rate": 0.0008988318427467426, + "loss": 0.88733649, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1195, + "time_per_iteration": 2.7444722652435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00878316, + "balance_loss_mlp": 1.52780199, + "diversity_loss_mlp": 0.1948241, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.0364111048645648, + "language_loss": 0.86376345, + "learning_rate": 0.0008986438733877887, + "loss": 0.87254667, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01700337, + "step": 1196, + "time_per_iteration": 3.5090088844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137546, + "balance_loss_mlp": 1.1229074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.08413871186116019, + "language_loss": 0.83810687, + "learning_rate": 0.0008984557492615576, + "loss": 0.84948236, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1197, + "time_per_iteration": 2.9953744411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10803354, + "diversity_loss_mlp": 0.0, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.08617240411661099, + "language_loss": 0.90267789, + "learning_rate": 0.0008982674704410854, + "loss": 0.91390687, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1198, + "time_per_iteration": 2.7513339519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110338, + "balance_loss_mlp": 1.09598517, + "diversity_loss_mlp": 0.0, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.11146547076727734, + "language_loss": 0.77876621, + "learning_rate": 0.0008980790369994682, + "loss": 0.78986955, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1199, + "time_per_iteration": 2.989825487136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120977, + "balance_loss_mlp": 1.10670781, + "diversity_loss_mlp": 0.0, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.0677628031660983, + "language_loss": 0.8729977, + "learning_rate": 0.000897890449009863, + "loss": 0.88420743, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1200, + "time_per_iteration": 2.6784448623657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127646, + "balance_loss_mlp": 1.11330509, + "diversity_loss_mlp": 0.0, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.080414080555838, + "language_loss": 0.89825618, + "learning_rate": 0.0008977017065454853, + "loss": 0.90953267, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1201, + "time_per_iteration": 2.6610703468322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00880483, + "balance_loss_mlp": 1.52539706, + "diversity_loss_mlp": 0.19880572, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.03277795962214655, + "language_loss": 0.80367738, + "learning_rate": 0.0008975128096796121, + "loss": 0.81248224, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01838172, + "step": 1202, + "time_per_iteration": 2.901998996734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145011, + "balance_loss_mlp": 1.13089633, + "diversity_loss_mlp": 0.0, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.10693947298766643, + "language_loss": 0.85848922, + "learning_rate": 0.0008973237584855794, + "loss": 0.86993933, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1203, + "time_per_iteration": 2.872408151626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160017, + "balance_loss_mlp": 1.1457237, + "diversity_loss_mlp": 0.0, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.08753213296005687, + "language_loss": 0.82586002, + "learning_rate": 0.0008971345530367832, + "loss": 0.83746028, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1204, + "time_per_iteration": 2.4641921520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185717, + "balance_loss_mlp": 1.17120886, + "diversity_loss_mlp": 0.0, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07947534631123947, + "language_loss": 0.85658818, + "learning_rate": 0.0008969451934066799, + "loss": 0.8684454, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1205, + "time_per_iteration": 2.7822117805480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173591, + "balance_loss_mlp": 1.15872586, + "diversity_loss_mlp": 0.0, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08780432716538046, + "language_loss": 0.79991889, + "learning_rate": 0.0008967556796687854, + "loss": 0.81165481, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1206, + "time_per_iteration": 2.8849406242370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117013, + "balance_loss_mlp": 1.15584886, + "diversity_loss_mlp": 0.0, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.07569633120476413, + "language_loss": 0.83779937, + "learning_rate": 0.0008965660118966752, + "loss": 0.84950066, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1207, + "time_per_iteration": 2.9316329956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146692, + "balance_loss_mlp": 1.1319102, + "diversity_loss_mlp": 0.0, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06968265941642382, + "language_loss": 0.90114093, + "learning_rate": 0.0008963761901639851, + "loss": 0.91260791, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1208, + "time_per_iteration": 2.8140323162078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113879, + "balance_loss_mlp": 1.12392485, + "diversity_loss_mlp": 0.0, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.08612535310277082, + "language_loss": 0.83098078, + "learning_rate": 0.0008961862145444103, + "loss": 0.84236872, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1209, + "time_per_iteration": 2.7529945373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122935, + "balance_loss_mlp": 1.10796285, + "diversity_loss_mlp": 0.0, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08243119711445285, + "language_loss": 0.85338795, + "learning_rate": 0.0008959960851117059, + "loss": 0.86461735, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1210, + "time_per_iteration": 2.624340534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108554, + "balance_loss_mlp": 1.09396267, + "diversity_loss_mlp": 0.0, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.10596241027535934, + "language_loss": 0.84048676, + "learning_rate": 0.0008958058019396868, + "loss": 0.85157233, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1211, + "time_per_iteration": 2.8316566944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112068, + "balance_loss_mlp": 1.09751284, + "diversity_loss_mlp": 0.0, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.07651667178885936, + "language_loss": 0.86494702, + "learning_rate": 0.0008956153651022274, + "loss": 0.8760677, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1212, + "time_per_iteration": 2.684788465499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103634, + "balance_loss_mlp": 1.08926892, + "diversity_loss_mlp": 0.0, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.07459915787800217, + "language_loss": 0.83929688, + "learning_rate": 0.0008954247746732618, + "loss": 0.85033321, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1213, + "time_per_iteration": 2.6184399127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117524, + "balance_loss_mlp": 1.10321903, + "diversity_loss_mlp": 0.0, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.08317009769115577, + "language_loss": 0.90604293, + "learning_rate": 0.0008952340307267837, + "loss": 0.91721821, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1214, + "time_per_iteration": 2.8993093967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119644, + "balance_loss_mlp": 1.10553002, + "diversity_loss_mlp": 0.0, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.09601716623847659, + "language_loss": 0.83731341, + "learning_rate": 0.0008950431333368468, + "loss": 0.84850979, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1215, + "time_per_iteration": 2.6151199340820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130858, + "balance_loss_mlp": 1.11676729, + "diversity_loss_mlp": 0.0, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.08049188450288745, + "language_loss": 0.84623635, + "learning_rate": 0.0008948520825775634, + "loss": 0.8575449, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1216, + "time_per_iteration": 3.645200490951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123063, + "balance_loss_mlp": 1.10880601, + "diversity_loss_mlp": 0.0, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.08038238822992319, + "language_loss": 0.83978343, + "learning_rate": 0.0008946608785231067, + "loss": 0.85101402, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1217, + "time_per_iteration": 2.871616840362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126329, + "balance_loss_mlp": 1.11263156, + "diversity_loss_mlp": 0.0, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.07832391647543825, + "language_loss": 0.84442961, + "learning_rate": 0.0008944695212477084, + "loss": 0.85569292, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1218, + "time_per_iteration": 2.507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123493, + "balance_loss_mlp": 1.10867572, + "diversity_loss_mlp": 0.0, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.07420792055611987, + "language_loss": 0.86334574, + "learning_rate": 0.0008942780108256599, + "loss": 0.87458062, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1219, + "time_per_iteration": 2.6183433532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107778, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.07657909053901747, + "language_loss": 0.86160946, + "learning_rate": 0.0008940863473313121, + "loss": 0.87268722, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1220, + "time_per_iteration": 2.495164632797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107377, + "balance_loss_mlp": 1.09272623, + "diversity_loss_mlp": 0.0, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07962638616920462, + "language_loss": 0.87889743, + "learning_rate": 0.0008938945308390756, + "loss": 0.88997114, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1221, + "time_per_iteration": 2.613927125930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097552, + "balance_loss_mlp": 1.08298469, + "diversity_loss_mlp": 0.0, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.06679649396710063, + "language_loss": 0.87179595, + "learning_rate": 0.00089370256142342, + "loss": 0.88277149, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1222, + "time_per_iteration": 2.732208013534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094745, + "balance_loss_mlp": 1.07952189, + "diversity_loss_mlp": 0.0, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.06680688140454344, + "language_loss": 0.84810197, + "learning_rate": 0.0008935104391588746, + "loss": 0.85904944, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1223, + "time_per_iteration": 2.7585461139678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094902, + "balance_loss_mlp": 1.07917881, + "diversity_loss_mlp": 0.0, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.07271030004651308, + "language_loss": 0.83111542, + "learning_rate": 0.0008933181641200276, + "loss": 0.84206444, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.15710449, + "routerloss_mlp": 0.0, + "step": 1224, + "time_per_iteration": 3.1440725326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087445, + "balance_loss_mlp": 1.07139981, + "diversity_loss_mlp": 0.0, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.07882513603721358, + "language_loss": 0.85824931, + "learning_rate": 0.0008931257363815271, + "loss": 0.8691237, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.16040039, + "routerloss_mlp": 0.0, + "step": 1225, + "time_per_iteration": 2.8887243270874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092595, + "balance_loss_mlp": 1.07659674, + "diversity_loss_mlp": 0.0, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.09571789824401095, + "language_loss": 0.89901638, + "learning_rate": 0.0008929331560180798, + "loss": 0.90994227, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.15991211, + "routerloss_mlp": 0.0, + "step": 1226, + "time_per_iteration": 2.897155284881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095366, + "balance_loss_mlp": 1.07965469, + "diversity_loss_mlp": 0.0, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.068724406385502, + "language_loss": 0.90771782, + "learning_rate": 0.0008927404231044525, + "loss": 0.91867149, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 1227, + "time_per_iteration": 2.6892144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_mlp": 1.08764625, + "diversity_loss_mlp": 0.0, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.06943954848997126, + "language_loss": 0.81646705, + "learning_rate": 0.0008925475377154703, + "loss": 0.82749879, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1228, + "time_per_iteration": 2.727325201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.11394727, + "diversity_loss_mlp": 0.0, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.0778889683705481, + "language_loss": 0.8212285, + "learning_rate": 0.0008923544999260183, + "loss": 0.83252132, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1229, + "time_per_iteration": 2.7520618438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.13194346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.0853653064859127, + "language_loss": 0.91254115, + "learning_rate": 0.00089216130981104, + "loss": 0.92400861, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1230, + "time_per_iteration": 3.016228199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138894, + "balance_loss_mlp": 1.12364721, + "diversity_loss_mlp": 0.0, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.08048994442870243, + "language_loss": 0.82752085, + "learning_rate": 0.000891967967445539, + "loss": 0.83890975, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.15222168, + "routerloss_mlp": 0.0, + "step": 1231, + "time_per_iteration": 2.65736722946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126061, + "balance_loss_mlp": 1.11135054, + "diversity_loss_mlp": 0.0, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.05909715635047166, + "language_loss": 0.889099, + "learning_rate": 0.0008917744729045772, + "loss": 0.90035963, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1232, + "time_per_iteration": 2.8686273097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110871, + "balance_loss_mlp": 1.0962795, + "diversity_loss_mlp": 0.0, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.08046733758331526, + "language_loss": 0.83836448, + "learning_rate": 0.0008915808262632757, + "loss": 0.84947324, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1233, + "time_per_iteration": 2.860353708267212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918962, + "balance_loss_mlp": 1.60287488, + "diversity_loss_mlp": 0.20008399, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.03182006079144566, + "language_loss": 0.93544835, + "learning_rate": 0.0008913870275968148, + "loss": 0.94463801, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017482, + "step": 1234, + "time_per_iteration": 2.7328829765319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095751, + "balance_loss_mlp": 1.08008718, + "diversity_loss_mlp": 0.0, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.07195832826776788, + "language_loss": 0.87503707, + "learning_rate": 0.0008911930769804342, + "loss": 0.88599461, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1235, + "time_per_iteration": 3.2619638442993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091405, + "balance_loss_mlp": 1.07551408, + "diversity_loss_mlp": 0.0, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.07148547933088874, + "language_loss": 0.91313815, + "learning_rate": 0.0008909989744894318, + "loss": 0.92405218, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1236, + "time_per_iteration": 2.8687992095947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080974, + "balance_loss_mlp": 1.06530952, + "diversity_loss_mlp": 0.0, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.08021447901266163, + "language_loss": 0.81662518, + "learning_rate": 0.0008908047201991649, + "loss": 0.8274349, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1237, + "time_per_iteration": 2.737638235092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076608, + "balance_loss_mlp": 1.06138515, + "diversity_loss_mlp": 0.0, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.07749899394714953, + "language_loss": 0.86585152, + "learning_rate": 0.0008906103141850502, + "loss": 0.87661767, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.15197754, + "routerloss_mlp": 0.0, + "step": 1238, + "time_per_iteration": 2.9184746742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068848, + "balance_loss_mlp": 1.05385113, + "diversity_loss_mlp": 0.0, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.10230617436374452, + "language_loss": 0.88104367, + "learning_rate": 0.0008904157565225621, + "loss": 0.89173216, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.1496582, + "routerloss_mlp": 0.0, + "step": 1239, + "time_per_iteration": 2.6396749019622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_mlp": 1.06220865, + "diversity_loss_mlp": 0.0, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.10467557893696883, + "language_loss": 0.81824136, + "learning_rate": 0.000890221047287235, + "loss": 0.82901168, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1240, + "time_per_iteration": 3.496812582015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081371, + "balance_loss_mlp": 1.06710172, + "diversity_loss_mlp": 0.0, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.09443583580909311, + "language_loss": 0.91125917, + "learning_rate": 0.0008900261865546615, + "loss": 0.92207289, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1241, + "time_per_iteration": 2.6527724266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103977, + "balance_loss_mlp": 1.0890398, + "diversity_loss_mlp": 0.0, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.08429957072104315, + "language_loss": 0.84985352, + "learning_rate": 0.0008898311744004936, + "loss": 0.86089325, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1242, + "time_per_iteration": 2.6740338802337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118763, + "balance_loss_mlp": 1.10411179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.07332762129893158, + "language_loss": 0.86932802, + "learning_rate": 0.0008896360109004414, + "loss": 0.88051569, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1243, + "time_per_iteration": 2.643489122390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142112, + "balance_loss_mlp": 1.12715125, + "diversity_loss_mlp": 0.0, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.09306092844590973, + "language_loss": 0.84636557, + "learning_rate": 0.0008894406961302742, + "loss": 0.85778666, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1244, + "time_per_iteration": 2.5876173973083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150798, + "balance_loss_mlp": 1.13590896, + "diversity_loss_mlp": 0.0, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.0838589606869783, + "language_loss": 0.83944738, + "learning_rate": 0.0008892452301658201, + "loss": 0.85095537, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1245, + "time_per_iteration": 2.928391218185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116146, + "balance_loss_mlp": 1.1460346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.0736247551351698, + "language_loss": 0.83299339, + "learning_rate": 0.0008890496130829653, + "loss": 0.84460801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1246, + "time_per_iteration": 2.6510462760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915571, + "balance_loss_mlp": 1.59993446, + "diversity_loss_mlp": 0.1987851, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.03287481157446996, + "language_loss": 0.85918486, + "learning_rate": 0.0008888538449576555, + "loss": 0.86834061, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621127, + "step": 1247, + "time_per_iteration": 2.5719456672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178279, + "balance_loss_mlp": 1.16323447, + "diversity_loss_mlp": 0.0, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.10811715250715398, + "language_loss": 0.83036304, + "learning_rate": 0.0008886579258658944, + "loss": 0.8421458, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.15014648, + "routerloss_mlp": 0.0, + "step": 1248, + "time_per_iteration": 2.5736701488494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148631, + "balance_loss_mlp": 1.13341999, + "diversity_loss_mlp": 0.0, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.07868761607649298, + "language_loss": 0.84717274, + "learning_rate": 0.0008884618558837446, + "loss": 0.85865903, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1249, + "time_per_iteration": 2.8215761184692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911764, + "balance_loss_mlp": 1.59372783, + "diversity_loss_mlp": 0.19720009, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.03236174678929329, + "language_loss": 0.8677094, + "learning_rate": 0.0008882656350873273, + "loss": 0.87682706, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01629994, + "step": 1250, + "time_per_iteration": 2.885092258453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126022, + "balance_loss_mlp": 1.11122799, + "diversity_loss_mlp": 0.0, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.08347743908005935, + "language_loss": 0.87000573, + "learning_rate": 0.0008880692635528219, + "loss": 0.88126594, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1251, + "time_per_iteration": 3.049070119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106629, + "balance_loss_mlp": 1.09177542, + "diversity_loss_mlp": 0.0, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.07406446185181008, + "language_loss": 0.89514965, + "learning_rate": 0.0008878727413564669, + "loss": 0.90621597, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1252, + "time_per_iteration": 2.734839677810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.06804204, + "diversity_loss_mlp": 0.0, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.048930323133030355, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81211317, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1253, + "time_per_iteration": 4.854974031448364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00873083, + "balance_loss_mlp": 1.51531768, + "diversity_loss_mlp": 0.19563958, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.03648198852202315, + "language_loss": 0.78763413, + "learning_rate": 0.0008874792452834528, + "loss": 0.7963649, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01760404, + "step": 1254, + "time_per_iteration": 2.803690195083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090026, + "balance_loss_mlp": 1.07530415, + "diversity_loss_mlp": 0.0, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.09659900556863026, + "language_loss": 0.8729195, + "learning_rate": 0.0008872822715595626, + "loss": 0.88381982, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1255, + "time_per_iteration": 2.657867670059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084204, + "balance_loss_mlp": 1.06968451, + "diversity_loss_mlp": 0.0, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.10497791491954662, + "language_loss": 0.87333822, + "learning_rate": 0.0008870851474793598, + "loss": 0.88418031, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1256, + "time_per_iteration": 2.5694568157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083756, + "balance_loss_mlp": 1.06920075, + "diversity_loss_mlp": 0.0, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.07331256259210016, + "language_loss": 0.89243567, + "learning_rate": 0.0008868878731193752, + "loss": 0.90327322, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1257, + "time_per_iteration": 2.829789400100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086138, + "balance_loss_mlp": 1.07158267, + "diversity_loss_mlp": 0.0, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.07236027639177293, + "language_loss": 0.89720446, + "learning_rate": 0.0008866904485561973, + "loss": 0.90806586, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1258, + "time_per_iteration": 2.731635570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078524, + "balance_loss_mlp": 1.06384969, + "diversity_loss_mlp": 0.0, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.0727569881861308, + "language_loss": 0.83084273, + "learning_rate": 0.000886492873866473, + "loss": 0.84162796, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1259, + "time_per_iteration": 2.8250575065612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080175, + "balance_loss_mlp": 1.06528533, + "diversity_loss_mlp": 0.0, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.10762424055834904, + "language_loss": 0.84672934, + "learning_rate": 0.000886295149126908, + "loss": 0.85753107, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1260, + "time_per_iteration": 2.7148356437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086434, + "balance_loss_mlp": 1.07181931, + "diversity_loss_mlp": 0.0, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.07159531524201106, + "language_loss": 0.85693741, + "learning_rate": 0.0008860972744142655, + "loss": 0.86780179, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1261, + "time_per_iteration": 2.931696653366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115009, + "balance_loss_mlp": 1.10064411, + "diversity_loss_mlp": 0.0, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.065367920687613, + "language_loss": 0.81639904, + "learning_rate": 0.0008858992498053671, + "loss": 0.82754916, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1262, + "time_per_iteration": 2.846466541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055704, + "balance_loss_mlp": 1.04764521, + "diversity_loss_mlp": 0.0, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.03374572714932058, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77644455, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1263, + "time_per_iteration": 4.882519006729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00872344, + "balance_loss_mlp": 1.51226497, + "diversity_loss_mlp": 0.19974959, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.03166105856965055, + "language_loss": 0.83409035, + "learning_rate": 0.0008855027512063817, + "loss": 0.84281385, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633644, + "step": 1264, + "time_per_iteration": 2.7414488792419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185798, + "balance_loss_mlp": 1.17132628, + "diversity_loss_mlp": 0.0, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.06261248257395001, + "language_loss": 0.85949916, + "learning_rate": 0.0008853042773702292, + "loss": 0.8713572, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1265, + "time_per_iteration": 2.695514440536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196886, + "balance_loss_mlp": 1.18234205, + "diversity_loss_mlp": 0.0, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.08760826562773598, + "language_loss": 0.87981403, + "learning_rate": 0.0008851056539456896, + "loss": 0.89178288, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1266, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119913, + "balance_loss_mlp": 1.18489647, + "diversity_loss_mlp": 0.0, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.07991839198753149, + "language_loss": 0.81904382, + "learning_rate": 0.0008849068810098755, + "loss": 0.83103514, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1267, + "time_per_iteration": 3.3067915439605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174372, + "balance_loss_mlp": 1.15992332, + "diversity_loss_mlp": 0.0, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.10499473220259715, + "language_loss": 0.83550054, + "learning_rate": 0.0008847079586399575, + "loss": 0.84724426, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1268, + "time_per_iteration": 2.4791157245635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115106, + "balance_loss_mlp": 1.13699341, + "diversity_loss_mlp": 0.0, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07765469411987547, + "language_loss": 0.86144567, + "learning_rate": 0.0008845088869131641, + "loss": 0.87295628, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1269, + "time_per_iteration": 2.6733555793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111483, + "balance_loss_mlp": 1.10053682, + "diversity_loss_mlp": 0.0, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.0888033537849515, + "language_loss": 0.88898385, + "learning_rate": 0.0008843096659067818, + "loss": 0.90013218, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1270, + "time_per_iteration": 2.6315910816192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111213, + "balance_loss_mlp": 1.09708679, + "diversity_loss_mlp": 0.0, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.09475560383246978, + "language_loss": 0.86565858, + "learning_rate": 0.000884110295698155, + "loss": 0.87677073, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1271, + "time_per_iteration": 2.926668643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.08752966, + "diversity_loss_mlp": 0.0, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.09917556522455147, + "language_loss": 0.85849231, + "learning_rate": 0.0008839107763646861, + "loss": 0.86951411, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1272, + "time_per_iteration": 2.58022403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110242, + "balance_loss_mlp": 1.08751881, + "diversity_loss_mlp": 0.0, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.08783320449451974, + "language_loss": 0.89941388, + "learning_rate": 0.0008837111079838353, + "loss": 0.91043806, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1273, + "time_per_iteration": 2.6877150535583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111566, + "balance_loss_mlp": 1.10096157, + "diversity_loss_mlp": 0.0, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.07640958054403056, + "language_loss": 0.89671296, + "learning_rate": 0.000883511290633121, + "loss": 0.90786958, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1274, + "time_per_iteration": 2.5929813385009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123812, + "balance_loss_mlp": 1.10898256, + "diversity_loss_mlp": 0.0, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.05814589763763208, + "language_loss": 0.92211604, + "learning_rate": 0.000883311324390119, + "loss": 0.93335414, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1275, + "time_per_iteration": 2.721343517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138447, + "balance_loss_mlp": 1.12315261, + "diversity_loss_mlp": 0.0, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.10098653640048322, + "language_loss": 0.81237984, + "learning_rate": 0.0008831112093324629, + "loss": 0.82376432, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.15283203, + "routerloss_mlp": 0.0, + "step": 1276, + "time_per_iteration": 3.066657543182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148152, + "balance_loss_mlp": 1.13266695, + "diversity_loss_mlp": 0.0, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.07328274291062464, + "language_loss": 0.89255905, + "learning_rate": 0.0008829109455378444, + "loss": 0.90404058, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 1277, + "time_per_iteration": 2.6705071926116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163573, + "balance_loss_mlp": 1.14844561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.08343231090098181, + "language_loss": 0.86569774, + "learning_rate": 0.000882710533084013, + "loss": 0.87733346, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1278, + "time_per_iteration": 2.632864236831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152351, + "balance_loss_mlp": 1.13783133, + "diversity_loss_mlp": 0.0, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.0729065811951457, + "language_loss": 0.8929435, + "learning_rate": 0.0008825099720487755, + "loss": 0.90446699, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1279, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00676302, + "balance_loss_mlp": 1.12665224, + "diversity_loss_mlp": 0.19835761, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.0027483074809680533, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.75937444, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0137972, + "step": 1280, + "time_per_iteration": 4.88429594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111003, + "balance_loss_mlp": 1.10232449, + "diversity_loss_mlp": 0.0, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.05615046205501133, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79055113, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1281, + "time_per_iteration": 4.752316236495972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113823, + "balance_loss_mlp": 1.09987593, + "diversity_loss_mlp": 0.0, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.08093958913819582, + "language_loss": 0.89542687, + "learning_rate": 0.0008819073982335619, + "loss": 0.90656507, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1282, + "time_per_iteration": 2.876927137374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110167, + "balance_loss_mlp": 1.08783603, + "diversity_loss_mlp": 0.0, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07169123109412263, + "language_loss": 0.84362143, + "learning_rate": 0.0008817062436519235, + "loss": 0.8546381, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.13824463, + "routerloss_mlp": 0.0, + "step": 1283, + "time_per_iteration": 2.6551387310028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0086846, + "balance_loss_mlp": 1.5022366, + "diversity_loss_mlp": 0.20048198, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.033180516132009126, + "language_loss": 0.89655471, + "learning_rate": 0.0008815049408787788, + "loss": 0.90523928, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01710081, + "step": 1284, + "time_per_iteration": 2.5652830600738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100363, + "balance_loss_mlp": 1.08698821, + "diversity_loss_mlp": 0.0, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.0762028673981185, + "language_loss": 0.85473216, + "learning_rate": 0.0008813034899922805, + "loss": 0.86573577, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1285, + "time_per_iteration": 2.549622058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111306, + "balance_loss_mlp": 1.09783578, + "diversity_loss_mlp": 0.0, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.11471388318643767, + "language_loss": 0.89855313, + "learning_rate": 0.0008811018910706387, + "loss": 0.9096663, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1286, + "time_per_iteration": 2.575176954269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117993, + "balance_loss_mlp": 1.10453439, + "diversity_loss_mlp": 0.0, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.10517914532856759, + "language_loss": 0.81922066, + "learning_rate": 0.0008809001441921211, + "loss": 0.83040059, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1287, + "time_per_iteration": 2.732236862182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.1132865, + "diversity_loss_mlp": 0.0, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.1440229573277689, + "language_loss": 0.85392761, + "learning_rate": 0.0008806982494350528, + "loss": 0.86519527, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1288, + "time_per_iteration": 2.6544177532196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168468, + "balance_loss_mlp": 1.1549263, + "diversity_loss_mlp": 0.0, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.07192560701016996, + "language_loss": 0.9021467, + "learning_rate": 0.0008804962068778161, + "loss": 0.91383135, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1289, + "time_per_iteration": 2.8321304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_mlp": 1.20329499, + "diversity_loss_mlp": 0.0, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.08274381184261048, + "language_loss": 0.81234664, + "learning_rate": 0.0008802940165988511, + "loss": 0.82451665, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1290, + "time_per_iteration": 2.848726749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262968, + "balance_loss_mlp": 1.24875808, + "diversity_loss_mlp": 0.0, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.09449787402071168, + "language_loss": 0.88461435, + "learning_rate": 0.000880091678676655, + "loss": 0.8972441, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1291, + "time_per_iteration": 2.802199363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279654, + "balance_loss_mlp": 1.26553965, + "diversity_loss_mlp": 0.0, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.11843407890200246, + "language_loss": 0.88870949, + "learning_rate": 0.0008798891931897821, + "loss": 0.90150601, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1292, + "time_per_iteration": 2.7150259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870403, + "balance_loss_mlp": 1.50883341, + "diversity_loss_mlp": 0.20002533, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.035309457370921726, + "language_loss": 0.84031773, + "learning_rate": 0.0008796865602168447, + "loss": 0.84902173, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597392, + "step": 1293, + "time_per_iteration": 2.5952000617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210957, + "balance_loss_mlp": 1.19661582, + "diversity_loss_mlp": 0.0, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.07909897749306223, + "language_loss": 0.88611919, + "learning_rate": 0.0008794837798365115, + "loss": 0.89822876, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1294, + "time_per_iteration": 2.6257524490356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167929, + "balance_loss_mlp": 1.15246725, + "diversity_loss_mlp": 0.0, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.06704316740686254, + "language_loss": 0.8866623, + "learning_rate": 0.0008792808521275089, + "loss": 0.89834166, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1295, + "time_per_iteration": 2.7125115394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153488, + "balance_loss_mlp": 1.13757372, + "diversity_loss_mlp": 0.0, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.08601952378824393, + "language_loss": 0.87496305, + "learning_rate": 0.0008790777771686206, + "loss": 0.88649786, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1296, + "time_per_iteration": 2.6131319999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124706, + "balance_loss_mlp": 1.10882747, + "diversity_loss_mlp": 0.0, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.0951042007575699, + "language_loss": 0.8543523, + "learning_rate": 0.0008788745550386872, + "loss": 0.86559939, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 1297, + "time_per_iteration": 2.5590503215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115503, + "balance_loss_mlp": 1.09948111, + "diversity_loss_mlp": 0.0, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.07219065567928346, + "language_loss": 0.80291975, + "learning_rate": 0.0008786711858166063, + "loss": 0.81407487, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1298, + "time_per_iteration": 2.951768398284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00871436, + "balance_loss_mlp": 1.51113367, + "diversity_loss_mlp": 0.19870289, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.03357842357877673, + "language_loss": 0.83488023, + "learning_rate": 0.0008784676695813332, + "loss": 0.84359455, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165179, + "step": 1299, + "time_per_iteration": 2.985684871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108985, + "balance_loss_mlp": 1.07411456, + "diversity_loss_mlp": 0.0, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07050099983107566, + "language_loss": 0.84900999, + "learning_rate": 0.0008782640064118796, + "loss": 0.85990846, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1300, + "time_per_iteration": 2.943368673324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139161, + "balance_loss_mlp": 1.13172245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.062054541004710057, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323914, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.07421875, + "routerloss_mlp": 0.0, + "step": 1301, + "time_per_iteration": 4.975619316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106013, + "balance_loss_mlp": 1.09055138, + "diversity_loss_mlp": 0.0, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.08145949094764637, + "language_loss": 0.86554521, + "learning_rate": 0.0008778562395867648, + "loss": 0.87660533, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1302, + "time_per_iteration": 2.6318612098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111342, + "balance_loss_mlp": 1.09572554, + "diversity_loss_mlp": 0.0, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.0727542370097133, + "language_loss": 0.84224409, + "learning_rate": 0.0008776521360894127, + "loss": 0.85335743, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 1303, + "time_per_iteration": 2.6512627601623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_mlp": 1.02259421, + "diversity_loss_mlp": 0.0, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.02979233866947858, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79991817, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.07128906, + "routerloss_mlp": 0.0, + "step": 1304, + "time_per_iteration": 4.802467107772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112993, + "balance_loss_mlp": 1.11518431, + "diversity_loss_mlp": 0.0, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.07060498048015267, + "language_loss": 0.9057076, + "learning_rate": 0.0008772434893213186, + "loss": 0.91700697, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1305, + "time_per_iteration": 2.601546049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137218, + "balance_loss_mlp": 1.12251997, + "diversity_loss_mlp": 0.0, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.13797279723809866, + "language_loss": 0.84362888, + "learning_rate": 0.0008770389462092276, + "loss": 0.85500103, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1306, + "time_per_iteration": 2.626138210296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141522, + "balance_loss_mlp": 1.12685966, + "diversity_loss_mlp": 0.0, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.08471108342240245, + "language_loss": 0.86803389, + "learning_rate": 0.0008768342567176357, + "loss": 0.87944913, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1307, + "time_per_iteration": 2.8074796199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114159, + "balance_loss_mlp": 1.12681937, + "diversity_loss_mlp": 0.0, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.07263390393133992, + "language_loss": 0.90559924, + "learning_rate": 0.0008766294209260107, + "loss": 0.91701508, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1308, + "time_per_iteration": 2.670790910720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.13312435, + "diversity_loss_mlp": 0.0, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.07764888634730133, + "language_loss": 0.91554916, + "learning_rate": 0.0008764244389138767, + "loss": 0.92702377, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1309, + "time_per_iteration": 2.572793483734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147476, + "balance_loss_mlp": 1.13318276, + "diversity_loss_mlp": 0.0, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09714227143719616, + "language_loss": 0.82980847, + "learning_rate": 0.000876219310760815, + "loss": 0.8412832, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1310, + "time_per_iteration": 2.8601791858673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146968, + "balance_loss_mlp": 1.13273418, + "diversity_loss_mlp": 0.0, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.09648806821544922, + "language_loss": 0.81436276, + "learning_rate": 0.0008760140365464631, + "loss": 0.82583249, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1311, + "time_per_iteration": 2.599353790283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870128, + "balance_loss_mlp": 1.50605726, + "diversity_loss_mlp": 0.20002663, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.03529693250820236, + "language_loss": 0.871418, + "learning_rate": 0.0008758086163505156, + "loss": 0.88011926, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170862, + "step": 1312, + "time_per_iteration": 2.6166832447052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163863, + "balance_loss_mlp": 1.14953399, + "diversity_loss_mlp": 0.0, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.07147814499844148, + "language_loss": 0.89267951, + "learning_rate": 0.0008756030502527239, + "loss": 0.90431809, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1313, + "time_per_iteration": 2.8452062606811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188075, + "balance_loss_mlp": 1.17377019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.09335955432973846, + "language_loss": 0.90298462, + "learning_rate": 0.0008753973383328954, + "loss": 0.91486537, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1314, + "time_per_iteration": 2.6988537311553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165459, + "balance_loss_mlp": 1.15108287, + "diversity_loss_mlp": 0.0, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.08872096542459323, + "language_loss": 0.83944553, + "learning_rate": 0.0008751914806708952, + "loss": 0.85110015, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1315, + "time_per_iteration": 2.6328680515289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.1372478, + "diversity_loss_mlp": 0.0, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.09247066962171595, + "language_loss": 0.81854099, + "learning_rate": 0.0008749854773466439, + "loss": 0.83005595, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1316, + "time_per_iteration": 2.6708498001098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134446, + "balance_loss_mlp": 1.11980653, + "diversity_loss_mlp": 0.0, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.06992463478304738, + "language_loss": 0.84568423, + "learning_rate": 0.0008747793284401192, + "loss": 0.85702872, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1317, + "time_per_iteration": 2.70182204246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120804, + "balance_loss_mlp": 1.10560477, + "diversity_loss_mlp": 0.0, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.11229953955213261, + "language_loss": 0.85994983, + "learning_rate": 0.0008745730340313551, + "loss": 0.87115788, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1318, + "time_per_iteration": 2.8026556968688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.1048007, + "diversity_loss_mlp": 0.0, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.0843917818222923, + "language_loss": 0.84519732, + "learning_rate": 0.0008743665942004422, + "loss": 0.85639453, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.14904785, + "routerloss_mlp": 0.0, + "step": 1319, + "time_per_iteration": 2.6717073917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120645, + "balance_loss_mlp": 1.10569644, + "diversity_loss_mlp": 0.0, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.06860607652829093, + "language_loss": 0.92769039, + "learning_rate": 0.0008741600090275277, + "loss": 0.93889689, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1320, + "time_per_iteration": 2.6251981258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120587, + "balance_loss_mlp": 1.10530448, + "diversity_loss_mlp": 0.0, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.09643257369734548, + "language_loss": 0.8425917, + "learning_rate": 0.0008739532785928151, + "loss": 0.85379755, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1321, + "time_per_iteration": 3.4925267696380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101061, + "balance_loss_mlp": 1.09305024, + "diversity_loss_mlp": 0.0, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.04547815076873398, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75994641, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1322, + "time_per_iteration": 4.8446879386901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0085354, + "balance_loss_mlp": 1.4814328, + "diversity_loss_mlp": 0.19370571, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.036800523279172735, + "language_loss": 0.82844102, + "learning_rate": 0.0008735393822590908, + "loss": 0.83697641, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597124, + "step": 1323, + "time_per_iteration": 2.7354650497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174586, + "balance_loss_mlp": 1.16032863, + "diversity_loss_mlp": 0.0, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.08280852347492981, + "language_loss": 0.87442601, + "learning_rate": 0.0008733322165207681, + "loss": 0.88617194, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1324, + "time_per_iteration": 2.6581695079803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120021, + "balance_loss_mlp": 1.18529749, + "diversity_loss_mlp": 0.0, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.0779912319299164, + "language_loss": 0.8296451, + "learning_rate": 0.0008731249058420247, + "loss": 0.84164721, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1325, + "time_per_iteration": 3.0674960613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203892, + "balance_loss_mlp": 1.18865728, + "diversity_loss_mlp": 0.0, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.10695670124077197, + "language_loss": 0.90080667, + "learning_rate": 0.0008729174503033459, + "loss": 0.91284555, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1326, + "time_per_iteration": 2.6511192321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188403, + "balance_loss_mlp": 1.17334652, + "diversity_loss_mlp": 0.0, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.10125548093505272, + "language_loss": 0.82427752, + "learning_rate": 0.0008727098499852728, + "loss": 0.83616149, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1327, + "time_per_iteration": 2.833803415298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150318, + "balance_loss_mlp": 1.13529778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.08478455973869617, + "language_loss": 0.89778203, + "learning_rate": 0.0008725021049684034, + "loss": 0.90928519, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.15002441, + "routerloss_mlp": 0.0, + "step": 1328, + "time_per_iteration": 2.7405433654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116795, + "balance_loss_mlp": 1.10194123, + "diversity_loss_mlp": 0.0, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.07099770943741918, + "language_loss": 0.83078361, + "learning_rate": 0.000872294215333391, + "loss": 0.84195161, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1329, + "time_per_iteration": 3.219834089279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099158, + "balance_loss_mlp": 1.08430433, + "diversity_loss_mlp": 0.0, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.06913408205057751, + "language_loss": 0.82662833, + "learning_rate": 0.0008720861811609457, + "loss": 0.8376199, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1330, + "time_per_iteration": 2.753122329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096587, + "balance_loss_mlp": 1.0816741, + "diversity_loss_mlp": 0.0, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0919113566921475, + "language_loss": 0.83719599, + "learning_rate": 0.0008718780025318338, + "loss": 0.84816188, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1331, + "time_per_iteration": 2.724808692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.09296656, + "diversity_loss_mlp": 0.0, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.09880415123515712, + "language_loss": 0.83982158, + "learning_rate": 0.0008716696795268771, + "loss": 0.85089689, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1332, + "time_per_iteration": 2.718421220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098797, + "balance_loss_mlp": 1.08430111, + "diversity_loss_mlp": 0.0, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.15208681676824193, + "language_loss": 0.85333431, + "learning_rate": 0.0008714612122269538, + "loss": 0.8643223, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1333, + "time_per_iteration": 2.877823829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120258, + "balance_loss_mlp": 1.10586989, + "diversity_loss_mlp": 0.0, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.07756137703605612, + "language_loss": 0.89334106, + "learning_rate": 0.0008712526007129982, + "loss": 0.90454364, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1334, + "time_per_iteration": 2.561842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155014, + "balance_loss_mlp": 1.14101923, + "diversity_loss_mlp": 0.0, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.12724628219842446, + "language_loss": 0.90676123, + "learning_rate": 0.0008710438450660003, + "loss": 0.91831136, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1335, + "time_per_iteration": 2.6618270874023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199277, + "balance_loss_mlp": 1.18486404, + "diversity_loss_mlp": 0.0, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.10895723532104484, + "language_loss": 0.87596953, + "learning_rate": 0.0008708349453670064, + "loss": 0.88796222, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1336, + "time_per_iteration": 2.5121865272521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195197, + "balance_loss_mlp": 1.18032002, + "diversity_loss_mlp": 0.0, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.10227195785495524, + "language_loss": 0.91035736, + "learning_rate": 0.0008706259016971185, + "loss": 0.92230934, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1337, + "time_per_iteration": 2.7760090827941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189061, + "balance_loss_mlp": 1.17414773, + "diversity_loss_mlp": 0.0, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.12625436277937716, + "language_loss": 0.83095431, + "learning_rate": 0.0008704167141374944, + "loss": 0.84284496, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1338, + "time_per_iteration": 2.824122428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146224, + "balance_loss_mlp": 1.13107228, + "diversity_loss_mlp": 0.0, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.0801465901926633, + "language_loss": 0.88427222, + "learning_rate": 0.0008702073827693482, + "loss": 0.89573455, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1339, + "time_per_iteration": 2.708488941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101183, + "balance_loss_mlp": 1.0865202, + "diversity_loss_mlp": 0.0, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.07445900988257396, + "language_loss": 0.88514435, + "learning_rate": 0.0008699979076739494, + "loss": 0.89615613, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1340, + "time_per_iteration": 2.960650682449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.07054412, + "diversity_loss_mlp": 0.0, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.09041758143252471, + "language_loss": 0.88622832, + "learning_rate": 0.0008697882889326234, + "loss": 0.89708054, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1341, + "time_per_iteration": 2.5199689865112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094608, + "balance_loss_mlp": 1.08043432, + "diversity_loss_mlp": 0.0, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.08157938691300957, + "language_loss": 0.86840844, + "learning_rate": 0.0008695785266267515, + "loss": 0.87935448, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1342, + "time_per_iteration": 2.6833419799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089859, + "balance_loss_mlp": 1.56664371, + "diversity_loss_mlp": 0.19803861, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.03344075262961686, + "language_loss": 0.83491886, + "learning_rate": 0.0008693686208377704, + "loss": 0.84390479, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01624843, + "step": 1343, + "time_per_iteration": 2.8157622814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101399, + "balance_loss_mlp": 1.08711743, + "diversity_loss_mlp": 0.0, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.07460013341605923, + "language_loss": 0.89022982, + "learning_rate": 0.0008691585716471733, + "loss": 0.90124375, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1344, + "time_per_iteration": 2.6386232376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.09707415, + "diversity_loss_mlp": 0.0, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.08548738123283665, + "language_loss": 0.85822487, + "learning_rate": 0.0008689483791365079, + "loss": 0.86934054, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1345, + "time_per_iteration": 2.831817626953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112096, + "balance_loss_mlp": 1.10685778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.07218857890204664, + "language_loss": 0.89327282, + "learning_rate": 0.0008687380433873786, + "loss": 0.90448248, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1346, + "time_per_iteration": 2.8322408199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1251955, + "diversity_loss_mlp": 0.0, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.07612070672802876, + "language_loss": 0.82638776, + "learning_rate": 0.0008685275644814448, + "loss": 0.83778065, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1347, + "time_per_iteration": 2.689772367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116224, + "balance_loss_mlp": 1.14764857, + "diversity_loss_mlp": 0.0, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07884944678342334, + "language_loss": 0.84390515, + "learning_rate": 0.0008683169425004216, + "loss": 0.85552752, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1348, + "time_per_iteration": 2.895153760910034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159732, + "balance_loss_mlp": 1.14511704, + "diversity_loss_mlp": 0.0, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.10354145261803285, + "language_loss": 0.83314335, + "learning_rate": 0.0008681061775260799, + "loss": 0.84474063, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1349, + "time_per_iteration": 2.850862503051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166024, + "balance_loss_mlp": 1.15118265, + "diversity_loss_mlp": 0.0, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08416928552821445, + "language_loss": 0.9214983, + "learning_rate": 0.0008678952696402458, + "loss": 0.93315852, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1350, + "time_per_iteration": 2.525019884109497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153353, + "balance_loss_mlp": 1.13848734, + "diversity_loss_mlp": 0.0, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.07397225666721696, + "language_loss": 0.86554277, + "learning_rate": 0.000867684218924801, + "loss": 0.87707639, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1351, + "time_per_iteration": 2.8780648708343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083238, + "balance_loss_mlp": 1.07517958, + "diversity_loss_mlp": 0.0, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.0438698963901256, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80030328, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1352, + "time_per_iteration": 4.916059255599976 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132931, + "balance_loss_mlp": 1.11807716, + "diversity_loss_mlp": 0.0, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.06358739416567256, + "language_loss": 0.85154414, + "learning_rate": 0.0008672616893328834, + "loss": 0.86287344, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1353, + "time_per_iteration": 2.9301464557647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120208, + "balance_loss_mlp": 1.10545015, + "diversity_loss_mlp": 0.0, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.0804298790611747, + "language_loss": 0.89736795, + "learning_rate": 0.0008670502106204512, + "loss": 0.90857005, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.14733887, + "routerloss_mlp": 0.0, + "step": 1354, + "time_per_iteration": 2.8392651081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121529, + "balance_loss_mlp": 1.10672283, + "diversity_loss_mlp": 0.0, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.08121830869095954, + "language_loss": 0.81676221, + "learning_rate": 0.0008668385894064892, + "loss": 0.82797754, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1355, + "time_per_iteration": 2.632744550704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.10095191, + "diversity_loss_mlp": 0.0, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.0871855710564252, + "language_loss": 0.88984954, + "learning_rate": 0.0008666268257731562, + "loss": 0.90100139, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1356, + "time_per_iteration": 3.0961363315582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132093, + "balance_loss_mlp": 1.11785948, + "diversity_loss_mlp": 0.0, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.08548634624367135, + "language_loss": 0.8594982, + "learning_rate": 0.0008664149198026662, + "loss": 0.87081909, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1357, + "time_per_iteration": 3.2423956394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133945, + "balance_loss_mlp": 1.12039137, + "diversity_loss_mlp": 0.0, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.09109654485188295, + "language_loss": 0.88802171, + "learning_rate": 0.0008662028715772883, + "loss": 0.89936113, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1358, + "time_per_iteration": 2.619495153427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138578, + "balance_loss_mlp": 1.12476182, + "diversity_loss_mlp": 0.0, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.07135790209188476, + "language_loss": 0.85816395, + "learning_rate": 0.0008659906811793467, + "loss": 0.86954975, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.13842773, + "routerloss_mlp": 0.0, + "step": 1359, + "time_per_iteration": 2.6752817630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135637, + "balance_loss_mlp": 1.12191582, + "diversity_loss_mlp": 0.0, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.07783428421444573, + "language_loss": 0.89649427, + "learning_rate": 0.0008657783486912215, + "loss": 0.90785068, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1360, + "time_per_iteration": 2.770136594772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918859, + "balance_loss_mlp": 1.60386825, + "diversity_loss_mlp": 0.20058532, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.03438194549161764, + "language_loss": 0.90315008, + "learning_rate": 0.0008655658741953472, + "loss": 0.91233867, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01663268, + "step": 1361, + "time_per_iteration": 3.239567518234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117406, + "balance_loss_mlp": 1.10352993, + "diversity_loss_mlp": 0.0, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.053733033776962646, + "language_loss": 0.88311911, + "learning_rate": 0.0008653532577742136, + "loss": 0.89429319, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1362, + "time_per_iteration": 2.6912107467651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111717, + "balance_loss_mlp": 1.09805584, + "diversity_loss_mlp": 0.0, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.07456283347469675, + "language_loss": 0.8687824, + "learning_rate": 0.0008651404995103659, + "loss": 0.87989956, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1363, + "time_per_iteration": 2.5554919242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106371, + "balance_loss_mlp": 1.09212554, + "diversity_loss_mlp": 0.0, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.0735216597505126, + "language_loss": 0.87311852, + "learning_rate": 0.0008649275994864041, + "loss": 0.88418221, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1364, + "time_per_iteration": 2.7228429317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109936, + "balance_loss_mlp": 1.0955832, + "diversity_loss_mlp": 0.0, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.06423000395680191, + "language_loss": 0.83767593, + "learning_rate": 0.0008647145577849834, + "loss": 0.84877527, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1365, + "time_per_iteration": 2.8194234371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110395, + "balance_loss_mlp": 1.09573257, + "diversity_loss_mlp": 0.0, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.0636918785190987, + "language_loss": 0.82912111, + "learning_rate": 0.0008645013744888139, + "loss": 0.8402251, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1366, + "time_per_iteration": 2.9121909141540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106528, + "balance_loss_mlp": 1.09266424, + "diversity_loss_mlp": 0.0, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.07268525177684865, + "language_loss": 0.87255573, + "learning_rate": 0.0008642880496806607, + "loss": 0.88362104, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1367, + "time_per_iteration": 2.7527663707733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117256, + "balance_loss_mlp": 1.1027844, + "diversity_loss_mlp": 0.0, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.06883104565378229, + "language_loss": 0.84193766, + "learning_rate": 0.0008640745834433437, + "loss": 0.85311019, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1368, + "time_per_iteration": 2.7203800678253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114698, + "balance_loss_mlp": 1.10065532, + "diversity_loss_mlp": 0.0, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.0718323039568536, + "language_loss": 0.87083656, + "learning_rate": 0.000863860975859738, + "loss": 0.88198352, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1369, + "time_per_iteration": 2.9021553993225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116648, + "balance_loss_mlp": 1.10278392, + "diversity_loss_mlp": 0.0, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.08463505288724613, + "language_loss": 0.88568735, + "learning_rate": 0.0008636472270127733, + "loss": 0.8968538, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1370, + "time_per_iteration": 2.6336748600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118916, + "balance_loss_mlp": 1.10440779, + "diversity_loss_mlp": 0.0, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.08505114845208346, + "language_loss": 0.90530956, + "learning_rate": 0.0008634333369854345, + "loss": 0.91649872, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1371, + "time_per_iteration": 2.585775136947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122621, + "balance_loss_mlp": 1.10868549, + "diversity_loss_mlp": 0.0, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.07138701063901956, + "language_loss": 0.87574148, + "learning_rate": 0.0008632193058607608, + "loss": 0.88696772, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.13952637, + "routerloss_mlp": 0.0, + "step": 1372, + "time_per_iteration": 2.719151735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124687, + "balance_loss_mlp": 1.11042953, + "diversity_loss_mlp": 0.0, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.09395332240398839, + "language_loss": 0.81125695, + "learning_rate": 0.0008630051337218466, + "loss": 0.82250381, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1373, + "time_per_iteration": 2.6700031757354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118707, + "balance_loss_mlp": 1.10506988, + "diversity_loss_mlp": 0.0, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0808240378873911, + "language_loss": 0.82403839, + "learning_rate": 0.0008627908206518409, + "loss": 0.83522546, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1374, + "time_per_iteration": 2.6610107421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061343, + "balance_loss_mlp": 1.05442929, + "diversity_loss_mlp": 0.0, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.04099598647265769, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76212597, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 1375, + "time_per_iteration": 4.979893922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109458, + "balance_loss_mlp": 1.09580863, + "diversity_loss_mlp": 0.0, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.06989177478220372, + "language_loss": 0.91488004, + "learning_rate": 0.0008623617720514241, + "loss": 0.92597461, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1376, + "time_per_iteration": 2.6515755653381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109452, + "balance_loss_mlp": 1.09554029, + "diversity_loss_mlp": 0.0, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.07399727326907257, + "language_loss": 0.84706682, + "learning_rate": 0.0008621470366875848, + "loss": 0.85816133, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1377, + "time_per_iteration": 2.599776268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119233, + "balance_loss_mlp": 1.10546422, + "diversity_loss_mlp": 0.0, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.07769258092785128, + "language_loss": 0.87980253, + "learning_rate": 0.0008619321607257966, + "loss": 0.89099485, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1378, + "time_per_iteration": 2.678865671157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.10274947, + "diversity_loss_mlp": 0.0, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.07519514659764338, + "language_loss": 0.82002568, + "learning_rate": 0.000861717144249482, + "loss": 0.83118635, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1379, + "time_per_iteration": 2.8830740451812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118616, + "balance_loss_mlp": 1.10515702, + "diversity_loss_mlp": 0.0, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06542821866252439, + "language_loss": 0.89670694, + "learning_rate": 0.0008615019873421175, + "loss": 0.90789306, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.1348877, + "routerloss_mlp": 0.0, + "step": 1380, + "time_per_iteration": 2.4692320823669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124803, + "balance_loss_mlp": 1.11096311, + "diversity_loss_mlp": 0.0, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.08230289019981965, + "language_loss": 0.85984069, + "learning_rate": 0.0008612866900872349, + "loss": 0.87108874, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1381, + "time_per_iteration": 2.5671193599700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119212, + "balance_loss_mlp": 1.10564578, + "diversity_loss_mlp": 0.0, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.09708901974799254, + "language_loss": 0.8800329, + "learning_rate": 0.0008610712525684197, + "loss": 0.89122504, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1382, + "time_per_iteration": 2.673672676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.12075388, + "diversity_loss_mlp": 0.0, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.08550137436350284, + "language_loss": 0.84231853, + "learning_rate": 0.0008608556748693121, + "loss": 0.85366714, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1383, + "time_per_iteration": 3.285391330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113293, + "balance_loss_mlp": 1.11881518, + "diversity_loss_mlp": 0.0, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.07276264363306281, + "language_loss": 0.86098409, + "learning_rate": 0.000860639957073607, + "loss": 0.87231338, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1384, + "time_per_iteration": 2.74979829788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130834, + "balance_loss_mlp": 1.11668396, + "diversity_loss_mlp": 0.0, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.07735164598050102, + "language_loss": 0.87488532, + "learning_rate": 0.0008604240992650534, + "loss": 0.88619369, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1385, + "time_per_iteration": 2.765714406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113264, + "balance_loss_mlp": 1.11819148, + "diversity_loss_mlp": 0.0, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.09224305204204497, + "language_loss": 0.89344275, + "learning_rate": 0.0008602081015274545, + "loss": 0.90476912, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1386, + "time_per_iteration": 2.7466471195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130382, + "balance_loss_mlp": 1.11580229, + "diversity_loss_mlp": 0.0, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.08049268911379595, + "language_loss": 0.83551365, + "learning_rate": 0.0008599919639446684, + "loss": 0.84681749, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1387, + "time_per_iteration": 2.680053234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119435, + "balance_loss_mlp": 1.10439074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.08313146027802099, + "language_loss": 0.80363739, + "learning_rate": 0.000859775686600607, + "loss": 0.81483173, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1388, + "time_per_iteration": 2.5738272666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114186, + "balance_loss_mlp": 1.12722135, + "diversity_loss_mlp": 0.0, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.08559032433145165, + "language_loss": 0.85052109, + "learning_rate": 0.0008595592695792367, + "loss": 0.86193967, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1389, + "time_per_iteration": 2.660012722015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112772, + "balance_loss_mlp": 1.11312914, + "diversity_loss_mlp": 0.0, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.07620364037172102, + "language_loss": 0.90774226, + "learning_rate": 0.0008593427129645778, + "loss": 0.91901946, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1390, + "time_per_iteration": 2.62744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131127, + "balance_loss_mlp": 1.11615419, + "diversity_loss_mlp": 0.0, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.0742307152228864, + "language_loss": 0.85619152, + "learning_rate": 0.0008591260168407052, + "loss": 0.86750275, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1391, + "time_per_iteration": 2.738680124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113811, + "balance_loss_mlp": 1.09930313, + "diversity_loss_mlp": 0.0, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.05574398067767488, + "language_loss": 0.82839364, + "learning_rate": 0.0008589091812917479, + "loss": 0.83953172, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1392, + "time_per_iteration": 2.5947506427764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109245, + "balance_loss_mlp": 1.09471345, + "diversity_loss_mlp": 0.0, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07022348692687568, + "language_loss": 0.85257161, + "learning_rate": 0.0008586922064018887, + "loss": 0.86366403, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1393, + "time_per_iteration": 2.6624581813812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110195, + "balance_loss_mlp": 1.09542501, + "diversity_loss_mlp": 0.0, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.07561979453055602, + "language_loss": 0.89401793, + "learning_rate": 0.0008584750922553651, + "loss": 0.9051199, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1394, + "time_per_iteration": 3.1940202713012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107917, + "balance_loss_mlp": 1.0934931, + "diversity_loss_mlp": 0.0, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.07234350422575066, + "language_loss": 0.83740592, + "learning_rate": 0.0008582578389364677, + "loss": 0.84848505, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1395, + "time_per_iteration": 2.8844621181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106129, + "balance_loss_mlp": 1.09147811, + "diversity_loss_mlp": 0.0, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.061968206774760184, + "language_loss": 0.91908813, + "learning_rate": 0.0008580404465295422, + "loss": 0.93014938, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1396, + "time_per_iteration": 2.7842769622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106127, + "balance_loss_mlp": 1.09155917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07293181793333794, + "language_loss": 0.88274646, + "learning_rate": 0.0008578229151189876, + "loss": 0.89380777, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1397, + "time_per_iteration": 2.96771502494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110096, + "balance_loss_mlp": 1.08638036, + "diversity_loss_mlp": 0.0, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.08798004746081324, + "language_loss": 0.81253606, + "learning_rate": 0.0008576052447892573, + "loss": 0.82354569, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1398, + "time_per_iteration": 2.5413830280303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101823, + "balance_loss_mlp": 1.08761334, + "diversity_loss_mlp": 0.0, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.0737959226904994, + "language_loss": 0.86320835, + "learning_rate": 0.000857387435624858, + "loss": 0.87422657, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1399, + "time_per_iteration": 2.554016351699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00934821, + "balance_loss_mlp": 1.63627267, + "diversity_loss_mlp": 0.20064378, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.02492172823463741, + "language_loss": 0.88190895, + "learning_rate": 0.0008571694877103513, + "loss": 0.89125717, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636335, + "step": 1400, + "time_per_iteration": 3.307114839553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110386, + "balance_loss_mlp": 1.09591365, + "diversity_loss_mlp": 0.0, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.07757128819182789, + "language_loss": 0.87680864, + "learning_rate": 0.0008569514011303515, + "loss": 0.88791251, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1401, + "time_per_iteration": 2.800502300262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917512, + "balance_loss_mlp": 1.60226941, + "diversity_loss_mlp": 0.19939175, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.03393521208879438, + "language_loss": 0.88186574, + "learning_rate": 0.0008567331759695277, + "loss": 0.8910408, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01668182, + "step": 1402, + "time_per_iteration": 2.7670016288757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108043, + "balance_loss_mlp": 1.09297514, + "diversity_loss_mlp": 0.0, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.0674494366068644, + "language_loss": 0.86427194, + "learning_rate": 0.0008565148123126023, + "loss": 0.87535238, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1403, + "time_per_iteration": 2.660659074783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094781, + "balance_loss_mlp": 1.08053553, + "diversity_loss_mlp": 0.0, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.059221605294443855, + "language_loss": 0.86113608, + "learning_rate": 0.0008562963102443516, + "loss": 0.8720839, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1404, + "time_per_iteration": 2.6982760429382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110424, + "balance_loss_mlp": 1.090042, + "diversity_loss_mlp": 0.0, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.08483345099627004, + "language_loss": 0.85166299, + "learning_rate": 0.0008560776698496056, + "loss": 0.86270541, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1405, + "time_per_iteration": 2.9167518615722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110133, + "balance_loss_mlp": 1.09539831, + "diversity_loss_mlp": 0.0, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.06923600464578249, + "language_loss": 0.85861331, + "learning_rate": 0.0008558588912132481, + "loss": 0.86971468, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1406, + "time_per_iteration": 2.8346776962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00696474, + "balance_loss_mlp": 1.17983532, + "diversity_loss_mlp": 0.18206902, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.0036772550136199766, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77155459, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0155216, + "step": 1407, + "time_per_iteration": 4.943782091140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_mlp": 1.09137964, + "diversity_loss_mlp": 0.0, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.08329945876184135, + "language_loss": 0.82942384, + "learning_rate": 0.0008554209195555016, + "loss": 0.84047806, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1408, + "time_per_iteration": 2.7417516708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125368, + "balance_loss_mlp": 1.11146832, + "diversity_loss_mlp": 0.0, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.06975199960684045, + "language_loss": 0.8827157, + "learning_rate": 0.0008552017267041483, + "loss": 0.89396936, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1409, + "time_per_iteration": 2.6978721618652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126015, + "balance_loss_mlp": 1.11216331, + "diversity_loss_mlp": 0.0, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06710824628929367, + "language_loss": 0.83395678, + "learning_rate": 0.0008549823959512549, + "loss": 0.84521693, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1410, + "time_per_iteration": 2.6867637634277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125714, + "balance_loss_mlp": 1.11246991, + "diversity_loss_mlp": 0.0, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.07002470067050659, + "language_loss": 0.86486357, + "learning_rate": 0.0008547629273819728, + "loss": 0.87612069, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.13262939, + "routerloss_mlp": 0.0, + "step": 1411, + "time_per_iteration": 3.410454750061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142962, + "balance_loss_mlp": 1.12940812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.07619635814943253, + "language_loss": 0.83522588, + "learning_rate": 0.0008545433210815074, + "loss": 0.84665549, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1412, + "time_per_iteration": 2.638172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139051, + "balance_loss_mlp": 1.12536621, + "diversity_loss_mlp": 0.0, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.06317158203016926, + "language_loss": 0.87351668, + "learning_rate": 0.0008543235771351176, + "loss": 0.88490719, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1413, + "time_per_iteration": 2.7705581188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159735, + "balance_loss_mlp": 1.14645457, + "diversity_loss_mlp": 0.0, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.08259318688939964, + "language_loss": 0.84684592, + "learning_rate": 0.0008541036956281154, + "loss": 0.85844326, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1414, + "time_per_iteration": 2.8803579807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147853, + "balance_loss_mlp": 1.13435841, + "diversity_loss_mlp": 0.0, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.09396951476817994, + "language_loss": 0.81928164, + "learning_rate": 0.0008538836766458665, + "loss": 0.83076018, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.13519287, + "routerloss_mlp": 0.0, + "step": 1415, + "time_per_iteration": 2.860991954803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140979, + "balance_loss_mlp": 1.12721062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.07553622395064079, + "language_loss": 0.84927893, + "learning_rate": 0.0008536635202737897, + "loss": 0.86068869, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1416, + "time_per_iteration": 2.848196268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146453, + "balance_loss_mlp": 1.13278019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.07031625369418516, + "language_loss": 0.82188255, + "learning_rate": 0.0008534432265973573, + "loss": 0.83334708, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1417, + "time_per_iteration": 2.6029789447784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153419, + "balance_loss_mlp": 1.13950717, + "diversity_loss_mlp": 0.0, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07823597875801033, + "language_loss": 0.88322413, + "learning_rate": 0.000853222795702095, + "loss": 0.89475828, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1418, + "time_per_iteration": 3.3933968544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149691, + "balance_loss_mlp": 1.13570726, + "diversity_loss_mlp": 0.0, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.07267637680100167, + "language_loss": 0.83730674, + "learning_rate": 0.0008530022276735813, + "loss": 0.84880364, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1419, + "time_per_iteration": 2.766181707382202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134499, + "balance_loss_mlp": 1.12086129, + "diversity_loss_mlp": 0.0, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.06887995103877555, + "language_loss": 0.86238861, + "learning_rate": 0.0008527815225974489, + "loss": 0.87373358, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1420, + "time_per_iteration": 2.6471102237701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135972, + "balance_loss_mlp": 1.12148833, + "diversity_loss_mlp": 0.0, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10131461494963417, + "language_loss": 0.88726115, + "learning_rate": 0.0008525606805593829, + "loss": 0.89862096, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1421, + "time_per_iteration": 2.436647653579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118286, + "balance_loss_mlp": 1.10405266, + "diversity_loss_mlp": 0.0, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.0859881194807961, + "language_loss": 0.8254106, + "learning_rate": 0.0008523397016451213, + "loss": 0.83659345, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1422, + "time_per_iteration": 2.593588352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103656, + "balance_loss_mlp": 1.08907628, + "diversity_loss_mlp": 0.0, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.06052148467578676, + "language_loss": 0.87038374, + "learning_rate": 0.0008521185859404564, + "loss": 0.88142037, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1423, + "time_per_iteration": 3.3936307430267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092129, + "balance_loss_mlp": 1.07775199, + "diversity_loss_mlp": 0.0, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06977326166261295, + "language_loss": 0.8940134, + "learning_rate": 0.0008518973335312326, + "loss": 0.90493476, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1424, + "time_per_iteration": 2.7834270000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081272, + "balance_loss_mlp": 1.06702638, + "diversity_loss_mlp": 0.0, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.119675165593639, + "language_loss": 0.83282709, + "learning_rate": 0.0008516759445033477, + "loss": 0.84363985, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1425, + "time_per_iteration": 2.665099859237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.06930685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08266887436661914, + "language_loss": 0.85026807, + "learning_rate": 0.0008514544189427526, + "loss": 0.86110568, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1426, + "time_per_iteration": 2.6887404918670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086038, + "balance_loss_mlp": 1.07249546, + "diversity_loss_mlp": 0.0, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.06908859165293682, + "language_loss": 0.86575979, + "learning_rate": 0.0008512327569354511, + "loss": 0.87662017, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1427, + "time_per_iteration": 2.5235631465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108142, + "balance_loss_mlp": 1.09480238, + "diversity_loss_mlp": 0.0, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.08987008099145026, + "language_loss": 0.8368206, + "learning_rate": 0.0008510109585675001, + "loss": 0.847902, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.13360596, + "routerloss_mlp": 0.0, + "step": 1428, + "time_per_iteration": 2.613348960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140613, + "balance_loss_mlp": 1.13260245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.05207498704371428, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82293957, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1429, + "time_per_iteration": 4.706013202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133032, + "balance_loss_mlp": 1.11977601, + "diversity_loss_mlp": 0.0, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.09002666847623074, + "language_loss": 0.80503839, + "learning_rate": 0.0008505669530941415, + "loss": 0.8163687, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1430, + "time_per_iteration": 3.2976372241973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0097004, + "balance_loss_mlp": 1.70641518, + "diversity_loss_mlp": 0.20088202, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.03747760406507578, + "language_loss": 0.84294951, + "learning_rate": 0.000850344746161112, + "loss": 0.85264993, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01639144, + "step": 1431, + "time_per_iteration": 2.6297106742858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139922, + "balance_loss_mlp": 1.12685704, + "diversity_loss_mlp": 0.0, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.08230554095697513, + "language_loss": 0.87346137, + "learning_rate": 0.0008501224032121894, + "loss": 0.88486063, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1432, + "time_per_iteration": 2.4853787422180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129049, + "balance_loss_mlp": 1.1158998, + "diversity_loss_mlp": 0.0, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06557126517551867, + "language_loss": 0.82118285, + "learning_rate": 0.0008498999243336946, + "loss": 0.83247334, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1433, + "time_per_iteration": 2.623809576034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11776567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.0832335684907068, + "language_loss": 0.87471139, + "learning_rate": 0.0008496773096120021, + "loss": 0.88601708, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.12817383, + "routerloss_mlp": 0.0, + "step": 1434, + "time_per_iteration": 2.7995760440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111971, + "balance_loss_mlp": 1.10637057, + "diversity_loss_mlp": 0.0, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.10286197296711953, + "language_loss": 0.84387434, + "learning_rate": 0.0008494545591335381, + "loss": 0.85507143, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.13354492, + "routerloss_mlp": 0.0, + "step": 1435, + "time_per_iteration": 2.933576822280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113068, + "balance_loss_mlp": 1.09978795, + "diversity_loss_mlp": 0.0, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.053150449500146836, + "language_loss": 0.86971611, + "learning_rate": 0.0008492316729847823, + "loss": 0.88084674, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1436, + "time_per_iteration": 2.8865604400634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.09676659, + "diversity_loss_mlp": 0.0, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08937825724590943, + "language_loss": 0.7968539, + "learning_rate": 0.0008490086512522664, + "loss": 0.80795395, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1437, + "time_per_iteration": 2.7166872024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105369, + "balance_loss_mlp": 1.0916723, + "diversity_loss_mlp": 0.0, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.09013751301914075, + "language_loss": 0.90582836, + "learning_rate": 0.0008487854940225755, + "loss": 0.91688204, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1438, + "time_per_iteration": 2.4426465034484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102616, + "balance_loss_mlp": 1.08844161, + "diversity_loss_mlp": 0.0, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.09066429268698341, + "language_loss": 0.89896768, + "learning_rate": 0.0008485622013823466, + "loss": 0.90999383, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1439, + "time_per_iteration": 2.599177360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090727, + "balance_loss_mlp": 1.07675576, + "diversity_loss_mlp": 0.0, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.08059762035463526, + "language_loss": 0.83446515, + "learning_rate": 0.00084833877341827, + "loss": 0.84537244, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1440, + "time_per_iteration": 2.667215347290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.0762167, + "diversity_loss_mlp": 0.0, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.07889497077341047, + "language_loss": 0.80625433, + "learning_rate": 0.000848115210217088, + "loss": 0.81715715, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1441, + "time_per_iteration": 2.5463788509368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.08003855, + "diversity_loss_mlp": 0.0, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.08443965058939805, + "language_loss": 0.81771946, + "learning_rate": 0.0008478915118655952, + "loss": 0.82866359, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1442, + "time_per_iteration": 2.743678569793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118232, + "balance_loss_mlp": 1.10385561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.07019455815968899, + "language_loss": 0.86195552, + "learning_rate": 0.0008476676784506393, + "loss": 0.87313789, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1443, + "time_per_iteration": 2.663422107696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.10996866, + "diversity_loss_mlp": 0.0, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.08623331537045495, + "language_loss": 0.81889486, + "learning_rate": 0.0008474437100591201, + "loss": 0.83014178, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1444, + "time_per_iteration": 3.340557813644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129459, + "balance_loss_mlp": 1.11489129, + "diversity_loss_mlp": 0.0, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.08279806566523454, + "language_loss": 0.85577607, + "learning_rate": 0.0008472196067779898, + "loss": 0.86707067, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1445, + "time_per_iteration": 2.675623655319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112665, + "balance_loss_mlp": 1.09800267, + "diversity_loss_mlp": 0.0, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10281028137483857, + "language_loss": 0.85108185, + "learning_rate": 0.0008469953686942531, + "loss": 0.86220849, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1446, + "time_per_iteration": 3.0647382736206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933894, + "balance_loss_mlp": 1.63962197, + "diversity_loss_mlp": 0.19544066, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.039122045531048345, + "language_loss": 0.83261281, + "learning_rate": 0.0008467709958949668, + "loss": 0.84195173, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636306, + "step": 1447, + "time_per_iteration": 2.777806043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932176, + "balance_loss_mlp": 1.63710666, + "diversity_loss_mlp": 0.19454433, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.036668832644649825, + "language_loss": 0.85678959, + "learning_rate": 0.0008465464884672403, + "loss": 0.8661114, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01635053, + "step": 1448, + "time_per_iteration": 2.7313778400421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109364, + "balance_loss_mlp": 1.07944214, + "diversity_loss_mlp": 0.0, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.08672786191572247, + "language_loss": 0.85892808, + "learning_rate": 0.0008463218464982348, + "loss": 0.86986446, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1449, + "time_per_iteration": 2.8115885257720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109775, + "balance_loss_mlp": 1.08367157, + "diversity_loss_mlp": 0.0, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.09681901325388456, + "language_loss": 0.8756566, + "learning_rate": 0.0008460970700751645, + "loss": 0.88663405, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1450, + "time_per_iteration": 3.071645975112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093318, + "balance_loss_mlp": 1.07963276, + "diversity_loss_mlp": 0.0, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.09020366192691211, + "language_loss": 0.87640095, + "learning_rate": 0.000845872159285295, + "loss": 0.88733411, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1451, + "time_per_iteration": 2.7342164516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051691, + "balance_loss_mlp": 1.04301238, + "diversity_loss_mlp": 0.0, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.032344288076380935, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78818536, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1452, + "time_per_iteration": 4.95387077331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121492, + "balance_loss_mlp": 1.10795009, + "diversity_loss_mlp": 0.0, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.08097200979220782, + "language_loss": 0.86171871, + "learning_rate": 0.0008454219349544836, + "loss": 0.87293363, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1453, + "time_per_iteration": 3.373755693435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127619, + "balance_loss_mlp": 1.11439896, + "diversity_loss_mlp": 0.0, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.0882994281711823, + "language_loss": 0.81864405, + "learning_rate": 0.000845196621588334, + "loss": 0.82992017, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.13244629, + "routerloss_mlp": 0.0, + "step": 1454, + "time_per_iteration": 2.758122682571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147815, + "balance_loss_mlp": 1.13453507, + "diversity_loss_mlp": 0.0, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.06575509380885615, + "language_loss": 0.76256007, + "learning_rate": 0.0008449711742049706, + "loss": 0.7740382, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1455, + "time_per_iteration": 2.752345561981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.1432693, + "diversity_loss_mlp": 0.0, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.10411587441286801, + "language_loss": 0.84306383, + "learning_rate": 0.0008447455928919196, + "loss": 0.85462898, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1456, + "time_per_iteration": 2.6104180812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146529, + "balance_loss_mlp": 1.13327312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.07273170046833245, + "language_loss": 0.86767292, + "learning_rate": 0.0008445198777367595, + "loss": 0.87913817, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1457, + "time_per_iteration": 2.614743947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144047, + "balance_loss_mlp": 1.13080251, + "diversity_loss_mlp": 0.0, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.08362811388708001, + "language_loss": 0.81054902, + "learning_rate": 0.0008442940288271208, + "loss": 0.82198954, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1458, + "time_per_iteration": 2.615705966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112578, + "balance_loss_mlp": 1.11191583, + "diversity_loss_mlp": 0.0, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06892977395484212, + "language_loss": 0.8688817, + "learning_rate": 0.0008440680462506856, + "loss": 0.88013953, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1459, + "time_per_iteration": 2.810474157333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121233, + "balance_loss_mlp": 1.10828125, + "diversity_loss_mlp": 0.0, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.06441288224223744, + "language_loss": 0.86424565, + "learning_rate": 0.0008438419300951883, + "loss": 0.87545788, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1460, + "time_per_iteration": 2.6540863513946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115517, + "balance_loss_mlp": 1.10215354, + "diversity_loss_mlp": 0.0, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.12446768600100189, + "language_loss": 0.86647975, + "learning_rate": 0.0008436156804484148, + "loss": 0.87763494, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1461, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110833, + "balance_loss_mlp": 1.0965395, + "diversity_loss_mlp": 0.0, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.08490544085138897, + "language_loss": 0.88168794, + "learning_rate": 0.0008433892973982031, + "loss": 0.89279622, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1462, + "time_per_iteration": 2.561211347579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10098886, + "diversity_loss_mlp": 0.0, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07295818188475026, + "language_loss": 0.84776855, + "learning_rate": 0.0008431627810324431, + "loss": 0.85892212, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1463, + "time_per_iteration": 2.654146671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117739, + "balance_loss_mlp": 1.10345769, + "diversity_loss_mlp": 0.0, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.06893619297503142, + "language_loss": 0.8126353, + "learning_rate": 0.000842936131439076, + "loss": 0.82381272, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1464, + "time_per_iteration": 2.6571760177612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115394, + "balance_loss_mlp": 1.1010766, + "diversity_loss_mlp": 0.0, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.07879840484237804, + "language_loss": 0.87885797, + "learning_rate": 0.0008427093487060951, + "loss": 0.89001191, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1465, + "time_per_iteration": 2.6847336292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101907, + "balance_loss_mlp": 1.08776927, + "diversity_loss_mlp": 0.0, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.06118480673876746, + "language_loss": 0.84661305, + "learning_rate": 0.000842482432921545, + "loss": 0.8576321, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1466, + "time_per_iteration": 2.884965181350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110751, + "balance_loss_mlp": 1.09353852, + "diversity_loss_mlp": 0.0, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.07927655906335743, + "language_loss": 0.87199128, + "learning_rate": 0.0008422553841735225, + "loss": 0.88306642, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1467, + "time_per_iteration": 2.528017997741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115631, + "balance_loss_mlp": 1.10146928, + "diversity_loss_mlp": 0.0, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.07348722340160863, + "language_loss": 0.84837711, + "learning_rate": 0.0008420282025501757, + "loss": 0.85953343, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1468, + "time_per_iteration": 2.7696359157562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115321, + "balance_loss_mlp": 1.10156429, + "diversity_loss_mlp": 0.0, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07024793700711117, + "language_loss": 0.85080296, + "learning_rate": 0.0008418008881397043, + "loss": 0.86195612, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1469, + "time_per_iteration": 2.659646511077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115825, + "balance_loss_mlp": 1.10241413, + "diversity_loss_mlp": 0.0, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.12791916727658353, + "language_loss": 0.82420468, + "learning_rate": 0.0008415734410303595, + "loss": 0.83536291, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1470, + "time_per_iteration": 3.2350287437438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120259, + "balance_loss_mlp": 1.10672879, + "diversity_loss_mlp": 0.0, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.0700140113394834, + "language_loss": 0.90437436, + "learning_rate": 0.0008413458613104444, + "loss": 0.91557699, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1471, + "time_per_iteration": 2.7219245433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111254, + "balance_loss_mlp": 1.09766376, + "diversity_loss_mlp": 0.0, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.07145574186167022, + "language_loss": 0.83164495, + "learning_rate": 0.0008411181490683129, + "loss": 0.84275752, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1472, + "time_per_iteration": 2.727936029434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107735, + "balance_loss_mlp": 1.09348917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.0645149730480124, + "language_loss": 0.82377428, + "learning_rate": 0.0008408903043923707, + "loss": 0.83485162, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1473, + "time_per_iteration": 2.9972269535064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111455, + "balance_loss_mlp": 1.1004951, + "diversity_loss_mlp": 0.0, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09233547648167305, + "language_loss": 0.81268132, + "learning_rate": 0.0008406623273710754, + "loss": 0.82382679, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1474, + "time_per_iteration": 2.5923123359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105938, + "balance_loss_mlp": 1.09263408, + "diversity_loss_mlp": 0.0, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.0761903935255829, + "language_loss": 0.8290056, + "learning_rate": 0.0008404342180929351, + "loss": 0.840065, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1475, + "time_per_iteration": 2.664698600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121728, + "balance_loss_mlp": 1.10819817, + "diversity_loss_mlp": 0.0, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.08946081876366527, + "language_loss": 0.81824017, + "learning_rate": 0.00084020597664651, + "loss": 0.82945752, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1476, + "time_per_iteration": 2.7941510677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113829, + "balance_loss_mlp": 1.10019112, + "diversity_loss_mlp": 0.0, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.09030679544521746, + "language_loss": 0.83820337, + "learning_rate": 0.0008399776031204111, + "loss": 0.84934169, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1477, + "time_per_iteration": 2.7508158683776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101529, + "balance_loss_mlp": 1.08784389, + "diversity_loss_mlp": 0.0, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07642048536310797, + "language_loss": 0.79864645, + "learning_rate": 0.0008397490976033009, + "loss": 0.80966175, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1478, + "time_per_iteration": 2.6500625610351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054127, + "balance_loss_mlp": 1.04673624, + "diversity_loss_mlp": 0.0, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.0303646120618472, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933775, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.07373047, + "routerloss_mlp": 0.0, + "step": 1479, + "time_per_iteration": 4.757360935211182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098606, + "balance_loss_mlp": 1.08449173, + "diversity_loss_mlp": 0.0, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06570619267025138, + "language_loss": 0.85133117, + "learning_rate": 0.0008392916909509525, + "loss": 0.86231726, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1480, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093081, + "balance_loss_mlp": 1.07888281, + "diversity_loss_mlp": 0.0, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.07896332999012158, + "language_loss": 0.8543641, + "learning_rate": 0.0008390627899932954, + "loss": 0.86529493, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1481, + "time_per_iteration": 2.5937705039978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100254, + "balance_loss_mlp": 1.08532953, + "diversity_loss_mlp": 0.0, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.08879627929694006, + "language_loss": 0.88894033, + "learning_rate": 0.000838833757399789, + "loss": 0.89994287, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1482, + "time_per_iteration": 2.95451283454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106961, + "balance_loss_mlp": 1.09247661, + "diversity_loss_mlp": 0.0, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.08557616325511565, + "language_loss": 0.80760586, + "learning_rate": 0.0008386045932593515, + "loss": 0.81867552, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1483, + "time_per_iteration": 2.6901025772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.09776473, + "diversity_loss_mlp": 0.0, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.0661413109298982, + "language_loss": 0.86017227, + "learning_rate": 0.0008383752976609525, + "loss": 0.87129307, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1484, + "time_per_iteration": 2.9148330688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116421, + "balance_loss_mlp": 1.1014719, + "diversity_loss_mlp": 0.0, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06788684976720215, + "language_loss": 0.80004096, + "learning_rate": 0.0008381458706936123, + "loss": 0.81120521, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1485, + "time_per_iteration": 2.681067943572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112387, + "balance_loss_mlp": 1.09728312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06920905175587555, + "language_loss": 0.8725493, + "learning_rate": 0.0008379163124464025, + "loss": 0.88367319, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1486, + "time_per_iteration": 2.7093162536621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_mlp": 1.10290396, + "diversity_loss_mlp": 0.0, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.09647963836289664, + "language_loss": 0.77093983, + "learning_rate": 0.0008376866230084452, + "loss": 0.78211844, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1487, + "time_per_iteration": 2.8678433895111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00910546, + "balance_loss_mlp": 1.59136748, + "diversity_loss_mlp": 0.19592074, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.03660624024989628, + "language_loss": 0.86046171, + "learning_rate": 0.000837456802468914, + "loss": 0.86956716, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01690142, + "step": 1488, + "time_per_iteration": 2.602982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102391, + "balance_loss_mlp": 1.08787107, + "diversity_loss_mlp": 0.0, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.0820682475712047, + "language_loss": 0.85374725, + "learning_rate": 0.0008372268509170331, + "loss": 0.86477119, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1489, + "time_per_iteration": 2.6895487308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099554, + "balance_loss_mlp": 1.08529639, + "diversity_loss_mlp": 0.0, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.09305985964981825, + "language_loss": 0.85262501, + "learning_rate": 0.0008369967684420779, + "loss": 0.86362052, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1490, + "time_per_iteration": 2.7102949619293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083179, + "balance_loss_mlp": 1.06912422, + "diversity_loss_mlp": 0.0, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.08804420397834639, + "language_loss": 0.84696782, + "learning_rate": 0.0008367665551333736, + "loss": 0.85779965, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1491, + "time_per_iteration": 2.618272304534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088636, + "balance_loss_mlp": 1.07430756, + "diversity_loss_mlp": 0.0, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.07991380194683065, + "language_loss": 0.85525382, + "learning_rate": 0.0008365362110802977, + "loss": 0.86614019, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1492, + "time_per_iteration": 2.851928234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101019, + "balance_loss_mlp": 1.08655906, + "diversity_loss_mlp": 0.0, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.0838988471662801, + "language_loss": 0.82620168, + "learning_rate": 0.0008363057363722773, + "loss": 0.83721185, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1493, + "time_per_iteration": 2.853207588195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_mlp": 1.09245062, + "diversity_loss_mlp": 0.0, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.06826703692619526, + "language_loss": 0.84157109, + "learning_rate": 0.0008360751310987906, + "loss": 0.85263485, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1494, + "time_per_iteration": 2.57387638092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11695361, + "diversity_loss_mlp": 0.0, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.058749130100992836, + "language_loss": 0.85290074, + "learning_rate": 0.0008358443953493666, + "loss": 0.86420786, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1495, + "time_per_iteration": 2.8883073329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164777, + "balance_loss_mlp": 1.15067482, + "diversity_loss_mlp": 0.0, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.08087911977453179, + "language_loss": 0.88221979, + "learning_rate": 0.0008356135292135851, + "loss": 0.89386749, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1496, + "time_per_iteration": 2.5230934619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186431, + "balance_loss_mlp": 1.17226899, + "diversity_loss_mlp": 0.0, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.11116302526442519, + "language_loss": 0.92429602, + "learning_rate": 0.0008353825327810758, + "loss": 0.93616039, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1497, + "time_per_iteration": 2.420966863632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188369, + "balance_loss_mlp": 1.17465985, + "diversity_loss_mlp": 0.0, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.07094257684914687, + "language_loss": 0.8160103, + "learning_rate": 0.00083515140614152, + "loss": 0.82789397, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1498, + "time_per_iteration": 2.7105205059051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172297, + "balance_loss_mlp": 1.15901685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.09212284213685974, + "language_loss": 0.87059236, + "learning_rate": 0.0008349201493846485, + "loss": 0.88231528, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1499, + "time_per_iteration": 2.6807801723480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148211, + "balance_loss_mlp": 1.13470435, + "diversity_loss_mlp": 0.0, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.07375807574735407, + "language_loss": 0.88790113, + "learning_rate": 0.0008346887626002432, + "loss": 0.89938325, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1500, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919256, + "balance_loss_mlp": 1.60489607, + "diversity_loss_mlp": 0.19980004, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.030907333217789122, + "language_loss": 0.85892522, + "learning_rate": 0.000834457245878137, + "loss": 0.86811781, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0169074, + "step": 1501, + "time_per_iteration": 2.6543540954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112198, + "balance_loss_mlp": 1.10861671, + "diversity_loss_mlp": 0.0, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.09029230185558035, + "language_loss": 0.81450766, + "learning_rate": 0.000834225599308212, + "loss": 0.82572746, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1502, + "time_per_iteration": 3.2493886947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125349, + "balance_loss_mlp": 1.11191428, + "diversity_loss_mlp": 0.0, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07343077704271528, + "language_loss": 0.85592055, + "learning_rate": 0.0008339938229804016, + "loss": 0.86717403, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.13458252, + "routerloss_mlp": 0.0, + "step": 1503, + "time_per_iteration": 2.712455987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.08344853, + "diversity_loss_mlp": 0.0, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.040592353184382625, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76525998, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1504, + "time_per_iteration": 4.975377082824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117311, + "balance_loss_mlp": 1.10320854, + "diversity_loss_mlp": 0.0, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.10665663300821891, + "language_loss": 0.84014988, + "learning_rate": 0.0008335298814111094, + "loss": 0.85132295, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1505, + "time_per_iteration": 2.563352584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119478, + "balance_loss_mlp": 1.10572124, + "diversity_loss_mlp": 0.0, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.07488877863745698, + "language_loss": 0.87982982, + "learning_rate": 0.0008332977163497455, + "loss": 0.89102459, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1506, + "time_per_iteration": 2.799177646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011178, + "balance_loss_mlp": 1.10419846, + "diversity_loss_mlp": 0.0, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.08855239932012744, + "language_loss": 0.83522987, + "learning_rate": 0.0008330654218907325, + "loss": 0.84640789, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1507, + "time_per_iteration": 2.7311654090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130891, + "balance_loss_mlp": 1.1170032, + "diversity_loss_mlp": 0.0, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.06185767339129184, + "language_loss": 0.82011658, + "learning_rate": 0.0008328329981242548, + "loss": 0.83142549, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1508, + "time_per_iteration": 2.87014102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148949, + "balance_loss_mlp": 1.13483465, + "diversity_loss_mlp": 0.0, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.0780337340178098, + "language_loss": 0.88045996, + "learning_rate": 0.0008326004451405475, + "loss": 0.89194947, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1509, + "time_per_iteration": 2.7449288368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146827, + "balance_loss_mlp": 1.13290334, + "diversity_loss_mlp": 0.0, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.07615169765943663, + "language_loss": 0.82328165, + "learning_rate": 0.0008323677630298957, + "loss": 0.83474988, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1510, + "time_per_iteration": 2.5527472496032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911058, + "balance_loss_mlp": 1.59209251, + "diversity_loss_mlp": 0.19929613, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.030084219280472915, + "language_loss": 0.84789264, + "learning_rate": 0.0008321349518826345, + "loss": 0.85700321, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01536426, + "step": 1511, + "time_per_iteration": 2.85006046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167449, + "balance_loss_mlp": 1.15337038, + "diversity_loss_mlp": 0.0, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.09547204503407083, + "language_loss": 0.94614309, + "learning_rate": 0.0008319020117891491, + "loss": 0.95781755, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1512, + "time_per_iteration": 2.619699001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150869, + "balance_loss_mlp": 1.13603973, + "diversity_loss_mlp": 0.0, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.0903449194731753, + "language_loss": 0.86757064, + "learning_rate": 0.0008316689428398751, + "loss": 0.87907934, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1513, + "time_per_iteration": 2.6975061893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122355, + "balance_loss_mlp": 1.10804975, + "diversity_loss_mlp": 0.0, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05700485295001885, + "language_loss": 0.88661957, + "learning_rate": 0.0008314357451252979, + "loss": 0.89784312, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1514, + "time_per_iteration": 2.7759623527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101866, + "balance_loss_mlp": 1.08762062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.06876651723291546, + "language_loss": 0.87979865, + "learning_rate": 0.0008312024187359527, + "loss": 0.89081734, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1515, + "time_per_iteration": 2.6594746112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108928, + "balance_loss_mlp": 1.07499838, + "diversity_loss_mlp": 0.0, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.06943657009436902, + "language_loss": 0.87168229, + "learning_rate": 0.000830968963762425, + "loss": 0.88257504, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1516, + "time_per_iteration": 3.0544168949127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078645, + "balance_loss_mlp": 1.06457818, + "diversity_loss_mlp": 0.0, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.07942748937188983, + "language_loss": 0.84183443, + "learning_rate": 0.0008307353802953497, + "loss": 0.85262084, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1517, + "time_per_iteration": 2.7325901985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.06031072, + "diversity_loss_mlp": 0.0, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.0903207444065502, + "language_loss": 0.86203992, + "learning_rate": 0.0008305016684254125, + "loss": 0.87279052, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1518, + "time_per_iteration": 2.790580987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073552, + "balance_loss_mlp": 1.05908012, + "diversity_loss_mlp": 0.0, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.07640210633127195, + "language_loss": 0.86818451, + "learning_rate": 0.0008302678282433479, + "loss": 0.87892002, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1519, + "time_per_iteration": 2.594045400619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077986, + "balance_loss_mlp": 1.06394291, + "diversity_loss_mlp": 0.0, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07607218771192015, + "language_loss": 0.84937745, + "learning_rate": 0.0008300338598399411, + "loss": 0.86015737, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1520, + "time_per_iteration": 2.6176183223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897129, + "balance_loss_mlp": 1.56367016, + "diversity_loss_mlp": 0.19839743, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.03454500929264816, + "language_loss": 0.94754219, + "learning_rate": 0.0008297997633060263, + "loss": 0.95651346, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160955, + "step": 1521, + "time_per_iteration": 2.5507402420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098471, + "balance_loss_mlp": 1.08445215, + "diversity_loss_mlp": 0.0, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07923859397995789, + "language_loss": 0.84868819, + "learning_rate": 0.0008295655387324883, + "loss": 0.8596729, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1522, + "time_per_iteration": 2.942894458770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103286, + "balance_loss_mlp": 1.08957708, + "diversity_loss_mlp": 0.0, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.09185291067452052, + "language_loss": 0.84979212, + "learning_rate": 0.0008293311862102609, + "loss": 0.86082506, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1523, + "time_per_iteration": 2.555556297302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.10218382, + "diversity_loss_mlp": 0.0, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.07878242279946136, + "language_loss": 0.88546365, + "learning_rate": 0.0008290967058303275, + "loss": 0.89662319, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1524, + "time_per_iteration": 2.5723721981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117022, + "balance_loss_mlp": 1.10387325, + "diversity_loss_mlp": 0.0, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07157234250277994, + "language_loss": 0.86573815, + "learning_rate": 0.0008288620976837219, + "loss": 0.87690842, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1525, + "time_per_iteration": 2.539079427719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116802, + "balance_loss_mlp": 1.10354626, + "diversity_loss_mlp": 0.0, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.07300174969402286, + "language_loss": 0.82548958, + "learning_rate": 0.000828627361861527, + "loss": 0.83665758, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1526, + "time_per_iteration": 2.5784413814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117225, + "balance_loss_mlp": 1.10368335, + "diversity_loss_mlp": 0.0, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.105387273671708, + "language_loss": 0.84438479, + "learning_rate": 0.0008283924984548752, + "loss": 0.85555708, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1527, + "time_per_iteration": 2.876854181289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136626, + "balance_loss_mlp": 1.12352467, + "diversity_loss_mlp": 0.0, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.07473419184062492, + "language_loss": 0.84776825, + "learning_rate": 0.0008281575075549485, + "loss": 0.8591345, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1528, + "time_per_iteration": 2.5660881996154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103997, + "balance_loss_mlp": 1.09631968, + "diversity_loss_mlp": 0.0, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.053938657910520806, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78456688, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1529, + "time_per_iteration": 4.633493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149647, + "balance_loss_mlp": 1.13666511, + "diversity_loss_mlp": 0.0, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.07225715112962865, + "language_loss": 0.90511358, + "learning_rate": 0.0008276871436402469, + "loss": 0.91661, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1530, + "time_per_iteration": 2.8149213790893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156897, + "balance_loss_mlp": 1.14402199, + "diversity_loss_mlp": 0.0, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.10076437192912456, + "language_loss": 0.87526608, + "learning_rate": 0.000827451770808083, + "loss": 0.88683504, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1531, + "time_per_iteration": 2.7307019233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137224, + "balance_loss_mlp": 1.12402749, + "diversity_loss_mlp": 0.0, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.07118672956881426, + "language_loss": 0.8318634, + "learning_rate": 0.0008272162708478674, + "loss": 0.84323561, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1532, + "time_per_iteration": 2.559326648712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135091, + "balance_loss_mlp": 1.1222167, + "diversity_loss_mlp": 0.0, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.07324079883183283, + "language_loss": 0.86170006, + "learning_rate": 0.000826980643851029, + "loss": 0.87305093, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1533, + "time_per_iteration": 2.728351354598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120692, + "balance_loss_mlp": 1.10734081, + "diversity_loss_mlp": 0.0, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.07850912920042735, + "language_loss": 0.84523225, + "learning_rate": 0.0008267448899090464, + "loss": 0.85643911, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1534, + "time_per_iteration": 2.595296859741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121931, + "balance_loss_mlp": 1.10788798, + "diversity_loss_mlp": 0.0, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.07265790711823701, + "language_loss": 0.80930066, + "learning_rate": 0.0008265090091134473, + "loss": 0.82051992, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1535, + "time_per_iteration": 2.8336315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105358, + "balance_loss_mlp": 1.09133863, + "diversity_loss_mlp": 0.0, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.08467148330579209, + "language_loss": 0.80271345, + "learning_rate": 0.0008262730015558088, + "loss": 0.81376696, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1536, + "time_per_iteration": 2.9066760540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102197, + "balance_loss_mlp": 1.08847594, + "diversity_loss_mlp": 0.0, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.07407642769484, + "language_loss": 0.81805962, + "learning_rate": 0.0008260368673277574, + "loss": 0.82908159, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1537, + "time_per_iteration": 3.1795482635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106736, + "balance_loss_mlp": 1.09302735, + "diversity_loss_mlp": 0.0, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06784415515848828, + "language_loss": 0.84026253, + "learning_rate": 0.0008258006065209682, + "loss": 0.85132986, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1538, + "time_per_iteration": 2.766732931137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112658, + "balance_loss_mlp": 1.09863889, + "diversity_loss_mlp": 0.0, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.0747520981493109, + "language_loss": 0.80543184, + "learning_rate": 0.0008255642192271657, + "loss": 0.81655836, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1539, + "time_per_iteration": 2.792191505432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130833, + "balance_loss_mlp": 1.11683834, + "diversity_loss_mlp": 0.0, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06277821647748005, + "language_loss": 0.83592129, + "learning_rate": 0.0008253277055381241, + "loss": 0.8472296, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1540, + "time_per_iteration": 2.8384311199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138407, + "balance_loss_mlp": 1.12428069, + "diversity_loss_mlp": 0.0, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09924754491110549, + "language_loss": 0.85482454, + "learning_rate": 0.0008250910655456658, + "loss": 0.86620867, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1541, + "time_per_iteration": 3.1718008518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133859, + "balance_loss_mlp": 1.12016189, + "diversity_loss_mlp": 0.0, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.07747440640117766, + "language_loss": 0.83370835, + "learning_rate": 0.0008248542993416625, + "loss": 0.84504688, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1542, + "time_per_iteration": 2.5952396392822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127147, + "balance_loss_mlp": 1.11278272, + "diversity_loss_mlp": 0.0, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.08018137719350796, + "language_loss": 0.83926904, + "learning_rate": 0.0008246174070180352, + "loss": 0.85054052, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1543, + "time_per_iteration": 2.6775217056274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.10168624, + "diversity_loss_mlp": 0.0, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09273281815149376, + "language_loss": 0.83928716, + "learning_rate": 0.0008243803886667537, + "loss": 0.85044312, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1544, + "time_per_iteration": 3.0925238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110422, + "balance_loss_mlp": 1.09024858, + "diversity_loss_mlp": 0.0, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.06593992881851045, + "language_loss": 0.79115343, + "learning_rate": 0.0008241432443798364, + "loss": 0.80219567, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1545, + "time_per_iteration": 2.839099407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088557, + "balance_loss_mlp": 1.07518196, + "diversity_loss_mlp": 0.0, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05453506209022983, + "language_loss": 0.85691601, + "learning_rate": 0.0008239059742493512, + "loss": 0.86780155, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1546, + "time_per_iteration": 2.7476751804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088565, + "balance_loss_mlp": 1.07480812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.06672989003234615, + "language_loss": 0.87117672, + "learning_rate": 0.0008236685783674142, + "loss": 0.88206244, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1547, + "time_per_iteration": 3.0519776344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06796312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.04305360715769565, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.772995, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1548, + "time_per_iteration": 4.883166790008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.07123256, + "diversity_loss_mlp": 0.0, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.11160876507978217, + "language_loss": 0.82253683, + "learning_rate": 0.0008231934097178955, + "loss": 0.8333841, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.1350708, + "routerloss_mlp": 0.0, + "step": 1549, + "time_per_iteration": 2.60786771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_mlp": 1.07919788, + "diversity_loss_mlp": 0.0, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.07843428838445873, + "language_loss": 0.85328496, + "learning_rate": 0.0008229556371347903, + "loss": 0.86420953, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1550, + "time_per_iteration": 2.962412118911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106892, + "balance_loss_mlp": 1.09379029, + "diversity_loss_mlp": 0.0, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.0840525031564576, + "language_loss": 0.79399186, + "learning_rate": 0.0008227177391691874, + "loss": 0.80506086, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.13122559, + "routerloss_mlp": 0.0, + "step": 1551, + "time_per_iteration": 3.1673550605773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111848, + "balance_loss_mlp": 1.09871709, + "diversity_loss_mlp": 0.0, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07195743014481873, + "language_loss": 0.89281148, + "learning_rate": 0.0008224797159134463, + "loss": 0.90392995, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1552, + "time_per_iteration": 2.7333877086639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121508, + "balance_loss_mlp": 1.10890126, + "diversity_loss_mlp": 0.0, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.07485820549569244, + "language_loss": 0.83144093, + "learning_rate": 0.0008222415674599765, + "loss": 0.84265602, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.12609863, + "routerloss_mlp": 0.0, + "step": 1553, + "time_per_iteration": 3.077017068862915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135128, + "balance_loss_mlp": 1.12165701, + "diversity_loss_mlp": 0.0, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.08671551895934956, + "language_loss": 0.83149582, + "learning_rate": 0.0008220032939012349, + "loss": 0.84284711, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1554, + "time_per_iteration": 2.6689035892486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115966, + "balance_loss_mlp": 1.10284674, + "diversity_loss_mlp": 0.0, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.06666483036401037, + "language_loss": 0.87800217, + "learning_rate": 0.0008217648953297277, + "loss": 0.88916183, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1555, + "time_per_iteration": 2.8417294025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119981, + "balance_loss_mlp": 1.10677278, + "diversity_loss_mlp": 0.0, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.08472740856632217, + "language_loss": 0.78017807, + "learning_rate": 0.0008215263718380095, + "loss": 0.7913779, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1556, + "time_per_iteration": 2.682047128677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096383, + "balance_loss_mlp": 1.08319807, + "diversity_loss_mlp": 0.0, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07743195715790333, + "language_loss": 0.84389544, + "learning_rate": 0.0008212877235186833, + "loss": 0.85485923, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.13201904, + "routerloss_mlp": 0.0, + "step": 1557, + "time_per_iteration": 2.6532580852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074398, + "balance_loss_mlp": 1.06710196, + "diversity_loss_mlp": 0.0, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.04061005434024277, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78811955, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.07275391, + "routerloss_mlp": 0.0, + "step": 1558, + "time_per_iteration": 4.923272132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092088, + "balance_loss_mlp": 1.07896352, + "diversity_loss_mlp": 0.0, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.10565427097675566, + "language_loss": 0.8116585, + "learning_rate": 0.0008208100527678611, + "loss": 0.82257938, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1559, + "time_per_iteration": 2.602773427963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.07101393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.11780548804152448, + "language_loss": 0.78494406, + "learning_rate": 0.0008205710305218135, + "loss": 0.79578459, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1560, + "time_per_iteration": 3.013576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089526, + "balance_loss_mlp": 1.07663918, + "diversity_loss_mlp": 0.0, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.08018423106971302, + "language_loss": 0.89838511, + "learning_rate": 0.0008203318838190541, + "loss": 0.9092803, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1561, + "time_per_iteration": 2.741619348526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108989, + "balance_loss_mlp": 1.07702184, + "diversity_loss_mlp": 0.0, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.09397123990600864, + "language_loss": 0.85396177, + "learning_rate": 0.0008200926127524281, + "loss": 0.86486065, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1562, + "time_per_iteration": 2.60974383354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106708, + "balance_loss_mlp": 1.0936904, + "diversity_loss_mlp": 0.0, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.08688269643752358, + "language_loss": 0.83400619, + "learning_rate": 0.0008198532174148289, + "loss": 0.84507322, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1563, + "time_per_iteration": 2.7336533069610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079297, + "balance_loss_mlp": 1.07195389, + "diversity_loss_mlp": 0.0, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.04112604139988501, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81765467, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1564, + "time_per_iteration": 4.828714609146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145426, + "balance_loss_mlp": 1.1324501, + "diversity_loss_mlp": 0.0, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08852118135813189, + "language_loss": 0.89291, + "learning_rate": 0.0008193740542985244, + "loss": 0.90436429, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1565, + "time_per_iteration": 2.5988731384277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151488, + "balance_loss_mlp": 1.13872099, + "diversity_loss_mlp": 0.0, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.1281977179548432, + "language_loss": 0.86354733, + "learning_rate": 0.0008191342867058467, + "loss": 0.87506223, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1566, + "time_per_iteration": 2.6914639472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.10574174, + "diversity_loss_mlp": 0.0, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.07018370282969584, + "language_loss": 0.83602738, + "learning_rate": 0.0008188943952142509, + "loss": 0.84721458, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1567, + "time_per_iteration": 2.7846438884735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111402, + "balance_loss_mlp": 1.09847367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.08750889372003143, + "language_loss": 0.82150149, + "learning_rate": 0.0008186543799168711, + "loss": 0.83261549, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1568, + "time_per_iteration": 3.1300384998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094152, + "balance_loss_mlp": 1.08103871, + "diversity_loss_mlp": 0.0, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.07719475001811499, + "language_loss": 0.88627326, + "learning_rate": 0.0008184142409068892, + "loss": 0.89721477, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.13134766, + "routerloss_mlp": 0.0, + "step": 1569, + "time_per_iteration": 2.9922726154327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087737, + "balance_loss_mlp": 1.07475495, + "diversity_loss_mlp": 0.0, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.07345065764158631, + "language_loss": 0.86446834, + "learning_rate": 0.000818173978277536, + "loss": 0.87534571, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.12994385, + "routerloss_mlp": 0.0, + "step": 1570, + "time_per_iteration": 2.695930242538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089564, + "balance_loss_mlp": 1.07673669, + "diversity_loss_mlp": 0.0, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.0712021049255776, + "language_loss": 0.83337176, + "learning_rate": 0.000817933592122089, + "loss": 0.84426749, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.12841797, + "routerloss_mlp": 0.0, + "step": 1571, + "time_per_iteration": 2.7131617069244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087482, + "balance_loss_mlp": 1.07427394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.08283074842036095, + "language_loss": 0.83667982, + "learning_rate": 0.0008176930825338749, + "loss": 0.84755468, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1572, + "time_per_iteration": 2.5447826385498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087862, + "balance_loss_mlp": 1.07405734, + "diversity_loss_mlp": 0.0, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07741282152017008, + "language_loss": 0.88849854, + "learning_rate": 0.0008174524496062679, + "loss": 0.89937723, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1573, + "time_per_iteration": 2.908740997314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092114, + "balance_loss_mlp": 1.07822633, + "diversity_loss_mlp": 0.0, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.06962859876416791, + "language_loss": 0.85499102, + "learning_rate": 0.0008172116934326894, + "loss": 0.86591208, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1574, + "time_per_iteration": 2.751488208770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098046, + "balance_loss_mlp": 1.08365786, + "diversity_loss_mlp": 0.0, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.09195920466248479, + "language_loss": 0.8794626, + "learning_rate": 0.0008169708141066097, + "loss": 0.89044309, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1575, + "time_per_iteration": 2.5947275161743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118908, + "balance_loss_mlp": 1.10441208, + "diversity_loss_mlp": 0.0, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.0784824693742563, + "language_loss": 0.90658617, + "learning_rate": 0.0008167298117215465, + "loss": 0.91777527, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1576, + "time_per_iteration": 2.5396125316619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011316, + "balance_loss_mlp": 1.11705649, + "diversity_loss_mlp": 0.0, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.1093253517132677, + "language_loss": 0.87566864, + "learning_rate": 0.0008164886863710649, + "loss": 0.88698471, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1577, + "time_per_iteration": 2.931835412979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138004, + "balance_loss_mlp": 1.12323439, + "diversity_loss_mlp": 0.0, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07788016425512684, + "language_loss": 0.8637675, + "learning_rate": 0.0008162474381487783, + "loss": 0.87514758, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1578, + "time_per_iteration": 3.041262626647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125978, + "balance_loss_mlp": 1.11132693, + "diversity_loss_mlp": 0.0, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.1532642042193693, + "language_loss": 0.84568751, + "learning_rate": 0.0008160060671483475, + "loss": 0.8569473, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1579, + "time_per_iteration": 2.6566197872161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110829, + "balance_loss_mlp": 1.0942831, + "diversity_loss_mlp": 0.0, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.10001869607158981, + "language_loss": 0.8342396, + "learning_rate": 0.0008157645734634809, + "loss": 0.84532249, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1580, + "time_per_iteration": 2.5994346141815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151521, + "balance_loss_mlp": 1.14064956, + "diversity_loss_mlp": 0.0, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.06737085519591758, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78048015, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 1581, + "time_per_iteration": 4.946556329727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00631723, + "balance_loss_mlp": 1.05820811, + "diversity_loss_mlp": 0.17941347, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.002006006723137456, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.73846221, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01291206, + "step": 1582, + "time_per_iteration": 4.897693395614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.08376384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.07529557219412701, + "language_loss": 0.83949858, + "learning_rate": 0.000815039357240067, + "loss": 0.85047406, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1583, + "time_per_iteration": 2.6096932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101837, + "balance_loss_mlp": 1.0882473, + "diversity_loss_mlp": 0.0, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.0740498467066553, + "language_loss": 0.84922493, + "learning_rate": 0.0008147973737554952, + "loss": 0.86024332, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1584, + "time_per_iteration": 2.7863824367523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106775, + "balance_loss_mlp": 1.09364963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.11669723774220289, + "language_loss": 0.85926318, + "learning_rate": 0.000814555268055744, + "loss": 0.87033093, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1585, + "time_per_iteration": 2.6167564392089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111589, + "balance_loss_mlp": 1.1022768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07476018488685929, + "language_loss": 0.87489879, + "learning_rate": 0.0008143130402348073, + "loss": 0.88605773, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1586, + "time_per_iteration": 2.6318202018737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112097, + "balance_loss_mlp": 1.10742807, + "diversity_loss_mlp": 0.0, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.07016471467090964, + "language_loss": 0.79198885, + "learning_rate": 0.0008140706903867265, + "loss": 0.80319858, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1587, + "time_per_iteration": 2.82663893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128991, + "balance_loss_mlp": 1.11541307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.09040046070353, + "language_loss": 0.90612531, + "learning_rate": 0.0008138282186055897, + "loss": 0.91741514, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1588, + "time_per_iteration": 2.690561294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142156, + "balance_loss_mlp": 1.12872136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.07675542780120453, + "language_loss": 0.82382154, + "learning_rate": 0.0008135856249855331, + "loss": 0.83524311, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1589, + "time_per_iteration": 2.6935813426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115907, + "balance_loss_mlp": 1.14551568, + "diversity_loss_mlp": 0.0, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.07642745969896261, + "language_loss": 0.89603746, + "learning_rate": 0.0008133429096207398, + "loss": 0.90762818, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1590, + "time_per_iteration": 2.7690787315368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113549, + "balance_loss_mlp": 1.10534787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.03962763613217991, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76425815, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.08203125, + "routerloss_mlp": 0.0, + "step": 1591, + "time_per_iteration": 4.950432538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184059, + "balance_loss_mlp": 1.17060041, + "diversity_loss_mlp": 0.0, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.0624915030883944, + "language_loss": 0.8671608, + "learning_rate": 0.0008128571140339123, + "loss": 0.87900144, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1592, + "time_per_iteration": 2.717022657394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.15618944, + "diversity_loss_mlp": 0.0, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.08640912687422367, + "language_loss": 0.87240267, + "learning_rate": 0.0008126140340004805, + "loss": 0.88410139, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1593, + "time_per_iteration": 2.5112054347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157381, + "balance_loss_mlp": 1.14379096, + "diversity_loss_mlp": 0.0, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06492228459438584, + "language_loss": 0.82168889, + "learning_rate": 0.0008123708325995172, + "loss": 0.83326268, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1594, + "time_per_iteration": 3.193125009536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139509, + "balance_loss_mlp": 1.1256932, + "diversity_loss_mlp": 0.0, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06515151231920442, + "language_loss": 0.79815221, + "learning_rate": 0.0008121275099254414, + "loss": 0.80954736, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1595, + "time_per_iteration": 2.9032304286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133663, + "balance_loss_mlp": 1.12007284, + "diversity_loss_mlp": 0.0, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06899315915000012, + "language_loss": 0.88638222, + "learning_rate": 0.0008118840660727194, + "loss": 0.89771879, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1596, + "time_per_iteration": 2.6298515796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115466, + "balance_loss_mlp": 1.10215056, + "diversity_loss_mlp": 0.0, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.06984166924665287, + "language_loss": 0.87847084, + "learning_rate": 0.0008116405011358644, + "loss": 0.88962543, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.13336182, + "routerloss_mlp": 0.0, + "step": 1597, + "time_per_iteration": 3.1922342777252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095988, + "balance_loss_mlp": 1.08212388, + "diversity_loss_mlp": 0.0, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.07145022695402857, + "language_loss": 0.79985273, + "learning_rate": 0.0008113968152094369, + "loss": 0.81081259, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1598, + "time_per_iteration": 2.500500440597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090097, + "balance_loss_mlp": 1.07637632, + "diversity_loss_mlp": 0.0, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.07896733537507578, + "language_loss": 0.82477671, + "learning_rate": 0.0008111530083880438, + "loss": 0.83567768, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1599, + "time_per_iteration": 2.9081485271453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.07693791, + "diversity_loss_mlp": 0.0, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.10700735308097704, + "language_loss": 0.86289096, + "learning_rate": 0.0008109090807663399, + "loss": 0.87379909, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1600, + "time_per_iteration": 2.7883458137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084832, + "balance_loss_mlp": 1.07049167, + "diversity_loss_mlp": 0.0, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.058046583591585654, + "language_loss": 0.8845669, + "learning_rate": 0.0008106650324390257, + "loss": 0.89541531, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1601, + "time_per_iteration": 2.8250818252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012306, + "balance_loss_mlp": 1.78856134, + "diversity_loss_mlp": 0.20302816, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.03151963489439222, + "language_loss": 0.81347358, + "learning_rate": 0.0008104208635008493, + "loss": 0.8235966, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165114, + "step": 1602, + "time_per_iteration": 2.6824991703033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078191, + "balance_loss_mlp": 1.06365991, + "diversity_loss_mlp": 0.0, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.06925842581040223, + "language_loss": 0.81696957, + "learning_rate": 0.0008101765740466058, + "loss": 0.82775152, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1603, + "time_per_iteration": 2.4828884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083153, + "balance_loss_mlp": 1.06891942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.08194523431430376, + "language_loss": 0.83996522, + "learning_rate": 0.0008099321641711364, + "loss": 0.85079676, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1604, + "time_per_iteration": 2.628990650177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093302, + "balance_loss_mlp": 1.07891393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.066381842407901, + "language_loss": 0.83568424, + "learning_rate": 0.0008096876339693295, + "loss": 0.84661728, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1605, + "time_per_iteration": 2.621486186981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104017, + "balance_loss_mlp": 1.0898906, + "diversity_loss_mlp": 0.0, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.08065648415588843, + "language_loss": 0.8146233, + "learning_rate": 0.0008094429835361206, + "loss": 0.82566357, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1606, + "time_per_iteration": 2.9436137676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101821, + "balance_loss_mlp": 1.08727765, + "diversity_loss_mlp": 0.0, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.06722603246449312, + "language_loss": 0.85730284, + "learning_rate": 0.0008091982129664908, + "loss": 0.86832106, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1607, + "time_per_iteration": 2.6776270866394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110298, + "balance_loss_mlp": 1.09606481, + "diversity_loss_mlp": 0.0, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.07435522574008574, + "language_loss": 0.83177197, + "learning_rate": 0.0008089533223554687, + "loss": 0.842875, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1608, + "time_per_iteration": 2.6971724033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106883, + "balance_loss_mlp": 1.09322155, + "diversity_loss_mlp": 0.0, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.08534881839400792, + "language_loss": 0.85436511, + "learning_rate": 0.0008087083117981294, + "loss": 0.86543399, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1609, + "time_per_iteration": 2.873072624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100887, + "balance_loss_mlp": 1.08715367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.08408730625442483, + "language_loss": 0.88209295, + "learning_rate": 0.0008084631813895943, + "loss": 0.89310181, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1610, + "time_per_iteration": 2.7717368602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_mlp": 1.0843389, + "diversity_loss_mlp": 0.0, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07291880748627809, + "language_loss": 0.84093356, + "learning_rate": 0.0008082179312250315, + "loss": 0.85191453, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1611, + "time_per_iteration": 2.6323728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167376, + "balance_loss_mlp": 1.15912676, + "diversity_loss_mlp": 0.0, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.06715325583723679, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81023216, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.08251953, + "routerloss_mlp": 0.0, + "step": 1612, + "time_per_iteration": 4.837978839874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103787, + "balance_loss_mlp": 1.09591889, + "diversity_loss_mlp": 0.0, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.04843806861709949, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77733123, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.07861328, + "routerloss_mlp": 0.0, + "step": 1613, + "time_per_iteration": 5.086154937744141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118625, + "balance_loss_mlp": 1.10497594, + "diversity_loss_mlp": 0.0, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.09649046421891638, + "language_loss": 0.82414234, + "learning_rate": 0.0008074814631475545, + "loss": 0.83532858, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1614, + "time_per_iteration": 3.3300058841705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115901, + "balance_loss_mlp": 1.10232294, + "diversity_loss_mlp": 0.0, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.10381126956618623, + "language_loss": 0.7917223, + "learning_rate": 0.0008072357349114907, + "loss": 0.80288124, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1615, + "time_per_iteration": 2.692242383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123449, + "balance_loss_mlp": 1.1100384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.09811598085954727, + "language_loss": 0.88751173, + "learning_rate": 0.0008069898873959363, + "loss": 0.89874619, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1616, + "time_per_iteration": 2.688138723373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119599, + "balance_loss_mlp": 1.10590243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.06496922585492992, + "language_loss": 0.85670269, + "learning_rate": 0.0008067439206963375, + "loss": 0.8678987, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1617, + "time_per_iteration": 2.628465175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126727, + "balance_loss_mlp": 1.11359048, + "diversity_loss_mlp": 0.0, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.08367367493581554, + "language_loss": 0.86233091, + "learning_rate": 0.0008064978349081873, + "loss": 0.87359822, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1618, + "time_per_iteration": 2.9359195232391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122791, + "balance_loss_mlp": 1.10941529, + "diversity_loss_mlp": 0.0, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.062058920213391884, + "language_loss": 0.86742592, + "learning_rate": 0.0008062516301270245, + "loss": 0.87865382, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1619, + "time_per_iteration": 2.685615301132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968061, + "balance_loss_mlp": 1.70987701, + "diversity_loss_mlp": 0.19448289, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.02692656797073588, + "language_loss": 0.8831743, + "learning_rate": 0.0008060053064484343, + "loss": 0.89285493, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588114, + "step": 1620, + "time_per_iteration": 2.9507076740264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131577, + "balance_loss_mlp": 1.11839283, + "diversity_loss_mlp": 0.0, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.08216719715750098, + "language_loss": 0.85142976, + "learning_rate": 0.0008057588639680482, + "loss": 0.86274558, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.13208008, + "routerloss_mlp": 0.0, + "step": 1621, + "time_per_iteration": 2.7498936653137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00955916, + "balance_loss_mlp": 1.68915153, + "diversity_loss_mlp": 0.19115068, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.038673577194741904, + "language_loss": 0.82934028, + "learning_rate": 0.0008055123027815434, + "loss": 0.83889943, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01576493, + "step": 1622, + "time_per_iteration": 2.92877459526062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119016, + "balance_loss_mlp": 1.10545552, + "diversity_loss_mlp": 0.0, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.11144773799130939, + "language_loss": 0.8492527, + "learning_rate": 0.0008052656229846436, + "loss": 0.86044282, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.13580322, + "routerloss_mlp": 0.0, + "step": 1623, + "time_per_iteration": 2.6647849082946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104342, + "balance_loss_mlp": 1.09039474, + "diversity_loss_mlp": 0.0, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.09067734621983937, + "language_loss": 0.90320027, + "learning_rate": 0.0008050188246731182, + "loss": 0.9142437, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1624, + "time_per_iteration": 2.6908931732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108727, + "balance_loss_mlp": 1.07360816, + "diversity_loss_mlp": 0.0, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08706559573327896, + "language_loss": 0.8222695, + "learning_rate": 0.0008047719079427834, + "loss": 0.83314216, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1625, + "time_per_iteration": 2.979578733444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281481, + "balance_loss_mlp": 1.27170551, + "diversity_loss_mlp": 0.0, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.09241126848133228, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75633186, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.09765625, + "routerloss_mlp": 0.0, + "step": 1626, + "time_per_iteration": 4.813723802566528 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078597, + "balance_loss_mlp": 1.06489933, + "diversity_loss_mlp": 0.0, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.061158387019755324, + "language_loss": 0.86164916, + "learning_rate": 0.0008042777196091757, + "loss": 0.87243509, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1627, + "time_per_iteration": 2.6777052879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931263, + "balance_loss_mlp": 1.63595629, + "diversity_loss_mlp": 0.19502082, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.02888255305303151, + "language_loss": 0.81839561, + "learning_rate": 0.0008040304481977643, + "loss": 0.82770824, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01577434, + "step": 1628, + "time_per_iteration": 2.685519218444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_mlp": 1.07024312, + "diversity_loss_mlp": 0.0, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.070875243316129, + "language_loss": 0.86462033, + "learning_rate": 0.0008037830587512649, + "loss": 0.875458, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1629, + "time_per_iteration": 3.0812296867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093655, + "balance_loss_mlp": 1.07976675, + "diversity_loss_mlp": 0.0, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.07857424850498267, + "language_loss": 0.78910959, + "learning_rate": 0.0008035355513657224, + "loss": 0.80004621, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1630, + "time_per_iteration": 2.509866714477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109932, + "balance_loss_mlp": 1.08518136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.05926482463995905, + "language_loss": 0.9323386, + "learning_rate": 0.0008032879261372279, + "loss": 0.94333184, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1631, + "time_per_iteration": 2.793675422668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121244, + "balance_loss_mlp": 1.20142555, + "diversity_loss_mlp": 0.0, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.0543299042148954, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80848283, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 1632, + "time_per_iteration": 5.6717705726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_mlp": 1.08712876, + "diversity_loss_mlp": 0.0, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.07399367926820971, + "language_loss": 0.87236691, + "learning_rate": 0.0008027923225359748, + "loss": 0.88337696, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.13885498, + "routerloss_mlp": 0.0, + "step": 1633, + "time_per_iteration": 2.591161012649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_mlp": 1.09272563, + "diversity_loss_mlp": 0.0, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.07361205381971474, + "language_loss": 0.8823992, + "learning_rate": 0.0008025443443556267, + "loss": 0.89347273, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1634, + "time_per_iteration": 2.714925765991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_mlp": 1.09279966, + "diversity_loss_mlp": 0.0, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.05821338652647348, + "language_loss": 0.88174599, + "learning_rate": 0.000802296248717147, + "loss": 0.89281231, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1635, + "time_per_iteration": 2.924661159515381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102803, + "balance_loss_mlp": 1.08889091, + "diversity_loss_mlp": 0.0, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06918051977022115, + "language_loss": 0.78766519, + "learning_rate": 0.0008020480357168554, + "loss": 0.79869324, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1636, + "time_per_iteration": 2.8397598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096954, + "balance_loss_mlp": 1.08334041, + "diversity_loss_mlp": 0.0, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.061070409346790804, + "language_loss": 0.88343245, + "learning_rate": 0.0008017997054511165, + "loss": 0.89440191, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.13623047, + "routerloss_mlp": 0.0, + "step": 1637, + "time_per_iteration": 2.5770463943481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109567, + "balance_loss_mlp": 1.08241367, + "diversity_loss_mlp": 0.0, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06082888573267997, + "language_loss": 0.85688329, + "learning_rate": 0.0008015512580163407, + "loss": 0.86783999, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1638, + "time_per_iteration": 2.7893900871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915347, + "balance_loss_mlp": 1.6005652, + "diversity_loss_mlp": 0.19760543, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.03200753828687725, + "language_loss": 0.80247211, + "learning_rate": 0.0008013026935089838, + "loss": 0.8116256, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0162621, + "step": 1639, + "time_per_iteration": 2.9013028144836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116887, + "balance_loss_mlp": 1.10366678, + "diversity_loss_mlp": 0.0, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.07107229367788748, + "language_loss": 0.84156835, + "learning_rate": 0.0008010540120255472, + "loss": 0.85273731, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1640, + "time_per_iteration": 2.6617894172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122905, + "balance_loss_mlp": 1.10991144, + "diversity_loss_mlp": 0.0, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.08316081918757003, + "language_loss": 0.86058956, + "learning_rate": 0.0008008052136625774, + "loss": 0.87181866, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1641, + "time_per_iteration": 2.8128581047058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117766, + "balance_loss_mlp": 1.10461712, + "diversity_loss_mlp": 0.0, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.11340060957388516, + "language_loss": 0.86898887, + "learning_rate": 0.0008005562985166666, + "loss": 0.88016647, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1642, + "time_per_iteration": 2.6915791034698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113412, + "balance_loss_mlp": 1.10045385, + "diversity_loss_mlp": 0.0, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.06371803301806024, + "language_loss": 0.85065734, + "learning_rate": 0.0008003072666844524, + "loss": 0.86179143, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.12976074, + "routerloss_mlp": 0.0, + "step": 1643, + "time_per_iteration": 2.713515520095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110554, + "balance_loss_mlp": 1.09287417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.09207812275617455, + "language_loss": 0.82446098, + "learning_rate": 0.0008000581182626173, + "loss": 0.83551639, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 1644, + "time_per_iteration": 2.5728507041931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099382, + "balance_loss_mlp": 1.08668065, + "diversity_loss_mlp": 0.0, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.07446065392993936, + "language_loss": 0.86341298, + "learning_rate": 0.0007998088533478894, + "loss": 0.87440687, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.12713623, + "routerloss_mlp": 0.0, + "step": 1645, + "time_per_iteration": 2.7022316455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103676, + "balance_loss_mlp": 1.09096265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.09512310951915111, + "language_loss": 0.84171218, + "learning_rate": 0.000799559472037042, + "loss": 0.85274899, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.12719727, + "routerloss_mlp": 0.0, + "step": 1646, + "time_per_iteration": 2.5341672897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089286, + "balance_loss_mlp": 1.07678151, + "diversity_loss_mlp": 0.0, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05690135295492242, + "language_loss": 0.87462902, + "learning_rate": 0.0007993099744268932, + "loss": 0.88552189, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1647, + "time_per_iteration": 2.9204719066619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097973, + "balance_loss_mlp": 1.08491409, + "diversity_loss_mlp": 0.0, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.08028992569563033, + "language_loss": 0.88103539, + "learning_rate": 0.000799060360614307, + "loss": 0.8920151, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1648, + "time_per_iteration": 2.7098584175109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094994, + "balance_loss_mlp": 1.08204746, + "diversity_loss_mlp": 0.0, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.07374581447427947, + "language_loss": 0.83565277, + "learning_rate": 0.0007988106306961917, + "loss": 0.84660268, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1649, + "time_per_iteration": 3.136148691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096074, + "balance_loss_mlp": 1.08292556, + "diversity_loss_mlp": 0.0, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.08307651310008923, + "language_loss": 0.84510154, + "learning_rate": 0.0007985607847695014, + "loss": 0.85606229, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1650, + "time_per_iteration": 2.6657865047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090136, + "balance_loss_mlp": 1.07697558, + "diversity_loss_mlp": 0.0, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.07221907468491222, + "language_loss": 0.82981718, + "learning_rate": 0.0007983108229312345, + "loss": 0.84071863, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.13183594, + "routerloss_mlp": 0.0, + "step": 1651, + "time_per_iteration": 2.939943313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109006, + "balance_loss_mlp": 1.07648206, + "diversity_loss_mlp": 0.0, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.0785368607999539, + "language_loss": 0.86505926, + "learning_rate": 0.0007980607452784351, + "loss": 0.87595987, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1652, + "time_per_iteration": 2.586700916290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082675, + "balance_loss_mlp": 1.06952596, + "diversity_loss_mlp": 0.0, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.06920593361186494, + "language_loss": 0.90510356, + "learning_rate": 0.0007978105519081919, + "loss": 0.91593033, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1653, + "time_per_iteration": 2.665844440460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084984, + "balance_loss_mlp": 1.0715965, + "diversity_loss_mlp": 0.0, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.07269169213621761, + "language_loss": 0.87967515, + "learning_rate": 0.0007975602429176385, + "loss": 0.89052504, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.13415527, + "routerloss_mlp": 0.0, + "step": 1654, + "time_per_iteration": 2.5818393230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085975, + "balance_loss_mlp": 1.07225442, + "diversity_loss_mlp": 0.0, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.08150423110047789, + "language_loss": 0.81308222, + "learning_rate": 0.0007973098184039536, + "loss": 0.82394195, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1655, + "time_per_iteration": 2.664916515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094509, + "balance_loss_mlp": 1.08110952, + "diversity_loss_mlp": 0.0, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.0661968945841423, + "language_loss": 0.8695243, + "learning_rate": 0.0007970592784643602, + "loss": 0.88046944, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.13427734, + "routerloss_mlp": 0.0, + "step": 1656, + "time_per_iteration": 2.851214647293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104427, + "balance_loss_mlp": 1.09084868, + "diversity_loss_mlp": 0.0, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.0809768283097012, + "language_loss": 0.85228848, + "learning_rate": 0.0007968086231961272, + "loss": 0.86333275, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1657, + "time_per_iteration": 2.6277201175689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111168, + "balance_loss_mlp": 1.09744644, + "diversity_loss_mlp": 0.0, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.10999441213252201, + "language_loss": 0.83322126, + "learning_rate": 0.0007965578526965671, + "loss": 0.84433806, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1658, + "time_per_iteration": 2.5514447689056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097556, + "balance_loss_mlp": 1.08337009, + "diversity_loss_mlp": 0.0, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.07090711515760839, + "language_loss": 0.86299932, + "learning_rate": 0.0007963069670630377, + "loss": 0.87397492, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1659, + "time_per_iteration": 2.722572088241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.07523549, + "diversity_loss_mlp": 0.0, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.07181055202596492, + "language_loss": 0.88127738, + "learning_rate": 0.0007960559663929416, + "loss": 0.8921715, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1660, + "time_per_iteration": 2.6411688327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079317, + "balance_loss_mlp": 1.06500006, + "diversity_loss_mlp": 0.0, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.06614466369263741, + "language_loss": 0.87915826, + "learning_rate": 0.0007958048507837259, + "loss": 0.88995141, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1661, + "time_per_iteration": 2.954888343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075627, + "balance_loss_mlp": 1.06107187, + "diversity_loss_mlp": 0.0, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.08599761261652404, + "language_loss": 0.87309289, + "learning_rate": 0.0007955536203328822, + "loss": 0.88384914, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1662, + "time_per_iteration": 2.9499282836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074811, + "balance_loss_mlp": 1.06073272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.08962386225204486, + "language_loss": 0.8334958, + "learning_rate": 0.0007953022751379469, + "loss": 0.84424388, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1663, + "time_per_iteration": 2.768754005432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075722, + "balance_loss_mlp": 1.06131005, + "diversity_loss_mlp": 0.0, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.08182948291647181, + "language_loss": 0.8200748, + "learning_rate": 0.000795050815296501, + "loss": 0.830832, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1664, + "time_per_iteration": 2.9893014430999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.07167196, + "diversity_loss_mlp": 0.0, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.0641722272838546, + "language_loss": 0.93037909, + "learning_rate": 0.0007947992409061695, + "loss": 0.94122881, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1665, + "time_per_iteration": 2.583789110183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100063, + "balance_loss_mlp": 1.08662808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07388769827525307, + "language_loss": 0.86501724, + "learning_rate": 0.0007945475520646226, + "loss": 0.87601787, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1666, + "time_per_iteration": 2.944988965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127031, + "balance_loss_mlp": 1.11408508, + "diversity_loss_mlp": 0.0, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.0781321549049884, + "language_loss": 0.84777099, + "learning_rate": 0.0007942957488695743, + "loss": 0.85904133, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1667, + "time_per_iteration": 2.667464017868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138299, + "balance_loss_mlp": 1.12505507, + "diversity_loss_mlp": 0.0, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06588913292879497, + "language_loss": 0.81000018, + "learning_rate": 0.0007940438314187833, + "loss": 0.82138324, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.13250732, + "routerloss_mlp": 0.0, + "step": 1668, + "time_per_iteration": 3.0395359992980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147791, + "balance_loss_mlp": 1.13491094, + "diversity_loss_mlp": 0.0, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.07621602089938284, + "language_loss": 0.80540276, + "learning_rate": 0.0007937917998100529, + "loss": 0.8168807, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1669, + "time_per_iteration": 2.5894687175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142528, + "balance_loss_mlp": 1.1294744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.07981389159152626, + "language_loss": 0.79167509, + "learning_rate": 0.0007935396541412302, + "loss": 0.80310035, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.13067627, + "routerloss_mlp": 0.0, + "step": 1670, + "time_per_iteration": 2.672978401184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141245, + "balance_loss_mlp": 1.12813175, + "diversity_loss_mlp": 0.0, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.06899314705075654, + "language_loss": 0.85712755, + "learning_rate": 0.0007932873945102068, + "loss": 0.86854005, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1671, + "time_per_iteration": 2.6296515464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_mlp": 1.25616145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.05047573422440889, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.77033865, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1672, + "time_per_iteration": 4.840561628341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138556, + "balance_loss_mlp": 1.1251744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.06902528499394482, + "language_loss": 0.86527705, + "learning_rate": 0.0007927825337533461, + "loss": 0.87666261, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1673, + "time_per_iteration": 2.693758964538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142697, + "balance_loss_mlp": 1.12930942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.08521571565711833, + "language_loss": 0.84877092, + "learning_rate": 0.0007925299328235131, + "loss": 0.8601979, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1674, + "time_per_iteration": 2.659621238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141943, + "balance_loss_mlp": 1.12855613, + "diversity_loss_mlp": 0.0, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.08187135533898351, + "language_loss": 0.84720862, + "learning_rate": 0.000792277218323488, + "loss": 0.85862803, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1675, + "time_per_iteration": 2.646108865737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135022, + "balance_loss_mlp": 1.12169456, + "diversity_loss_mlp": 0.0, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.08499328402904442, + "language_loss": 0.8509531, + "learning_rate": 0.0007920243903513833, + "loss": 0.86230332, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1676, + "time_per_iteration": 2.5730555057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126699, + "balance_loss_mlp": 1.11364567, + "diversity_loss_mlp": 0.0, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.08854342537284099, + "language_loss": 0.84008271, + "learning_rate": 0.0007917714490053556, + "loss": 0.85134971, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1677, + "time_per_iteration": 2.718555212020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122958, + "balance_loss_mlp": 1.10974979, + "diversity_loss_mlp": 0.0, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.07711595043056121, + "language_loss": 0.86223996, + "learning_rate": 0.0007915183943836055, + "loss": 0.87346947, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1678, + "time_per_iteration": 2.902038812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112402, + "balance_loss_mlp": 1.09958673, + "diversity_loss_mlp": 0.0, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07762427611918464, + "language_loss": 0.8422336, + "learning_rate": 0.0007912652265843773, + "loss": 0.85335761, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1679, + "time_per_iteration": 3.024665117263794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107958, + "balance_loss_mlp": 1.09453535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.06959311244041297, + "language_loss": 0.81845474, + "learning_rate": 0.0007910119457059597, + "loss": 0.82953429, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1680, + "time_per_iteration": 2.6954221725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111109, + "balance_loss_mlp": 1.09806776, + "diversity_loss_mlp": 0.0, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.08135634404485692, + "language_loss": 0.80380678, + "learning_rate": 0.0007907585518466849, + "loss": 0.81491786, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1681, + "time_per_iteration": 2.961648464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108764, + "balance_loss_mlp": 1.09574652, + "diversity_loss_mlp": 0.0, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.06462126830885603, + "language_loss": 0.89670283, + "learning_rate": 0.000790505045104929, + "loss": 0.90779042, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1682, + "time_per_iteration": 2.5210485458374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111018, + "balance_loss_mlp": 1.09719789, + "diversity_loss_mlp": 0.0, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.08715930327910015, + "language_loss": 0.86719161, + "learning_rate": 0.0007902514255791125, + "loss": 0.8782934, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1683, + "time_per_iteration": 2.8002610206604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097901, + "balance_loss_mlp": 1.084764, + "diversity_loss_mlp": 0.0, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06656486310868524, + "language_loss": 0.8795855, + "learning_rate": 0.0007899976933676986, + "loss": 0.89056444, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1684, + "time_per_iteration": 2.967172622680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092249, + "balance_loss_mlp": 1.07880259, + "diversity_loss_mlp": 0.0, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.09628316614228749, + "language_loss": 0.87045735, + "learning_rate": 0.0007897438485691955, + "loss": 0.88137984, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1685, + "time_per_iteration": 2.680147171020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103099, + "balance_loss_mlp": 1.0898304, + "diversity_loss_mlp": 0.0, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.0850736326825917, + "language_loss": 0.82684374, + "learning_rate": 0.0007894898912821542, + "loss": 0.83787471, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1686, + "time_per_iteration": 2.554380416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.0880518, + "diversity_loss_mlp": 0.0, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.06056792299191916, + "language_loss": 0.86695451, + "learning_rate": 0.0007892358216051695, + "loss": 0.87797034, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1687, + "time_per_iteration": 2.7851648330688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.09641767, + "diversity_loss_mlp": 0.0, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.07434076211008771, + "language_loss": 0.91829026, + "learning_rate": 0.0007889816396368803, + "loss": 0.92938912, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1688, + "time_per_iteration": 2.6211581230163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.10499799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07845440141588131, + "language_loss": 0.85253429, + "learning_rate": 0.0007887273454759687, + "loss": 0.8637172, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.13299561, + "routerloss_mlp": 0.0, + "step": 1689, + "time_per_iteration": 2.507779598236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.10946417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.08373410695529686, + "language_loss": 0.82792354, + "learning_rate": 0.0007884729392211603, + "loss": 0.83914578, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1690, + "time_per_iteration": 2.6805906295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119249, + "balance_loss_mlp": 1.10672641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09069843341009556, + "language_loss": 0.85648167, + "learning_rate": 0.0007882184209712245, + "loss": 0.86767411, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.12530518, + "routerloss_mlp": 0.0, + "step": 1691, + "time_per_iteration": 2.569239377975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00949982, + "balance_loss_mlp": 1.66309059, + "diversity_loss_mlp": 0.20491584, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.028395749586794427, + "language_loss": 0.85757548, + "learning_rate": 0.000787963790824974, + "loss": 0.86707526, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597837, + "step": 1692, + "time_per_iteration": 3.009209156036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113225, + "balance_loss_mlp": 1.10071397, + "diversity_loss_mlp": 0.0, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.22846677162281695, + "language_loss": 0.89612615, + "learning_rate": 0.0007877090488812651, + "loss": 0.90725839, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.12512207, + "routerloss_mlp": 0.0, + "step": 1693, + "time_per_iteration": 2.450209617614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936753, + "balance_loss_mlp": 1.63723278, + "diversity_loss_mlp": 0.20419246, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.03161007726798549, + "language_loss": 0.83743423, + "learning_rate": 0.0007874541952389973, + "loss": 0.84680176, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01604037, + "step": 1694, + "time_per_iteration": 2.6965737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111615, + "balance_loss_mlp": 1.10350823, + "diversity_loss_mlp": 0.0, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.07424213060006848, + "language_loss": 0.86538494, + "learning_rate": 0.0007871992299971136, + "loss": 0.87654638, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1695, + "time_per_iteration": 2.570406913757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131966, + "balance_loss_mlp": 1.11953878, + "diversity_loss_mlp": 0.0, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.0612219868328418, + "language_loss": 0.84142137, + "learning_rate": 0.0007869441532546001, + "loss": 0.852741, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1696, + "time_per_iteration": 2.763688087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128533, + "balance_loss_mlp": 1.11626601, + "diversity_loss_mlp": 0.0, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06155756648422996, + "language_loss": 0.79298395, + "learning_rate": 0.0007866889651104867, + "loss": 0.80426925, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 1697, + "time_per_iteration": 2.816236972808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130866, + "balance_loss_mlp": 1.11769366, + "diversity_loss_mlp": 0.0, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.0827611554210385, + "language_loss": 0.83172429, + "learning_rate": 0.000786433665663846, + "loss": 0.84303296, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.13195801, + "routerloss_mlp": 0.0, + "step": 1698, + "time_per_iteration": 2.6627049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135283, + "balance_loss_mlp": 1.12240815, + "diversity_loss_mlp": 0.0, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.08562611300573084, + "language_loss": 0.86256903, + "learning_rate": 0.0007861782550137942, + "loss": 0.87392187, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1699, + "time_per_iteration": 2.9298973083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115677, + "balance_loss_mlp": 1.10270739, + "diversity_loss_mlp": 0.0, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.06870341741306431, + "language_loss": 0.85913056, + "learning_rate": 0.0007859227332594901, + "loss": 0.8702873, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1700, + "time_per_iteration": 2.9108214378356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099921, + "balance_loss_mlp": 1.08703494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.08010897822069696, + "language_loss": 0.84705722, + "learning_rate": 0.0007856671005001365, + "loss": 0.85805643, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1701, + "time_per_iteration": 3.172921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088126, + "balance_loss_mlp": 1.07506084, + "diversity_loss_mlp": 0.0, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.0963591610521261, + "language_loss": 0.81720912, + "learning_rate": 0.0007854113568349787, + "loss": 0.82809043, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.13085938, + "routerloss_mlp": 0.0, + "step": 1702, + "time_per_iteration": 3.1135685443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100269, + "balance_loss_mlp": 1.08686948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.07838750037803571, + "language_loss": 0.80661154, + "learning_rate": 0.0007851555023633052, + "loss": 0.8176142, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.13397217, + "routerloss_mlp": 0.0, + "step": 1703, + "time_per_iteration": 2.841059684753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086421, + "balance_loss_mlp": 1.07271171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07047077484334266, + "language_loss": 0.82222247, + "learning_rate": 0.0007848995371844474, + "loss": 0.83308667, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1704, + "time_per_iteration": 2.515455961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094981, + "balance_loss_mlp": 1.0816896, + "diversity_loss_mlp": 0.0, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.08203255389116743, + "language_loss": 0.80260348, + "learning_rate": 0.0007846434613977801, + "loss": 0.81355333, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1705, + "time_per_iteration": 2.523026466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.0868392, + "diversity_loss_mlp": 0.0, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.07270926258732689, + "language_loss": 0.78603041, + "learning_rate": 0.0007843872751027203, + "loss": 0.7970314, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.13275146, + "routerloss_mlp": 0.0, + "step": 1706, + "time_per_iteration": 2.8923709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915397, + "balance_loss_mlp": 1.59612775, + "diversity_loss_mlp": 0.20258766, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.02966318853366187, + "language_loss": 0.87305748, + "learning_rate": 0.0007841309783987287, + "loss": 0.88221151, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01603885, + "step": 1707, + "time_per_iteration": 2.7517144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115655, + "balance_loss_mlp": 1.10263109, + "diversity_loss_mlp": 0.0, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06500174516261728, + "language_loss": 0.89240694, + "learning_rate": 0.0007838745713853084, + "loss": 0.9035635, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1708, + "time_per_iteration": 2.6181201934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122322, + "balance_loss_mlp": 1.10945296, + "diversity_loss_mlp": 0.0, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.06936064314807153, + "language_loss": 0.8434307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85465395, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.12866211, + "routerloss_mlp": 0.0, + "step": 1709, + "time_per_iteration": 2.7040350437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124287, + "balance_loss_mlp": 1.1112572, + "diversity_loss_mlp": 0.0, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06883588356672955, + "language_loss": 0.86454904, + "learning_rate": 0.0007833614268284082, + "loss": 0.87579191, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 1710, + "time_per_iteration": 2.5110740661621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425821, + "balance_loss_mlp": 1.41738081, + "diversity_loss_mlp": 0.0, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.1402114647579648, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75535595, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.08447266, + "routerloss_mlp": 0.0, + "step": 1711, + "time_per_iteration": 4.873327016830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129416, + "balance_loss_mlp": 1.11650598, + "diversity_loss_mlp": 0.0, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.0798208466882041, + "language_loss": 0.78414649, + "learning_rate": 0.0007828478422289016, + "loss": 0.79544067, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 1712, + "time_per_iteration": 2.608412027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138403, + "balance_loss_mlp": 1.12507582, + "diversity_loss_mlp": 0.0, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.07544776571140048, + "language_loss": 0.8909815, + "learning_rate": 0.0007825908851623833, + "loss": 0.90236557, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.13323975, + "routerloss_mlp": 0.0, + "step": 1713, + "time_per_iteration": 2.8033607006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134961, + "balance_loss_mlp": 1.12190771, + "diversity_loss_mlp": 0.0, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.06974595077498419, + "language_loss": 0.85003847, + "learning_rate": 0.0007823338183843533, + "loss": 0.86138809, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1714, + "time_per_iteration": 2.6861188411712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148942, + "balance_loss_mlp": 1.13610959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.07049806127627434, + "language_loss": 0.81025606, + "learning_rate": 0.0007820766419946141, + "loss": 0.82174551, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1715, + "time_per_iteration": 3.3007164001464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168148, + "balance_loss_mlp": 1.16008925, + "diversity_loss_mlp": 0.0, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.052131774928428895, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80840629, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1716, + "time_per_iteration": 4.947760105133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906852, + "balance_loss_mlp": 1.58163857, + "diversity_loss_mlp": 0.20079982, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.033697214377685164, + "language_loss": 0.75853068, + "learning_rate": 0.0007815619607794288, + "loss": 0.76759923, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563331, + "step": 1717, + "time_per_iteration": 2.689937114715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173062, + "balance_loss_mlp": 1.1601274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.09689448967864323, + "language_loss": 0.8294118, + "learning_rate": 0.0007813044561538001, + "loss": 0.84114236, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1718, + "time_per_iteration": 3.1421005725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158875, + "balance_loss_mlp": 1.14559531, + "diversity_loss_mlp": 0.0, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06842928932014077, + "language_loss": 0.88578129, + "learning_rate": 0.0007810468423160958, + "loss": 0.89736998, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1719, + "time_per_iteration": 2.8917293548583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157511, + "balance_loss_mlp": 1.14486265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06941390463820386, + "language_loss": 0.81896281, + "learning_rate": 0.0007807891193663306, + "loss": 0.83053792, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.12640381, + "routerloss_mlp": 0.0, + "step": 1720, + "time_per_iteration": 2.8352882862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141979, + "balance_loss_mlp": 1.12950385, + "diversity_loss_mlp": 0.0, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07961809028947962, + "language_loss": 0.82409328, + "learning_rate": 0.0007805312874045614, + "loss": 0.83551311, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1721, + "time_per_iteration": 2.5056259632110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137994, + "balance_loss_mlp": 1.12510777, + "diversity_loss_mlp": 0.0, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.09061115976682882, + "language_loss": 0.86960506, + "learning_rate": 0.0007802733465308874, + "loss": 0.88098502, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1722, + "time_per_iteration": 2.438533306121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144898, + "balance_loss_mlp": 1.13225603, + "diversity_loss_mlp": 0.0, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06773749819611302, + "language_loss": 0.84162688, + "learning_rate": 0.0007800152968454501, + "loss": 0.8530758, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1723, + "time_per_iteration": 2.6364991664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134044, + "balance_loss_mlp": 1.12146711, + "diversity_loss_mlp": 0.0, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06044198445597461, + "language_loss": 0.90330362, + "learning_rate": 0.0007797571384484334, + "loss": 0.91464406, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.12567139, + "routerloss_mlp": 0.0, + "step": 1724, + "time_per_iteration": 2.8638265132904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133346, + "balance_loss_mlp": 1.12061453, + "diversity_loss_mlp": 0.0, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.0752969909322094, + "language_loss": 0.91929704, + "learning_rate": 0.0007794988714400633, + "loss": 0.93063056, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 1725, + "time_per_iteration": 2.615788698196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125798, + "balance_loss_mlp": 1.11242867, + "diversity_loss_mlp": 0.0, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.07890733478173245, + "language_loss": 0.85302055, + "learning_rate": 0.0007792404959206079, + "loss": 0.86427855, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.13372803, + "routerloss_mlp": 0.0, + "step": 1726, + "time_per_iteration": 2.545780897140503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107165, + "balance_loss_mlp": 1.09446895, + "diversity_loss_mlp": 0.0, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.07756389475354548, + "language_loss": 0.81480336, + "learning_rate": 0.0007789820119903774, + "loss": 0.82587504, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.12689209, + "routerloss_mlp": 0.0, + "step": 1727, + "time_per_iteration": 3.005662441253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114992, + "balance_loss_mlp": 1.10335684, + "diversity_loss_mlp": 0.0, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.03748312413261812, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.7960766, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 1728, + "time_per_iteration": 4.833205223083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105872, + "balance_loss_mlp": 1.09285486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.07170574552345628, + "language_loss": 0.83970881, + "learning_rate": 0.0007784647192990428, + "loss": 0.85076749, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 1729, + "time_per_iteration": 2.7309772968292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107979, + "balance_loss_mlp": 1.0948776, + "diversity_loss_mlp": 0.0, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06011930461286596, + "language_loss": 0.80777055, + "learning_rate": 0.0007782059107387696, + "loss": 0.81885028, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.13116455, + "routerloss_mlp": 0.0, + "step": 1730, + "time_per_iteration": 2.8615641593933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113195, + "balance_loss_mlp": 1.11733532, + "diversity_loss_mlp": 0.0, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.08106060743083753, + "language_loss": 0.88617826, + "learning_rate": 0.0007779469941693826, + "loss": 0.89749771, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1731, + "time_per_iteration": 2.801208257675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126142, + "balance_loss_mlp": 1.11240935, + "diversity_loss_mlp": 0.0, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.09519717038034853, + "language_loss": 0.77091044, + "learning_rate": 0.0007776879696914029, + "loss": 0.78217185, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1732, + "time_per_iteration": 2.8286595344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123068, + "balance_loss_mlp": 1.10889435, + "diversity_loss_mlp": 0.0, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.05947539267688924, + "language_loss": 0.88910627, + "learning_rate": 0.000777428837405392, + "loss": 0.90033698, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1733, + "time_per_iteration": 2.8319156169891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121491, + "balance_loss_mlp": 1.10701954, + "diversity_loss_mlp": 0.0, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.07113995025739508, + "language_loss": 0.86735553, + "learning_rate": 0.0007771695974119544, + "loss": 0.87857044, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1734, + "time_per_iteration": 2.5376570224761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112031, + "balance_loss_mlp": 1.09795249, + "diversity_loss_mlp": 0.0, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.08734149249458338, + "language_loss": 0.75937277, + "learning_rate": 0.0007769102498117359, + "loss": 0.77049315, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1735, + "time_per_iteration": 3.093188524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105698, + "balance_loss_mlp": 1.09138131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06929562674350419, + "language_loss": 0.79383999, + "learning_rate": 0.000776650794705424, + "loss": 0.80489695, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1736, + "time_per_iteration": 3.253673791885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121685, + "balance_loss_mlp": 1.10730791, + "diversity_loss_mlp": 0.0, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.06325878214231093, + "language_loss": 0.82130396, + "learning_rate": 0.0007763912321937483, + "loss": 0.83252084, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1737, + "time_per_iteration": 2.7109947204589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117751, + "balance_loss_mlp": 1.10324299, + "diversity_loss_mlp": 0.0, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.08404595709863052, + "language_loss": 0.82403475, + "learning_rate": 0.0007761315623774799, + "loss": 0.83521223, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1738, + "time_per_iteration": 3.4125657081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109047, + "balance_loss_mlp": 1.0946703, + "diversity_loss_mlp": 0.0, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08421865543081901, + "language_loss": 0.87820536, + "learning_rate": 0.0007758717853574313, + "loss": 0.88929582, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1739, + "time_per_iteration": 2.7345223426818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106099, + "balance_loss_mlp": 1.09184134, + "diversity_loss_mlp": 0.0, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.07638673743764693, + "language_loss": 0.90095574, + "learning_rate": 0.0007756119012344571, + "loss": 0.91201669, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1740, + "time_per_iteration": 2.5901129245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.08709717, + "diversity_loss_mlp": 0.0, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06863708242027233, + "language_loss": 0.8461023, + "learning_rate": 0.0007753519101094535, + "loss": 0.85711253, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1741, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089984, + "balance_loss_mlp": 1.07595301, + "diversity_loss_mlp": 0.0, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.07992644583812669, + "language_loss": 0.86363387, + "learning_rate": 0.0007750918120833575, + "loss": 0.87453371, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1742, + "time_per_iteration": 2.58940052986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.07488728, + "diversity_loss_mlp": 0.0, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.11201991585260462, + "language_loss": 0.87392128, + "learning_rate": 0.0007748316072571485, + "loss": 0.88480592, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1743, + "time_per_iteration": 2.8557286262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086266, + "balance_loss_mlp": 1.07202053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0749416267225997, + "language_loss": 0.79045737, + "learning_rate": 0.0007745712957318467, + "loss": 0.80131996, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1744, + "time_per_iteration": 2.9912548065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084233, + "balance_loss_mlp": 1.07057166, + "diversity_loss_mlp": 0.0, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06946859722884112, + "language_loss": 0.86471289, + "learning_rate": 0.0007743108776085141, + "loss": 0.87555522, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1745, + "time_per_iteration": 2.7899224758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_mlp": 1.07023191, + "diversity_loss_mlp": 0.0, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.08256839233284315, + "language_loss": 0.82965624, + "learning_rate": 0.0007740503529882543, + "loss": 0.84050083, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1746, + "time_per_iteration": 2.808084011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084564, + "balance_loss_mlp": 1.07044971, + "diversity_loss_mlp": 0.0, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.07349682427851349, + "language_loss": 0.90707254, + "learning_rate": 0.0007737897219722114, + "loss": 0.91791821, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1747, + "time_per_iteration": 2.712833881378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092286, + "balance_loss_mlp": 1.07794499, + "diversity_loss_mlp": 0.0, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.05794758251669461, + "language_loss": 0.81094921, + "learning_rate": 0.0007735289846615716, + "loss": 0.82187206, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1748, + "time_per_iteration": 2.677976369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108166, + "balance_loss_mlp": 1.09457588, + "diversity_loss_mlp": 0.0, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.0827866783592608, + "language_loss": 0.823035, + "learning_rate": 0.0007732681411575621, + "loss": 0.8341167, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1749, + "time_per_iteration": 2.674349069595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114062, + "balance_loss_mlp": 1.09997165, + "diversity_loss_mlp": 0.0, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.4203922337067485, + "language_loss": 0.87328398, + "learning_rate": 0.0007730071915614514, + "loss": 0.88442457, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1750, + "time_per_iteration": 2.6714634895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113648, + "balance_loss_mlp": 1.10037947, + "diversity_loss_mlp": 0.0, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09571011442330926, + "language_loss": 0.88792437, + "learning_rate": 0.0007727461359745489, + "loss": 0.89906085, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1751, + "time_per_iteration": 2.469905376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141755, + "balance_loss_mlp": 1.12897623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.07412184794878955, + "language_loss": 0.85941112, + "learning_rate": 0.0007724849744982056, + "loss": 0.87082875, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 1752, + "time_per_iteration": 2.6805977821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117715, + "balance_loss_mlp": 1.16388226, + "diversity_loss_mlp": 0.0, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.09378397224837084, + "language_loss": 0.81843758, + "learning_rate": 0.0007722237072338131, + "loss": 0.83020908, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1753, + "time_per_iteration": 2.7348344326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186311, + "balance_loss_mlp": 1.17280459, + "diversity_loss_mlp": 0.0, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.1034159122014491, + "language_loss": 0.85304463, + "learning_rate": 0.0007719623342828046, + "loss": 0.86490774, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1754, + "time_per_iteration": 2.5181336402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202577, + "balance_loss_mlp": 1.18872511, + "diversity_loss_mlp": 0.0, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.12703041648808322, + "language_loss": 0.84088987, + "learning_rate": 0.000771700855746654, + "loss": 0.85291564, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1755, + "time_per_iteration": 2.590925931930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188345, + "balance_loss_mlp": 1.1743381, + "diversity_loss_mlp": 0.0, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.06849832931784437, + "language_loss": 0.88371092, + "learning_rate": 0.0007714392717268763, + "loss": 0.89559436, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1756, + "time_per_iteration": 2.560246706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189305, + "balance_loss_mlp": 1.17545295, + "diversity_loss_mlp": 0.0, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.09135673410225151, + "language_loss": 0.8630141, + "learning_rate": 0.0007711775823250273, + "loss": 0.8749072, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1757, + "time_per_iteration": 2.562939167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194838, + "balance_loss_mlp": 1.18069935, + "diversity_loss_mlp": 0.0, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.07414503329772545, + "language_loss": 0.83081156, + "learning_rate": 0.0007709157876427039, + "loss": 0.84275991, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1758, + "time_per_iteration": 3.0652947425842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190916, + "balance_loss_mlp": 1.17681408, + "diversity_loss_mlp": 0.0, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.06977999371164574, + "language_loss": 0.85321373, + "learning_rate": 0.0007706538877815439, + "loss": 0.86512285, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1759, + "time_per_iteration": 2.5949320793151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202515, + "balance_loss_mlp": 1.1888063, + "diversity_loss_mlp": 0.0, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.052908737395413206, + "language_loss": 0.83029473, + "learning_rate": 0.0007703918828432259, + "loss": 0.84231991, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1760, + "time_per_iteration": 2.6404576301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231589, + "balance_loss_mlp": 1.21696198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.11529749255982873, + "language_loss": 0.89274669, + "learning_rate": 0.000770129772929469, + "loss": 0.90506256, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1761, + "time_per_iteration": 2.6486427783966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212596, + "balance_loss_mlp": 1.19812357, + "diversity_loss_mlp": 0.0, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.10010821715075297, + "language_loss": 0.8820551, + "learning_rate": 0.0007698675581420334, + "loss": 0.89418107, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1762, + "time_per_iteration": 2.8473589420318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170537, + "balance_loss_mlp": 1.15610099, + "diversity_loss_mlp": 0.0, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06768336788468338, + "language_loss": 0.79040444, + "learning_rate": 0.0007696052385827199, + "loss": 0.80210984, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1763, + "time_per_iteration": 2.9893951416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147034, + "balance_loss_mlp": 1.13271689, + "diversity_loss_mlp": 0.0, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.06731413775333611, + "language_loss": 0.78161937, + "learning_rate": 0.00076934281435337, + "loss": 0.79308975, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1764, + "time_per_iteration": 2.7329161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933074, + "balance_loss_mlp": 1.62411106, + "diversity_loss_mlp": 0.20785357, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0341650984642099, + "language_loss": 0.86205357, + "learning_rate": 0.0007690802855558658, + "loss": 0.87138426, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170921, + "step": 1765, + "time_per_iteration": 2.9281163215637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121638, + "balance_loss_mlp": 1.10924029, + "diversity_loss_mlp": 0.0, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.029090002598214117, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77496594, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 1766, + "time_per_iteration": 4.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104609, + "balance_loss_mlp": 1.08886182, + "diversity_loss_mlp": 0.0, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.08396151855964885, + "language_loss": 0.89357018, + "learning_rate": 0.0007685549146641262, + "loss": 0.90461624, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1767, + "time_per_iteration": 2.5867435932159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108303, + "balance_loss_mlp": 1.093521, + "diversity_loss_mlp": 0.0, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.10736891621188589, + "language_loss": 0.8816734, + "learning_rate": 0.0007682920727738579, + "loss": 0.89275646, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1768, + "time_per_iteration": 2.5119268894195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102653, + "balance_loss_mlp": 1.08738232, + "diversity_loss_mlp": 0.0, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.10494960168224592, + "language_loss": 0.85048056, + "learning_rate": 0.000768029126723369, + "loss": 0.86150718, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1769, + "time_per_iteration": 2.495424270629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090257, + "balance_loss_mlp": 1.07520068, + "diversity_loss_mlp": 0.0, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.08686425564719477, + "language_loss": 0.82128584, + "learning_rate": 0.0007677660766147447, + "loss": 0.83218843, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 1770, + "time_per_iteration": 2.532904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066946, + "balance_loss_mlp": 1.05578792, + "diversity_loss_mlp": 0.0, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.023964921008177247, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73537892, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 1771, + "time_per_iteration": 4.944117784500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117931, + "balance_loss_mlp": 1.1034112, + "diversity_loss_mlp": 0.0, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.10616133846526872, + "language_loss": 0.795196, + "learning_rate": 0.0007672396646316306, + "loss": 0.80637527, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1772, + "time_per_iteration": 2.6089062690734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134399, + "balance_loss_mlp": 1.11959314, + "diversity_loss_mlp": 0.0, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.07513330183645242, + "language_loss": 0.80376065, + "learning_rate": 0.000766976302961512, + "loss": 0.8151046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1773, + "time_per_iteration": 3.042421340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158934, + "balance_loss_mlp": 1.14410484, + "diversity_loss_mlp": 0.0, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.07872996810077096, + "language_loss": 0.81390858, + "learning_rate": 0.0007667128376420003, + "loss": 0.82549793, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1774, + "time_per_iteration": 2.536562442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208475, + "balance_loss_mlp": 1.19358635, + "diversity_loss_mlp": 0.0, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.08297883362487203, + "language_loss": 0.8462863, + "learning_rate": 0.0007664492687753817, + "loss": 0.85837102, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1775, + "time_per_iteration": 2.6977102756500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198612, + "balance_loss_mlp": 1.18424678, + "diversity_loss_mlp": 0.0, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10155126624771216, + "language_loss": 0.81542516, + "learning_rate": 0.000766185596463983, + "loss": 0.82741123, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1776, + "time_per_iteration": 2.6038215160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196202, + "balance_loss_mlp": 1.18163514, + "diversity_loss_mlp": 0.0, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.0897891274607312, + "language_loss": 0.77011722, + "learning_rate": 0.0007659218208101706, + "loss": 0.78207922, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1777, + "time_per_iteration": 3.0933022499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173425, + "balance_loss_mlp": 1.15902483, + "diversity_loss_mlp": 0.0, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.08364054831663822, + "language_loss": 0.85122472, + "learning_rate": 0.0007656579419163515, + "loss": 0.86295897, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1778, + "time_per_iteration": 2.732297420501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146549, + "balance_loss_mlp": 1.13211274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.0722191895240348, + "language_loss": 0.77409559, + "learning_rate": 0.0007653939598849724, + "loss": 0.78556108, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1779, + "time_per_iteration": 2.4908664226531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_mlp": 1.02253902, + "diversity_loss_mlp": 0.0, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.029240552967656448, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83912855, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.10107422, + "routerloss_mlp": 0.0, + "step": 1780, + "time_per_iteration": 4.9182775020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.10688317, + "diversity_loss_mlp": 0.0, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07624931845389674, + "language_loss": 0.80176342, + "learning_rate": 0.000764865686819522, + "loss": 0.81297386, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1781, + "time_per_iteration": 3.0602052211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111853, + "balance_loss_mlp": 1.097965, + "diversity_loss_mlp": 0.0, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.07936344533488468, + "language_loss": 0.85836053, + "learning_rate": 0.0007646013959905449, + "loss": 0.86947906, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1782, + "time_per_iteration": 2.5750925540924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109071, + "balance_loss_mlp": 1.09528995, + "diversity_loss_mlp": 0.0, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.07233814650781724, + "language_loss": 0.81042612, + "learning_rate": 0.0007643370024341949, + "loss": 0.82151681, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1783, + "time_per_iteration": 3.0870087146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110812, + "balance_loss_mlp": 1.09431553, + "diversity_loss_mlp": 0.0, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.07806584209391611, + "language_loss": 0.83175099, + "learning_rate": 0.0007640725062531195, + "loss": 0.84283221, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1784, + "time_per_iteration": 2.5063886642456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102776, + "balance_loss_mlp": 1.08888865, + "diversity_loss_mlp": 0.0, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.5067557182324087, + "language_loss": 0.86699629, + "learning_rate": 0.0007638079075500047, + "loss": 0.87802398, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1785, + "time_per_iteration": 2.532945394515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015111, + "balance_loss_mlp": 1.00562215, + "diversity_loss_mlp": 0.0, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.016449027395748255, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76195776, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 1786, + "time_per_iteration": 4.944318056106567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150049, + "balance_loss_mlp": 1.13542247, + "diversity_loss_mlp": 0.0, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.07356798682381475, + "language_loss": 0.83088338, + "learning_rate": 0.0007632784029886026, + "loss": 0.84238386, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1787, + "time_per_iteration": 2.6217002868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204344, + "balance_loss_mlp": 1.1884768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.08799574205003287, + "language_loss": 0.85466659, + "learning_rate": 0.0007630134973358873, + "loss": 0.86671007, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1788, + "time_per_iteration": 2.9664394855499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251833, + "balance_loss_mlp": 1.2359066, + "diversity_loss_mlp": 0.0, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.1052875761358054, + "language_loss": 0.86575854, + "learning_rate": 0.0007627484895722763, + "loss": 0.87827688, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1789, + "time_per_iteration": 2.67280912399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247407, + "balance_loss_mlp": 1.23117065, + "diversity_loss_mlp": 0.0, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.09611070791328494, + "language_loss": 0.80025196, + "learning_rate": 0.0007624833798006552, + "loss": 0.81272602, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.16235352, + "routerloss_mlp": 0.0, + "step": 1790, + "time_per_iteration": 3.046809196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238128, + "balance_loss_mlp": 1.22221315, + "diversity_loss_mlp": 0.0, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.07959093752215074, + "language_loss": 0.83783114, + "learning_rate": 0.0007622181681239483, + "loss": 0.8502124, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1791, + "time_per_iteration": 2.6601433753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244342, + "balance_loss_mlp": 1.22793913, + "diversity_loss_mlp": 0.0, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07919089267187412, + "language_loss": 0.84668601, + "learning_rate": 0.0007619528546451202, + "loss": 0.85912943, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 1792, + "time_per_iteration": 2.782947063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208587, + "balance_loss_mlp": 1.19314909, + "diversity_loss_mlp": 0.0, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.07332959959795217, + "language_loss": 0.83832949, + "learning_rate": 0.0007616874394671745, + "loss": 0.85041535, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1793, + "time_per_iteration": 3.3206703662872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184994, + "balance_loss_mlp": 1.169258, + "diversity_loss_mlp": 0.0, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.0713753042238581, + "language_loss": 0.85051751, + "learning_rate": 0.0007614219226931547, + "loss": 0.86236751, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1794, + "time_per_iteration": 2.7190396785736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179587, + "balance_loss_mlp": 1.16401851, + "diversity_loss_mlp": 0.0, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.07163818055438703, + "language_loss": 0.8457973, + "learning_rate": 0.0007611563044261435, + "loss": 0.85759324, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 1795, + "time_per_iteration": 2.5077741146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150042, + "balance_loss_mlp": 1.13422251, + "diversity_loss_mlp": 0.0, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.0670543853763616, + "language_loss": 0.86376798, + "learning_rate": 0.0007608905847692631, + "loss": 0.8752684, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1796, + "time_per_iteration": 2.4662768840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112741, + "balance_loss_mlp": 1.11171043, + "diversity_loss_mlp": 0.0, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07671810253227593, + "language_loss": 0.86553091, + "learning_rate": 0.0007606247638256749, + "loss": 0.87680501, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 1797, + "time_per_iteration": 2.8649494647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00624206, + "balance_loss_mlp": 1.05204535, + "diversity_loss_mlp": 0.16984753, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.0016633519833830733, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.78794497, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01325956, + "step": 1798, + "time_per_iteration": 4.963132619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_mlp": 1.04498482, + "diversity_loss_mlp": 0.0, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.032920799461559694, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80382872, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.10693359, + "routerloss_mlp": 0.0, + "step": 1799, + "time_per_iteration": 4.773633003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099713, + "balance_loss_mlp": 1.08345306, + "diversity_loss_mlp": 0.0, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.10233507255995049, + "language_loss": 0.85892332, + "learning_rate": 0.0007598266943068686, + "loss": 0.86992049, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 1800, + "time_per_iteration": 2.7380948066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092311, + "balance_loss_mlp": 1.0761466, + "diversity_loss_mlp": 0.0, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.08416075255699706, + "language_loss": 0.83903629, + "learning_rate": 0.0007595604692488507, + "loss": 0.84995937, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1801, + "time_per_iteration": 2.5558300018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099836, + "balance_loss_mlp": 1.08382583, + "diversity_loss_mlp": 0.0, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.0681721192963598, + "language_loss": 0.82674247, + "learning_rate": 0.0007592941434205215, + "loss": 0.83774084, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 1802, + "time_per_iteration": 2.8181002140045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017477, + "balance_loss_mlp": 1.00651026, + "diversity_loss_mlp": 0.0, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.018274165575771096, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588537, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 1803, + "time_per_iteration": 5.063629388809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126513, + "balance_loss_mlp": 1.11121821, + "diversity_loss_mlp": 0.0, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07342722091818694, + "language_loss": 0.80217302, + "learning_rate": 0.0007587611898665566, + "loss": 0.81343818, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.15270996, + "routerloss_mlp": 0.0, + "step": 1804, + "time_per_iteration": 3.0994317531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113982, + "balance_loss_mlp": 1.12468028, + "diversity_loss_mlp": 0.0, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.05936466476556785, + "language_loss": 0.82130265, + "learning_rate": 0.0007584945623478315, + "loss": 0.83270085, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1805, + "time_per_iteration": 2.833981513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152624, + "balance_loss_mlp": 1.13780582, + "diversity_loss_mlp": 0.0, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.08744691316973383, + "language_loss": 0.80801159, + "learning_rate": 0.000758227834472617, + "loss": 0.81953788, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1806, + "time_per_iteration": 3.0535178184509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166216, + "balance_loss_mlp": 1.15111172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.07500761638021176, + "language_loss": 0.77729452, + "learning_rate": 0.0007579610063444664, + "loss": 0.7889567, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1807, + "time_per_iteration": 2.7615864276885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149194, + "balance_loss_mlp": 1.1339947, + "diversity_loss_mlp": 0.0, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.07406875426876382, + "language_loss": 0.87547183, + "learning_rate": 0.0007576940780669712, + "loss": 0.88696373, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1808, + "time_per_iteration": 3.264080762863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143539, + "balance_loss_mlp": 1.12863731, + "diversity_loss_mlp": 0.0, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.07928472428244501, + "language_loss": 0.84104979, + "learning_rate": 0.0007574270497437624, + "loss": 0.85248518, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1809, + "time_per_iteration": 2.9859273433685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128376, + "balance_loss_mlp": 1.11302221, + "diversity_loss_mlp": 0.0, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.07150597602774303, + "language_loss": 0.88426095, + "learning_rate": 0.000757159921478509, + "loss": 0.89554477, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 1810, + "time_per_iteration": 2.7891488075256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057512, + "balance_loss_mlp": 1.04754615, + "diversity_loss_mlp": 0.0, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.03228641235871289, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75508153, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 1811, + "time_per_iteration": 4.737962007522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103488, + "balance_loss_mlp": 1.08814573, + "diversity_loss_mlp": 0.0, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.07438083858778873, + "language_loss": 0.87798911, + "learning_rate": 0.0007566253655367423, + "loss": 0.88902402, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 1812, + "time_per_iteration": 2.5879476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.07600367, + "diversity_loss_mlp": 0.0, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.06854488097647142, + "language_loss": 0.8957805, + "learning_rate": 0.000756357938067762, + "loss": 0.90669596, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 1813, + "time_per_iteration": 2.7090489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.07826209, + "diversity_loss_mlp": 0.0, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.0690606019510397, + "language_loss": 0.8334865, + "learning_rate": 0.0007560904110718033, + "loss": 0.84443069, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1814, + "time_per_iteration": 3.2445590496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096093, + "balance_loss_mlp": 1.08003569, + "diversity_loss_mlp": 0.0, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06223934742271703, + "language_loss": 0.83650601, + "learning_rate": 0.0007558227846527297, + "loss": 0.84746695, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1815, + "time_per_iteration": 2.8504550457000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110334, + "balance_loss_mlp": 1.08731842, + "diversity_loss_mlp": 0.0, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.07831164241761415, + "language_loss": 0.83117825, + "learning_rate": 0.0007555550589144429, + "loss": 0.84221166, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1816, + "time_per_iteration": 2.4655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111049, + "balance_loss_mlp": 1.09515882, + "diversity_loss_mlp": 0.0, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.08460625336983617, + "language_loss": 0.84522688, + "learning_rate": 0.000755287233960883, + "loss": 0.85633731, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1817, + "time_per_iteration": 2.602492094039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.07385683, + "diversity_loss_mlp": 0.0, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.07045705340523431, + "language_loss": 0.77682364, + "learning_rate": 0.0007550193098960292, + "loss": 0.78771949, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1818, + "time_per_iteration": 2.8674800395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00989642, + "balance_loss_mlp": 1.73270237, + "diversity_loss_mlp": 0.21087486, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.029406524514427698, + "language_loss": 0.86412024, + "learning_rate": 0.0007547512868238988, + "loss": 0.87401664, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01785346, + "step": 1819, + "time_per_iteration": 3.151559829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090552, + "balance_loss_mlp": 1.07453036, + "diversity_loss_mlp": 0.0, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.06124546921927801, + "language_loss": 0.83503008, + "learning_rate": 0.0007544831648485473, + "loss": 0.84593564, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1820, + "time_per_iteration": 2.6791367530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094234, + "balance_loss_mlp": 1.07806909, + "diversity_loss_mlp": 0.0, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.08232155140582742, + "language_loss": 0.81448233, + "learning_rate": 0.0007542149440740694, + "loss": 0.82542467, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1821, + "time_per_iteration": 2.665632724761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.07229352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.08177047744866778, + "language_loss": 0.85514361, + "learning_rate": 0.000753946624604597, + "loss": 0.8660273, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1822, + "time_per_iteration": 2.708221673965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085159, + "balance_loss_mlp": 1.06938744, + "diversity_loss_mlp": 0.0, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.07022994660183399, + "language_loss": 0.88119262, + "learning_rate": 0.0007536782065443015, + "loss": 0.89204431, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 1823, + "time_per_iteration": 2.633929967880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109141, + "balance_loss_mlp": 1.0758059, + "diversity_loss_mlp": 0.0, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.09965750131036237, + "language_loss": 0.75038946, + "learning_rate": 0.0007534096899973919, + "loss": 0.7613036, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1824, + "time_per_iteration": 2.585160732269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089888, + "balance_loss_mlp": 1.07460535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.0636070515998131, + "language_loss": 0.82941401, + "learning_rate": 0.0007531410750681154, + "loss": 0.84031284, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1825, + "time_per_iteration": 2.7595911026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100118, + "balance_loss_mlp": 1.08562207, + "diversity_loss_mlp": 0.0, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.09267960960885083, + "language_loss": 0.87015611, + "learning_rate": 0.0007528723618607575, + "loss": 0.88115728, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1826, + "time_per_iteration": 3.4216692447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090335, + "balance_loss_mlp": 1.07524323, + "diversity_loss_mlp": 0.0, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.07214965975453298, + "language_loss": 0.82582879, + "learning_rate": 0.0007526035504796422, + "loss": 0.83673215, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.15087891, + "routerloss_mlp": 0.0, + "step": 1827, + "time_per_iteration": 2.7822000980377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094099, + "balance_loss_mlp": 1.0794003, + "diversity_loss_mlp": 0.0, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.07057247929289283, + "language_loss": 0.86824054, + "learning_rate": 0.0007523346410291312, + "loss": 0.8791815, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1828, + "time_per_iteration": 2.7560181617736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098232, + "balance_loss_mlp": 1.08291376, + "diversity_loss_mlp": 0.0, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.0630617970486185, + "language_loss": 0.85159689, + "learning_rate": 0.0007520656336136245, + "loss": 0.86257917, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1829, + "time_per_iteration": 2.9432313442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098974, + "balance_loss_mlp": 1.08431172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06541232162591855, + "language_loss": 0.88230217, + "learning_rate": 0.0007517965283375599, + "loss": 0.89329195, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1830, + "time_per_iteration": 2.8773486614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098552, + "balance_loss_mlp": 1.08363926, + "diversity_loss_mlp": 0.0, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.06973135687475002, + "language_loss": 0.89511967, + "learning_rate": 0.0007515273253054132, + "loss": 0.90610522, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1831, + "time_per_iteration": 2.662757396697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097986, + "balance_loss_mlp": 1.08288169, + "diversity_loss_mlp": 0.0, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.07142201858296882, + "language_loss": 0.82785273, + "learning_rate": 0.0007512580246216988, + "loss": 0.83883256, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1832, + "time_per_iteration": 2.730994939804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096378, + "balance_loss_mlp": 1.08164394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.07119734441282773, + "language_loss": 0.84715027, + "learning_rate": 0.000750988626390968, + "loss": 0.85811406, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1833, + "time_per_iteration": 2.604182004928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_mlp": 1.07508624, + "diversity_loss_mlp": 0.0, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.07060575001723658, + "language_loss": 0.85089648, + "learning_rate": 0.0007507191307178108, + "loss": 0.86179501, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1834, + "time_per_iteration": 2.7584774494171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083532, + "balance_loss_mlp": 1.06808281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.09392412586459238, + "language_loss": 0.75105453, + "learning_rate": 0.0007504495377068543, + "loss": 0.76188982, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1835, + "time_per_iteration": 2.731039524078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087025, + "balance_loss_mlp": 1.07230306, + "diversity_loss_mlp": 0.0, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09299008065025831, + "language_loss": 0.81784093, + "learning_rate": 0.0007501798474627642, + "loss": 0.82871115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1836, + "time_per_iteration": 2.9180665016174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092183, + "balance_loss_mlp": 1.07738876, + "diversity_loss_mlp": 0.0, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.06800399913452355, + "language_loss": 0.8354817, + "learning_rate": 0.0007499100600902433, + "loss": 0.84640354, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1837, + "time_per_iteration": 2.981478452682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097597, + "balance_loss_mlp": 1.08236217, + "diversity_loss_mlp": 0.0, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.07178124654929893, + "language_loss": 0.83625698, + "learning_rate": 0.0007496401756940324, + "loss": 0.84723294, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1838, + "time_per_iteration": 2.7256877422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107267, + "balance_loss_mlp": 1.09267545, + "diversity_loss_mlp": 0.0, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.08438072522416575, + "language_loss": 0.81940264, + "learning_rate": 0.0007493701943789098, + "loss": 0.83047533, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1839, + "time_per_iteration": 2.805553674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117723, + "balance_loss_mlp": 1.10266685, + "diversity_loss_mlp": 0.0, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07000666511795951, + "language_loss": 0.82830888, + "learning_rate": 0.000749100116249692, + "loss": 0.83948612, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1840, + "time_per_iteration": 2.608135223388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00954188, + "balance_loss_mlp": 1.66862321, + "diversity_loss_mlp": 0.20571998, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.03743173710930313, + "language_loss": 0.86076337, + "learning_rate": 0.0007488299414112321, + "loss": 0.87030524, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01701665, + "step": 1841, + "time_per_iteration": 2.6307811737060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112413, + "balance_loss_mlp": 1.10974133, + "diversity_loss_mlp": 0.0, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.06710116446149988, + "language_loss": 0.77204335, + "learning_rate": 0.0007485596699684215, + "loss": 0.78328466, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1842, + "time_per_iteration": 2.808776378631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132611, + "balance_loss_mlp": 1.11780548, + "diversity_loss_mlp": 0.0, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.07987851383877129, + "language_loss": 0.85353696, + "learning_rate": 0.000748289302026189, + "loss": 0.86486304, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1843, + "time_per_iteration": 2.8449106216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127963, + "balance_loss_mlp": 1.11339569, + "diversity_loss_mlp": 0.0, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.06918658934745357, + "language_loss": 0.85752398, + "learning_rate": 0.0007480188376895004, + "loss": 0.86880362, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1844, + "time_per_iteration": 3.0339298248291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160602, + "balance_loss_mlp": 1.15135121, + "diversity_loss_mlp": 0.0, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.06421168097867443, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74971944, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 1845, + "time_per_iteration": 4.932978391647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.10506296, + "diversity_loss_mlp": 0.0, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08194467088107492, + "language_loss": 0.78768218, + "learning_rate": 0.0007474776202528074, + "loss": 0.79887938, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1846, + "time_per_iteration": 2.9188990592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111713, + "balance_loss_mlp": 1.1021452, + "diversity_loss_mlp": 0.0, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08015412782248336, + "language_loss": 0.80999184, + "learning_rate": 0.000747206867362922, + "loss": 0.82116312, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1847, + "time_per_iteration": 3.0966272354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099814, + "balance_loss_mlp": 1.085235, + "diversity_loss_mlp": 0.0, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.09857033029565816, + "language_loss": 0.836568, + "learning_rate": 0.0007469360184988194, + "loss": 0.84756613, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1848, + "time_per_iteration": 2.9021246433258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104687, + "balance_loss_mlp": 1.08986914, + "diversity_loss_mlp": 0.0, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08185517170087683, + "language_loss": 0.86821651, + "learning_rate": 0.0007466650737656518, + "loss": 0.8792634, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1849, + "time_per_iteration": 2.615549325942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102053, + "balance_loss_mlp": 1.0876888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06916390030254578, + "language_loss": 0.89687926, + "learning_rate": 0.0007463940332686098, + "loss": 0.9078998, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1850, + "time_per_iteration": 2.497159242630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931214, + "balance_loss_mlp": 1.62144685, + "diversity_loss_mlp": 0.20650919, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.030410176313075864, + "language_loss": 0.84120536, + "learning_rate": 0.0007461228971129205, + "loss": 0.85051751, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01723633, + "step": 1851, + "time_per_iteration": 2.959170341491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931448, + "balance_loss_mlp": 1.62270963, + "diversity_loss_mlp": 0.20620242, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.03221270440610224, + "language_loss": 0.85523784, + "learning_rate": 0.0007458516654038483, + "loss": 0.86455238, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01699215, + "step": 1852, + "time_per_iteration": 2.6886868476867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.13526964, + "diversity_loss_mlp": 0.0, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06572834298852859, + "language_loss": 0.86835778, + "learning_rate": 0.0007455803382466946, + "loss": 0.8798511, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1853, + "time_per_iteration": 2.8323659896850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151398, + "balance_loss_mlp": 1.13686657, + "diversity_loss_mlp": 0.0, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.06349489422764842, + "language_loss": 0.86956179, + "learning_rate": 0.0007453089157467979, + "loss": 0.88107574, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1854, + "time_per_iteration": 2.817117929458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.13687038, + "diversity_loss_mlp": 0.0, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06687597930641362, + "language_loss": 0.8221277, + "learning_rate": 0.0007450373980095341, + "loss": 0.83364242, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1855, + "time_per_iteration": 3.0857772827148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148466, + "balance_loss_mlp": 1.13494754, + "diversity_loss_mlp": 0.0, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.0656889709190827, + "language_loss": 0.86804116, + "learning_rate": 0.0007447657851403155, + "loss": 0.87952584, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1856, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144273, + "balance_loss_mlp": 1.1303966, + "diversity_loss_mlp": 0.0, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.08894932465162153, + "language_loss": 0.78988904, + "learning_rate": 0.0007444940772445915, + "loss": 0.80133176, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1857, + "time_per_iteration": 2.752232551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122576, + "balance_loss_mlp": 1.10860419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06705763345081875, + "language_loss": 0.80129987, + "learning_rate": 0.0007442222744278484, + "loss": 0.81252563, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1858, + "time_per_iteration": 2.638322591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110883, + "balance_loss_mlp": 1.09717393, + "diversity_loss_mlp": 0.0, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.05935371072747042, + "language_loss": 0.8399322, + "learning_rate": 0.0007439503767956099, + "loss": 0.85104102, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.137146, + "routerloss_mlp": 0.0, + "step": 1859, + "time_per_iteration": 2.699204921722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124434, + "balance_loss_mlp": 1.11480188, + "diversity_loss_mlp": 0.0, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.03541879327423246, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80796039, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 1860, + "time_per_iteration": 4.89499831199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089258, + "balance_loss_mlp": 1.07479787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.06413043417122823, + "language_loss": 0.86215138, + "learning_rate": 0.000743406297506922, + "loss": 0.87304389, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1861, + "time_per_iteration": 2.7184388637542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919817, + "balance_loss_mlp": 1.60078692, + "diversity_loss_mlp": 0.20507258, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.028510278569739433, + "language_loss": 0.84439111, + "learning_rate": 0.0007431341160617031, + "loss": 0.8535893, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01688758, + "step": 1862, + "time_per_iteration": 2.8915610313415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084391, + "balance_loss_mlp": 1.06988358, + "diversity_loss_mlp": 0.0, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06954606141633879, + "language_loss": 0.88100171, + "learning_rate": 0.0007428618402234491, + "loss": 0.8918457, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1863, + "time_per_iteration": 2.6724555492401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087919, + "balance_loss_mlp": 1.0733279, + "diversity_loss_mlp": 0.0, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.07542508091229044, + "language_loss": 0.80288851, + "learning_rate": 0.0007425894700978668, + "loss": 0.81376767, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1864, + "time_per_iteration": 2.724853038787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_mlp": 1.06996608, + "diversity_loss_mlp": 0.0, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.07695346444963648, + "language_loss": 0.7981261, + "learning_rate": 0.0007423170057906996, + "loss": 0.80896473, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1865, + "time_per_iteration": 3.9006779193878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108627, + "balance_loss_mlp": 1.0722512, + "diversity_loss_mlp": 0.0, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.07814080760266444, + "language_loss": 0.86228722, + "learning_rate": 0.0007420444474077275, + "loss": 0.87314993, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1866, + "time_per_iteration": 2.546194076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095193, + "balance_loss_mlp": 1.0812335, + "diversity_loss_mlp": 0.0, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.0773553058948038, + "language_loss": 0.8949936, + "learning_rate": 0.0007417717950547671, + "loss": 0.90594554, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1867, + "time_per_iteration": 2.5670700073242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052493, + "balance_loss_mlp": 1.04262233, + "diversity_loss_mlp": 0.0, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.023944930622272237, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.770491, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 1868, + "time_per_iteration": 4.900780200958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101302, + "balance_loss_mlp": 1.087533, + "diversity_loss_mlp": 0.0, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.06547244306940128, + "language_loss": 0.84938717, + "learning_rate": 0.0007412262088623299, + "loss": 0.86040014, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1869, + "time_per_iteration": 2.7674195766448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092029, + "balance_loss_mlp": 1.60128522, + "diversity_loss_mlp": 0.20662443, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.03542659619783611, + "language_loss": 0.79155517, + "learning_rate": 0.0007409532752346684, + "loss": 0.80075806, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633519, + "step": 1870, + "time_per_iteration": 2.7116785049438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111485, + "balance_loss_mlp": 1.101367, + "diversity_loss_mlp": 0.0, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.061502004439029076, + "language_loss": 0.8836326, + "learning_rate": 0.0007406802480606491, + "loss": 0.89478111, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1871, + "time_per_iteration": 2.642608165740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105605, + "balance_loss_mlp": 1.0916698, + "diversity_loss_mlp": 0.0, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.06939665757215846, + "language_loss": 0.90353388, + "learning_rate": 0.0007404071274462707, + "loss": 0.91458994, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.1394043, + "routerloss_mlp": 0.0, + "step": 1872, + "time_per_iteration": 2.5600955486297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113518, + "balance_loss_mlp": 1.09967744, + "diversity_loss_mlp": 0.0, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.07241097832053987, + "language_loss": 0.83719409, + "learning_rate": 0.0007401339134975682, + "loss": 0.84832925, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1873, + "time_per_iteration": 2.6775293350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111162, + "balance_loss_mlp": 1.09724998, + "diversity_loss_mlp": 0.0, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.07980684605652169, + "language_loss": 0.84604299, + "learning_rate": 0.0007398606063206122, + "loss": 0.85715467, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1874, + "time_per_iteration": 2.6092889308929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109131, + "balance_loss_mlp": 1.09546924, + "diversity_loss_mlp": 0.0, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.09304103013369584, + "language_loss": 0.78818524, + "learning_rate": 0.0007395872060215101, + "loss": 0.79927647, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1875, + "time_per_iteration": 2.5999374389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124779, + "balance_loss_mlp": 1.11121297, + "diversity_loss_mlp": 0.0, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.08049441369365674, + "language_loss": 0.8851527, + "learning_rate": 0.0007393137127064056, + "loss": 0.89640045, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1876, + "time_per_iteration": 2.635896682739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127435, + "balance_loss_mlp": 1.11380959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.06613177233605298, + "language_loss": 0.84377646, + "learning_rate": 0.0007390401264814779, + "loss": 0.8550508, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1877, + "time_per_iteration": 2.597508192062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151319, + "balance_loss_mlp": 1.1378243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.09083655630754779, + "language_loss": 0.84454513, + "learning_rate": 0.0007387664474529427, + "loss": 0.8560583, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1878, + "time_per_iteration": 2.6493661403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143725, + "balance_loss_mlp": 1.1302073, + "diversity_loss_mlp": 0.0, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.0643860955644754, + "language_loss": 0.91379291, + "learning_rate": 0.0007384926757270518, + "loss": 0.92523015, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1879, + "time_per_iteration": 2.62565016746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152012, + "balance_loss_mlp": 1.13819528, + "diversity_loss_mlp": 0.0, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.07609143241795291, + "language_loss": 0.80057949, + "learning_rate": 0.0007382188114100924, + "loss": 0.81209958, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1880, + "time_per_iteration": 2.974212169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.14148784, + "diversity_loss_mlp": 0.0, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.0632350243804942, + "language_loss": 0.8182314, + "learning_rate": 0.0007379448546083884, + "loss": 0.82978803, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1881, + "time_per_iteration": 2.894099712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154364, + "balance_loss_mlp": 1.14052355, + "diversity_loss_mlp": 0.0, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06232367753538678, + "language_loss": 0.8822301, + "learning_rate": 0.0007376708054282992, + "loss": 0.89377379, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1882, + "time_per_iteration": 2.9576163291931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162916, + "balance_loss_mlp": 1.14919519, + "diversity_loss_mlp": 0.0, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.06608098206448941, + "language_loss": 0.83563071, + "learning_rate": 0.0007373966639762201, + "loss": 0.84725988, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1883, + "time_per_iteration": 2.6004068851470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158732, + "balance_loss_mlp": 1.14478457, + "diversity_loss_mlp": 0.0, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.07441448138889938, + "language_loss": 0.88544619, + "learning_rate": 0.0007371224303585822, + "loss": 0.89703357, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1884, + "time_per_iteration": 2.5741078853607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109461, + "balance_loss_mlp": 1.09897089, + "diversity_loss_mlp": 0.0, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.03545085729862102, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81466532, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 1885, + "time_per_iteration": 4.706872224807739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.13442218, + "diversity_loss_mlp": 0.0, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.0691831634947964, + "language_loss": 0.8278423, + "learning_rate": 0.0007365736870525335, + "loss": 0.83932269, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1886, + "time_per_iteration": 2.8480284214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135823, + "balance_loss_mlp": 1.12236464, + "diversity_loss_mlp": 0.0, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.0786816251155578, + "language_loss": 0.82659888, + "learning_rate": 0.000736299177577164, + "loss": 0.83795714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1887, + "time_per_iteration": 2.601449966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127197, + "balance_loss_mlp": 1.11358309, + "diversity_loss_mlp": 0.0, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0767010159800114, + "language_loss": 0.8381778, + "learning_rate": 0.0007360245763623174, + "loss": 0.84944975, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1888, + "time_per_iteration": 2.6951138973236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106263, + "balance_loss_mlp": 1.09350717, + "diversity_loss_mlp": 0.0, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06311908909694558, + "language_loss": 0.89886129, + "learning_rate": 0.0007357498835146039, + "loss": 0.90992391, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1889, + "time_per_iteration": 2.8509137630462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094399, + "balance_loss_mlp": 1.08141732, + "diversity_loss_mlp": 0.0, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.06820711534899371, + "language_loss": 0.86674547, + "learning_rate": 0.0007354750991406684, + "loss": 0.87768942, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1890, + "time_per_iteration": 2.7162795066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089716, + "balance_loss_mlp": 1.07673419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07876014589837055, + "language_loss": 0.80930853, + "learning_rate": 0.0007352002233471919, + "loss": 0.82020569, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1891, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091374, + "balance_loss_mlp": 1.07835662, + "diversity_loss_mlp": 0.0, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.08103720744805817, + "language_loss": 0.79372823, + "learning_rate": 0.0007349252562408906, + "loss": 0.80464196, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1892, + "time_per_iteration": 2.6752734184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097706, + "balance_loss_mlp": 1.08496833, + "diversity_loss_mlp": 0.0, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.07356128462514616, + "language_loss": 0.81490725, + "learning_rate": 0.0007346501979285158, + "loss": 0.82588428, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1893, + "time_per_iteration": 2.8990893363952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_mlp": 1.03214884, + "diversity_loss_mlp": 0.0, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.022756463517582398, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81579787, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.08544922, + "routerloss_mlp": 0.0, + "step": 1894, + "time_per_iteration": 4.8097145557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098768, + "balance_loss_mlp": 1.0857501, + "diversity_loss_mlp": 0.0, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06969655176236832, + "language_loss": 0.85880721, + "learning_rate": 0.0007340998081127308, + "loss": 0.86979485, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1895, + "time_per_iteration": 2.757380485534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087705, + "balance_loss_mlp": 1.074646, + "diversity_loss_mlp": 0.0, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06910669114263218, + "language_loss": 0.91127002, + "learning_rate": 0.0007338244768230007, + "loss": 0.92214715, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1896, + "time_per_iteration": 2.7967634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098629, + "balance_loss_mlp": 1.08584976, + "diversity_loss_mlp": 0.0, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.05804787602656793, + "language_loss": 0.88684666, + "learning_rate": 0.0007335490547545578, + "loss": 0.89783299, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1897, + "time_per_iteration": 3.086498260498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095377, + "balance_loss_mlp": 1.08286643, + "diversity_loss_mlp": 0.0, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06953546528053214, + "language_loss": 0.82679451, + "learning_rate": 0.0007332735420143308, + "loss": 0.83774823, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1898, + "time_per_iteration": 2.788245439529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097867, + "balance_loss_mlp": 1.08476591, + "diversity_loss_mlp": 0.0, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.07600656362423025, + "language_loss": 0.86647844, + "learning_rate": 0.0007329979387092826, + "loss": 0.87745708, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1899, + "time_per_iteration": 2.5437934398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101105, + "balance_loss_mlp": 1.08821869, + "diversity_loss_mlp": 0.0, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.05952938167480439, + "language_loss": 0.83796108, + "learning_rate": 0.0007327222449464124, + "loss": 0.8489722, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1900, + "time_per_iteration": 3.2824244499206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011078, + "balance_loss_mlp": 1.09499097, + "diversity_loss_mlp": 0.0, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07745224305421915, + "language_loss": 0.88634431, + "learning_rate": 0.0007324464608327538, + "loss": 0.89742231, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.12823486, + "routerloss_mlp": 0.0, + "step": 1901, + "time_per_iteration": 2.6411991119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102996, + "balance_loss_mlp": 1.08995461, + "diversity_loss_mlp": 0.0, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.08223816362142805, + "language_loss": 0.88474846, + "learning_rate": 0.0007321705864753758, + "loss": 0.89577842, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.1305542, + "routerloss_mlp": 0.0, + "step": 1902, + "time_per_iteration": 2.682002544403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931657, + "balance_loss_mlp": 1.62497878, + "diversity_loss_mlp": 0.20707282, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.026825446902959647, + "language_loss": 0.84137708, + "learning_rate": 0.0007318946219813823, + "loss": 0.85069364, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563089, + "step": 1903, + "time_per_iteration": 3.0061404705047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108165, + "balance_loss_mlp": 1.09403849, + "diversity_loss_mlp": 0.0, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.07526416733947026, + "language_loss": 0.89736164, + "learning_rate": 0.000731618567457912, + "loss": 0.90844321, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.14105225, + "routerloss_mlp": 0.0, + "step": 1904, + "time_per_iteration": 2.6523027420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099252, + "balance_loss_mlp": 1.08536446, + "diversity_loss_mlp": 0.0, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07605082206895837, + "language_loss": 0.87058568, + "learning_rate": 0.000731342423012139, + "loss": 0.88157821, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1905, + "time_per_iteration": 3.0595312118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096318, + "balance_loss_mlp": 1.08213234, + "diversity_loss_mlp": 0.0, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.07718853495225737, + "language_loss": 0.82559443, + "learning_rate": 0.0007310661887512722, + "loss": 0.83655763, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1906, + "time_per_iteration": 3.056859016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090478, + "balance_loss_mlp": 1.07672131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.07458396044121823, + "language_loss": 0.8194133, + "learning_rate": 0.0007307898647825549, + "loss": 0.83031803, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1907, + "time_per_iteration": 2.670468807220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090408, + "balance_loss_mlp": 1.07666349, + "diversity_loss_mlp": 0.0, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.09231339543244264, + "language_loss": 0.89368939, + "learning_rate": 0.0007305134512132659, + "loss": 0.90459347, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.13751221, + "routerloss_mlp": 0.0, + "step": 1908, + "time_per_iteration": 2.6561663150787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091843, + "balance_loss_mlp": 1.07826495, + "diversity_loss_mlp": 0.0, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.08913139219920335, + "language_loss": 0.83308864, + "learning_rate": 0.0007302369481507183, + "loss": 0.84400707, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1909, + "time_per_iteration": 2.5485799312591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017138, + "balance_loss_mlp": 1.00979447, + "diversity_loss_mlp": 0.0, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.013277678950868657, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.80978894, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1910, + "time_per_iteration": 4.848855257034302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111697, + "balance_loss_mlp": 1.09842944, + "diversity_loss_mlp": 0.0, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.058739485749840115, + "language_loss": 0.85315347, + "learning_rate": 0.000729683673975274, + "loss": 0.86427045, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.13287354, + "routerloss_mlp": 0.0, + "step": 1911, + "time_per_iteration": 2.690218210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114863, + "balance_loss_mlp": 1.10165429, + "diversity_loss_mlp": 0.0, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.05585809452393386, + "language_loss": 0.8291769, + "learning_rate": 0.0007294069030771774, + "loss": 0.84032547, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1912, + "time_per_iteration": 3.678927183151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125561, + "balance_loss_mlp": 1.1124301, + "diversity_loss_mlp": 0.0, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.06389765233013874, + "language_loss": 0.90667701, + "learning_rate": 0.0007291300431154224, + "loss": 0.91793263, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1913, + "time_per_iteration": 2.616999387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043841, + "balance_loss_mlp": 1.03611672, + "diversity_loss_mlp": 0.0, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.02051984405011318, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7143358, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1914, + "time_per_iteration": 4.973980903625488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137116, + "balance_loss_mlp": 1.12441444, + "diversity_loss_mlp": 0.0, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.0814243559806059, + "language_loss": 0.7981922, + "learning_rate": 0.0007285760564309179, + "loss": 0.8095634, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.1270752, + "routerloss_mlp": 0.0, + "step": 1915, + "time_per_iteration": 3.091447353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127931, + "balance_loss_mlp": 1.11485386, + "diversity_loss_mlp": 0.0, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.09574055809111115, + "language_loss": 0.84848046, + "learning_rate": 0.0007282989299232448, + "loss": 0.85975981, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.13092041, + "routerloss_mlp": 0.0, + "step": 1916, + "time_per_iteration": 3.074547052383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113017, + "balance_loss_mlp": 1.09977341, + "diversity_loss_mlp": 0.0, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.08763204320127825, + "language_loss": 0.83209801, + "learning_rate": 0.0007280217147820668, + "loss": 0.84322822, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1917, + "time_per_iteration": 2.6260228157043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092055, + "balance_loss_mlp": 1.07888198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06316346716689762, + "language_loss": 0.79465461, + "learning_rate": 0.0007277444111150079, + "loss": 0.80557513, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.13189697, + "routerloss_mlp": 0.0, + "step": 1918, + "time_per_iteration": 2.6777923107147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088544, + "balance_loss_mlp": 1.07465601, + "diversity_loss_mlp": 0.0, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.09595367080188737, + "language_loss": 0.84512901, + "learning_rate": 0.0007274670190297272, + "loss": 0.85601443, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1919, + "time_per_iteration": 2.590839147567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085845, + "balance_loss_mlp": 1.07205224, + "diversity_loss_mlp": 0.0, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.07431087712553297, + "language_loss": 0.82079387, + "learning_rate": 0.0007271895386339179, + "loss": 0.83165228, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1920, + "time_per_iteration": 2.7924282550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094639, + "balance_loss_mlp": 1.08048892, + "diversity_loss_mlp": 0.0, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.07797312778631413, + "language_loss": 0.83431751, + "learning_rate": 0.0007269119700353073, + "loss": 0.84526384, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1921, + "time_per_iteration": 2.7155139446258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112357, + "balance_loss_mlp": 1.0987196, + "diversity_loss_mlp": 0.0, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.07250682713227712, + "language_loss": 0.84994757, + "learning_rate": 0.0007266343133416571, + "loss": 0.86107111, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1922, + "time_per_iteration": 2.7394983768463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073276, + "balance_loss_mlp": 1.06564641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.035523530201468645, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78190196, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.07617188, + "routerloss_mlp": 0.0, + "step": 1923, + "time_per_iteration": 4.877161026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10153794, + "diversity_loss_mlp": 0.0, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.0789330271899564, + "language_loss": 0.84356588, + "learning_rate": 0.0007260787361004556, + "loss": 0.85471952, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1924, + "time_per_iteration": 2.608745813369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_mlp": 1.02985299, + "diversity_loss_mlp": 0.0, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.021371165562314075, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74798417, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.07080078, + "routerloss_mlp": 0.0, + "step": 1925, + "time_per_iteration": 4.906585931777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114233, + "balance_loss_mlp": 1.10069048, + "diversity_loss_mlp": 0.0, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.12026638393290963, + "language_loss": 0.87422252, + "learning_rate": 0.0007255228077730903, + "loss": 0.88536477, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1926, + "time_per_iteration": 2.6886680126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123836, + "balance_loss_mlp": 1.11107421, + "diversity_loss_mlp": 0.0, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.06719853297068734, + "language_loss": 0.81722987, + "learning_rate": 0.0007252447122218632, + "loss": 0.82846814, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1927, + "time_per_iteration": 3.1511058807373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125512, + "balance_loss_mlp": 1.11258984, + "diversity_loss_mlp": 0.0, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.08764579691953547, + "language_loss": 0.87849444, + "learning_rate": 0.0007249665292228834, + "loss": 0.88974959, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1928, + "time_per_iteration": 2.565991163253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120289, + "balance_loss_mlp": 1.1073308, + "diversity_loss_mlp": 0.0, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.0633685198143462, + "language_loss": 0.83318496, + "learning_rate": 0.000724688258884151, + "loss": 0.84438789, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1929, + "time_per_iteration": 2.531827926635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_mlp": 1.10286927, + "diversity_loss_mlp": 0.0, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.05744658583323744, + "language_loss": 0.86564112, + "learning_rate": 0.0007244099013137002, + "loss": 0.8767941, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1930, + "time_per_iteration": 3.1130166053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.10404849, + "diversity_loss_mlp": 0.0, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.06880018611034966, + "language_loss": 0.88695574, + "learning_rate": 0.0007241314566195993, + "loss": 0.89812243, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.1262207, + "routerloss_mlp": 0.0, + "step": 1931, + "time_per_iteration": 3.374743700027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110097, + "balance_loss_mlp": 1.08821416, + "diversity_loss_mlp": 0.0, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.06303779661636588, + "language_loss": 0.85510373, + "learning_rate": 0.0007238529249099496, + "loss": 0.86611342, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1932, + "time_per_iteration": 2.6654059886932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097731, + "balance_loss_mlp": 1.0911988, + "diversity_loss_mlp": 0.0, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.03412398452916775, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78954613, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.06542969, + "routerloss_mlp": 0.0, + "step": 1933, + "time_per_iteration": 4.851354598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091175, + "balance_loss_mlp": 1.07859278, + "diversity_loss_mlp": 0.0, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.08014253307267598, + "language_loss": 0.80636895, + "learning_rate": 0.000723295600876581, + "loss": 0.81728071, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.12597656, + "routerloss_mlp": 0.0, + "step": 1934, + "time_per_iteration": 3.0025534629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097044, + "balance_loss_mlp": 1.08416963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.08698689907724866, + "language_loss": 0.88006312, + "learning_rate": 0.0007230168087692344, + "loss": 0.89103359, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.12872314, + "routerloss_mlp": 0.0, + "step": 1935, + "time_per_iteration": 2.6499342918395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095813, + "balance_loss_mlp": 1.0830214, + "diversity_loss_mlp": 0.0, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.07031074193849007, + "language_loss": 0.82382512, + "learning_rate": 0.0007227379300790839, + "loss": 0.8347832, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1936, + "time_per_iteration": 3.0040676593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092601, + "balance_loss_mlp": 1.07969058, + "diversity_loss_mlp": 0.0, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.07132774808829288, + "language_loss": 0.85478282, + "learning_rate": 0.0007224589649143997, + "loss": 0.86570889, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 1937, + "time_per_iteration": 2.584545612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089825, + "balance_loss_mlp": 1.07662272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.0711139803163438, + "language_loss": 0.8120302, + "learning_rate": 0.0007221799133834861, + "loss": 0.82292843, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.13214111, + "routerloss_mlp": 0.0, + "step": 1938, + "time_per_iteration": 2.6393649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109955, + "balance_loss_mlp": 1.08649623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.20460237815205612, + "language_loss": 0.81793052, + "learning_rate": 0.00072190077559468, + "loss": 0.82892597, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1939, + "time_per_iteration": 2.5494682788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127139, + "balance_loss_mlp": 1.1140976, + "diversity_loss_mlp": 0.0, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.05817015695703163, + "language_loss": 0.89248812, + "learning_rate": 0.0007216215516563527, + "loss": 0.90375948, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.13049316, + "routerloss_mlp": 0.0, + "step": 1940, + "time_per_iteration": 2.6755452156066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129035, + "balance_loss_mlp": 1.1159811, + "diversity_loss_mlp": 0.0, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.07778932214282369, + "language_loss": 0.83852386, + "learning_rate": 0.0007213422416769083, + "loss": 0.84981418, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1941, + "time_per_iteration": 2.6008002758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135994, + "balance_loss_mlp": 1.12319708, + "diversity_loss_mlp": 0.0, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.06345716224902766, + "language_loss": 0.7501297, + "learning_rate": 0.0007210628457647849, + "loss": 0.76148963, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1942, + "time_per_iteration": 2.5911362171173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140859, + "balance_loss_mlp": 1.12763917, + "diversity_loss_mlp": 0.0, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.06753886702103719, + "language_loss": 0.78585184, + "learning_rate": 0.000720783364028453, + "loss": 0.7972604, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.13238525, + "routerloss_mlp": 0.0, + "step": 1943, + "time_per_iteration": 2.7490458488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149977, + "balance_loss_mlp": 1.13685822, + "diversity_loss_mlp": 0.0, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.0650742437261564, + "language_loss": 0.87667847, + "learning_rate": 0.0007205037965764177, + "loss": 0.88817823, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1944, + "time_per_iteration": 2.5870554447174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134812, + "balance_loss_mlp": 1.12192512, + "diversity_loss_mlp": 0.0, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07468357539719116, + "language_loss": 0.85650361, + "learning_rate": 0.0007202241435172161, + "loss": 0.86785173, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1945, + "time_per_iteration": 2.7550253868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131811, + "balance_loss_mlp": 1.11901414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07270487210957549, + "language_loss": 0.87884831, + "learning_rate": 0.0007199444049594198, + "loss": 0.8901664, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1946, + "time_per_iteration": 2.9499337673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111095, + "balance_loss_mlp": 1.09783912, + "diversity_loss_mlp": 0.0, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.07247382516020226, + "language_loss": 0.83384776, + "learning_rate": 0.0007196645810116322, + "loss": 0.84495866, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1947, + "time_per_iteration": 2.70394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113218, + "balance_loss_mlp": 1.1003499, + "diversity_loss_mlp": 0.0, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.07522309633784076, + "language_loss": 0.84431696, + "learning_rate": 0.0007193846717824912, + "loss": 0.8554492, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1948, + "time_per_iteration": 2.923752546310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116002, + "balance_loss_mlp": 1.10312748, + "diversity_loss_mlp": 0.0, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.06883561802065806, + "language_loss": 0.88268626, + "learning_rate": 0.0007191046773806669, + "loss": 0.89384627, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1949, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108921, + "balance_loss_mlp": 1.09593272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07969110082801287, + "language_loss": 0.83211446, + "learning_rate": 0.0007188245979148631, + "loss": 0.84320366, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1950, + "time_per_iteration": 3.193124294281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111725, + "balance_loss_mlp": 1.09892154, + "diversity_loss_mlp": 0.0, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.07005872092850987, + "language_loss": 0.87434363, + "learning_rate": 0.0007185444334938157, + "loss": 0.88546085, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1951, + "time_per_iteration": 2.669201135635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101783, + "balance_loss_mlp": 1.0892663, + "diversity_loss_mlp": 0.0, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.08195801919923047, + "language_loss": 0.85047525, + "learning_rate": 0.0007182641842262947, + "loss": 0.86149311, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.12518311, + "routerloss_mlp": 0.0, + "step": 1952, + "time_per_iteration": 2.602139472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092197, + "balance_loss_mlp": 1.07936394, + "diversity_loss_mlp": 0.0, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.07349771430020792, + "language_loss": 0.77754879, + "learning_rate": 0.0007179838502211022, + "loss": 0.78847075, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.128479, + "routerloss_mlp": 0.0, + "step": 1953, + "time_per_iteration": 2.85720157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094498, + "balance_loss_mlp": 1.08148086, + "diversity_loss_mlp": 0.0, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.0681681729591206, + "language_loss": 0.86330736, + "learning_rate": 0.0007177034315870738, + "loss": 0.87425238, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1954, + "time_per_iteration": 2.958862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101066, + "balance_loss_mlp": 1.08803654, + "diversity_loss_mlp": 0.0, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06642365438263753, + "language_loss": 0.90809441, + "learning_rate": 0.0007174229284330773, + "loss": 0.91910505, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1955, + "time_per_iteration": 2.5824947357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108936, + "balance_loss_mlp": 1.07642531, + "diversity_loss_mlp": 0.0, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.07788827503332588, + "language_loss": 0.86705017, + "learning_rate": 0.0007171423408680141, + "loss": 0.87794375, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1956, + "time_per_iteration": 2.8101606369018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00950311, + "balance_loss_mlp": 1.6602329, + "diversity_loss_mlp": 0.20739825, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.03218717292019043, + "language_loss": 0.89567441, + "learning_rate": 0.0007168616690008176, + "loss": 0.90517747, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01649548, + "step": 1957, + "time_per_iteration": 2.6774377822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081569, + "balance_loss_mlp": 1.06840825, + "diversity_loss_mlp": 0.0, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.07242251254882147, + "language_loss": 0.85681045, + "learning_rate": 0.0007165809129404545, + "loss": 0.86762613, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1958, + "time_per_iteration": 2.8396048545837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090105, + "balance_loss_mlp": 1.07657433, + "diversity_loss_mlp": 0.0, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.08227545286248691, + "language_loss": 0.86212921, + "learning_rate": 0.0007163000727959239, + "loss": 0.87303019, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1959, + "time_per_iteration": 2.478990316390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087148, + "balance_loss_mlp": 1.07989979, + "diversity_loss_mlp": 0.0, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.05215322395932221, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79046214, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.07226562, + "routerloss_mlp": 0.0, + "step": 1960, + "time_per_iteration": 4.869986057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095327, + "balance_loss_mlp": 1.08232689, + "diversity_loss_mlp": 0.0, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.08048811275026858, + "language_loss": 0.84568793, + "learning_rate": 0.00071573814069052, + "loss": 0.85664117, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.13018799, + "routerloss_mlp": 0.0, + "step": 1961, + "time_per_iteration": 2.9122819900512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109901, + "balance_loss_mlp": 1.08614171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.06061063893945359, + "language_loss": 0.88073885, + "learning_rate": 0.0007154570489478081, + "loss": 0.89172894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1962, + "time_per_iteration": 3.1824018955230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111399, + "balance_loss_mlp": 1.10154414, + "diversity_loss_mlp": 0.0, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.06274200702745775, + "language_loss": 0.86391222, + "learning_rate": 0.0007151758735572514, + "loss": 0.87505209, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.12451172, + "routerloss_mlp": 0.0, + "step": 1963, + "time_per_iteration": 2.997624158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111089, + "balance_loss_mlp": 1.09836888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.07983075782925624, + "language_loss": 0.80894458, + "learning_rate": 0.0007148946146280119, + "loss": 0.82005548, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.12731934, + "routerloss_mlp": 0.0, + "step": 1964, + "time_per_iteration": 2.836583137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00620122, + "balance_loss_mlp": 1.05382681, + "diversity_loss_mlp": 0.16216688, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.0017779517528101797, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.72812271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01212509, + "step": 1965, + "time_per_iteration": 4.906678915023804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_mlp": 1.02436352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.025755206304302582, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.7637251, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.06176758, + "routerloss_mlp": 0.0, + "step": 1966, + "time_per_iteration": 4.93319296836853 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127949, + "balance_loss_mlp": 1.11581361, + "diversity_loss_mlp": 0.0, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.05898800907157556, + "language_loss": 0.83873129, + "learning_rate": 0.0007140503377003022, + "loss": 0.85001081, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 1967, + "time_per_iteration": 2.9807000160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123739, + "balance_loss_mlp": 1.11125755, + "diversity_loss_mlp": 0.0, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.06421364750503517, + "language_loss": 0.84625173, + "learning_rate": 0.000713768745708599, + "loss": 0.85748911, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1968, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118961, + "balance_loss_mlp": 1.10671234, + "diversity_loss_mlp": 0.0, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.06880095080762995, + "language_loss": 0.77052647, + "learning_rate": 0.0007134870707245085, + "loss": 0.78171611, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.12249756, + "routerloss_mlp": 0.0, + "step": 1969, + "time_per_iteration": 3.302985429763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120912, + "balance_loss_mlp": 1.10852587, + "diversity_loss_mlp": 0.0, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.07142024228833302, + "language_loss": 0.84469545, + "learning_rate": 0.0007132053128573864, + "loss": 0.85590458, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.12384033, + "routerloss_mlp": 0.0, + "step": 1970, + "time_per_iteration": 2.7751197814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_mlp": 1.11231327, + "diversity_loss_mlp": 0.0, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06795721743578591, + "language_loss": 0.83786452, + "learning_rate": 0.0007129234722166211, + "loss": 0.84910882, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 1971, + "time_per_iteration": 2.806898832321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114684, + "balance_loss_mlp": 1.10238707, + "diversity_loss_mlp": 0.0, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.06601167392952549, + "language_loss": 0.91087604, + "learning_rate": 0.0007126415489116328, + "loss": 0.92202282, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.1229248, + "routerloss_mlp": 0.0, + "step": 1972, + "time_per_iteration": 2.656651496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.09782279, + "diversity_loss_mlp": 0.0, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06641244535011205, + "language_loss": 0.81145501, + "learning_rate": 0.0007123595430518736, + "loss": 0.82255375, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 1973, + "time_per_iteration": 2.8665072917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102568, + "balance_loss_mlp": 1.09068835, + "diversity_loss_mlp": 0.0, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.07235703206146665, + "language_loss": 0.86411089, + "learning_rate": 0.0007120774547468282, + "loss": 0.87513655, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 1974, + "time_per_iteration": 2.5590381622314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948323, + "balance_loss_mlp": 1.65707994, + "diversity_loss_mlp": 0.20756721, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.03148003592885531, + "language_loss": 0.81558585, + "learning_rate": 0.0007117952841060128, + "loss": 0.82506907, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01599924, + "step": 1975, + "time_per_iteration": 2.6777563095092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083824, + "balance_loss_mlp": 1.07167053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.07660828670939425, + "language_loss": 0.83672053, + "learning_rate": 0.0007115130312389756, + "loss": 0.8475588, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.12145996, + "routerloss_mlp": 0.0, + "step": 1976, + "time_per_iteration": 2.7103323936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.07200503, + "diversity_loss_mlp": 0.0, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.08353002189035653, + "language_loss": 0.79290646, + "learning_rate": 0.0007112306962552973, + "loss": 0.80375111, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.12463379, + "routerloss_mlp": 0.0, + "step": 1977, + "time_per_iteration": 2.576239824295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084774, + "balance_loss_mlp": 1.07254314, + "diversity_loss_mlp": 0.0, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.06483406604645132, + "language_loss": 0.85315859, + "learning_rate": 0.0007109482792645896, + "loss": 0.86400628, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 1978, + "time_per_iteration": 2.7146143913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.07276165, + "diversity_loss_mlp": 0.0, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.06865418790878511, + "language_loss": 0.83831733, + "learning_rate": 0.0007106657803764969, + "loss": 0.84916663, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 1979, + "time_per_iteration": 2.73152494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086146, + "balance_loss_mlp": 1.07395101, + "diversity_loss_mlp": 0.0, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.07620298141647525, + "language_loss": 0.81962979, + "learning_rate": 0.0007103831997006948, + "loss": 0.83049119, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.12194824, + "routerloss_mlp": 0.0, + "step": 1980, + "time_per_iteration": 2.7383615970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094611, + "balance_loss_mlp": 1.08276772, + "diversity_loss_mlp": 0.0, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.0842263164190672, + "language_loss": 0.85342598, + "learning_rate": 0.0007101005373468908, + "loss": 0.86437213, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 1981, + "time_per_iteration": 2.889251708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097444, + "balance_loss_mlp": 1.08543372, + "diversity_loss_mlp": 0.0, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.06048237516575629, + "language_loss": 0.86649287, + "learning_rate": 0.0007098177934248242, + "loss": 0.87746727, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 1982, + "time_per_iteration": 2.773146867752075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920145, + "balance_loss_mlp": 1.60273147, + "diversity_loss_mlp": 0.20649332, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.033525346661278974, + "language_loss": 0.85516387, + "learning_rate": 0.0007095349680442661, + "loss": 0.86436534, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01553278, + "step": 1983, + "time_per_iteration": 2.8675785064697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116298, + "balance_loss_mlp": 1.1045742, + "diversity_loss_mlp": 0.0, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.06407324010727367, + "language_loss": 0.78783178, + "learning_rate": 0.0007092520613150188, + "loss": 0.79899484, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 1984, + "time_per_iteration": 2.709177017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918651, + "balance_loss_mlp": 1.59999418, + "diversity_loss_mlp": 0.20665541, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.03070680845617011, + "language_loss": 0.80925471, + "learning_rate": 0.0007089690733469165, + "loss": 0.81844121, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532666, + "step": 1985, + "time_per_iteration": 2.750558376312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135734, + "balance_loss_mlp": 1.12384343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.08571071539105668, + "language_loss": 0.82313848, + "learning_rate": 0.000708686004249825, + "loss": 0.83449578, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 1986, + "time_per_iteration": 2.7550368309020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132102, + "balance_loss_mlp": 1.12012124, + "diversity_loss_mlp": 0.0, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.07744479108461458, + "language_loss": 0.91340905, + "learning_rate": 0.0007084028541336413, + "loss": 0.92473006, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.11975098, + "routerloss_mlp": 0.0, + "step": 1987, + "time_per_iteration": 2.703339099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914957, + "balance_loss_mlp": 1.59260678, + "diversity_loss_mlp": 0.20690078, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.03035395776464378, + "language_loss": 0.86267084, + "learning_rate": 0.0007081196231082942, + "loss": 0.87182039, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01520337, + "step": 1988, + "time_per_iteration": 2.8075153827667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.10567343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.07746710731409655, + "language_loss": 0.80053389, + "learning_rate": 0.0007078363112837436, + "loss": 0.81171107, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.12036133, + "routerloss_mlp": 0.0, + "step": 1989, + "time_per_iteration": 2.811197280883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104881, + "balance_loss_mlp": 1.09261441, + "diversity_loss_mlp": 0.0, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.07961201652041947, + "language_loss": 0.84721339, + "learning_rate": 0.000707552918769981, + "loss": 0.85826218, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 1990, + "time_per_iteration": 2.4908246994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102216, + "balance_loss_mlp": 1.08987188, + "diversity_loss_mlp": 0.0, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.06284554422997896, + "language_loss": 0.83619118, + "learning_rate": 0.000707269445677029, + "loss": 0.84721333, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.12341309, + "routerloss_mlp": 0.0, + "step": 1991, + "time_per_iteration": 2.733126401901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101588, + "balance_loss_mlp": 1.08921361, + "diversity_loss_mlp": 0.0, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.07203164936975576, + "language_loss": 0.85140717, + "learning_rate": 0.0007069858921149416, + "loss": 0.86242306, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.12371826, + "routerloss_mlp": 0.0, + "step": 1992, + "time_per_iteration": 2.9382007122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096798, + "balance_loss_mlp": 1.08434701, + "diversity_loss_mlp": 0.0, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.05485930037569587, + "language_loss": 0.85794246, + "learning_rate": 0.0007067022581938043, + "loss": 0.86891043, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.12457275, + "routerloss_mlp": 0.0, + "step": 1993, + "time_per_iteration": 2.857525110244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095406, + "balance_loss_mlp": 1.08321714, + "diversity_loss_mlp": 0.0, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.0871408980162776, + "language_loss": 0.83722532, + "learning_rate": 0.0007064185440237334, + "loss": 0.8481794, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1994, + "time_per_iteration": 2.7131123542785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099231, + "balance_loss_mlp": 1.08733368, + "diversity_loss_mlp": 0.0, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.06357294591464056, + "language_loss": 0.84358412, + "learning_rate": 0.0007061347497148764, + "loss": 0.85457647, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.11895752, + "routerloss_mlp": 0.0, + "step": 1995, + "time_per_iteration": 2.7398569583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102339, + "balance_loss_mlp": 1.09015, + "diversity_loss_mlp": 0.0, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.07322887134464046, + "language_loss": 0.86299884, + "learning_rate": 0.0007058508753774122, + "loss": 0.87402225, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1996, + "time_per_iteration": 2.6903162002563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108867, + "balance_loss_mlp": 1.09709477, + "diversity_loss_mlp": 0.0, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.0698381422429368, + "language_loss": 0.86921895, + "learning_rate": 0.0007055669211215505, + "loss": 0.88030767, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 1997, + "time_per_iteration": 2.695028066635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113141, + "balance_loss_mlp": 1.10084486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.08585182349688475, + "language_loss": 0.77776283, + "learning_rate": 0.0007052828870575322, + "loss": 0.78889418, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 1998, + "time_per_iteration": 2.685685873031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011259, + "balance_loss_mlp": 1.11406291, + "diversity_loss_mlp": 0.0, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.06979871165732322, + "language_loss": 0.87060714, + "learning_rate": 0.0007049987732956291, + "loss": 0.8818661, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.11834717, + "routerloss_mlp": 0.0, + "step": 1999, + "time_per_iteration": 2.9710631370544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110647, + "balance_loss_mlp": 1.09428668, + "diversity_loss_mlp": 0.0, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.05561177596637214, + "language_loss": 0.82812738, + "learning_rate": 0.0007047145799461439, + "loss": 0.83919203, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2000, + "time_per_iteration": 2.8492860794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105216, + "balance_loss_mlp": 1.09293747, + "diversity_loss_mlp": 0.0, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06017266002852966, + "language_loss": 0.82272708, + "learning_rate": 0.00070443030711941, + "loss": 0.83377922, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2001, + "time_per_iteration": 2.769383430480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100076, + "balance_loss_mlp": 1.08806002, + "diversity_loss_mlp": 0.0, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.061888534691205976, + "language_loss": 0.82098496, + "learning_rate": 0.0007041459549257924, + "loss": 0.83198571, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2002, + "time_per_iteration": 2.876244306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089803, + "balance_loss_mlp": 1.07744145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.06816771124006925, + "language_loss": 0.78024125, + "learning_rate": 0.0007038615234756859, + "loss": 0.79113925, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.12359619, + "routerloss_mlp": 0.0, + "step": 2003, + "time_per_iteration": 3.1744768619537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086899, + "balance_loss_mlp": 1.07477546, + "diversity_loss_mlp": 0.0, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.09233530116269285, + "language_loss": 0.83808231, + "learning_rate": 0.000703577012879517, + "loss": 0.84895122, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2004, + "time_per_iteration": 2.633391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089004, + "balance_loss_mlp": 1.07705307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.07105955558417659, + "language_loss": 0.88946962, + "learning_rate": 0.0007032924232477423, + "loss": 0.90035963, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2005, + "time_per_iteration": 2.6482574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109168, + "balance_loss_mlp": 1.0797528, + "diversity_loss_mlp": 0.0, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.07024694433071269, + "language_loss": 0.80605727, + "learning_rate": 0.0007030077546908493, + "loss": 0.81697416, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2006, + "time_per_iteration": 2.6219046115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087051, + "balance_loss_mlp": 1.08056581, + "diversity_loss_mlp": 0.0, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.032453276732354666, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84151709, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.06494141, + "routerloss_mlp": 0.0, + "step": 2007, + "time_per_iteration": 4.798014402389526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099136, + "balance_loss_mlp": 1.08744717, + "diversity_loss_mlp": 0.0, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.08661380313869275, + "language_loss": 0.79137146, + "learning_rate": 0.0007024381812438117, + "loss": 0.8023628, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.11676025, + "routerloss_mlp": 0.0, + "step": 2008, + "time_per_iteration": 2.5403189659118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110095, + "balance_loss_mlp": 1.08864713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.09407170185597404, + "language_loss": 0.83448064, + "learning_rate": 0.0007021532765747951, + "loss": 0.8454901, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.12310791, + "routerloss_mlp": 0.0, + "step": 2009, + "time_per_iteration": 2.9585187435150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094144, + "balance_loss_mlp": 1.08211613, + "diversity_loss_mlp": 0.0, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.0684890586406507, + "language_loss": 0.79048979, + "learning_rate": 0.0007018682934229162, + "loss": 0.80143124, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2010, + "time_per_iteration": 2.9703307151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_mlp": 1.0842756, + "diversity_loss_mlp": 0.0, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.06303649013837292, + "language_loss": 0.82761061, + "learning_rate": 0.0007015832318988152, + "loss": 0.83857542, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2011, + "time_per_iteration": 2.6060009002685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_mlp": 1.02231336, + "diversity_loss_mlp": 0.0, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.017766506591404385, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.7491802, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.05615234, + "routerloss_mlp": 0.0, + "step": 2012, + "time_per_iteration": 4.938155651092529 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109088, + "balance_loss_mlp": 1.07810068, + "diversity_loss_mlp": 0.0, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.060967443696148906, + "language_loss": 0.84265292, + "learning_rate": 0.0007010128741766604, + "loss": 0.85356176, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 2013, + "time_per_iteration": 2.7293431758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091499, + "balance_loss_mlp": 1.07861209, + "diversity_loss_mlp": 0.0, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.07873148114105366, + "language_loss": 0.84277219, + "learning_rate": 0.0007007275782000391, + "loss": 0.85368717, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 2014, + "time_per_iteration": 2.644911766052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091625, + "balance_loss_mlp": 1.07889354, + "diversity_loss_mlp": 0.0, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.0868083489465314, + "language_loss": 0.8502394, + "learning_rate": 0.0007004422042940605, + "loss": 0.86115563, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 2015, + "time_per_iteration": 2.5096747875213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109894, + "balance_loss_mlp": 1.08593392, + "diversity_loss_mlp": 0.0, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.08227522563153689, + "language_loss": 0.89877218, + "learning_rate": 0.0007001567525695169, + "loss": 0.90976155, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 2016, + "time_per_iteration": 2.606520891189575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105972, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.06437704205290017, + "language_loss": 0.83705699, + "learning_rate": 0.0006998712231372303, + "loss": 0.84811676, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 2017, + "time_per_iteration": 3.016061305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119415, + "balance_loss_mlp": 1.10692167, + "diversity_loss_mlp": 0.0, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06622760195410109, + "language_loss": 0.85886908, + "learning_rate": 0.0006995856161080532, + "loss": 0.87006325, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.12487793, + "routerloss_mlp": 0.0, + "step": 2018, + "time_per_iteration": 2.8263893127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_mlp": 1.11165869, + "diversity_loss_mlp": 0.0, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.06957079313074316, + "language_loss": 0.82328916, + "learning_rate": 0.0006992999315928679, + "loss": 0.83453172, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.1260376, + "routerloss_mlp": 0.0, + "step": 2019, + "time_per_iteration": 2.789020299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.11772799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.05589846380959986, + "language_loss": 0.85480869, + "learning_rate": 0.0006990141697025871, + "loss": 0.86611497, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 2020, + "time_per_iteration": 2.788597345352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067569, + "balance_loss_mlp": 1.06141829, + "diversity_loss_mlp": 0.0, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.034323999481440985, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77427208, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2021, + "time_per_iteration": 4.782108545303345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130924, + "balance_loss_mlp": 1.11879468, + "diversity_loss_mlp": 0.0, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0813348018947899, + "language_loss": 0.82333553, + "learning_rate": 0.0006984424142405392, + "loss": 0.83464473, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 2022, + "time_per_iteration": 2.804866075515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118053, + "balance_loss_mlp": 1.10578668, + "diversity_loss_mlp": 0.0, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.07379903296161248, + "language_loss": 0.82117045, + "learning_rate": 0.0006981564208907474, + "loss": 0.83235097, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2023, + "time_per_iteration": 2.5883662700653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130828, + "balance_loss_mlp": 1.11855519, + "diversity_loss_mlp": 0.0, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.07869766022149485, + "language_loss": 0.8995713, + "learning_rate": 0.0006978703506098102, + "loss": 0.91087961, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2024, + "time_per_iteration": 2.730283498764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127449, + "balance_loss_mlp": 1.11556411, + "diversity_loss_mlp": 0.0, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.0665173530375796, + "language_loss": 0.88210815, + "learning_rate": 0.00069758420350879, + "loss": 0.89338267, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2025, + "time_per_iteration": 2.62969708442688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932402, + "balance_loss_mlp": 1.62686133, + "diversity_loss_mlp": 0.20693868, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.03379762859523427, + "language_loss": 0.8613863, + "learning_rate": 0.000697297979698779, + "loss": 0.87071025, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01550185, + "step": 2026, + "time_per_iteration": 2.837543249130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107077, + "balance_loss_mlp": 1.09529877, + "diversity_loss_mlp": 0.0, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06049708379655892, + "language_loss": 0.83660531, + "learning_rate": 0.0006970116792908992, + "loss": 0.84767604, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2027, + "time_per_iteration": 3.1133604049682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107629, + "balance_loss_mlp": 1.0960542, + "diversity_loss_mlp": 0.0, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.07190738956644391, + "language_loss": 0.81380564, + "learning_rate": 0.000696725302396302, + "loss": 0.82488191, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2028, + "time_per_iteration": 2.6460230350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109766, + "balance_loss_mlp": 1.08604932, + "diversity_loss_mlp": 0.0, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.06814290150602269, + "language_loss": 0.85887402, + "learning_rate": 0.0006964388491261692, + "loss": 0.86985064, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2029, + "time_per_iteration": 3.296208143234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099195, + "balance_loss_mlp": 1.0871129, + "diversity_loss_mlp": 0.0, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.075812953715104, + "language_loss": 0.87511015, + "learning_rate": 0.0006961523195917114, + "loss": 0.88610214, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2030, + "time_per_iteration": 2.803239345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107522, + "balance_loss_mlp": 1.09573865, + "diversity_loss_mlp": 0.0, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.0665807006884719, + "language_loss": 0.78137511, + "learning_rate": 0.0006958657139041696, + "loss": 0.79245031, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2031, + "time_per_iteration": 2.739151954650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061018, + "balance_loss_mlp": 1.05531955, + "diversity_loss_mlp": 0.0, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.035996309550900246, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77773988, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.05688477, + "routerloss_mlp": 0.0, + "step": 2032, + "time_per_iteration": 4.918209552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094882, + "balance_loss_mlp": 1.08307993, + "diversity_loss_mlp": 0.0, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.0751880944680772, + "language_loss": 0.78643966, + "learning_rate": 0.0006952922745149434, + "loss": 0.79738843, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2033, + "time_per_iteration": 2.6274161338806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091358, + "balance_loss_mlp": 1.07940745, + "diversity_loss_mlp": 0.0, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.07391479182011068, + "language_loss": 0.87674987, + "learning_rate": 0.000695005441035888, + "loss": 0.88766348, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2034, + "time_per_iteration": 2.647348642349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018577, + "balance_loss_mlp": 1.01280713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.010435626825017296, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74742007, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2035, + "time_per_iteration": 4.8861188888549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107172, + "balance_loss_mlp": 1.094733, + "diversity_loss_mlp": 0.0, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.06114898183694146, + "language_loss": 0.81133932, + "learning_rate": 0.0006944315470656863, + "loss": 0.82241106, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.12438965, + "routerloss_mlp": 0.0, + "step": 2036, + "time_per_iteration": 3.0057246685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108633, + "balance_loss_mlp": 1.09606266, + "diversity_loss_mlp": 0.0, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.0812142536963638, + "language_loss": 0.90953541, + "learning_rate": 0.000694144486797345, + "loss": 0.92062169, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.12579346, + "routerloss_mlp": 0.0, + "step": 2037, + "time_per_iteration": 2.6566872596740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_mlp": 1.0060699, + "diversity_loss_mlp": 0.0, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.012879447335335118, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80532491, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2038, + "time_per_iteration": 4.609802722930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103729, + "balance_loss_mlp": 1.09141517, + "diversity_loss_mlp": 0.0, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.07718413790316761, + "language_loss": 0.89271998, + "learning_rate": 0.0006935701402514156, + "loss": 0.90375727, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.12316895, + "routerloss_mlp": 0.0, + "step": 2039, + "time_per_iteration": 2.610905408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101805, + "balance_loss_mlp": 1.01206541, + "diversity_loss_mlp": 0.0, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.016017309503016164, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74052942, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2040, + "time_per_iteration": 4.954579830169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106674, + "balance_loss_mlp": 1.09434199, + "diversity_loss_mlp": 0.0, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.0728619475730698, + "language_loss": 0.84539711, + "learning_rate": 0.0006929954931031422, + "loss": 0.85646391, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2041, + "time_per_iteration": 3.6979990005493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114297, + "balance_loss_mlp": 1.10201287, + "diversity_loss_mlp": 0.0, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.07303574322286652, + "language_loss": 0.88330269, + "learning_rate": 0.0006927080570819805, + "loss": 0.89444566, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2042, + "time_per_iteration": 2.5840306282043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126631, + "balance_loss_mlp": 1.11437607, + "diversity_loss_mlp": 0.0, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.09784101638347129, + "language_loss": 0.80726093, + "learning_rate": 0.0006924205462449161, + "loss": 0.81852722, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2043, + "time_per_iteration": 2.556964159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123407, + "balance_loss_mlp": 1.11139631, + "diversity_loss_mlp": 0.0, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.07674510212981295, + "language_loss": 0.81822228, + "learning_rate": 0.0006921329607035702, + "loss": 0.82945639, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.12005615, + "routerloss_mlp": 0.0, + "step": 2044, + "time_per_iteration": 3.2355051040649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109464, + "balance_loss_mlp": 1.09777582, + "diversity_loss_mlp": 0.0, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.0626655505852987, + "language_loss": 0.87889385, + "learning_rate": 0.0006918453005695938, + "loss": 0.88998848, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2045, + "time_per_iteration": 2.616405725479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112555, + "balance_loss_mlp": 1.10047281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.0593607382511463, + "language_loss": 0.8430419, + "learning_rate": 0.0006915575659546662, + "loss": 0.85416746, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2046, + "time_per_iteration": 2.6596429347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100096, + "balance_loss_mlp": 1.08785915, + "diversity_loss_mlp": 0.0, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.0680979304239865, + "language_loss": 0.80745959, + "learning_rate": 0.0006912697569704959, + "loss": 0.81846058, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2047, + "time_per_iteration": 2.5962154865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097855, + "balance_loss_mlp": 1.08564174, + "diversity_loss_mlp": 0.0, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07634449995136075, + "language_loss": 0.8702817, + "learning_rate": 0.0006909818737288205, + "loss": 0.88126016, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.12207031, + "routerloss_mlp": 0.0, + "step": 2048, + "time_per_iteration": 2.5559332370758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111282, + "balance_loss_mlp": 1.09955215, + "diversity_loss_mlp": 0.0, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07451514550279957, + "language_loss": 0.80715293, + "learning_rate": 0.000690693916341406, + "loss": 0.81826574, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2049, + "time_per_iteration": 2.605881690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115507, + "balance_loss_mlp": 1.10377121, + "diversity_loss_mlp": 0.0, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.06516266173427393, + "language_loss": 0.82286257, + "learning_rate": 0.0006904058849200475, + "loss": 0.83401763, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2050, + "time_per_iteration": 2.7183115482330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.09360313, + "diversity_loss_mlp": 0.0, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.0753850450331705, + "language_loss": 0.84972727, + "learning_rate": 0.0006901177795765683, + "loss": 0.8607837, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 2051, + "time_per_iteration": 2.627774715423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105318, + "balance_loss_mlp": 1.09354019, + "diversity_loss_mlp": 0.0, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.06465732667856934, + "language_loss": 0.81096435, + "learning_rate": 0.0006898296004228213, + "loss": 0.82201755, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2052, + "time_per_iteration": 2.7607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050397, + "balance_loss_mlp": 1.04446077, + "diversity_loss_mlp": 0.0, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03031396698302257, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177135, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.05932617, + "routerloss_mlp": 0.0, + "step": 2053, + "time_per_iteration": 4.876460552215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117073, + "balance_loss_mlp": 1.10529494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.1105412420488248, + "language_loss": 0.79620701, + "learning_rate": 0.0006892530211320763, + "loss": 0.80737776, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2054, + "time_per_iteration": 2.702591896057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00944261, + "balance_loss_mlp": 1.6481061, + "diversity_loss_mlp": 0.21043469, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.03587460904718008, + "language_loss": 0.84313488, + "learning_rate": 0.000688964621218926, + "loss": 0.85257751, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01499031, + "step": 2055, + "time_per_iteration": 2.6392524242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109262, + "balance_loss_mlp": 1.08063984, + "diversity_loss_mlp": 0.0, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.0862390851468888, + "language_loss": 0.80478442, + "learning_rate": 0.0006886761479432037, + "loss": 0.81571066, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.11968994, + "routerloss_mlp": 0.0, + "step": 2056, + "time_per_iteration": 2.8577234745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079958, + "balance_loss_mlp": 1.06739902, + "diversity_loss_mlp": 0.0, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.06874544900142358, + "language_loss": 0.84387571, + "learning_rate": 0.0006883876014169045, + "loss": 0.85467529, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.12554932, + "routerloss_mlp": 0.0, + "step": 2057, + "time_per_iteration": 2.572458505630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073975, + "balance_loss_mlp": 1.06154716, + "diversity_loss_mlp": 0.0, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07681071569739906, + "language_loss": 0.90056652, + "learning_rate": 0.000688098981752052, + "loss": 0.91130626, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 2058, + "time_per_iteration": 2.7125563621520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080003, + "balance_loss_mlp": 1.06697917, + "diversity_loss_mlp": 0.0, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08571295812058347, + "language_loss": 0.80176479, + "learning_rate": 0.0006878102890606982, + "loss": 0.81256485, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 2059, + "time_per_iteration": 3.0797197818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108467, + "balance_loss_mlp": 1.07161617, + "diversity_loss_mlp": 0.0, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.08415103615204221, + "language_loss": 0.81576395, + "learning_rate": 0.0006875215234549239, + "loss": 0.82661068, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 2060, + "time_per_iteration": 2.5358171463012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078952, + "balance_loss_mlp": 1.06604218, + "diversity_loss_mlp": 0.0, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.08360675720274492, + "language_loss": 0.85212821, + "learning_rate": 0.0006872326850468376, + "loss": 0.86291778, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 2061, + "time_per_iteration": 2.685746669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079022, + "balance_loss_mlp": 1.06612396, + "diversity_loss_mlp": 0.0, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.08669948408116639, + "language_loss": 0.78834969, + "learning_rate": 0.0006869437739485762, + "loss": 0.79913992, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.12908936, + "routerloss_mlp": 0.0, + "step": 2062, + "time_per_iteration": 2.608938455581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085084, + "balance_loss_mlp": 1.07266808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06314890183319057, + "language_loss": 0.92750764, + "learning_rate": 0.0006866547902723053, + "loss": 0.93835843, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.12420654, + "routerloss_mlp": 0.0, + "step": 2063, + "time_per_iteration": 2.654764175415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07135844, + "diversity_loss_mlp": 0.0, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10797740353372913, + "language_loss": 0.80444092, + "learning_rate": 0.000686365734130218, + "loss": 0.81527805, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.12365723, + "routerloss_mlp": 0.0, + "step": 2064, + "time_per_iteration": 2.7161076068878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085534, + "balance_loss_mlp": 1.07345843, + "diversity_loss_mlp": 0.0, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06605501724079509, + "language_loss": 0.83883071, + "learning_rate": 0.000686076605634536, + "loss": 0.84968603, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2065, + "time_per_iteration": 2.5960052013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088286, + "balance_loss_mlp": 1.07656133, + "diversity_loss_mlp": 0.0, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.06893141882644385, + "language_loss": 0.84303313, + "learning_rate": 0.0006857874048975088, + "loss": 0.85391599, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2066, + "time_per_iteration": 2.5419557094573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.08599246, + "diversity_loss_mlp": 0.0, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.07076940729430262, + "language_loss": 0.86944497, + "learning_rate": 0.0006854981320314142, + "loss": 0.88042831, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2067, + "time_per_iteration": 2.4425127506256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101956, + "balance_loss_mlp": 1.0900414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.08678893766230582, + "language_loss": 0.86775517, + "learning_rate": 0.0006852087871485579, + "loss": 0.87877476, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2068, + "time_per_iteration": 2.617234468460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104627, + "balance_loss_mlp": 1.09308147, + "diversity_loss_mlp": 0.0, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08540761893483814, + "language_loss": 0.81805646, + "learning_rate": 0.0006849193703612735, + "loss": 0.82910275, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2069, + "time_per_iteration": 2.7818312644958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.08808875, + "diversity_loss_mlp": 0.0, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.06305964525737012, + "language_loss": 0.77731991, + "learning_rate": 0.0006846298817819225, + "loss": 0.78832221, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2070, + "time_per_iteration": 2.970045328140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099656, + "balance_loss_mlp": 1.08777106, + "diversity_loss_mlp": 0.0, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.09229213766989015, + "language_loss": 0.81058359, + "learning_rate": 0.0006843403215228945, + "loss": 0.82158017, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2071, + "time_per_iteration": 2.47542405128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.08525538, + "diversity_loss_mlp": 0.0, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.06250612449775428, + "language_loss": 0.80665851, + "learning_rate": 0.0006840506896966065, + "loss": 0.81763273, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2072, + "time_per_iteration": 2.7048730850219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102151, + "balance_loss_mlp": 1.09000397, + "diversity_loss_mlp": 0.0, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.07670911788950584, + "language_loss": 0.82343054, + "learning_rate": 0.0006837609864155038, + "loss": 0.83445203, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2073, + "time_per_iteration": 2.940208673477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111546, + "balance_loss_mlp": 1.09976768, + "diversity_loss_mlp": 0.0, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.06443735331096001, + "language_loss": 0.83203363, + "learning_rate": 0.0006834712117920592, + "loss": 0.84314907, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2074, + "time_per_iteration": 2.6217153072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111182, + "balance_loss_mlp": 1.09892166, + "diversity_loss_mlp": 0.0, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.07401760730887977, + "language_loss": 0.85670066, + "learning_rate": 0.0006831813659387729, + "loss": 0.86781245, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2075, + "time_per_iteration": 2.5696237087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109855, + "balance_loss_mlp": 1.09774292, + "diversity_loss_mlp": 0.0, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.05990934262108594, + "language_loss": 0.84167391, + "learning_rate": 0.0006828914489681733, + "loss": 0.85277247, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2076, + "time_per_iteration": 2.7859339714050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119025, + "balance_loss_mlp": 1.1072948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.06517456650976074, + "language_loss": 0.85312855, + "learning_rate": 0.0006826014609928162, + "loss": 0.86431879, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2077, + "time_per_iteration": 2.6851699352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0060157, + "balance_loss_mlp": 1.02597332, + "diversity_loss_mlp": 0.1552759, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.0013651319096223075, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8380096, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01094547, + "step": 2078, + "time_per_iteration": 4.859188795089722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.1030947, + "diversity_loss_mlp": 0.0, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.0748648316539235, + "language_loss": 0.80062771, + "learning_rate": 0.0006820212724781896, + "loss": 0.81177354, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.11486816, + "routerloss_mlp": 0.0, + "step": 2079, + "time_per_iteration": 2.6628189086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.09492946, + "diversity_loss_mlp": 0.0, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06148312623903997, + "language_loss": 0.83733618, + "learning_rate": 0.0006817310721641694, + "loss": 0.84840119, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2080, + "time_per_iteration": 2.847182512283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119227, + "balance_loss_mlp": 1.10731816, + "diversity_loss_mlp": 0.0, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.07223167054032475, + "language_loss": 0.83566946, + "learning_rate": 0.00068144080129589, + "loss": 0.84686172, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2081, + "time_per_iteration": 2.7161402702331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_mlp": 1.10388541, + "diversity_loss_mlp": 0.0, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.07619573858560975, + "language_loss": 0.8280167, + "learning_rate": 0.0006811504599860441, + "loss": 0.83917284, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2082, + "time_per_iteration": 2.5584774017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104984, + "balance_loss_mlp": 1.0928719, + "diversity_loss_mlp": 0.0, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.1306421138400452, + "language_loss": 0.8569895, + "learning_rate": 0.0006808600483473526, + "loss": 0.86803931, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2083, + "time_per_iteration": 2.864786148071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094797, + "balance_loss_mlp": 1.0824883, + "diversity_loss_mlp": 0.0, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.06339794743033755, + "language_loss": 0.86393988, + "learning_rate": 0.0006805695664925629, + "loss": 0.87488782, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.12304688, + "routerloss_mlp": 0.0, + "step": 2084, + "time_per_iteration": 2.844709634780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089155, + "balance_loss_mlp": 1.07735372, + "diversity_loss_mlp": 0.0, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.0888076684038974, + "language_loss": 0.83841193, + "learning_rate": 0.0006802790145344506, + "loss": 0.84930348, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2085, + "time_per_iteration": 2.4883856773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083386, + "balance_loss_mlp": 1.07145894, + "diversity_loss_mlp": 0.0, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07803386161895243, + "language_loss": 0.87420845, + "learning_rate": 0.0006799883925858176, + "loss": 0.88504231, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2086, + "time_per_iteration": 2.8824286460876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088787, + "balance_loss_mlp": 1.0766871, + "diversity_loss_mlp": 0.0, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.06924310288687491, + "language_loss": 0.85459089, + "learning_rate": 0.0006796977007594933, + "loss": 0.86547881, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2087, + "time_per_iteration": 2.6597371101379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970559, + "balance_loss_mlp": 1.6983223, + "diversity_loss_mlp": 0.21244028, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.03280700890509502, + "language_loss": 0.86715519, + "learning_rate": 0.0006794069391683345, + "loss": 0.87686074, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01517779, + "step": 2088, + "time_per_iteration": 2.7649624347686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078316, + "balance_loss_mlp": 1.06610286, + "diversity_loss_mlp": 0.0, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.07764554073270104, + "language_loss": 0.80781567, + "learning_rate": 0.0006791161079252248, + "loss": 0.81859887, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2089, + "time_per_iteration": 2.6467885971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082706, + "balance_loss_mlp": 1.07014716, + "diversity_loss_mlp": 0.0, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.0935978018434956, + "language_loss": 0.82482743, + "learning_rate": 0.0006788252071430747, + "loss": 0.8356545, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.12561035, + "routerloss_mlp": 0.0, + "step": 2090, + "time_per_iteration": 2.684659242630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076621, + "balance_loss_mlp": 1.06417561, + "diversity_loss_mlp": 0.0, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.061003649340911806, + "language_loss": 0.86884034, + "learning_rate": 0.0006785342369348222, + "loss": 0.87960654, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.12451172, + "routerloss_mlp": 0.0, + "step": 2091, + "time_per_iteration": 2.7500762939453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081239, + "balance_loss_mlp": 1.06896663, + "diversity_loss_mlp": 0.0, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.08323404973511926, + "language_loss": 0.79681003, + "learning_rate": 0.0006782431974134316, + "loss": 0.80762231, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2092, + "time_per_iteration": 2.554500102996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_mlp": 1.07266974, + "diversity_loss_mlp": 0.0, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.06323665884579813, + "language_loss": 0.89339125, + "learning_rate": 0.0006779520886918949, + "loss": 0.90424317, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2093, + "time_per_iteration": 3.0625791549682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109256, + "balance_loss_mlp": 1.08038247, + "diversity_loss_mlp": 0.0, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.06591278584355922, + "language_loss": 0.81594688, + "learning_rate": 0.0006776609108832301, + "loss": 0.82687247, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2094, + "time_per_iteration": 2.84006929397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099273, + "balance_loss_mlp": 1.08723903, + "diversity_loss_mlp": 0.0, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.07397134749055344, + "language_loss": 0.84911013, + "learning_rate": 0.0006773696641004828, + "loss": 0.86010277, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.12030029, + "routerloss_mlp": 0.0, + "step": 2095, + "time_per_iteration": 2.5662059783935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110289, + "balance_loss_mlp": 1.09781969, + "diversity_loss_mlp": 0.0, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.07471072764212172, + "language_loss": 0.77422667, + "learning_rate": 0.0006770783484567247, + "loss": 0.78532958, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.12475586, + "routerloss_mlp": 0.0, + "step": 2096, + "time_per_iteration": 3.120000123977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106409, + "balance_loss_mlp": 1.09445786, + "diversity_loss_mlp": 0.0, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.05645154934481913, + "language_loss": 0.85885596, + "learning_rate": 0.000676786964065055, + "loss": 0.86992002, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2097, + "time_per_iteration": 2.7947449684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.09767413, + "diversity_loss_mlp": 0.0, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.06468702094514471, + "language_loss": 0.78823644, + "learning_rate": 0.0006764955110385986, + "loss": 0.7993331, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2098, + "time_per_iteration": 2.7805027961730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113539, + "balance_loss_mlp": 1.10162365, + "diversity_loss_mlp": 0.0, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.06520165677387538, + "language_loss": 0.80479109, + "learning_rate": 0.0006762039894905083, + "loss": 0.81592649, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2099, + "time_per_iteration": 2.5934462547302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113864, + "balance_loss_mlp": 1.10191941, + "diversity_loss_mlp": 0.0, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.07619139256642768, + "language_loss": 0.80502266, + "learning_rate": 0.000675912399533962, + "loss": 0.81616127, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2100, + "time_per_iteration": 2.5193917751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0095878, + "balance_loss_mlp": 1.67460704, + "diversity_loss_mlp": 0.21229821, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.026749352452392162, + "language_loss": 0.8501215, + "learning_rate": 0.0006756207412821656, + "loss": 0.85970926, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532745, + "step": 2101, + "time_per_iteration": 3.0674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125351, + "balance_loss_mlp": 1.11366224, + "diversity_loss_mlp": 0.0, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.07971707112625441, + "language_loss": 0.80680853, + "learning_rate": 0.0006753290148483505, + "loss": 0.81806201, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2102, + "time_per_iteration": 3.0177412033081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128591, + "balance_loss_mlp": 1.11720061, + "diversity_loss_mlp": 0.0, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.07197972569419236, + "language_loss": 0.78862077, + "learning_rate": 0.0006750372203457752, + "loss": 0.79990667, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2103, + "time_per_iteration": 2.4715232849121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133472, + "balance_loss_mlp": 1.12199795, + "diversity_loss_mlp": 0.0, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.05679089538273026, + "language_loss": 0.8629868, + "learning_rate": 0.0006747453578877242, + "loss": 0.87432158, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2104, + "time_per_iteration": 2.7127907276153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133404, + "balance_loss_mlp": 1.12154305, + "diversity_loss_mlp": 0.0, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.07881786572134404, + "language_loss": 0.83325595, + "learning_rate": 0.0006744534275875085, + "loss": 0.84459001, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.11853027, + "routerloss_mlp": 0.0, + "step": 2105, + "time_per_iteration": 2.9968934059143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_mlp": 1.11278331, + "diversity_loss_mlp": 0.0, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.06959652480101333, + "language_loss": 0.85228348, + "learning_rate": 0.0006741614295584657, + "loss": 0.86352497, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2106, + "time_per_iteration": 2.6837310791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128388, + "balance_loss_mlp": 1.1166873, + "diversity_loss_mlp": 0.0, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.07271017039443997, + "language_loss": 0.78820735, + "learning_rate": 0.0006738693639139595, + "loss": 0.79949123, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2107, + "time_per_iteration": 2.9876344203948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_mlp": 1.09982085, + "diversity_loss_mlp": 0.0, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.07545270814647756, + "language_loss": 0.7770499, + "learning_rate": 0.0006735772307673796, + "loss": 0.78816462, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2108, + "time_per_iteration": 3.5391368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.1007216, + "diversity_loss_mlp": 0.0, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.07028810729839409, + "language_loss": 0.8317976, + "learning_rate": 0.0006732850302321421, + "loss": 0.84292281, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2109, + "time_per_iteration": 2.924703359603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107897, + "balance_loss_mlp": 1.0962801, + "diversity_loss_mlp": 0.0, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.08331494403878895, + "language_loss": 0.84220135, + "learning_rate": 0.00067299276242169, + "loss": 0.85328031, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2110, + "time_per_iteration": 2.6628758907318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00591895, + "balance_loss_mlp": 1.01285744, + "diversity_loss_mlp": 0.15005666, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.0011574932258311419, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.74974066, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01043818, + "step": 2111, + "time_per_iteration": 4.913798093795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100112, + "balance_loss_mlp": 1.0884769, + "diversity_loss_mlp": 0.0, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.0671840972805921, + "language_loss": 0.77974957, + "learning_rate": 0.0006724080254290395, + "loss": 0.79075068, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2112, + "time_per_iteration": 2.790695905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087298, + "balance_loss_mlp": 1.07509685, + "diversity_loss_mlp": 0.0, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.06921545909042545, + "language_loss": 0.89956391, + "learning_rate": 0.0006721155564738566, + "loss": 0.91043687, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2113, + "time_per_iteration": 2.654052495956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00590146, + "balance_loss_mlp": 1.01069736, + "diversity_loss_mlp": 0.14874323, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.001129022163549877, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79212785, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01042587, + "step": 2114, + "time_per_iteration": 5.02890682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095601, + "balance_loss_mlp": 1.08348942, + "diversity_loss_mlp": 0.0, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.06673632265299649, + "language_loss": 0.85678279, + "learning_rate": 0.0006715304182135078, + "loss": 0.86773884, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2115, + "time_per_iteration": 2.6665151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_mlp": 1.07951176, + "diversity_loss_mlp": 0.0, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.08902530655488881, + "language_loss": 0.8859638, + "learning_rate": 0.0006712377491355127, + "loss": 0.89688623, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.12731934, + "routerloss_mlp": 0.0, + "step": 2116, + "time_per_iteration": 2.9124083518981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091485, + "balance_loss_mlp": 1.07896256, + "diversity_loss_mlp": 0.0, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.06275972542298792, + "language_loss": 0.81009984, + "learning_rate": 0.0006709450135771274, + "loss": 0.8210147, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2117, + "time_per_iteration": 2.9538469314575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109045, + "balance_loss_mlp": 1.07800436, + "diversity_loss_mlp": 0.0, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.06731197780562713, + "language_loss": 0.8655895, + "learning_rate": 0.0006706522116520023, + "loss": 0.87649393, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.12445068, + "routerloss_mlp": 0.0, + "step": 2118, + "time_per_iteration": 2.6403684616088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109386, + "balance_loss_mlp": 1.08127189, + "diversity_loss_mlp": 0.0, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.07339707473672348, + "language_loss": 0.82936597, + "learning_rate": 0.0006703593434738127, + "loss": 0.84030455, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.12579346, + "routerloss_mlp": 0.0, + "step": 2119, + "time_per_iteration": 2.706406354904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_mlp": 1.0847466, + "diversity_loss_mlp": 0.0, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.05750096894007485, + "language_loss": 0.78123623, + "learning_rate": 0.0006700664091562604, + "loss": 0.79220533, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2120, + "time_per_iteration": 2.5515992641448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102687, + "balance_loss_mlp": 1.09045601, + "diversity_loss_mlp": 0.0, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.08484846499370094, + "language_loss": 0.85241771, + "learning_rate": 0.0006697734088130725, + "loss": 0.86344457, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2121, + "time_per_iteration": 2.5997116565704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094995, + "balance_loss_mlp": 1.08268619, + "diversity_loss_mlp": 0.0, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.06901349076849703, + "language_loss": 0.85628182, + "learning_rate": 0.0006694803425580018, + "loss": 0.86723173, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.12310791, + "routerloss_mlp": 0.0, + "step": 2122, + "time_per_iteration": 2.975572109222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090381, + "balance_loss_mlp": 1.07825708, + "diversity_loss_mlp": 0.0, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.08123936309079019, + "language_loss": 0.84420574, + "learning_rate": 0.0006691872105048268, + "loss": 0.85510951, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.12133789, + "routerloss_mlp": 0.0, + "step": 2123, + "time_per_iteration": 2.5785253047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109138, + "balance_loss_mlp": 1.07879114, + "diversity_loss_mlp": 0.0, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.06700388653835253, + "language_loss": 0.84703517, + "learning_rate": 0.0006688940127673513, + "loss": 0.85794896, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.12597656, + "routerloss_mlp": 0.0, + "step": 2124, + "time_per_iteration": 2.794312000274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080411, + "balance_loss_mlp": 1.06789398, + "diversity_loss_mlp": 0.0, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.11477925500015464, + "language_loss": 0.85646629, + "learning_rate": 0.0006686007494594049, + "loss": 0.86727041, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2125, + "time_per_iteration": 2.8629977703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.06869102, + "diversity_loss_mlp": 0.0, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.08770785423003769, + "language_loss": 0.80226219, + "learning_rate": 0.0006683074206948425, + "loss": 0.81306815, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2126, + "time_per_iteration": 2.5477960109710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_mlp": 1.06884146, + "diversity_loss_mlp": 0.0, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.0688791895715759, + "language_loss": 0.81257784, + "learning_rate": 0.0006680140265875443, + "loss": 0.82338405, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2127, + "time_per_iteration": 2.824706792831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076947, + "balance_loss_mlp": 1.06504989, + "diversity_loss_mlp": 0.0, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.0706270365820259, + "language_loss": 0.95744675, + "learning_rate": 0.0006677205672514162, + "loss": 0.96821618, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2128, + "time_per_iteration": 2.6173171997070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081253, + "balance_loss_mlp": 1.06944525, + "diversity_loss_mlp": 0.0, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.08385407721227026, + "language_loss": 0.88751161, + "learning_rate": 0.000667427042800389, + "loss": 0.89832413, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2129, + "time_per_iteration": 2.746561288833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090066, + "balance_loss_mlp": 1.07828188, + "diversity_loss_mlp": 0.0, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.0802302808929841, + "language_loss": 0.82728851, + "learning_rate": 0.0006671334533484192, + "loss": 0.83818918, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2130, + "time_per_iteration": 2.7765390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094937, + "balance_loss_mlp": 1.08306408, + "diversity_loss_mlp": 0.0, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.06494454218377498, + "language_loss": 0.83394802, + "learning_rate": 0.0006668397990094881, + "loss": 0.84489739, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2131, + "time_per_iteration": 2.6814444065093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094306, + "balance_loss_mlp": 1.08240891, + "diversity_loss_mlp": 0.0, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.08851492372685672, + "language_loss": 0.84863144, + "learning_rate": 0.0006665460798976027, + "loss": 0.8595745, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2132, + "time_per_iteration": 2.734208822250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098079, + "balance_loss_mlp": 1.08680749, + "diversity_loss_mlp": 0.0, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.07834997970618658, + "language_loss": 0.8153789, + "learning_rate": 0.0006662522961267947, + "loss": 0.82635975, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2133, + "time_per_iteration": 2.642789363861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100422, + "balance_loss_mlp": 1.0889008, + "diversity_loss_mlp": 0.0, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.06175420460070233, + "language_loss": 0.87238759, + "learning_rate": 0.0006659584478111211, + "loss": 0.88339174, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2134, + "time_per_iteration": 2.8097283840179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110764, + "balance_loss_mlp": 1.09618366, + "diversity_loss_mlp": 0.0, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.07261990262121029, + "language_loss": 0.82762325, + "learning_rate": 0.000665664535064664, + "loss": 0.83869964, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2135, + "time_per_iteration": 3.034973382949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118337, + "balance_loss_mlp": 1.10702372, + "diversity_loss_mlp": 0.0, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.07277612177905571, + "language_loss": 0.82753229, + "learning_rate": 0.0006653705580015303, + "loss": 0.83871567, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.11309814, + "routerloss_mlp": 0.0, + "step": 2136, + "time_per_iteration": 2.719024181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130623, + "balance_loss_mlp": 1.11913705, + "diversity_loss_mlp": 0.0, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.09561286081072368, + "language_loss": 0.86333638, + "learning_rate": 0.0006650765167358523, + "loss": 0.87464261, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2137, + "time_per_iteration": 2.798013210296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119744, + "balance_loss_mlp": 1.10816908, + "diversity_loss_mlp": 0.0, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.06575385598885217, + "language_loss": 0.90120316, + "learning_rate": 0.0006647824113817864, + "loss": 0.9124006, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2138, + "time_per_iteration": 2.5290029048919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00941862, + "balance_loss_mlp": 1.64172852, + "diversity_loss_mlp": 0.21382158, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.027199696320483784, + "language_loss": 0.81782889, + "learning_rate": 0.000664488242053515, + "loss": 0.8272475, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01408678, + "step": 2139, + "time_per_iteration": 2.7610864639282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111286, + "balance_loss_mlp": 1.1009748, + "diversity_loss_mlp": 0.0, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.07795493316399416, + "language_loss": 0.83879304, + "learning_rate": 0.0006641940088652445, + "loss": 0.84992164, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 2140, + "time_per_iteration": 2.7797446250915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.08682573, + "diversity_loss_mlp": 0.0, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.09321248474614077, + "language_loss": 0.82214057, + "learning_rate": 0.0006638997119312065, + "loss": 0.83312857, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2141, + "time_per_iteration": 2.688427209854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_mlp": 1.07580638, + "diversity_loss_mlp": 0.0, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.05051376163622262, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76146024, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 2142, + "time_per_iteration": 4.916438817977905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084339, + "balance_loss_mlp": 1.07186329, + "diversity_loss_mlp": 0.0, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.0666522569579182, + "language_loss": 0.84487629, + "learning_rate": 0.000663310927282877, + "loss": 0.85571963, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.12475586, + "routerloss_mlp": 0.0, + "step": 2143, + "time_per_iteration": 2.742781162261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075707, + "balance_loss_mlp": 1.06302905, + "diversity_loss_mlp": 0.0, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.07553146792883669, + "language_loss": 0.85816187, + "learning_rate": 0.000663016439797172, + "loss": 0.86891896, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.12677002, + "routerloss_mlp": 0.0, + "step": 2144, + "time_per_iteration": 2.602322578430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075852, + "balance_loss_mlp": 1.06363273, + "diversity_loss_mlp": 0.0, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.09188682549299809, + "language_loss": 0.80924189, + "learning_rate": 0.0006627218890228724, + "loss": 0.82000041, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2145, + "time_per_iteration": 2.76790452003479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081048, + "balance_loss_mlp": 1.0687809, + "diversity_loss_mlp": 0.0, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.09235653357512275, + "language_loss": 0.83860421, + "learning_rate": 0.0006624272750743326, + "loss": 0.84941471, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2146, + "time_per_iteration": 2.986267566680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085622, + "balance_loss_mlp": 1.073385, + "diversity_loss_mlp": 0.0, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.06221373460159241, + "language_loss": 0.82866907, + "learning_rate": 0.0006621325980659322, + "loss": 0.83952528, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2147, + "time_per_iteration": 2.78074049949646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091906, + "balance_loss_mlp": 1.07981253, + "diversity_loss_mlp": 0.0, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.06655163113776748, + "language_loss": 0.81613219, + "learning_rate": 0.000661837858112075, + "loss": 0.82705128, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2148, + "time_per_iteration": 2.8118457794189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920817, + "balance_loss_mlp": 1.59947157, + "diversity_loss_mlp": 0.21162269, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.03430222900415099, + "language_loss": 0.88696158, + "learning_rate": 0.0006615430553271888, + "loss": 0.89616972, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01526995, + "step": 2149, + "time_per_iteration": 2.809389352798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.10438299, + "diversity_loss_mlp": 0.0, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.06824786639125466, + "language_loss": 0.85333586, + "learning_rate": 0.0006612481898257264, + "loss": 0.8644954, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2150, + "time_per_iteration": 2.855074644088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137757, + "balance_loss_mlp": 1.12599659, + "diversity_loss_mlp": 0.0, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.07789693292988349, + "language_loss": 0.851385, + "learning_rate": 0.000660953261722165, + "loss": 0.86276257, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.11749268, + "routerloss_mlp": 0.0, + "step": 2151, + "time_per_iteration": 2.5938022136688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113704, + "balance_loss_mlp": 1.12522054, + "diversity_loss_mlp": 0.0, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.08228338378299185, + "language_loss": 0.82884097, + "learning_rate": 0.0006606582711310055, + "loss": 0.84021133, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2152, + "time_per_iteration": 2.7282497882843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145368, + "balance_loss_mlp": 1.13366747, + "diversity_loss_mlp": 0.0, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.06559194318793425, + "language_loss": 0.82812124, + "learning_rate": 0.0006603632181667736, + "loss": 0.83957493, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2153, + "time_per_iteration": 2.6664750576019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103836, + "balance_loss_mlp": 1.09754133, + "diversity_loss_mlp": 0.0, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.03767833543400207, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.8004716, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.06298828, + "routerloss_mlp": 0.0, + "step": 2154, + "time_per_iteration": 4.910309791564941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135237, + "balance_loss_mlp": 1.12367392, + "diversity_loss_mlp": 0.0, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.0807614788835298, + "language_loss": 0.81897664, + "learning_rate": 0.0006597729255773153, + "loss": 0.83032906, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.11560059, + "routerloss_mlp": 0.0, + "step": 2155, + "time_per_iteration": 2.509021520614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146058, + "balance_loss_mlp": 1.13441765, + "diversity_loss_mlp": 0.0, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.07993173196210833, + "language_loss": 0.82465029, + "learning_rate": 0.0006594776861812608, + "loss": 0.83611095, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2156, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151315, + "balance_loss_mlp": 1.13991857, + "diversity_loss_mlp": 0.0, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.06494614409867079, + "language_loss": 0.8654387, + "learning_rate": 0.0006591823848704776, + "loss": 0.87695187, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.11395264, + "routerloss_mlp": 0.0, + "step": 2157, + "time_per_iteration": 2.9039251804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134696, + "balance_loss_mlp": 1.12316287, + "diversity_loss_mlp": 0.0, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.07584878913150254, + "language_loss": 0.81510401, + "learning_rate": 0.0006588870217596117, + "loss": 0.82645094, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2158, + "time_per_iteration": 2.7366249561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121714, + "balance_loss_mlp": 1.11010289, + "diversity_loss_mlp": 0.0, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.0768974217493938, + "language_loss": 0.8567549, + "learning_rate": 0.0006585915969633334, + "loss": 0.86797202, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2159, + "time_per_iteration": 2.557969331741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105923, + "balance_loss_mlp": 1.09437764, + "diversity_loss_mlp": 0.0, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.06453825749462137, + "language_loss": 0.89545041, + "learning_rate": 0.0006582961105963366, + "loss": 0.90650964, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2160, + "time_per_iteration": 2.782766103744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089959, + "balance_loss_mlp": 1.07836008, + "diversity_loss_mlp": 0.0, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.09389311079563152, + "language_loss": 0.77639234, + "learning_rate": 0.0006580005627733395, + "loss": 0.78729188, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2161, + "time_per_iteration": 2.7049734592437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086569, + "balance_loss_mlp": 1.07492197, + "diversity_loss_mlp": 0.0, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.08236412019602501, + "language_loss": 0.81618345, + "learning_rate": 0.0006577049536090838, + "loss": 0.8270492, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.11645508, + "routerloss_mlp": 0.0, + "step": 2162, + "time_per_iteration": 2.723243236541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078674, + "balance_loss_mlp": 1.06676459, + "diversity_loss_mlp": 0.0, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.09869721655750711, + "language_loss": 0.85591501, + "learning_rate": 0.000657409283218335, + "loss": 0.86670172, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2163, + "time_per_iteration": 2.64973783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078005, + "balance_loss_mlp": 1.0662148, + "diversity_loss_mlp": 0.0, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.06806079796586995, + "language_loss": 0.81014043, + "learning_rate": 0.0006571135517158829, + "loss": 0.82092047, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2164, + "time_per_iteration": 2.6662614345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261192, + "balance_loss_mlp": 1.25542271, + "diversity_loss_mlp": 0.0, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.0963910676883023, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.78025252, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2165, + "time_per_iteration": 4.733267068862915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.07227921, + "diversity_loss_mlp": 0.0, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.08489426271121504, + "language_loss": 0.83098751, + "learning_rate": 0.0006565219058351444, + "loss": 0.84183216, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.12194824, + "routerloss_mlp": 0.0, + "step": 2166, + "time_per_iteration": 2.555367946624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087654, + "balance_loss_mlp": 1.07506573, + "diversity_loss_mlp": 0.0, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.0663020588108057, + "language_loss": 0.82663929, + "learning_rate": 0.0006562259916865553, + "loss": 0.83751583, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.12585449, + "routerloss_mlp": 0.0, + "step": 2167, + "time_per_iteration": 2.5647947788238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085528, + "balance_loss_mlp": 1.07305884, + "diversity_loss_mlp": 0.0, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.11811458423881586, + "language_loss": 0.79392177, + "learning_rate": 0.0006559300168856573, + "loss": 0.80477709, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 2168, + "time_per_iteration": 2.737071990966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090803, + "balance_loss_mlp": 1.07860184, + "diversity_loss_mlp": 0.0, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.07183663020795078, + "language_loss": 0.86060214, + "learning_rate": 0.0006556339815473577, + "loss": 0.87151015, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2169, + "time_per_iteration": 2.6506707668304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087149, + "balance_loss_mlp": 1.07504892, + "diversity_loss_mlp": 0.0, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.07609133400056706, + "language_loss": 0.86409211, + "learning_rate": 0.000655337885786588, + "loss": 0.87496364, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2170, + "time_per_iteration": 2.8835949897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078201, + "balance_loss_mlp": 1.06654263, + "diversity_loss_mlp": 0.0, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.08298304012821277, + "language_loss": 0.85129267, + "learning_rate": 0.0006550417297183025, + "loss": 0.86207461, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2171, + "time_per_iteration": 2.6195385456085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087717, + "balance_loss_mlp": 1.07584357, + "diversity_loss_mlp": 0.0, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.07223590906341684, + "language_loss": 0.81395489, + "learning_rate": 0.0006547455134574793, + "loss": 0.82483202, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 2172, + "time_per_iteration": 2.688387155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091071, + "balance_loss_mlp": 1.07947183, + "diversity_loss_mlp": 0.0, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06986640066350178, + "language_loss": 0.84520721, + "learning_rate": 0.0006544492371191198, + "loss": 0.85611784, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2173, + "time_per_iteration": 3.1099753379821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094341, + "balance_loss_mlp": 1.08226562, + "diversity_loss_mlp": 0.0, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.06657472623207703, + "language_loss": 0.8341983, + "learning_rate": 0.0006541529008182485, + "loss": 0.84514177, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2174, + "time_per_iteration": 3.203376054763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_mlp": 1.09567666, + "diversity_loss_mlp": 0.0, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.07167092475387357, + "language_loss": 0.87561977, + "learning_rate": 0.0006538565046699136, + "loss": 0.8866933, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2175, + "time_per_iteration": 2.6136248111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122872, + "balance_loss_mlp": 1.1111474, + "diversity_loss_mlp": 0.0, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.08073018870716439, + "language_loss": 0.81308544, + "learning_rate": 0.0006535600487891862, + "loss": 0.82431418, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2176, + "time_per_iteration": 2.8484995365142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112142, + "balance_loss_mlp": 1.10968423, + "diversity_loss_mlp": 0.0, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.06933020813080157, + "language_loss": 0.89047962, + "learning_rate": 0.0006532635332911603, + "loss": 0.90169382, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2177, + "time_per_iteration": 2.6983814239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139797, + "balance_loss_mlp": 1.12828767, + "diversity_loss_mlp": 0.0, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.07833316419755533, + "language_loss": 0.80340332, + "learning_rate": 0.0006529669582909541, + "loss": 0.81480134, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2178, + "time_per_iteration": 3.247034788131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130167, + "balance_loss_mlp": 1.11881781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.08850961832331757, + "language_loss": 0.85867965, + "learning_rate": 0.0006526703239037077, + "loss": 0.86998129, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2179, + "time_per_iteration": 2.6653683185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933718, + "balance_loss_mlp": 1.62844765, + "diversity_loss_mlp": 0.20954823, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.029582524443817385, + "language_loss": 0.86593473, + "learning_rate": 0.0006523736302445851, + "loss": 0.87527192, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01471971, + "step": 2180, + "time_per_iteration": 2.857030153274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120353, + "balance_loss_mlp": 1.10893881, + "diversity_loss_mlp": 0.0, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.0687803817541909, + "language_loss": 0.77392578, + "learning_rate": 0.0006520768774287728, + "loss": 0.78512931, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2181, + "time_per_iteration": 5.625683307647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114289, + "balance_loss_mlp": 1.10282135, + "diversity_loss_mlp": 0.0, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06088029266780351, + "language_loss": 0.85493296, + "learning_rate": 0.0006517800655714806, + "loss": 0.86607587, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2182, + "time_per_iteration": 2.812955617904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105622, + "balance_loss_mlp": 1.09442866, + "diversity_loss_mlp": 0.0, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07098705372074567, + "language_loss": 0.85399854, + "learning_rate": 0.0006514831947879407, + "loss": 0.86505473, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.11193848, + "routerloss_mlp": 0.0, + "step": 2183, + "time_per_iteration": 2.961418867111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097617, + "balance_loss_mlp": 1.08642888, + "diversity_loss_mlp": 0.0, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.08450852264083888, + "language_loss": 0.78323019, + "learning_rate": 0.0006511862651934091, + "loss": 0.79420632, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2184, + "time_per_iteration": 3.076414108276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091172, + "balance_loss_mlp": 1.07956707, + "diversity_loss_mlp": 0.0, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.06921087236063693, + "language_loss": 0.82092035, + "learning_rate": 0.0006508892769031638, + "loss": 0.83183205, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2185, + "time_per_iteration": 2.638606309890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089875, + "balance_loss_mlp": 1.07868707, + "diversity_loss_mlp": 0.0, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.07895440454445611, + "language_loss": 0.87322706, + "learning_rate": 0.000650592230032506, + "loss": 0.88412583, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.11187744, + "routerloss_mlp": 0.0, + "step": 2186, + "time_per_iteration": 2.702061176300049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093256, + "balance_loss_mlp": 1.0815382, + "diversity_loss_mlp": 0.0, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.07748698496632533, + "language_loss": 0.85121393, + "learning_rate": 0.0006502951246967595, + "loss": 0.8621465, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.11706543, + "routerloss_mlp": 0.0, + "step": 2187, + "time_per_iteration": 2.871629476547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087261, + "balance_loss_mlp": 1.07582331, + "diversity_loss_mlp": 0.0, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.06016607527200091, + "language_loss": 0.86913472, + "learning_rate": 0.0006499979610112706, + "loss": 0.88000733, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2188, + "time_per_iteration": 2.795278787612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107962, + "balance_loss_mlp": 1.06803894, + "diversity_loss_mlp": 0.0, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.0593739697007924, + "language_loss": 0.84024572, + "learning_rate": 0.000649700739091409, + "loss": 0.85104191, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2189, + "time_per_iteration": 2.822756290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123251, + "balance_loss_mlp": 1.11500144, + "diversity_loss_mlp": 0.0, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.03860831682793276, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74959522, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.08251953, + "routerloss_mlp": 0.0, + "step": 2190, + "time_per_iteration": 4.79919958114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082066, + "balance_loss_mlp": 1.07052088, + "diversity_loss_mlp": 0.0, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.06761793691364075, + "language_loss": 0.85737348, + "learning_rate": 0.0006491061210101557, + "loss": 0.86819422, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2191, + "time_per_iteration": 2.661578416824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094285, + "balance_loss_mlp": 1.08270931, + "diversity_loss_mlp": 0.0, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.0725556462678514, + "language_loss": 0.83956218, + "learning_rate": 0.0006488087250796157, + "loss": 0.85050505, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2192, + "time_per_iteration": 2.881225347518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095445, + "balance_loss_mlp": 1.08376861, + "diversity_loss_mlp": 0.0, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.09298126342392905, + "language_loss": 0.81662476, + "learning_rate": 0.0006485112713764049, + "loss": 0.82757914, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2193, + "time_per_iteration": 2.8921914100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_mlp": 1.08214593, + "diversity_loss_mlp": 0.0, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.058244545196029895, + "language_loss": 0.83715278, + "learning_rate": 0.0006482137600160051, + "loss": 0.84809017, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2194, + "time_per_iteration": 2.484341859817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094196, + "balance_loss_mlp": 1.08240056, + "diversity_loss_mlp": 0.0, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.08574033239321836, + "language_loss": 0.847399, + "learning_rate": 0.0006479161911139206, + "loss": 0.85834098, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2195, + "time_per_iteration": 2.5937106609344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082105, + "balance_loss_mlp": 1.07043433, + "diversity_loss_mlp": 0.0, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08791937036502419, + "language_loss": 0.85522735, + "learning_rate": 0.0006476185647856778, + "loss": 0.86604846, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.11657715, + "routerloss_mlp": 0.0, + "step": 2196, + "time_per_iteration": 2.569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080125, + "balance_loss_mlp": 1.06815672, + "diversity_loss_mlp": 0.0, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.07778870715402122, + "language_loss": 0.82192588, + "learning_rate": 0.0006473208811468255, + "loss": 0.83272707, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2197, + "time_per_iteration": 2.899557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072137, + "balance_loss_mlp": 1.06046605, + "diversity_loss_mlp": 0.0, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.07330307904629892, + "language_loss": 0.84140831, + "learning_rate": 0.0006470231403129347, + "loss": 0.85212964, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.11663818, + "routerloss_mlp": 0.0, + "step": 2198, + "time_per_iteration": 2.602447509765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106933, + "balance_loss_mlp": 1.05760026, + "diversity_loss_mlp": 0.0, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.06409293690085444, + "language_loss": 0.81590885, + "learning_rate": 0.0006467253423995988, + "loss": 0.82660222, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2199, + "time_per_iteration": 2.8557229042053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107934, + "balance_loss_mlp": 1.06755078, + "diversity_loss_mlp": 0.0, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.07244216805562081, + "language_loss": 0.78831869, + "learning_rate": 0.000646427487522433, + "loss": 0.79911208, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2200, + "time_per_iteration": 2.65742826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084908, + "balance_loss_mlp": 1.07336855, + "diversity_loss_mlp": 0.0, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.07121994515744344, + "language_loss": 0.83032513, + "learning_rate": 0.0006461295757970749, + "loss": 0.84117424, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2201, + "time_per_iteration": 2.950655698776245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090426, + "balance_loss_mlp": 1.07880902, + "diversity_loss_mlp": 0.0, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.07713064950594434, + "language_loss": 0.81538546, + "learning_rate": 0.0006458316073391839, + "loss": 0.82628965, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2202, + "time_per_iteration": 2.8609914779663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089677, + "balance_loss_mlp": 1.07874584, + "diversity_loss_mlp": 0.0, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.07022827859020209, + "language_loss": 0.87709206, + "learning_rate": 0.0006455335822644422, + "loss": 0.88798881, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2203, + "time_per_iteration": 2.6978323459625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118526, + "balance_loss_mlp": 1.10743332, + "diversity_loss_mlp": 0.0, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.08724206882012846, + "language_loss": 0.78530163, + "learning_rate": 0.0006452355006885527, + "loss": 0.79648691, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.11090088, + "routerloss_mlp": 0.0, + "step": 2204, + "time_per_iteration": 2.686579704284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00922718, + "balance_loss_mlp": 1.60671031, + "diversity_loss_mlp": 0.20807257, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.038668439213979985, + "language_loss": 0.8761735, + "learning_rate": 0.0006449373627272412, + "loss": 0.88540065, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532654, + "step": 2205, + "time_per_iteration": 2.7558722496032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112883, + "balance_loss_mlp": 1.10164738, + "diversity_loss_mlp": 0.0, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.08032286277613819, + "language_loss": 0.82142913, + "learning_rate": 0.0006446391684962553, + "loss": 0.83255792, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.11230469, + "routerloss_mlp": 0.0, + "step": 2206, + "time_per_iteration": 2.6579248905181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117757, + "balance_loss_mlp": 1.10650921, + "diversity_loss_mlp": 0.0, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.06707307211931093, + "language_loss": 0.82899106, + "learning_rate": 0.000644340918111364, + "loss": 0.8401686, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2207, + "time_per_iteration": 2.5347208976745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117145, + "balance_loss_mlp": 1.10573626, + "diversity_loss_mlp": 0.0, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.09153331321335235, + "language_loss": 0.84820396, + "learning_rate": 0.0006440426116883585, + "loss": 0.85937536, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.11401367, + "routerloss_mlp": 0.0, + "step": 2208, + "time_per_iteration": 2.5513036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.11258864, + "diversity_loss_mlp": 0.0, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.07442494649717855, + "language_loss": 0.86227304, + "learning_rate": 0.0006437442493430519, + "loss": 0.87351412, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2209, + "time_per_iteration": 2.6560840606689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120019, + "balance_loss_mlp": 1.10829473, + "diversity_loss_mlp": 0.0, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.09545289030190586, + "language_loss": 0.86441422, + "learning_rate": 0.000643445831191278, + "loss": 0.8756144, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2210, + "time_per_iteration": 2.9028308391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_mlp": 1.09162724, + "diversity_loss_mlp": 0.0, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.07646392549286844, + "language_loss": 0.81526744, + "learning_rate": 0.0006431473573488937, + "loss": 0.82629919, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2211, + "time_per_iteration": 2.7377443313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089807, + "balance_loss_mlp": 1.0782795, + "diversity_loss_mlp": 0.0, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.08107145257136338, + "language_loss": 0.85147351, + "learning_rate": 0.0006428488279317765, + "loss": 0.86237156, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2212, + "time_per_iteration": 2.6276626586914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109177, + "balance_loss_mlp": 1.08065951, + "diversity_loss_mlp": 0.0, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.09124161172132733, + "language_loss": 0.87490094, + "learning_rate": 0.0006425502430558259, + "loss": 0.88581866, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.11120605, + "routerloss_mlp": 0.0, + "step": 2213, + "time_per_iteration": 2.588928699493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109161, + "balance_loss_mlp": 1.08046961, + "diversity_loss_mlp": 0.0, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.06865062693642494, + "language_loss": 0.84588826, + "learning_rate": 0.0006422516028369628, + "loss": 0.85680431, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.11138916, + "routerloss_mlp": 0.0, + "step": 2214, + "time_per_iteration": 2.639619827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085781, + "balance_loss_mlp": 1.07456374, + "diversity_loss_mlp": 0.0, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.06481575152476399, + "language_loss": 0.83497036, + "learning_rate": 0.0006419529073911296, + "loss": 0.84582818, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2215, + "time_per_iteration": 2.8564555644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091551, + "balance_loss_mlp": 1.08075058, + "diversity_loss_mlp": 0.0, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.07537518077633425, + "language_loss": 0.85102242, + "learning_rate": 0.0006416541568342901, + "loss": 0.86193788, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.10797119, + "routerloss_mlp": 0.0, + "step": 2216, + "time_per_iteration": 2.8998327255249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082292, + "balance_loss_mlp": 1.07092535, + "diversity_loss_mlp": 0.0, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.06331803259599181, + "language_loss": 0.84347832, + "learning_rate": 0.0006413553512824297, + "loss": 0.85430121, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2217, + "time_per_iteration": 2.754044532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084192, + "balance_loss_mlp": 1.07307625, + "diversity_loss_mlp": 0.0, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.07616444203019798, + "language_loss": 0.84374213, + "learning_rate": 0.0006410564908515549, + "loss": 0.85458404, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.11114502, + "routerloss_mlp": 0.0, + "step": 2218, + "time_per_iteration": 2.724478006362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081004, + "balance_loss_mlp": 1.06966138, + "diversity_loss_mlp": 0.0, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.0731173396075932, + "language_loss": 0.85161233, + "learning_rate": 0.0006407575756576935, + "loss": 0.86242241, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.11334229, + "routerloss_mlp": 0.0, + "step": 2219, + "time_per_iteration": 2.754624128341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093699, + "balance_loss_mlp": 1.08191478, + "diversity_loss_mlp": 0.0, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.068521011535794, + "language_loss": 0.87612599, + "learning_rate": 0.0006404586058168951, + "loss": 0.88706297, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2220, + "time_per_iteration": 2.6972298622131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100965, + "balance_loss_mlp": 1.08927631, + "diversity_loss_mlp": 0.0, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.1033551804820373, + "language_loss": 0.86327708, + "learning_rate": 0.0006401595814452296, + "loss": 0.87428677, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2221, + "time_per_iteration": 2.6071925163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_mlp": 1.08816695, + "diversity_loss_mlp": 0.0, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.07649462730323824, + "language_loss": 0.8070569, + "learning_rate": 0.000639860502658789, + "loss": 0.81805706, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 2222, + "time_per_iteration": 2.6844141483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101843, + "balance_loss_mlp": 1.08965993, + "diversity_loss_mlp": 0.0, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.0652732350229211, + "language_loss": 0.84929889, + "learning_rate": 0.0006395613695736853, + "loss": 0.86031729, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.1217041, + "routerloss_mlp": 0.0, + "step": 2223, + "time_per_iteration": 2.6799042224884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091013, + "balance_loss_mlp": 1.07850194, + "diversity_loss_mlp": 0.0, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.10552751254703834, + "language_loss": 0.82026577, + "learning_rate": 0.0006392621823060529, + "loss": 0.83117592, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.12518311, + "routerloss_mlp": 0.0, + "step": 2224, + "time_per_iteration": 2.722675323486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083211, + "balance_loss_mlp": 1.07109332, + "diversity_loss_mlp": 0.0, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.0790777786133485, + "language_loss": 0.8508532, + "learning_rate": 0.0006389629409720465, + "loss": 0.86168534, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2225, + "time_per_iteration": 2.6559393405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084084, + "balance_loss_mlp": 1.07179379, + "diversity_loss_mlp": 0.0, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.0811747132385773, + "language_loss": 0.88654399, + "learning_rate": 0.0006386636456878417, + "loss": 0.89738482, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2226, + "time_per_iteration": 2.898261308670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083891, + "balance_loss_mlp": 1.07153535, + "diversity_loss_mlp": 0.0, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.07696212536929578, + "language_loss": 0.92413348, + "learning_rate": 0.0006383642965696353, + "loss": 0.93497235, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 2227, + "time_per_iteration": 2.467622995376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932178, + "balance_loss_mlp": 1.62005818, + "diversity_loss_mlp": 0.21207821, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.033827312051000154, + "language_loss": 0.83018744, + "learning_rate": 0.000638064893733645, + "loss": 0.83950925, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01611001, + "step": 2228, + "time_per_iteration": 2.74554705619812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00939878, + "balance_loss_mlp": 1.63503206, + "diversity_loss_mlp": 0.21170495, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.03357304306136308, + "language_loss": 0.90087909, + "learning_rate": 0.000637765437296109, + "loss": 0.91027784, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01650969, + "step": 2229, + "time_per_iteration": 2.6807308197021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086799, + "balance_loss_mlp": 1.07446718, + "diversity_loss_mlp": 0.0, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.09425394332621637, + "language_loss": 0.85585725, + "learning_rate": 0.000637465927373287, + "loss": 0.86672527, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2230, + "time_per_iteration": 2.6279454231262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088133, + "balance_loss_mlp": 1.0761342, + "diversity_loss_mlp": 0.0, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.13300209785278838, + "language_loss": 0.79446864, + "learning_rate": 0.000637166364081459, + "loss": 0.80534995, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.11993408, + "routerloss_mlp": 0.0, + "step": 2231, + "time_per_iteration": 2.7252066135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108179, + "balance_loss_mlp": 1.07001245, + "diversity_loss_mlp": 0.0, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.08046243261781533, + "language_loss": 0.84081841, + "learning_rate": 0.0006368667475369256, + "loss": 0.85163629, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2232, + "time_per_iteration": 2.756286382675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_mlp": 1.03840148, + "diversity_loss_mlp": 0.0, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.02809293853716727, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79574001, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.078125, + "routerloss_mlp": 0.0, + "step": 2233, + "time_per_iteration": 4.852276086807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_mlp": 1.02313304, + "diversity_loss_mlp": 0.0, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.02329901381823612, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79926044, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.07470703, + "routerloss_mlp": 0.0, + "step": 2234, + "time_per_iteration": 4.812516689300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107186, + "balance_loss_mlp": 1.09534228, + "diversity_loss_mlp": 0.0, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.06628794940731256, + "language_loss": 0.86166692, + "learning_rate": 0.0006359675795504112, + "loss": 0.87273884, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 2235, + "time_per_iteration": 2.7691314220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112399, + "balance_loss_mlp": 1.11230159, + "diversity_loss_mlp": 0.0, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.08124483128316094, + "language_loss": 0.74637383, + "learning_rate": 0.0006356677511584775, + "loss": 0.75761378, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.11676025, + "routerloss_mlp": 0.0, + "step": 2236, + "time_per_iteration": 3.51676082611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138804, + "balance_loss_mlp": 1.12733603, + "diversity_loss_mlp": 0.0, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.08045247853644188, + "language_loss": 0.85975677, + "learning_rate": 0.0006353678700956511, + "loss": 0.87114477, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2237, + "time_per_iteration": 2.5487072467803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137661, + "balance_loss_mlp": 1.12605572, + "diversity_loss_mlp": 0.0, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.08414636037035166, + "language_loss": 0.84184766, + "learning_rate": 0.0006350679364783569, + "loss": 0.85322422, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2238, + "time_per_iteration": 2.730128288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113973, + "balance_loss_mlp": 1.1279577, + "diversity_loss_mlp": 0.0, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.06707032645836293, + "language_loss": 0.85872072, + "learning_rate": 0.0006347679504230393, + "loss": 0.87011802, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2239, + "time_per_iteration": 2.640791893005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136631, + "balance_loss_mlp": 1.12453079, + "diversity_loss_mlp": 0.0, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.07174503893432663, + "language_loss": 0.7626543, + "learning_rate": 0.0006344679120461632, + "loss": 0.77402061, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2240, + "time_per_iteration": 3.3352768421173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128316, + "balance_loss_mlp": 1.11687779, + "diversity_loss_mlp": 0.0, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.08647233478950261, + "language_loss": 0.79984182, + "learning_rate": 0.0006341678214642134, + "loss": 0.81112498, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2241, + "time_per_iteration": 2.662132740020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114748, + "balance_loss_mlp": 1.10336995, + "diversity_loss_mlp": 0.0, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.06482352137494116, + "language_loss": 0.82986903, + "learning_rate": 0.0006338676787936963, + "loss": 0.84101653, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2242, + "time_per_iteration": 3.064518451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123318, + "balance_loss_mlp": 1.11183178, + "diversity_loss_mlp": 0.0, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.07554467546841755, + "language_loss": 0.84015846, + "learning_rate": 0.0006335674841511367, + "loss": 0.85139167, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2243, + "time_per_iteration": 2.7494354248046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067953, + "balance_loss_mlp": 1.06189752, + "diversity_loss_mlp": 0.0, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.020266409588932003, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80249119, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.06054688, + "routerloss_mlp": 0.0, + "step": 2244, + "time_per_iteration": 5.019898414611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058136, + "balance_loss_mlp": 1.05208015, + "diversity_loss_mlp": 0.0, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.017496917907237546, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78423691, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.06054688, + "routerloss_mlp": 0.0, + "step": 2245, + "time_per_iteration": 4.940483808517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111091, + "balance_loss_mlp": 1.09893775, + "diversity_loss_mlp": 0.0, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.07826437205196314, + "language_loss": 0.82487583, + "learning_rate": 0.0006326665895567652, + "loss": 0.83598673, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.121521, + "routerloss_mlp": 0.0, + "step": 2246, + "time_per_iteration": 2.6287152767181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111115, + "balance_loss_mlp": 1.09895015, + "diversity_loss_mlp": 0.0, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.09268036537549412, + "language_loss": 0.87613881, + "learning_rate": 0.0006323661881916976, + "loss": 0.88725001, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.121521, + "routerloss_mlp": 0.0, + "step": 2247, + "time_per_iteration": 2.6966464519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110117, + "balance_loss_mlp": 1.08901072, + "diversity_loss_mlp": 0.0, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.07850654458656253, + "language_loss": 0.812437, + "learning_rate": 0.0006320657354375179, + "loss": 0.82344878, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2248, + "time_per_iteration": 3.0057384967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.08872366, + "diversity_loss_mlp": 0.0, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.07399569527983862, + "language_loss": 0.87203169, + "learning_rate": 0.0006317652314108726, + "loss": 0.88303995, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2249, + "time_per_iteration": 2.6106557846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083093, + "balance_loss_mlp": 1.07126176, + "diversity_loss_mlp": 0.0, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.07131076511794647, + "language_loss": 0.91191232, + "learning_rate": 0.0006314646762284277, + "loss": 0.92274326, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2250, + "time_per_iteration": 2.601017951965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_mlp": 1.02617049, + "diversity_loss_mlp": 0.0, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.02997957544407836, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76458681, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.06689453, + "routerloss_mlp": 0.0, + "step": 2251, + "time_per_iteration": 4.872025966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085431, + "balance_loss_mlp": 1.07351613, + "diversity_loss_mlp": 0.0, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.07162967916255573, + "language_loss": 0.77412337, + "learning_rate": 0.0006308634128629022, + "loss": 0.78497767, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2252, + "time_per_iteration": 2.858896255493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089815, + "balance_loss_mlp": 1.07750654, + "diversity_loss_mlp": 0.0, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.0655401202696214, + "language_loss": 0.8742274, + "learning_rate": 0.0006305627049132531, + "loss": 0.88512552, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2253, + "time_per_iteration": 2.8089702129364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108552, + "balance_loss_mlp": 1.07309866, + "diversity_loss_mlp": 0.0, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.05577202062379855, + "language_loss": 0.85968709, + "learning_rate": 0.0006302619462746662, + "loss": 0.87054229, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.12414551, + "routerloss_mlp": 0.0, + "step": 2254, + "time_per_iteration": 3.117469072341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090126, + "balance_loss_mlp": 1.07842588, + "diversity_loss_mlp": 0.0, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.07095559842956704, + "language_loss": 0.90230805, + "learning_rate": 0.0006299611370639069, + "loss": 0.91320932, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2255, + "time_per_iteration": 2.723188638687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084284, + "balance_loss_mlp": 1.07239318, + "diversity_loss_mlp": 0.0, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.07367301477096526, + "language_loss": 0.79524988, + "learning_rate": 0.0006296602773977593, + "loss": 0.80609274, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2256, + "time_per_iteration": 2.6743130683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099708, + "balance_loss_mlp": 1.08790588, + "diversity_loss_mlp": 0.0, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.06301035546935001, + "language_loss": 0.87406039, + "learning_rate": 0.0006293593673930277, + "loss": 0.88505745, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2257, + "time_per_iteration": 2.6397616863250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103769, + "balance_loss_mlp": 1.09211683, + "diversity_loss_mlp": 0.0, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07716264473653381, + "language_loss": 0.78774142, + "learning_rate": 0.0006290584071665358, + "loss": 0.79877913, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2258, + "time_per_iteration": 2.9148640632629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_mlp": 1.07634544, + "diversity_loss_mlp": 0.0, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.06859255861010008, + "language_loss": 0.82309216, + "learning_rate": 0.0006287573968351266, + "loss": 0.83397484, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2259, + "time_per_iteration": 2.582099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081836, + "balance_loss_mlp": 1.06989694, + "diversity_loss_mlp": 0.0, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.0728512329620832, + "language_loss": 0.8210361, + "learning_rate": 0.0006284563365156626, + "loss": 0.83185446, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2260, + "time_per_iteration": 2.802004814147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075245, + "balance_loss_mlp": 1.06343079, + "diversity_loss_mlp": 0.0, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.08318375282180102, + "language_loss": 0.87862843, + "learning_rate": 0.0006281552263250261, + "loss": 0.88938093, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.11810303, + "routerloss_mlp": 0.0, + "step": 2261, + "time_per_iteration": 2.5335495471954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_mlp": 1.02721453, + "diversity_loss_mlp": 0.0, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.02511862566194507, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81726044, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.07275391, + "routerloss_mlp": 0.0, + "step": 2262, + "time_per_iteration": 4.858395338058472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067582, + "balance_loss_mlp": 1.05593562, + "diversity_loss_mlp": 0.0, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.07030760098393707, + "language_loss": 0.81181604, + "learning_rate": 0.0006275528567978593, + "loss": 0.82249182, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2263, + "time_per_iteration": 2.9562113285064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106752, + "balance_loss_mlp": 1.05570674, + "diversity_loss_mlp": 0.0, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.09515047383985015, + "language_loss": 0.82464182, + "learning_rate": 0.0006272515976951898, + "loss": 0.83531702, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2264, + "time_per_iteration": 3.0750486850738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106778, + "balance_loss_mlp": 1.05625236, + "diversity_loss_mlp": 0.0, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.06538835415995116, + "language_loss": 0.7903443, + "learning_rate": 0.0006269502891890687, + "loss": 0.80102211, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2265, + "time_per_iteration": 3.0723042488098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069278, + "balance_loss_mlp": 1.05721438, + "diversity_loss_mlp": 0.0, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.06791130510000161, + "language_loss": 0.88071477, + "learning_rate": 0.0006266489313964743, + "loss": 0.89140749, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.12060547, + "routerloss_mlp": 0.0, + "step": 2266, + "time_per_iteration": 2.7362618446350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00937641, + "balance_loss_mlp": 1.63294578, + "diversity_loss_mlp": 0.21328503, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.028233172977391998, + "language_loss": 0.85207379, + "learning_rate": 0.0006263475244344041, + "loss": 0.8614502, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01452552, + "step": 2267, + "time_per_iteration": 2.8842954635620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082065, + "balance_loss_mlp": 1.06979251, + "diversity_loss_mlp": 0.0, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.07502115173737808, + "language_loss": 0.84271002, + "learning_rate": 0.0006260460684198746, + "loss": 0.8535307, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.12268066, + "routerloss_mlp": 0.0, + "step": 2268, + "time_per_iteration": 2.6355533599853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.07749879, + "diversity_loss_mlp": 0.0, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.07640014386484298, + "language_loss": 0.84040511, + "learning_rate": 0.0006257445634699213, + "loss": 0.85130346, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.12322998, + "routerloss_mlp": 0.0, + "step": 2269, + "time_per_iteration": 2.5279150009155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089112, + "balance_loss_mlp": 1.07683921, + "diversity_loss_mlp": 0.0, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.16142331523875347, + "language_loss": 0.83037758, + "learning_rate": 0.0006254430097015993, + "loss": 0.84126872, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.12268066, + "routerloss_mlp": 0.0, + "step": 2270, + "time_per_iteration": 2.660228729248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_mlp": 1.03087568, + "diversity_loss_mlp": 0.0, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.024589935077845904, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77516735, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.06787109, + "routerloss_mlp": 0.0, + "step": 2271, + "time_per_iteration": 4.794579744338989 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070138, + "balance_loss_mlp": 1.05796623, + "diversity_loss_mlp": 0.0, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.057648382072647573, + "language_loss": 0.85053569, + "learning_rate": 0.0006248397561781609, + "loss": 0.86123705, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2272, + "time_per_iteration": 2.862569570541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067836, + "balance_loss_mlp": 1.05557537, + "diversity_loss_mlp": 0.0, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.08840424380788836, + "language_loss": 0.86255217, + "learning_rate": 0.0006245380566572482, + "loss": 0.87323052, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2273, + "time_per_iteration": 2.7386484146118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068942, + "balance_loss_mlp": 1.0566572, + "diversity_loss_mlp": 0.0, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07723857249852564, + "language_loss": 0.75794655, + "learning_rate": 0.0006242363087863744, + "loss": 0.76863599, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.12286377, + "routerloss_mlp": 0.0, + "step": 2274, + "time_per_iteration": 2.948030710220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010675, + "balance_loss_mlp": 1.05560887, + "diversity_loss_mlp": 0.0, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.06687985923679116, + "language_loss": 0.86043644, + "learning_rate": 0.0006239345126826878, + "loss": 0.87111151, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2275, + "time_per_iteration": 2.787750482559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071289, + "balance_loss_mlp": 1.05926108, + "diversity_loss_mlp": 0.0, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.07503499995760528, + "language_loss": 0.83946115, + "learning_rate": 0.0006236326684633561, + "loss": 0.85017407, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2276, + "time_per_iteration": 2.8109841346740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071151, + "balance_loss_mlp": 1.05921769, + "diversity_loss_mlp": 0.0, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.08049471875944368, + "language_loss": 0.75253642, + "learning_rate": 0.0006233307762455658, + "loss": 0.76324785, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.11932373, + "routerloss_mlp": 0.0, + "step": 2277, + "time_per_iteration": 2.632291793823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072177, + "balance_loss_mlp": 1.06043518, + "diversity_loss_mlp": 0.0, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.0727539933311737, + "language_loss": 0.83312476, + "learning_rate": 0.0006230288361465216, + "loss": 0.8438465, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2278, + "time_per_iteration": 3.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106943, + "balance_loss_mlp": 1.05752659, + "diversity_loss_mlp": 0.0, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.08745359184854619, + "language_loss": 0.84888816, + "learning_rate": 0.0006227268482834473, + "loss": 0.85958248, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2279, + "time_per_iteration": 2.9116861820220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929134, + "balance_loss_mlp": 1.61467147, + "diversity_loss_mlp": 0.21327347, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.03053717197724305, + "language_loss": 0.8733198, + "learning_rate": 0.000622424812773585, + "loss": 0.88261116, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0151619, + "step": 2280, + "time_per_iteration": 2.83655047416687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087033, + "balance_loss_mlp": 1.07515955, + "diversity_loss_mlp": 0.0, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.09030781332224262, + "language_loss": 0.8003484, + "learning_rate": 0.000622122729734195, + "loss": 0.81121874, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2281, + "time_per_iteration": 2.598515033721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088881, + "balance_loss_mlp": 1.07746708, + "diversity_loss_mlp": 0.0, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.05965815533468205, + "language_loss": 0.87430406, + "learning_rate": 0.0006218205992825566, + "loss": 0.88519287, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2282, + "time_per_iteration": 2.6424663066864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084394, + "balance_loss_mlp": 1.07271123, + "diversity_loss_mlp": 0.0, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.06483845116972914, + "language_loss": 0.81733787, + "learning_rate": 0.0006215184215359671, + "loss": 0.8281818, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2283, + "time_per_iteration": 2.736311674118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087917, + "balance_loss_mlp": 1.07662153, + "diversity_loss_mlp": 0.0, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.0656289826640407, + "language_loss": 0.86697561, + "learning_rate": 0.0006212161966117425, + "loss": 0.8778547, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.11297607, + "routerloss_mlp": 0.0, + "step": 2284, + "time_per_iteration": 2.727402448654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091514, + "balance_loss_mlp": 1.07989156, + "diversity_loss_mlp": 0.0, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.07463232969806483, + "language_loss": 0.81628394, + "learning_rate": 0.0006209139246272164, + "loss": 0.8271991, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 2285, + "time_per_iteration": 2.978759527206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.08205843, + "diversity_loss_mlp": 0.0, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.08236326374350296, + "language_loss": 0.81938732, + "learning_rate": 0.0006206116056997421, + "loss": 0.83032608, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.1182251, + "routerloss_mlp": 0.0, + "step": 2286, + "time_per_iteration": 2.6111207008361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085811, + "balance_loss_mlp": 1.07444477, + "diversity_loss_mlp": 0.0, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.06662472973472185, + "language_loss": 0.82727671, + "learning_rate": 0.0006203092399466892, + "loss": 0.83813483, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2287, + "time_per_iteration": 2.6246864795684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109279, + "balance_loss_mlp": 1.08137023, + "diversity_loss_mlp": 0.0, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.06470350083987941, + "language_loss": 0.85380936, + "learning_rate": 0.0006200068274854473, + "loss": 0.86473733, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.11419678, + "routerloss_mlp": 0.0, + "step": 2288, + "time_per_iteration": 2.675197124481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091416, + "balance_loss_mlp": 1.07988858, + "diversity_loss_mlp": 0.0, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.0650031810595099, + "language_loss": 0.8588661, + "learning_rate": 0.0006197043684334229, + "loss": 0.86978024, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2289, + "time_per_iteration": 2.787095785140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092106, + "balance_loss_mlp": 1.08063841, + "diversity_loss_mlp": 0.0, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.0715970788084748, + "language_loss": 0.79333103, + "learning_rate": 0.0006194018629080411, + "loss": 0.80425215, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2290, + "time_per_iteration": 2.817836284637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103028, + "balance_loss_mlp": 1.09150028, + "diversity_loss_mlp": 0.0, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.07061114258803743, + "language_loss": 0.81714827, + "learning_rate": 0.0006190993110267451, + "loss": 0.82817852, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2291, + "time_per_iteration": 2.741288900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108227, + "balance_loss_mlp": 1.09614503, + "diversity_loss_mlp": 0.0, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.07455801894128893, + "language_loss": 0.84193838, + "learning_rate": 0.0006187967129069958, + "loss": 0.85302061, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2292, + "time_per_iteration": 2.5778286457061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106682, + "balance_loss_mlp": 1.09472573, + "diversity_loss_mlp": 0.0, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.06400814904414545, + "language_loss": 0.8690064, + "learning_rate": 0.0006184940686662722, + "loss": 0.88007319, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2293, + "time_per_iteration": 2.7292487621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111022, + "balance_loss_mlp": 1.09812045, + "diversity_loss_mlp": 0.0, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.06813451942076464, + "language_loss": 0.90379488, + "learning_rate": 0.0006181913784220714, + "loss": 0.91489702, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2294, + "time_per_iteration": 2.6506428718566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081962, + "balance_loss_mlp": 1.0750953, + "diversity_loss_mlp": 0.0, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.029819366941177792, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81635749, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.06884766, + "routerloss_mlp": 0.0, + "step": 2295, + "time_per_iteration": 4.882002592086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110182, + "balance_loss_mlp": 1.09772444, + "diversity_loss_mlp": 0.0, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.07012194180041048, + "language_loss": 0.7971437, + "learning_rate": 0.0006175858603933146, + "loss": 0.80824548, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.12469482, + "routerloss_mlp": 0.0, + "step": 2296, + "time_per_iteration": 2.8836371898651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908854, + "balance_loss_mlp": 1.58032632, + "diversity_loss_mlp": 0.2095283, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.03267646081870075, + "language_loss": 0.80986243, + "learning_rate": 0.0006172830328438416, + "loss": 0.81895095, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01392685, + "step": 2297, + "time_per_iteration": 2.9758472442626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093338, + "balance_loss_mlp": 1.0806725, + "diversity_loss_mlp": 0.0, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.0684627092891604, + "language_loss": 0.86739677, + "learning_rate": 0.0006169801597610572, + "loss": 0.87833017, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 2298, + "time_per_iteration": 2.796999454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080861, + "balance_loss_mlp": 1.06855834, + "diversity_loss_mlp": 0.0, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.09148837874044675, + "language_loss": 0.89672303, + "learning_rate": 0.0006166772412625469, + "loss": 0.90753162, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.12304688, + "routerloss_mlp": 0.0, + "step": 2299, + "time_per_iteration": 2.719217300415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079493, + "balance_loss_mlp": 1.06674969, + "diversity_loss_mlp": 0.0, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.0806717243265584, + "language_loss": 0.81995088, + "learning_rate": 0.0006163742774659141, + "loss": 0.83074582, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 2300, + "time_per_iteration": 2.857851266860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082582, + "balance_loss_mlp": 1.07051837, + "diversity_loss_mlp": 0.0, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.07368324051857801, + "language_loss": 0.85920924, + "learning_rate": 0.0006160712684887801, + "loss": 0.87003505, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.1206665, + "routerloss_mlp": 0.0, + "step": 2301, + "time_per_iteration": 2.7615816593170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076648, + "balance_loss_mlp": 1.06491232, + "diversity_loss_mlp": 0.0, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.07775198871362894, + "language_loss": 0.81987381, + "learning_rate": 0.0006157682144487832, + "loss": 0.83064032, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2302, + "time_per_iteration": 2.759446620941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071769, + "balance_loss_mlp": 1.05998516, + "diversity_loss_mlp": 0.0, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.07391427816126875, + "language_loss": 0.82887244, + "learning_rate": 0.0006154651154635793, + "loss": 0.83959019, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2303, + "time_per_iteration": 2.8566582202911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074582, + "balance_loss_mlp": 1.0627867, + "diversity_loss_mlp": 0.0, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.07276664214775759, + "language_loss": 0.84800553, + "learning_rate": 0.0006151619716508421, + "loss": 0.85875136, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2304, + "time_per_iteration": 2.678624153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070842, + "balance_loss_mlp": 1.05890322, + "diversity_loss_mlp": 0.0, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.0708190445963316, + "language_loss": 0.87117589, + "learning_rate": 0.0006148587831282625, + "loss": 0.88188434, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2305, + "time_per_iteration": 2.6833643913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065001, + "balance_loss_mlp": 1.05813479, + "diversity_loss_mlp": 0.0, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.03167846404368131, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80241072, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.06884766, + "routerloss_mlp": 0.0, + "step": 2306, + "time_per_iteration": 4.908214092254639 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_mlp": 1.06202734, + "diversity_loss_mlp": 0.0, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.10781991147306623, + "language_loss": 0.87386847, + "learning_rate": 0.0006142522724244255, + "loss": 0.8846153, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.12664795, + "routerloss_mlp": 0.0, + "step": 2307, + "time_per_iteration": 2.559011459350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_mlp": 1.03301477, + "diversity_loss_mlp": 0.0, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.019467834986953515, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77524698, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.06982422, + "routerloss_mlp": 0.0, + "step": 2308, + "time_per_iteration": 4.990226984024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010956, + "balance_loss_mlp": 1.08379281, + "diversity_loss_mlp": 0.0, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.134173965781989, + "language_loss": 0.77330542, + "learning_rate": 0.000613645584293942, + "loss": 0.78426147, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2309, + "time_per_iteration": 2.925625801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096392, + "balance_loss_mlp": 1.08444726, + "diversity_loss_mlp": 0.0, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.07260585347328512, + "language_loss": 0.83497787, + "learning_rate": 0.0006133421739881185, + "loss": 0.84594172, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2310, + "time_per_iteration": 2.6521387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105727, + "balance_loss_mlp": 1.09360933, + "diversity_loss_mlp": 0.0, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.08716252058009813, + "language_loss": 0.82747865, + "learning_rate": 0.0006130387196789605, + "loss": 0.8385359, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2311, + "time_per_iteration": 2.7266759872436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100575, + "balance_loss_mlp": 1.08809423, + "diversity_loss_mlp": 0.0, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.057672451626414926, + "language_loss": 0.84308195, + "learning_rate": 0.0006127352214842795, + "loss": 0.85408771, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2312, + "time_per_iteration": 2.9728119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104798, + "balance_loss_mlp": 1.09263897, + "diversity_loss_mlp": 0.0, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.09124128780751645, + "language_loss": 0.85551131, + "learning_rate": 0.0006124316795219041, + "loss": 0.86655927, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2313, + "time_per_iteration": 2.793999671936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098642, + "balance_loss_mlp": 1.08649504, + "diversity_loss_mlp": 0.0, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.07392199689713573, + "language_loss": 0.82170153, + "learning_rate": 0.0006121280939096794, + "loss": 0.83268797, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.12145996, + "routerloss_mlp": 0.0, + "step": 2314, + "time_per_iteration": 2.7882213592529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087686, + "balance_loss_mlp": 1.07496047, + "diversity_loss_mlp": 0.0, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.07188819518398708, + "language_loss": 0.87831259, + "learning_rate": 0.000611824464765468, + "loss": 0.88918942, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.12738037, + "routerloss_mlp": 0.0, + "step": 2315, + "time_per_iteration": 2.570239305496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_mlp": 1.03435254, + "diversity_loss_mlp": 0.0, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.031544046963938845, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79636735, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.07421875, + "routerloss_mlp": 0.0, + "step": 2316, + "time_per_iteration": 4.63933539390564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107211, + "balance_loss_mlp": 1.05995071, + "diversity_loss_mlp": 0.0, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.10006595419905694, + "language_loss": 0.85561663, + "learning_rate": 0.000611217076352619, + "loss": 0.86633772, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2317, + "time_per_iteration": 2.763282299041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068374, + "balance_loss_mlp": 1.05613708, + "diversity_loss_mlp": 0.0, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.07080250397958886, + "language_loss": 0.8323034, + "learning_rate": 0.0006109133173197905, + "loss": 0.84298718, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 2318, + "time_per_iteration": 2.7228074073791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.0546751, + "diversity_loss_mlp": 0.0, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.07919775459104113, + "language_loss": 0.85392821, + "learning_rate": 0.0006106095152265935, + "loss": 0.86459887, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.12390137, + "routerloss_mlp": 0.0, + "step": 2319, + "time_per_iteration": 2.950333595275879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067661, + "balance_loss_mlp": 1.05547166, + "diversity_loss_mlp": 0.0, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.061336847968553085, + "language_loss": 0.84789562, + "learning_rate": 0.0006103056701909739, + "loss": 0.85857224, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2320, + "time_per_iteration": 2.9283788204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076472, + "balance_loss_mlp": 1.06437278, + "diversity_loss_mlp": 0.0, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.06696737396207848, + "language_loss": 0.83276129, + "learning_rate": 0.0006100017823308956, + "loss": 0.84352595, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2321, + "time_per_iteration": 3.159337282180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072799, + "balance_loss_mlp": 1.06091988, + "diversity_loss_mlp": 0.0, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.07676377008356373, + "language_loss": 0.79803503, + "learning_rate": 0.0006096978517643377, + "loss": 0.80876303, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2322, + "time_per_iteration": 2.8253674507141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00921995, + "balance_loss_mlp": 1.60181236, + "diversity_loss_mlp": 0.21422489, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.03237790796068106, + "language_loss": 0.83347481, + "learning_rate": 0.0006093938786092968, + "loss": 0.84269476, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01397606, + "step": 2323, + "time_per_iteration": 2.648444890975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110106, + "balance_loss_mlp": 1.09840608, + "diversity_loss_mlp": 0.0, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.07300553293113453, + "language_loss": 0.90023661, + "learning_rate": 0.0006090898629837857, + "loss": 0.91133773, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2324, + "time_per_iteration": 2.852698564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126468, + "balance_loss_mlp": 1.11461282, + "diversity_loss_mlp": 0.0, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.06000654076761871, + "language_loss": 0.87143672, + "learning_rate": 0.0006087858050058337, + "loss": 0.8827014, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2325, + "time_per_iteration": 2.7674834728240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138358, + "balance_loss_mlp": 1.12663388, + "diversity_loss_mlp": 0.0, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.0853990663964482, + "language_loss": 0.82412744, + "learning_rate": 0.0006084817047934866, + "loss": 0.83551097, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2326, + "time_per_iteration": 2.6421871185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121575, + "balance_loss_mlp": 1.10977352, + "diversity_loss_mlp": 0.0, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.08985792381424736, + "language_loss": 0.89330196, + "learning_rate": 0.0006081775624648066, + "loss": 0.90451771, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2327, + "time_per_iteration": 2.578197956085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131674, + "balance_loss_mlp": 1.12057006, + "diversity_loss_mlp": 0.0, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.0872530433154025, + "language_loss": 0.83162999, + "learning_rate": 0.0006078733781378721, + "loss": 0.84294665, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2328, + "time_per_iteration": 2.6186208724975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099348, + "balance_loss_mlp": 1.08810675, + "diversity_loss_mlp": 0.0, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.07633837573658239, + "language_loss": 0.82202363, + "learning_rate": 0.0006075691519307781, + "loss": 0.83301711, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2329, + "time_per_iteration": 2.9000244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094166, + "balance_loss_mlp": 1.08247721, + "diversity_loss_mlp": 0.0, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.0736281868256213, + "language_loss": 0.81618124, + "learning_rate": 0.0006072648839616356, + "loss": 0.82712287, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.11694336, + "routerloss_mlp": 0.0, + "step": 2330, + "time_per_iteration": 2.6364829540252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083826, + "balance_loss_mlp": 1.07230425, + "diversity_loss_mlp": 0.0, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.0657010816534965, + "language_loss": 0.82723016, + "learning_rate": 0.0006069605743485718, + "loss": 0.83806837, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2331, + "time_per_iteration": 3.3334474563598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086805, + "balance_loss_mlp": 1.07531917, + "diversity_loss_mlp": 0.0, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.07225675858451452, + "language_loss": 0.83265316, + "learning_rate": 0.0006066562232097303, + "loss": 0.84352124, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2332, + "time_per_iteration": 2.705143690109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082054, + "balance_loss_mlp": 1.07051468, + "diversity_loss_mlp": 0.0, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.06521315479324259, + "language_loss": 0.8614397, + "learning_rate": 0.0006063518306632708, + "loss": 0.87226027, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2333, + "time_per_iteration": 2.9501705169677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085822, + "balance_loss_mlp": 1.07427073, + "diversity_loss_mlp": 0.0, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.07251688845149425, + "language_loss": 0.82197714, + "learning_rate": 0.0006060473968273688, + "loss": 0.83283544, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2334, + "time_per_iteration": 2.708394765853882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_mlp": 1.032179, + "diversity_loss_mlp": 0.0, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.02865006957504222, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78918916, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.07177734, + "routerloss_mlp": 0.0, + "step": 2335, + "time_per_iteration": 4.866912841796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_mlp": 1.01901519, + "diversity_loss_mlp": 0.0, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.021847156852776353, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82031286, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.07080078, + "routerloss_mlp": 0.0, + "step": 2336, + "time_per_iteration": 4.834076642990112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108818, + "balance_loss_mlp": 1.07613969, + "diversity_loss_mlp": 0.0, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.09890748330953583, + "language_loss": 0.88285863, + "learning_rate": 0.0006051338487650047, + "loss": 0.89374042, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 2337, + "time_per_iteration": 2.4428114891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00930205, + "balance_loss_mlp": 1.62015963, + "diversity_loss_mlp": 0.20974493, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.03186253719782368, + "language_loss": 0.82399797, + "learning_rate": 0.0006048292509534095, + "loss": 0.83329999, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01525321, + "step": 2338, + "time_per_iteration": 2.6332457065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079855, + "balance_loss_mlp": 1.06772542, + "diversity_loss_mlp": 0.0, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.08456945041025239, + "language_loss": 0.77873439, + "learning_rate": 0.0006045246124434895, + "loss": 0.7895329, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 2339, + "time_per_iteration": 2.7590980529785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073061, + "balance_loss_mlp": 1.06156278, + "diversity_loss_mlp": 0.0, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.06841757056071682, + "language_loss": 0.86623305, + "learning_rate": 0.0006042199333535162, + "loss": 0.87696362, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2340, + "time_per_iteration": 3.293574333190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079106, + "balance_loss_mlp": 1.06769133, + "diversity_loss_mlp": 0.0, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06101547553515947, + "language_loss": 0.84343052, + "learning_rate": 0.0006039152138017763, + "loss": 0.85422158, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2341, + "time_per_iteration": 3.0700981616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087051, + "balance_loss_mlp": 1.07579744, + "diversity_loss_mlp": 0.0, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.09071323966594208, + "language_loss": 0.83541143, + "learning_rate": 0.0006036104539065726, + "loss": 0.84628195, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.11260986, + "routerloss_mlp": 0.0, + "step": 2342, + "time_per_iteration": 2.6694719791412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089407, + "balance_loss_mlp": 1.07793319, + "diversity_loss_mlp": 0.0, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.08270437502254605, + "language_loss": 0.84371507, + "learning_rate": 0.000603305653786223, + "loss": 0.85460913, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2343, + "time_per_iteration": 3.16105318069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083424, + "balance_loss_mlp": 1.07187295, + "diversity_loss_mlp": 0.0, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.07028076371432387, + "language_loss": 0.84103405, + "learning_rate": 0.0006030008135590622, + "loss": 0.85186827, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2344, + "time_per_iteration": 2.7197835445404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082164, + "balance_loss_mlp": 1.07096398, + "diversity_loss_mlp": 0.0, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.05864949769745669, + "language_loss": 0.7999413, + "learning_rate": 0.0006026959333434387, + "loss": 0.81076288, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2345, + "time_per_iteration": 2.777010202407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919083, + "balance_loss_mlp": 1.6008426, + "diversity_loss_mlp": 0.20793086, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.028469676504860836, + "language_loss": 0.77684712, + "learning_rate": 0.0006023910132577181, + "loss": 0.78603798, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01469593, + "step": 2346, + "time_per_iteration": 2.689173936843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093318, + "balance_loss_mlp": 1.08186746, + "diversity_loss_mlp": 0.0, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.07173117007756048, + "language_loss": 0.84956741, + "learning_rate": 0.0006020860534202806, + "loss": 0.86050057, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2347, + "time_per_iteration": 2.499941110610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099048, + "balance_loss_mlp": 1.08747303, + "diversity_loss_mlp": 0.0, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.06525031943024168, + "language_loss": 0.81076705, + "learning_rate": 0.0006017810539495224, + "loss": 0.82175756, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2348, + "time_per_iteration": 2.9487318992614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094541, + "balance_loss_mlp": 1.08284068, + "diversity_loss_mlp": 0.0, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.07881291561071736, + "language_loss": 0.82607108, + "learning_rate": 0.0006014760149638547, + "loss": 0.83701646, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.11700439, + "routerloss_mlp": 0.0, + "step": 2349, + "time_per_iteration": 2.7228691577911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096509, + "balance_loss_mlp": 1.0852139, + "diversity_loss_mlp": 0.0, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.08019466042103662, + "language_loss": 0.88398969, + "learning_rate": 0.000601170936581704, + "loss": 0.8949548, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.112854, + "routerloss_mlp": 0.0, + "step": 2350, + "time_per_iteration": 2.521714687347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090727, + "balance_loss_mlp": 1.07951522, + "diversity_loss_mlp": 0.0, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.08533615412567333, + "language_loss": 0.84897137, + "learning_rate": 0.0006008658189215121, + "loss": 0.85987866, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2351, + "time_per_iteration": 2.6506216526031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.07545722, + "diversity_loss_mlp": 0.0, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.09237808795246917, + "language_loss": 0.80232167, + "learning_rate": 0.0006005606621017366, + "loss": 0.81319243, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2352, + "time_per_iteration": 2.5878968238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010807, + "balance_loss_mlp": 1.06907678, + "diversity_loss_mlp": 0.0, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.07057821380790058, + "language_loss": 0.80339801, + "learning_rate": 0.0006002554662408496, + "loss": 0.81420493, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2353, + "time_per_iteration": 2.883782386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.0691061, + "diversity_loss_mlp": 0.0, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.0736680584084088, + "language_loss": 0.9135446, + "learning_rate": 0.0005999502314573388, + "loss": 0.9243511, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2354, + "time_per_iteration": 2.645484685897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103614, + "balance_loss_mlp": 1.09201527, + "diversity_loss_mlp": 0.0, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.07036557956994945, + "language_loss": 0.86196381, + "learning_rate": 0.0005996449578697066, + "loss": 0.87299991, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2355, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906536, + "balance_loss_mlp": 1.57839537, + "diversity_loss_mlp": 0.20635399, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.031145483684461562, + "language_loss": 0.81619978, + "learning_rate": 0.0005993396455964709, + "loss": 0.82526517, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01416124, + "step": 2356, + "time_per_iteration": 2.7277767658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.14805746, + "diversity_loss_mlp": 0.0, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.07904312092760724, + "language_loss": 0.81657517, + "learning_rate": 0.0005990342947561647, + "loss": 0.82816887, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.11315918, + "routerloss_mlp": 0.0, + "step": 2357, + "time_per_iteration": 2.696223258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167894, + "balance_loss_mlp": 1.15651524, + "diversity_loss_mlp": 0.0, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.07381995676601517, + "language_loss": 0.78198934, + "learning_rate": 0.0005987289054673351, + "loss": 0.79366827, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2358, + "time_per_iteration": 2.602642059326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360078, + "balance_loss_mlp": 1.35392714, + "diversity_loss_mlp": 0.0, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.12195170998658643, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77935815, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2359, + "time_per_iteration": 4.880090713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146892, + "balance_loss_mlp": 1.13553107, + "diversity_loss_mlp": 0.0, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.07250720881476776, + "language_loss": 0.91548061, + "learning_rate": 0.0005981180120183722, + "loss": 0.9269495, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2360, + "time_per_iteration": 2.680730104446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133243, + "balance_loss_mlp": 1.121382, + "diversity_loss_mlp": 0.0, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.055968167495159496, + "language_loss": 0.85338825, + "learning_rate": 0.0005978125080954089, + "loss": 0.8647207, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.11853027, + "routerloss_mlp": 0.0, + "step": 2361, + "time_per_iteration": 2.791376829147339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124184, + "balance_loss_mlp": 1.11265099, + "diversity_loss_mlp": 0.0, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.08653591933533131, + "language_loss": 0.77322888, + "learning_rate": 0.000597506966198262, + "loss": 0.7844708, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2362, + "time_per_iteration": 2.97446870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119088, + "balance_loss_mlp": 1.10733426, + "diversity_loss_mlp": 0.0, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.09240364374598002, + "language_loss": 0.84247041, + "learning_rate": 0.0005972013864455536, + "loss": 0.85366124, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.11743164, + "routerloss_mlp": 0.0, + "step": 2363, + "time_per_iteration": 2.577167510986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108786, + "balance_loss_mlp": 1.09771168, + "diversity_loss_mlp": 0.0, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.0787330127694287, + "language_loss": 0.8535012, + "learning_rate": 0.0005968957689559203, + "loss": 0.8645891, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2364, + "time_per_iteration": 2.7120981216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105615, + "balance_loss_mlp": 1.09457588, + "diversity_loss_mlp": 0.0, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.07389843074969835, + "language_loss": 0.88484383, + "learning_rate": 0.0005965901138480131, + "loss": 0.89590001, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2365, + "time_per_iteration": 2.578874349594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110202, + "balance_loss_mlp": 1.09081471, + "diversity_loss_mlp": 0.0, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.06426783448513047, + "language_loss": 0.87068385, + "learning_rate": 0.0005962844212404982, + "loss": 0.88170409, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.11206055, + "routerloss_mlp": 0.0, + "step": 2366, + "time_per_iteration": 2.6638920307159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096105, + "balance_loss_mlp": 1.08472049, + "diversity_loss_mlp": 0.0, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.05830156527831164, + "language_loss": 0.87147355, + "learning_rate": 0.0005959786912520558, + "loss": 0.88243461, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 2367, + "time_per_iteration": 2.6142454147338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088371, + "balance_loss_mlp": 1.07726681, + "diversity_loss_mlp": 0.0, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.06261196085687584, + "language_loss": 0.83712542, + "learning_rate": 0.0005956729240013806, + "loss": 0.84800917, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2368, + "time_per_iteration": 2.786256790161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095858, + "balance_loss_mlp": 1.08447385, + "diversity_loss_mlp": 0.0, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.06874460659515655, + "language_loss": 0.91648531, + "learning_rate": 0.0005953671196071824, + "loss": 0.92744386, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2369, + "time_per_iteration": 2.756943941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093695, + "balance_loss_mlp": 1.08220375, + "diversity_loss_mlp": 0.0, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.07258619671695062, + "language_loss": 0.80044961, + "learning_rate": 0.0005950612781881846, + "loss": 0.81138659, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2370, + "time_per_iteration": 2.6791019439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906758, + "balance_loss_mlp": 1.57760763, + "diversity_loss_mlp": 0.20680004, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.03266097765038979, + "language_loss": 0.76005763, + "learning_rate": 0.0005947553998631259, + "loss": 0.76912522, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01455403, + "step": 2371, + "time_per_iteration": 2.908493995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010769, + "balance_loss_mlp": 1.06543183, + "diversity_loss_mlp": 0.0, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.05564189265933484, + "language_loss": 0.79205543, + "learning_rate": 0.000594449484750758, + "loss": 0.80282438, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2372, + "time_per_iteration": 3.18151593208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072251, + "balance_loss_mlp": 1.06046152, + "diversity_loss_mlp": 0.0, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.07444834598910231, + "language_loss": 0.83208215, + "learning_rate": 0.0005941435329698484, + "loss": 0.84280467, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2373, + "time_per_iteration": 2.6709630489349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107048, + "balance_loss_mlp": 1.05895281, + "diversity_loss_mlp": 0.0, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.06837725942446468, + "language_loss": 0.83204812, + "learning_rate": 0.0005938375446391778, + "loss": 0.84275293, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2374, + "time_per_iteration": 2.6943106651306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_mlp": 1.06261396, + "diversity_loss_mlp": 0.0, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.0748623734907781, + "language_loss": 0.8912878, + "learning_rate": 0.0005935315198775415, + "loss": 0.90203297, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2375, + "time_per_iteration": 2.6303911209106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066821, + "balance_loss_mlp": 1.05491209, + "diversity_loss_mlp": 0.0, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.06590971106227904, + "language_loss": 0.87093645, + "learning_rate": 0.0005932254588037486, + "loss": 0.88160467, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2376, + "time_per_iteration": 2.5003554821014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106434, + "balance_loss_mlp": 1.0520016, + "diversity_loss_mlp": 0.0, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.07188519107297629, + "language_loss": 0.86239958, + "learning_rate": 0.000592919361536623, + "loss": 0.87304294, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.12335205, + "routerloss_mlp": 0.0, + "step": 2377, + "time_per_iteration": 2.6426758766174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106134, + "balance_loss_mlp": 1.04946113, + "diversity_loss_mlp": 0.0, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.06083573176815847, + "language_loss": 0.88679874, + "learning_rate": 0.0005926132281950017, + "loss": 0.89741206, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2378, + "time_per_iteration": 2.7510690689086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065869, + "balance_loss_mlp": 1.05310154, + "diversity_loss_mlp": 0.0, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.07940360452878177, + "language_loss": 0.85365742, + "learning_rate": 0.0005923070588977367, + "loss": 0.86431611, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.12774658, + "routerloss_mlp": 0.0, + "step": 2379, + "time_per_iteration": 2.7969985008239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066405, + "balance_loss_mlp": 1.05444837, + "diversity_loss_mlp": 0.0, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.06398281947580985, + "language_loss": 0.86384034, + "learning_rate": 0.0005920008537636931, + "loss": 0.87450439, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.11956787, + "routerloss_mlp": 0.0, + "step": 2380, + "time_per_iteration": 2.90964412689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_mlp": 1.05391335, + "diversity_loss_mlp": 0.0, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.05698304417859526, + "language_loss": 0.86739266, + "learning_rate": 0.0005916946129117504, + "loss": 0.87805718, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.12548828, + "routerloss_mlp": 0.0, + "step": 2381, + "time_per_iteration": 2.9013612270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.06223381, + "diversity_loss_mlp": 0.0, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.07634094682432664, + "language_loss": 0.80304879, + "learning_rate": 0.0005913883364608017, + "loss": 0.81379426, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2382, + "time_per_iteration": 3.086503505706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108411, + "balance_loss_mlp": 1.07212973, + "diversity_loss_mlp": 0.0, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.06243795661807547, + "language_loss": 0.8841778, + "learning_rate": 0.0005910820245297542, + "loss": 0.89501894, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.11975098, + "routerloss_mlp": 0.0, + "step": 2383, + "time_per_iteration": 2.8612842559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090258, + "balance_loss_mlp": 1.07756186, + "diversity_loss_mlp": 0.0, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.08243832238560393, + "language_loss": 0.80972016, + "learning_rate": 0.000590775677237529, + "loss": 0.82062268, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 2384, + "time_per_iteration": 2.731405735015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094631, + "balance_loss_mlp": 1.08257282, + "diversity_loss_mlp": 0.0, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.07578687885193977, + "language_loss": 0.80532229, + "learning_rate": 0.0005904692947030601, + "loss": 0.81626856, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.1204834, + "routerloss_mlp": 0.0, + "step": 2385, + "time_per_iteration": 2.6176209449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106556, + "balance_loss_mlp": 1.09437895, + "diversity_loss_mlp": 0.0, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.08078833732724985, + "language_loss": 0.8953619, + "learning_rate": 0.0005901628770452963, + "loss": 0.90642744, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.1217041, + "routerloss_mlp": 0.0, + "step": 2386, + "time_per_iteration": 2.5513737201690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.10345697, + "diversity_loss_mlp": 0.0, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.09403156888929357, + "language_loss": 0.87502134, + "learning_rate": 0.000589856424383199, + "loss": 0.88617843, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2387, + "time_per_iteration": 2.599862813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111298, + "balance_loss_mlp": 1.10114813, + "diversity_loss_mlp": 0.0, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.08117329221401763, + "language_loss": 0.8309918, + "learning_rate": 0.000589549936835744, + "loss": 0.8421216, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2388, + "time_per_iteration": 2.914754867553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101869, + "balance_loss_mlp": 1.0899775, + "diversity_loss_mlp": 0.0, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06559429512714879, + "language_loss": 0.79056096, + "learning_rate": 0.0005892434145219202, + "loss": 0.80157959, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2389, + "time_per_iteration": 2.6295268535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898813, + "balance_loss_mlp": 1.5620172, + "diversity_loss_mlp": 0.2081904, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.0365067866217014, + "language_loss": 0.82780147, + "learning_rate": 0.0005889368575607303, + "loss": 0.83678961, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01370906, + "step": 2390, + "time_per_iteration": 2.8635401725769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089349, + "balance_loss_mlp": 1.07753515, + "diversity_loss_mlp": 0.0, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.056196182118315396, + "language_loss": 0.78421402, + "learning_rate": 0.00058863026607119, + "loss": 0.79510748, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2391, + "time_per_iteration": 3.0734708309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099092, + "balance_loss_mlp": 1.08715332, + "diversity_loss_mlp": 0.0, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.07079174515079527, + "language_loss": 0.795928, + "learning_rate": 0.0005883236401723287, + "loss": 0.80691886, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.11932373, + "routerloss_mlp": 0.0, + "step": 2392, + "time_per_iteration": 3.1697676181793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.08348131, + "diversity_loss_mlp": 0.0, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.08882239564338372, + "language_loss": 0.84418833, + "learning_rate": 0.0005880169799831893, + "loss": 0.85514069, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2393, + "time_per_iteration": 2.668509006500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095056, + "balance_loss_mlp": 1.08327174, + "diversity_loss_mlp": 0.0, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.06874062850812142, + "language_loss": 0.81593782, + "learning_rate": 0.0005877102856228278, + "loss": 0.82688844, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2394, + "time_per_iteration": 2.862039566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099301, + "balance_loss_mlp": 1.08791018, + "diversity_loss_mlp": 0.0, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.07005170830273995, + "language_loss": 0.84822053, + "learning_rate": 0.0005874035572103133, + "loss": 0.85921353, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.1138916, + "routerloss_mlp": 0.0, + "step": 2395, + "time_per_iteration": 2.660466194152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092906, + "balance_loss_mlp": 1.08152771, + "diversity_loss_mlp": 0.0, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.09691208121118819, + "language_loss": 0.82382149, + "learning_rate": 0.0005870967948647288, + "loss": 0.83475053, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2396, + "time_per_iteration": 2.8379006385803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259876, + "balance_loss_mlp": 1.25238955, + "diversity_loss_mlp": 0.0, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.08205623370138872, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75568175, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.07470703, + "routerloss_mlp": 0.0, + "step": 2397, + "time_per_iteration": 5.0380027294158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912357, + "balance_loss_mlp": 1.5885272, + "diversity_loss_mlp": 0.20776251, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.030510515868204604, + "language_loss": 0.86040902, + "learning_rate": 0.0005864831688507443, + "loss": 0.86953259, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0142122, + "step": 2398, + "time_per_iteration": 2.9795196056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099565, + "balance_loss_mlp": 1.08854449, + "diversity_loss_mlp": 0.0, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.07495608045078013, + "language_loss": 0.75224954, + "learning_rate": 0.0005861763054205754, + "loss": 0.76324517, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2399, + "time_per_iteration": 2.7307660579681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908198, + "balance_loss_mlp": 1.58042729, + "diversity_loss_mlp": 0.20863593, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.03052990379504839, + "language_loss": 0.8056978, + "learning_rate": 0.0005858694085337976, + "loss": 0.81477976, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01366598, + "step": 2400, + "time_per_iteration": 2.8421711921691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115275, + "balance_loss_mlp": 1.10424817, + "diversity_loss_mlp": 0.0, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.08470381171074581, + "language_loss": 0.8355788, + "learning_rate": 0.0005855624783095589, + "loss": 0.84673154, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2401, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114727, + "balance_loss_mlp": 1.10386109, + "diversity_loss_mlp": 0.0, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.07139821582333657, + "language_loss": 0.85265267, + "learning_rate": 0.00058525551486702, + "loss": 0.86379993, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2402, + "time_per_iteration": 2.5159239768981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119193, + "balance_loss_mlp": 1.10795164, + "diversity_loss_mlp": 0.0, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.08747389081307531, + "language_loss": 0.80850065, + "learning_rate": 0.0005849485183253548, + "loss": 0.81969261, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.11242676, + "routerloss_mlp": 0.0, + "step": 2403, + "time_per_iteration": 2.643031358718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110943, + "balance_loss_mlp": 1.09971905, + "diversity_loss_mlp": 0.0, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.06974006499463392, + "language_loss": 0.8764264, + "learning_rate": 0.0005846414888037501, + "loss": 0.88753581, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.11224365, + "routerloss_mlp": 0.0, + "step": 2404, + "time_per_iteration": 2.4847412109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091069, + "balance_loss_mlp": 1.07962489, + "diversity_loss_mlp": 0.0, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.07303422211334305, + "language_loss": 0.82384312, + "learning_rate": 0.0005843344264214049, + "loss": 0.83475375, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.11444092, + "routerloss_mlp": 0.0, + "step": 2405, + "time_per_iteration": 2.7470028400421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093931, + "balance_loss_mlp": 1.08265948, + "diversity_loss_mlp": 0.0, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.06660378994806349, + "language_loss": 0.84838545, + "learning_rate": 0.0005840273312975317, + "loss": 0.85932475, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2406, + "time_per_iteration": 2.834179162979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082019, + "balance_loss_mlp": 1.07018733, + "diversity_loss_mlp": 0.0, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.07201348711751891, + "language_loss": 0.89853442, + "learning_rate": 0.0005837202035513555, + "loss": 0.90935457, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2407, + "time_per_iteration": 2.578505277633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081302, + "balance_loss_mlp": 1.06933987, + "diversity_loss_mlp": 0.0, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.06479654524201506, + "language_loss": 0.81299376, + "learning_rate": 0.0005834130433021136, + "loss": 0.82380676, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.11956787, + "routerloss_mlp": 0.0, + "step": 2408, + "time_per_iteration": 2.742830991744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075359, + "balance_loss_mlp": 1.0631156, + "diversity_loss_mlp": 0.0, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.06628126289532602, + "language_loss": 0.73402894, + "learning_rate": 0.0005831058506690563, + "loss": 0.74478251, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 2409, + "time_per_iteration": 2.6239566802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875374, + "balance_loss_mlp": 1.5126431, + "diversity_loss_mlp": 0.20975235, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.03030502692098504, + "language_loss": 0.86162984, + "learning_rate": 0.0005827986257714464, + "loss": 0.87038362, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01417591, + "step": 2410, + "time_per_iteration": 2.9302031993865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069185, + "balance_loss_mlp": 1.05664992, + "diversity_loss_mlp": 0.0, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.07558638886093381, + "language_loss": 0.88803709, + "learning_rate": 0.0005824913687285591, + "loss": 0.89872897, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.12542725, + "routerloss_mlp": 0.0, + "step": 2411, + "time_per_iteration": 2.685814142227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070655, + "balance_loss_mlp": 1.05821514, + "diversity_loss_mlp": 0.0, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.1080687232114875, + "language_loss": 0.81367224, + "learning_rate": 0.0005821840796596821, + "loss": 0.82437879, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.12445068, + "routerloss_mlp": 0.0, + "step": 2412, + "time_per_iteration": 2.6551058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073612, + "balance_loss_mlp": 1.06099916, + "diversity_loss_mlp": 0.0, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.07026214254932567, + "language_loss": 0.80428362, + "learning_rate": 0.0005818767586841158, + "loss": 0.81501973, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.12609863, + "routerloss_mlp": 0.0, + "step": 2413, + "time_per_iteration": 2.759437322616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085225, + "balance_loss_mlp": 1.07259476, + "diversity_loss_mlp": 0.0, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.08627931539992734, + "language_loss": 0.86441922, + "learning_rate": 0.0005815694059211726, + "loss": 0.8752715, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.12640381, + "routerloss_mlp": 0.0, + "step": 2414, + "time_per_iteration": 2.658977746963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171514, + "balance_loss_mlp": 1.16250181, + "diversity_loss_mlp": 0.0, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.047494824411654174, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82045138, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 2415, + "time_per_iteration": 4.799519777297974 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145606, + "balance_loss_mlp": 1.13711834, + "diversity_loss_mlp": 0.0, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.043373387729815825, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78090668, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.08496094, + "routerloss_mlp": 0.0, + "step": 2416, + "time_per_iteration": 4.990553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0087124, + "balance_loss_mlp": 1.50839305, + "diversity_loss_mlp": 0.20828754, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.030578892859867562, + "language_loss": 0.86378521, + "learning_rate": 0.0005806471581013931, + "loss": 0.87249762, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01289999, + "step": 2417, + "time_per_iteration": 2.6900436878204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122345, + "balance_loss_mlp": 1.11040044, + "diversity_loss_mlp": 0.0, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.07418438196536063, + "language_loss": 0.78360349, + "learning_rate": 0.0005803396793823146, + "loss": 0.79482698, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2418, + "time_per_iteration": 2.8027873039245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113389, + "balance_loss_mlp": 1.12212396, + "diversity_loss_mlp": 0.0, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.07660062238284089, + "language_loss": 0.85582161, + "learning_rate": 0.0005800321694726065, + "loss": 0.86716056, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2419, + "time_per_iteration": 4.293209075927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870744, + "balance_loss_mlp": 1.50698626, + "diversity_loss_mlp": 0.20827082, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.03270390918014964, + "language_loss": 0.86636543, + "learning_rate": 0.0005797246284916545, + "loss": 0.87507284, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01311516, + "step": 2420, + "time_per_iteration": 2.7184417247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112587, + "balance_loss_mlp": 1.1061976, + "diversity_loss_mlp": 0.0, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.04763479459010098, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78617769, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2421, + "time_per_iteration": 4.978823900222778 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164162, + "balance_loss_mlp": 1.1527952, + "diversity_loss_mlp": 0.0, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.08359324638355049, + "language_loss": 0.87635398, + "learning_rate": 0.0005791094537936233, + "loss": 0.8879956, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2422, + "time_per_iteration": 2.706270217895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145768, + "balance_loss_mlp": 1.1349256, + "diversity_loss_mlp": 0.0, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.07317342210777962, + "language_loss": 0.81790811, + "learning_rate": 0.0005788018203153762, + "loss": 0.82936579, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2423, + "time_per_iteration": 2.5965187549591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114513, + "balance_loss_mlp": 1.13404965, + "diversity_loss_mlp": 0.0, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.08308161607945047, + "language_loss": 0.85607517, + "learning_rate": 0.000578494156243549, + "loss": 0.86752647, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.11083984, + "routerloss_mlp": 0.0, + "step": 2424, + "time_per_iteration": 2.5783984661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124685, + "balance_loss_mlp": 1.1135745, + "diversity_loss_mlp": 0.0, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.06702614551613306, + "language_loss": 0.88852286, + "learning_rate": 0.0005781864616975878, + "loss": 0.89976966, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2425, + "time_per_iteration": 2.6615347862243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105595, + "balance_loss_mlp": 1.09463954, + "diversity_loss_mlp": 0.0, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.0790317604017366, + "language_loss": 0.84397781, + "learning_rate": 0.0005778787367969502, + "loss": 0.85503376, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2426, + "time_per_iteration": 2.5796711444854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095822, + "balance_loss_mlp": 1.08478928, + "diversity_loss_mlp": 0.0, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.062032004097500974, + "language_loss": 0.80925953, + "learning_rate": 0.0005775709816611053, + "loss": 0.82021779, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.11029053, + "routerloss_mlp": 0.0, + "step": 2427, + "time_per_iteration": 2.9491348266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.07454419, + "diversity_loss_mlp": 0.0, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.0676389696771178, + "language_loss": 0.83549029, + "learning_rate": 0.0005772631964095346, + "loss": 0.8463425, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.10681152, + "routerloss_mlp": 0.0, + "step": 2428, + "time_per_iteration": 2.6981353759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081501, + "balance_loss_mlp": 1.07072484, + "diversity_loss_mlp": 0.0, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.08126061261115217, + "language_loss": 0.8576231, + "learning_rate": 0.000576955381161731, + "loss": 0.86843812, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.10778809, + "routerloss_mlp": 0.0, + "step": 2429, + "time_per_iteration": 2.6633517742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074344, + "balance_loss_mlp": 1.06313229, + "diversity_loss_mlp": 0.0, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.08275287351868318, + "language_loss": 0.86212349, + "learning_rate": 0.0005766475360371985, + "loss": 0.87286699, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2430, + "time_per_iteration": 2.5904853343963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072898, + "balance_loss_mlp": 1.06205034, + "diversity_loss_mlp": 0.0, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.0860704645170746, + "language_loss": 0.84563982, + "learning_rate": 0.0005763396611554536, + "loss": 0.85636878, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.10852051, + "routerloss_mlp": 0.0, + "step": 2431, + "time_per_iteration": 2.6467607021331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071528, + "balance_loss_mlp": 1.0607698, + "diversity_loss_mlp": 0.0, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.08998246562287979, + "language_loss": 0.80544329, + "learning_rate": 0.0005760317566360237, + "loss": 0.81615859, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 2432, + "time_per_iteration": 3.006641387939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075816, + "balance_loss_mlp": 1.0648669, + "diversity_loss_mlp": 0.0, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.07509845156715887, + "language_loss": 0.84929144, + "learning_rate": 0.000575723822598448, + "loss": 0.86004961, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2433, + "time_per_iteration": 2.764425277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067328, + "balance_loss_mlp": 1.0558188, + "diversity_loss_mlp": 0.0, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.06651895210271294, + "language_loss": 0.8167448, + "learning_rate": 0.0005754158591622773, + "loss": 0.82741809, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2434, + "time_per_iteration": 2.9786107540130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075165, + "balance_loss_mlp": 1.06366098, + "diversity_loss_mlp": 0.0, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.07251033111677281, + "language_loss": 0.82255369, + "learning_rate": 0.0005751078664470732, + "loss": 0.83330536, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2435, + "time_per_iteration": 2.5367684364318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079887, + "balance_loss_mlp": 1.06816268, + "diversity_loss_mlp": 0.0, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.07721942828462902, + "language_loss": 0.85977614, + "learning_rate": 0.0005747998445724094, + "loss": 0.87057501, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2436, + "time_per_iteration": 2.636200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108497, + "balance_loss_mlp": 1.07313251, + "diversity_loss_mlp": 0.0, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.07122055500535385, + "language_loss": 0.89087129, + "learning_rate": 0.0005744917936578707, + "loss": 0.90172094, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2437, + "time_per_iteration": 2.7820210456848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.07790279, + "diversity_loss_mlp": 0.0, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.0674848593159629, + "language_loss": 0.84104413, + "learning_rate": 0.0005741837138230526, + "loss": 0.85194385, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.1206665, + "routerloss_mlp": 0.0, + "step": 2438, + "time_per_iteration": 2.7324602603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091997, + "balance_loss_mlp": 1.07981968, + "diversity_loss_mlp": 0.0, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.08534673561441382, + "language_loss": 0.86345065, + "learning_rate": 0.0005738756051875627, + "loss": 0.87437063, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2439, + "time_per_iteration": 3.0705649852752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098053, + "balance_loss_mlp": 1.08564377, + "diversity_loss_mlp": 0.0, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.06467123496854205, + "language_loss": 0.83114249, + "learning_rate": 0.0005735674678710192, + "loss": 0.84212297, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.12414551, + "routerloss_mlp": 0.0, + "step": 2440, + "time_per_iteration": 2.6645498275756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089062, + "balance_loss_mlp": 1.07644403, + "diversity_loss_mlp": 0.0, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.09155388913703945, + "language_loss": 0.81178355, + "learning_rate": 0.0005732593019930517, + "loss": 0.82267421, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.12628174, + "routerloss_mlp": 0.0, + "step": 2441, + "time_per_iteration": 2.892775774002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084176, + "balance_loss_mlp": 1.07203436, + "diversity_loss_mlp": 0.0, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.07090754106091501, + "language_loss": 0.87927258, + "learning_rate": 0.0005729511076733008, + "loss": 0.89011431, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2442, + "time_per_iteration": 2.629671096801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080039, + "balance_loss_mlp": 1.06766534, + "diversity_loss_mlp": 0.0, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.0886658808398658, + "language_loss": 0.85080904, + "learning_rate": 0.000572642885031418, + "loss": 0.86160946, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.1237793, + "routerloss_mlp": 0.0, + "step": 2443, + "time_per_iteration": 2.858177900314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083351, + "balance_loss_mlp": 1.07077432, + "diversity_loss_mlp": 0.0, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.06516149518751314, + "language_loss": 0.80735445, + "learning_rate": 0.0005723346341870662, + "loss": 0.81818795, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.12573242, + "routerloss_mlp": 0.0, + "step": 2444, + "time_per_iteration": 2.7146968841552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084098, + "balance_loss_mlp": 1.07161689, + "diversity_loss_mlp": 0.0, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.08093347646647668, + "language_loss": 0.86360067, + "learning_rate": 0.0005720263552599188, + "loss": 0.87444162, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2445, + "time_per_iteration": 2.5240447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077035, + "balance_loss_mlp": 1.06469131, + "diversity_loss_mlp": 0.0, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.10031003663616385, + "language_loss": 0.80052316, + "learning_rate": 0.0005717180483696604, + "loss": 0.81129348, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.12347412, + "routerloss_mlp": 0.0, + "step": 2446, + "time_per_iteration": 2.8576042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076731, + "balance_loss_mlp": 1.06456566, + "diversity_loss_mlp": 0.0, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.06704052343949889, + "language_loss": 0.82989585, + "learning_rate": 0.0005714097136359862, + "loss": 0.84066319, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2447, + "time_per_iteration": 2.624566078186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841696, + "balance_loss_mlp": 1.45028305, + "diversity_loss_mlp": 0.205522, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.027205551471082397, + "language_loss": 0.86918223, + "learning_rate": 0.0005711013511786027, + "loss": 0.87759912, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01379322, + "step": 2448, + "time_per_iteration": 2.797086238861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106901, + "balance_loss_mlp": 1.05689788, + "diversity_loss_mlp": 0.0, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.06342125158561994, + "language_loss": 0.83811176, + "learning_rate": 0.0005707929611172263, + "loss": 0.84880185, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2449, + "time_per_iteration": 2.731825351715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071528, + "balance_loss_mlp": 1.05951726, + "diversity_loss_mlp": 0.0, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.09170207604049842, + "language_loss": 0.84256124, + "learning_rate": 0.000570484543571585, + "loss": 0.85327655, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2450, + "time_per_iteration": 2.5735461711883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064618, + "balance_loss_mlp": 1.05268502, + "diversity_loss_mlp": 0.0, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.08479509676509417, + "language_loss": 0.82936448, + "learning_rate": 0.0005701760986614171, + "loss": 0.84001064, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2451, + "time_per_iteration": 2.537297248840332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071081, + "balance_loss_mlp": 1.0591718, + "diversity_loss_mlp": 0.0, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.059658494784791405, + "language_loss": 0.8734417, + "learning_rate": 0.0005698676265064714, + "loss": 0.88415247, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2452, + "time_per_iteration": 2.5586979389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076856, + "balance_loss_mlp": 1.06525099, + "diversity_loss_mlp": 0.0, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.0707454592736124, + "language_loss": 0.89208829, + "learning_rate": 0.0005695591272265074, + "loss": 0.90285689, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2453, + "time_per_iteration": 2.527719736099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088311, + "balance_loss_mlp": 1.07617581, + "diversity_loss_mlp": 0.0, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.07134640406799209, + "language_loss": 0.81947398, + "learning_rate": 0.0005692506009412954, + "loss": 0.83035707, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.12133789, + "routerloss_mlp": 0.0, + "step": 2454, + "time_per_iteration": 2.6558947563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0064123, + "balance_loss_mlp": 1.11988485, + "diversity_loss_mlp": 0.13842735, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.002527541257966033, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78192496, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01207405, + "step": 2455, + "time_per_iteration": 5.005730628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_mlp": 1.07716715, + "diversity_loss_mlp": 0.0, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07179176619920838, + "language_loss": 0.89308333, + "learning_rate": 0.0005686334678342593, + "loss": 0.90397304, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2456, + "time_per_iteration": 2.8779940605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094143, + "balance_loss_mlp": 1.08280611, + "diversity_loss_mlp": 0.0, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.08187467616753978, + "language_loss": 0.81664062, + "learning_rate": 0.0005683248612520274, + "loss": 0.82758206, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.11334229, + "routerloss_mlp": 0.0, + "step": 2457, + "time_per_iteration": 3.0844156742095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087436, + "balance_loss_mlp": 1.07605195, + "diversity_loss_mlp": 0.0, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.08330432962991885, + "language_loss": 0.83940041, + "learning_rate": 0.0005680162281437321, + "loss": 0.85027468, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2458, + "time_per_iteration": 2.886364221572876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108407, + "balance_loss_mlp": 1.07263231, + "diversity_loss_mlp": 0.0, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.06607837126207569, + "language_loss": 0.84340584, + "learning_rate": 0.000567707568629195, + "loss": 0.8542465, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2459, + "time_per_iteration": 2.7153613567352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082795, + "balance_loss_mlp": 1.0712074, + "diversity_loss_mlp": 0.0, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.0662532862091719, + "language_loss": 0.82247961, + "learning_rate": 0.0005673988828282486, + "loss": 0.8333075, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2460, + "time_per_iteration": 2.6740705966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_mlp": 1.06760526, + "diversity_loss_mlp": 0.0, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.05997115702153478, + "language_loss": 0.81122911, + "learning_rate": 0.0005670901708607352, + "loss": 0.82202172, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.11645508, + "routerloss_mlp": 0.0, + "step": 2461, + "time_per_iteration": 3.0222864151000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077887, + "balance_loss_mlp": 1.0661211, + "diversity_loss_mlp": 0.0, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.12722631062247966, + "language_loss": 0.83784962, + "learning_rate": 0.0005667814328465076, + "loss": 0.84862852, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2462, + "time_per_iteration": 2.62223744392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071899, + "balance_loss_mlp": 1.06031179, + "diversity_loss_mlp": 0.0, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.10920156375550993, + "language_loss": 0.82163846, + "learning_rate": 0.0005664726689054285, + "loss": 0.83235747, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2463, + "time_per_iteration": 2.474776029586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072445, + "balance_loss_mlp": 1.06096554, + "diversity_loss_mlp": 0.0, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.07990467081118383, + "language_loss": 0.80772603, + "learning_rate": 0.0005661638791573704, + "loss": 0.81845051, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2464, + "time_per_iteration": 2.699165105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073096, + "balance_loss_mlp": 1.06145513, + "diversity_loss_mlp": 0.0, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.06593248790897067, + "language_loss": 0.86978662, + "learning_rate": 0.0005658550637222164, + "loss": 0.8805176, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2465, + "time_per_iteration": 2.6154093742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070219, + "balance_loss_mlp": 1.0586381, + "diversity_loss_mlp": 0.0, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.06422453310815268, + "language_loss": 0.82103038, + "learning_rate": 0.0005655462227198592, + "loss": 0.83173257, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2466, + "time_per_iteration": 2.888040065765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068955, + "balance_loss_mlp": 1.05703366, + "diversity_loss_mlp": 0.0, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.07464863741428074, + "language_loss": 0.84426093, + "learning_rate": 0.0005652373562702016, + "loss": 0.85495043, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2467, + "time_per_iteration": 2.6240220069885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071196, + "balance_loss_mlp": 1.05926943, + "diversity_loss_mlp": 0.0, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.06778780294468974, + "language_loss": 0.88405621, + "learning_rate": 0.000564928464493156, + "loss": 0.89476824, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2468, + "time_per_iteration": 2.598493814468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068768, + "balance_loss_mlp": 1.05676329, + "diversity_loss_mlp": 0.0, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.06443301027733518, + "language_loss": 0.81735635, + "learning_rate": 0.000564619547508645, + "loss": 0.82804406, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.11999512, + "routerloss_mlp": 0.0, + "step": 2469, + "time_per_iteration": 4.510512828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_mlp": 1.05816698, + "diversity_loss_mlp": 0.0, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.0879456232971056, + "language_loss": 0.82882106, + "learning_rate": 0.0005643106054366008, + "loss": 0.83952397, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.12121582, + "routerloss_mlp": 0.0, + "step": 2470, + "time_per_iteration": 2.5648152828216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.06276536, + "diversity_loss_mlp": 0.0, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.06194770014341408, + "language_loss": 0.79193991, + "learning_rate": 0.000564001638396965, + "loss": 0.8026849, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.11706543, + "routerloss_mlp": 0.0, + "step": 2471, + "time_per_iteration": 2.7267987728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073205, + "balance_loss_mlp": 1.06152296, + "diversity_loss_mlp": 0.0, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.06505306942508977, + "language_loss": 0.82164901, + "learning_rate": 0.0005636926465096897, + "loss": 0.83238107, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2472, + "time_per_iteration": 3.035590887069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078551, + "balance_loss_mlp": 1.06670165, + "diversity_loss_mlp": 0.0, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.08684318660371242, + "language_loss": 0.8723672, + "learning_rate": 0.0005633836298947363, + "loss": 0.88315272, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2473, + "time_per_iteration": 4.002026796340942 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091096, + "balance_loss_mlp": 1.07912695, + "diversity_loss_mlp": 0.0, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.0706680414575132, + "language_loss": 0.70566314, + "learning_rate": 0.000563074588672075, + "loss": 0.71657413, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2474, + "time_per_iteration": 2.6985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089769, + "balance_loss_mlp": 1.07802129, + "diversity_loss_mlp": 0.0, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.06282750442858279, + "language_loss": 0.85378051, + "learning_rate": 0.0005627655229616868, + "loss": 0.86467826, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.11743164, + "routerloss_mlp": 0.0, + "step": 2475, + "time_per_iteration": 2.7580935955047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091111, + "balance_loss_mlp": 1.07941031, + "diversity_loss_mlp": 0.0, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.07002888905047219, + "language_loss": 0.90058106, + "learning_rate": 0.0005624564328835616, + "loss": 0.91149217, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2476, + "time_per_iteration": 2.789257764816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108666, + "balance_loss_mlp": 1.07509637, + "diversity_loss_mlp": 0.0, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.06042863191219761, + "language_loss": 0.84203571, + "learning_rate": 0.0005621473185576986, + "loss": 0.85290229, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2477, + "time_per_iteration": 2.724280834197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089922, + "balance_loss_mlp": 1.07846594, + "diversity_loss_mlp": 0.0, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.07203405271885309, + "language_loss": 0.87555075, + "learning_rate": 0.0005618381801041068, + "loss": 0.88644993, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2478, + "time_per_iteration": 2.6800026893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085708, + "balance_loss_mlp": 1.0738883, + "diversity_loss_mlp": 0.0, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.08495018756940642, + "language_loss": 0.83006722, + "learning_rate": 0.0005615290176428044, + "loss": 0.84092432, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.11810303, + "routerloss_mlp": 0.0, + "step": 2479, + "time_per_iteration": 2.6456432342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078377, + "balance_loss_mlp": 1.06658673, + "diversity_loss_mlp": 0.0, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.07371403414772894, + "language_loss": 0.84979588, + "learning_rate": 0.0005612198312938187, + "loss": 0.86057961, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2480, + "time_per_iteration": 2.7325923442840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085233, + "balance_loss_mlp": 1.0737772, + "diversity_loss_mlp": 0.0, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.05926830515799366, + "language_loss": 0.79493093, + "learning_rate": 0.0005609106211771868, + "loss": 0.80578327, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2481, + "time_per_iteration": 2.8374931812286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108305, + "balance_loss_mlp": 1.07103384, + "diversity_loss_mlp": 0.0, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.06643858588339867, + "language_loss": 0.88938701, + "learning_rate": 0.0005606013874129543, + "loss": 0.90021759, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2482, + "time_per_iteration": 2.7547929286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081649, + "balance_loss_mlp": 1.07017505, + "diversity_loss_mlp": 0.0, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.06416127972697647, + "language_loss": 0.80410159, + "learning_rate": 0.0005602921301211768, + "loss": 0.81491804, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2483, + "time_per_iteration": 2.7025153636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080053, + "balance_loss_mlp": 1.06850159, + "diversity_loss_mlp": 0.0, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.07652865967226291, + "language_loss": 0.8209163, + "learning_rate": 0.0005599828494219185, + "loss": 0.83171678, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.11541748, + "routerloss_mlp": 0.0, + "step": 2484, + "time_per_iteration": 2.5415024757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070825, + "balance_loss_mlp": 1.05903542, + "diversity_loss_mlp": 0.0, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.07721505579443601, + "language_loss": 0.89162952, + "learning_rate": 0.0005596735454352527, + "loss": 0.90233779, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2485, + "time_per_iteration": 2.8591346740722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077742, + "balance_loss_mlp": 1.06591046, + "diversity_loss_mlp": 0.0, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.07819028279068943, + "language_loss": 0.85696715, + "learning_rate": 0.0005593642182812619, + "loss": 0.86774457, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.1182251, + "routerloss_mlp": 0.0, + "step": 2486, + "time_per_iteration": 2.679927349090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_mlp": 1.06575358, + "diversity_loss_mlp": 0.0, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.0859238614993436, + "language_loss": 0.83753216, + "learning_rate": 0.0005590548680800378, + "loss": 0.84830678, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.11694336, + "routerloss_mlp": 0.0, + "step": 2487, + "time_per_iteration": 3.0984909534454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071222, + "balance_loss_mlp": 1.05950415, + "diversity_loss_mlp": 0.0, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.06795851613398404, + "language_loss": 0.76434267, + "learning_rate": 0.0005587454949516804, + "loss": 0.77505481, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2488, + "time_per_iteration": 2.692324161529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_mlp": 1.06507468, + "diversity_loss_mlp": 0.0, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.06921637005003253, + "language_loss": 0.8785038, + "learning_rate": 0.0005584360990162993, + "loss": 0.88927084, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.11627197, + "routerloss_mlp": 0.0, + "step": 2489, + "time_per_iteration": 2.646521806716919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077817, + "balance_loss_mlp": 1.06614649, + "diversity_loss_mlp": 0.0, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.06386300972416134, + "language_loss": 0.85713631, + "learning_rate": 0.0005581266803940124, + "loss": 0.86791456, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.11657715, + "routerloss_mlp": 0.0, + "step": 2490, + "time_per_iteration": 2.735152244567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.05925143, + "diversity_loss_mlp": 0.0, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.0718717211843218, + "language_loss": 0.87536263, + "learning_rate": 0.0005578172392049471, + "loss": 0.88607073, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2491, + "time_per_iteration": 2.7718377113342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00892921, + "balance_loss_mlp": 1.54530287, + "diversity_loss_mlp": 0.21191472, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.033555176901221506, + "language_loss": 0.84551859, + "learning_rate": 0.0005575077755692386, + "loss": 0.85444778, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01431197, + "step": 2492, + "time_per_iteration": 2.81888747215271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070402, + "balance_loss_mlp": 1.05893993, + "diversity_loss_mlp": 0.0, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.054684262853474656, + "language_loss": 0.86001486, + "learning_rate": 0.0005571982896070316, + "loss": 0.8707189, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.11456299, + "routerloss_mlp": 0.0, + "step": 2493, + "time_per_iteration": 2.655311346054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084039, + "balance_loss_mlp": 1.07248712, + "diversity_loss_mlp": 0.0, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.07545203546694841, + "language_loss": 0.89854079, + "learning_rate": 0.0005568887814384792, + "loss": 0.90938115, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2494, + "time_per_iteration": 2.5930681228637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_mlp": 1.07098675, + "diversity_loss_mlp": 0.0, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.07194257940045806, + "language_loss": 0.87281573, + "learning_rate": 0.000556579251183743, + "loss": 0.88364077, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2495, + "time_per_iteration": 2.6386003494262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076942, + "balance_loss_mlp": 1.06520605, + "diversity_loss_mlp": 0.0, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.0750590648958695, + "language_loss": 0.80158448, + "learning_rate": 0.0005562696989629936, + "loss": 0.81235385, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.11737061, + "routerloss_mlp": 0.0, + "step": 2496, + "time_per_iteration": 2.7050864696502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00880705, + "balance_loss_mlp": 1.52288473, + "diversity_loss_mlp": 0.21003026, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.02916103721032611, + "language_loss": 0.82606125, + "learning_rate": 0.0005559601248964095, + "loss": 0.83486831, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01424794, + "step": 2497, + "time_per_iteration": 2.6473939418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085332, + "balance_loss_mlp": 1.0741564, + "diversity_loss_mlp": 0.0, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.07410871061403823, + "language_loss": 0.85882998, + "learning_rate": 0.0005556505291041783, + "loss": 0.86968333, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.11175537, + "routerloss_mlp": 0.0, + "step": 2498, + "time_per_iteration": 2.665832042694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105898, + "balance_loss_mlp": 1.09428692, + "diversity_loss_mlp": 0.0, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.06465509842390993, + "language_loss": 0.84413946, + "learning_rate": 0.0005553409117064954, + "loss": 0.8551985, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2499, + "time_per_iteration": 2.880300521850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00859857, + "balance_loss_mlp": 1.48415303, + "diversity_loss_mlp": 0.20870377, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.02869897963967695, + "language_loss": 0.84937358, + "learning_rate": 0.0005550312728235654, + "loss": 0.85797209, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01342856, + "step": 2500, + "time_per_iteration": 2.7199203968048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109419, + "balance_loss_mlp": 1.08251953, + "diversity_loss_mlp": 0.0, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.07331859457791397, + "language_loss": 0.83879191, + "learning_rate": 0.0005547216125756003, + "loss": 0.84973377, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2501, + "time_per_iteration": 2.732786178588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098928, + "balance_loss_mlp": 1.08708501, + "diversity_loss_mlp": 0.0, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.07387575947985975, + "language_loss": 0.82064617, + "learning_rate": 0.0005544119310828211, + "loss": 0.83163536, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2502, + "time_per_iteration": 3.1029446125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100673, + "balance_loss_mlp": 1.08865714, + "diversity_loss_mlp": 0.0, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.06596898477591598, + "language_loss": 0.84657413, + "learning_rate": 0.0005541022284654568, + "loss": 0.8575809, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2503, + "time_per_iteration": 2.901026725769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092163, + "balance_loss_mlp": 1.08015907, + "diversity_loss_mlp": 0.0, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.0759157238743441, + "language_loss": 0.83907866, + "learning_rate": 0.0005537925048437446, + "loss": 0.85000032, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2504, + "time_per_iteration": 2.6014060974121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00594545, + "balance_loss_mlp": 1.03097272, + "diversity_loss_mlp": 0.13453583, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.0017952613590721677, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76346016, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01179097, + "step": 2505, + "time_per_iteration": 4.960138320922852 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00867388, + "balance_loss_mlp": 1.49711311, + "diversity_loss_mlp": 0.20998067, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.029195885141922995, + "language_loss": 0.88189656, + "learning_rate": 0.0005531729950682664, + "loss": 0.8905704, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01384138, + "step": 2506, + "time_per_iteration": 3.056671142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082436, + "balance_loss_mlp": 1.07027662, + "diversity_loss_mlp": 0.0, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.09591114443507165, + "language_loss": 0.84746361, + "learning_rate": 0.000552863209155015, + "loss": 0.85828793, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2507, + "time_per_iteration": 2.473930835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00866012, + "balance_loss_mlp": 1.49284506, + "diversity_loss_mlp": 0.21081753, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.03047035716712285, + "language_loss": 0.82048851, + "learning_rate": 0.0005525534027184461, + "loss": 0.82914865, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01418037, + "step": 2508, + "time_per_iteration": 2.5708260536193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078028, + "balance_loss_mlp": 1.06624985, + "diversity_loss_mlp": 0.0, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.06261213728600334, + "language_loss": 0.83131289, + "learning_rate": 0.0005522435758788365, + "loss": 0.84209323, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2509, + "time_per_iteration": 2.7291650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00853572, + "balance_loss_mlp": 1.46908307, + "diversity_loss_mlp": 0.20966808, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.03495470447814039, + "language_loss": 0.80126894, + "learning_rate": 0.0005519337287564721, + "loss": 0.80980462, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01419635, + "step": 2510, + "time_per_iteration": 2.843698024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077046, + "balance_loss_mlp": 1.06536365, + "diversity_loss_mlp": 0.0, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07525780944119016, + "language_loss": 0.83495927, + "learning_rate": 0.000551623861471646, + "loss": 0.84572971, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2511, + "time_per_iteration": 2.7327091693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.1273582, + "diversity_loss_mlp": 0.0, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.052890092991212126, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79952717, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.06542969, + "routerloss_mlp": 0.0, + "step": 2512, + "time_per_iteration": 4.820046901702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073764, + "balance_loss_mlp": 1.06182551, + "diversity_loss_mlp": 0.0, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.09417698665840035, + "language_loss": 0.8670119, + "learning_rate": 0.0005510040668958211, + "loss": 0.87774956, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2513, + "time_per_iteration": 2.579780101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051826, + "balance_loss_mlp": 1.04515004, + "diversity_loss_mlp": 0.0, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.02705432320804172, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78812408, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.06689453, + "routerloss_mlp": 0.0, + "step": 2514, + "time_per_iteration": 4.83507227897644 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106953, + "balance_loss_mlp": 1.05716157, + "diversity_loss_mlp": 0.0, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.07432123735470587, + "language_loss": 0.83170015, + "learning_rate": 0.0005503841931138645, + "loss": 0.84239542, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.12365723, + "routerloss_mlp": 0.0, + "step": 2515, + "time_per_iteration": 2.6834895610809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071737, + "balance_loss_mlp": 1.05963731, + "diversity_loss_mlp": 0.0, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.07510504832931036, + "language_loss": 0.81515384, + "learning_rate": 0.0005500742268214025, + "loss": 0.82587123, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2516, + "time_per_iteration": 2.494479179382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084077, + "balance_loss_mlp": 1.0715425, + "diversity_loss_mlp": 0.0, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.06432693662792612, + "language_loss": 0.85142744, + "learning_rate": 0.0005497642410884014, + "loss": 0.86226821, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.12542725, + "routerloss_mlp": 0.0, + "step": 2517, + "time_per_iteration": 2.760425090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080501, + "balance_loss_mlp": 1.06788325, + "diversity_loss_mlp": 0.0, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.06763953923030977, + "language_loss": 0.85120749, + "learning_rate": 0.0005494542360352085, + "loss": 0.86201251, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.12628174, + "routerloss_mlp": 0.0, + "step": 2518, + "time_per_iteration": 2.6524109840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108191, + "balance_loss_mlp": 1.06955993, + "diversity_loss_mlp": 0.0, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.06089591080825084, + "language_loss": 0.85741639, + "learning_rate": 0.0005491442117821783, + "loss": 0.86823547, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 2519, + "time_per_iteration": 2.7461459636688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079216, + "balance_loss_mlp": 1.06654429, + "diversity_loss_mlp": 0.0, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.07584750574127574, + "language_loss": 0.87494171, + "learning_rate": 0.0005488341684496732, + "loss": 0.88573384, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.12677002, + "routerloss_mlp": 0.0, + "step": 2520, + "time_per_iteration": 2.6621458530426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080655, + "balance_loss_mlp": 1.06843615, + "diversity_loss_mlp": 0.0, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06605179609441998, + "language_loss": 0.9207437, + "learning_rate": 0.0005485241061580624, + "loss": 0.9315502, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2521, + "time_per_iteration": 2.772949457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089898, + "balance_loss_mlp": 1.07741094, + "diversity_loss_mlp": 0.0, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.06556104217544546, + "language_loss": 0.8458938, + "learning_rate": 0.0005482140250277228, + "loss": 0.85679281, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 2522, + "time_per_iteration": 2.978330135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847105, + "balance_loss_mlp": 1.45509815, + "diversity_loss_mlp": 0.21114388, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.03368619412239962, + "language_loss": 0.87090278, + "learning_rate": 0.0005479039251790387, + "loss": 0.87937379, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01398425, + "step": 2523, + "time_per_iteration": 2.6939120292663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00840008, + "balance_loss_mlp": 1.44148707, + "diversity_loss_mlp": 0.21069397, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.03188648694570784, + "language_loss": 0.84722733, + "learning_rate": 0.0005475938067324014, + "loss": 0.85562754, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0139178, + "step": 2524, + "time_per_iteration": 2.859184980392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106923, + "balance_loss_mlp": 1.09528267, + "diversity_loss_mlp": 0.0, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.06962736532334403, + "language_loss": 0.83518255, + "learning_rate": 0.0005472836698082098, + "loss": 0.84625173, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2525, + "time_per_iteration": 2.534783363342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101033, + "balance_loss_mlp": 1.08923149, + "diversity_loss_mlp": 0.0, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.07423434170097615, + "language_loss": 0.84140873, + "learning_rate": 0.0005469735145268694, + "loss": 0.85241902, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2526, + "time_per_iteration": 2.7064108848571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090982, + "balance_loss_mlp": 1.07928169, + "diversity_loss_mlp": 0.0, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.0731540325655248, + "language_loss": 0.81093931, + "learning_rate": 0.0005466633410087933, + "loss": 0.82184911, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2527, + "time_per_iteration": 2.682969570159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085513, + "balance_loss_mlp": 1.07793164, + "diversity_loss_mlp": 0.0, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.03711409557498352, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78346336, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.07568359, + "routerloss_mlp": 0.0, + "step": 2528, + "time_per_iteration": 4.962444067001343 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085086, + "balance_loss_mlp": 1.07360601, + "diversity_loss_mlp": 0.0, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.07791605184695856, + "language_loss": 0.88148236, + "learning_rate": 0.0005460429397441214, + "loss": 0.89233321, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2529, + "time_per_iteration": 2.5908102989196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00835644, + "balance_loss_mlp": 1.43002903, + "diversity_loss_mlp": 0.21195745, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.03186279831907627, + "language_loss": 0.87013817, + "learning_rate": 0.0005457327122383866, + "loss": 0.87849462, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01465126, + "step": 2530, + "time_per_iteration": 2.656264543533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036926, + "balance_loss_mlp": 1.02939153, + "diversity_loss_mlp": 0.0, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.02373673385224348, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75673413, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.07519531, + "routerloss_mlp": 0.0, + "step": 2531, + "time_per_iteration": 4.838496208190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100935, + "balance_loss_mlp": 1.08965194, + "diversity_loss_mlp": 0.0, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.06845758574896237, + "language_loss": 0.75823385, + "learning_rate": 0.0005451122040823244, + "loss": 0.76924324, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2532, + "time_per_iteration": 2.770751714706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099102, + "balance_loss_mlp": 1.08746696, + "diversity_loss_mlp": 0.0, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.07387169787784394, + "language_loss": 0.77164292, + "learning_rate": 0.0005448019236728997, + "loss": 0.7826339, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 2533, + "time_per_iteration": 2.8874497413635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00837303, + "balance_loss_mlp": 1.43305767, + "diversity_loss_mlp": 0.21233971, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.03246629845535473, + "language_loss": 0.8471576, + "learning_rate": 0.0005444916258698255, + "loss": 0.85553062, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01460437, + "step": 2534, + "time_per_iteration": 2.623748540878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112009, + "balance_loss_mlp": 1.10867584, + "diversity_loss_mlp": 0.0, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.06488105381348498, + "language_loss": 0.86077154, + "learning_rate": 0.0005441813107935704, + "loss": 0.87197244, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2535, + "time_per_iteration": 2.6705739498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124443, + "balance_loss_mlp": 1.11277819, + "diversity_loss_mlp": 0.0, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.07112550287999594, + "language_loss": 0.86025345, + "learning_rate": 0.0005438709785646091, + "loss": 0.87149793, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2536, + "time_per_iteration": 2.5624749660491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120427, + "balance_loss_mlp": 1.10864902, + "diversity_loss_mlp": 0.0, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.08492074314505418, + "language_loss": 0.86885595, + "learning_rate": 0.0005435606293034234, + "loss": 0.8800602, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2537, + "time_per_iteration": 2.6347479820251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121847, + "balance_loss_mlp": 1.11035514, + "diversity_loss_mlp": 0.0, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.08214525409599778, + "language_loss": 0.84619427, + "learning_rate": 0.0005432502631305016, + "loss": 0.8574127, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2538, + "time_per_iteration": 2.700613021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_mlp": 1.10190618, + "diversity_loss_mlp": 0.0, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.06429037959601741, + "language_loss": 0.83193302, + "learning_rate": 0.0005429398801663386, + "loss": 0.84306723, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2539, + "time_per_iteration": 2.9839913845062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097658, + "balance_loss_mlp": 1.08599913, + "diversity_loss_mlp": 0.0, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.12053819121868696, + "language_loss": 0.8290484, + "learning_rate": 0.0005426294805314355, + "loss": 0.84002495, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2540, + "time_per_iteration": 2.5029373168945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094803, + "balance_loss_mlp": 1.08291781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.06245664696917761, + "language_loss": 0.80155998, + "learning_rate": 0.0005423190643463003, + "loss": 0.81250799, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2541, + "time_per_iteration": 2.949772357940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093208, + "balance_loss_mlp": 1.08163261, + "diversity_loss_mlp": 0.0, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.07791209549750817, + "language_loss": 0.8281579, + "learning_rate": 0.0005420086317314473, + "loss": 0.83908999, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2542, + "time_per_iteration": 2.6383941173553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088553, + "balance_loss_mlp": 1.0765729, + "diversity_loss_mlp": 0.0, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.06362759827284906, + "language_loss": 0.81081557, + "learning_rate": 0.0005416981828073971, + "loss": 0.82170111, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.11981201, + "routerloss_mlp": 0.0, + "step": 2543, + "time_per_iteration": 2.8023576736450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007156, + "balance_loss_mlp": 0.99990815, + "diversity_loss_mlp": 0.0, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.01938913368632236, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78122175, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.07226562, + "routerloss_mlp": 0.0, + "step": 2544, + "time_per_iteration": 4.817458629608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093446, + "balance_loss_mlp": 1.08184147, + "diversity_loss_mlp": 0.0, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.08678858450341921, + "language_loss": 0.84937072, + "learning_rate": 0.000541077236513819, + "loss": 0.86030519, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2545, + "time_per_iteration": 2.5271120071411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089352, + "balance_loss_mlp": 1.07800293, + "diversity_loss_mlp": 0.0, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.07207098978073255, + "language_loss": 0.82449925, + "learning_rate": 0.0005407667393853638, + "loss": 0.83539271, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2546, + "time_per_iteration": 2.6385204792022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093617, + "balance_loss_mlp": 1.08250618, + "diversity_loss_mlp": 0.0, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.06843607218978102, + "language_loss": 0.83673334, + "learning_rate": 0.0005404562264298569, + "loss": 0.84766948, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2547, + "time_per_iteration": 2.845250368118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102851, + "balance_loss_mlp": 1.09120405, + "diversity_loss_mlp": 0.0, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.06940893068641271, + "language_loss": 0.83999467, + "learning_rate": 0.0005401456977678498, + "loss": 0.8510232, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2548, + "time_per_iteration": 2.638720750808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099322, + "balance_loss_mlp": 1.08754444, + "diversity_loss_mlp": 0.0, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.08453175850654031, + "language_loss": 0.77431965, + "learning_rate": 0.0005398351535199008, + "loss": 0.78531289, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2549, + "time_per_iteration": 3.064035415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103016, + "balance_loss_mlp": 1.09175706, + "diversity_loss_mlp": 0.0, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.07238427843662706, + "language_loss": 0.84189212, + "learning_rate": 0.0005395245938065735, + "loss": 0.85292226, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2550, + "time_per_iteration": 2.7746829986572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118468, + "balance_loss_mlp": 1.10702372, + "diversity_loss_mlp": 0.0, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.08583684211433391, + "language_loss": 0.82631576, + "learning_rate": 0.0005392140187484379, + "loss": 0.83750039, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2551, + "time_per_iteration": 2.582195281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124142, + "balance_loss_mlp": 1.11273384, + "diversity_loss_mlp": 0.0, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.0682243054902728, + "language_loss": 0.89719319, + "learning_rate": 0.0005389034284660701, + "loss": 0.90843463, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.11401367, + "routerloss_mlp": 0.0, + "step": 2552, + "time_per_iteration": 2.824427366256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131566, + "balance_loss_mlp": 1.12022352, + "diversity_loss_mlp": 0.0, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.08386347311462448, + "language_loss": 0.82537109, + "learning_rate": 0.000538592823080052, + "loss": 0.83668673, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.11340332, + "routerloss_mlp": 0.0, + "step": 2553, + "time_per_iteration": 3.24122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127167, + "balance_loss_mlp": 1.11565781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.06967590045443849, + "language_loss": 0.84592807, + "learning_rate": 0.000538282202710971, + "loss": 0.85719973, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.11505127, + "routerloss_mlp": 0.0, + "step": 2554, + "time_per_iteration": 2.5753910541534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130476, + "balance_loss_mlp": 1.11918652, + "diversity_loss_mlp": 0.0, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.07442252581599826, + "language_loss": 0.82315147, + "learning_rate": 0.000537971567479421, + "loss": 0.83445626, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2555, + "time_per_iteration": 2.7354228496551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127557, + "balance_loss_mlp": 1.11596429, + "diversity_loss_mlp": 0.0, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.09076326784032986, + "language_loss": 0.88129175, + "learning_rate": 0.0005376609175060011, + "loss": 0.8925674, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2556, + "time_per_iteration": 2.6124610900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106232, + "balance_loss_mlp": 1.09465659, + "diversity_loss_mlp": 0.0, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.07210041581715526, + "language_loss": 0.80779845, + "learning_rate": 0.0005373502529113162, + "loss": 0.81886077, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2557, + "time_per_iteration": 2.823993444442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100884, + "balance_loss_mlp": 1.08888519, + "diversity_loss_mlp": 0.0, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.07460313059090624, + "language_loss": 0.81449521, + "learning_rate": 0.0005370395738159773, + "loss": 0.82550406, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2558, + "time_per_iteration": 2.6436777114868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00834873, + "balance_loss_mlp": 1.42800272, + "diversity_loss_mlp": 0.21467975, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.03347414568603151, + "language_loss": 0.82822633, + "learning_rate": 0.0005367288803406003, + "loss": 0.83657515, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01353174, + "step": 2559, + "time_per_iteration": 2.662224531173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083349, + "balance_loss_mlp": 1.07132101, + "diversity_loss_mlp": 0.0, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.0788259825299616, + "language_loss": 0.818443, + "learning_rate": 0.0005364181726058073, + "loss": 0.82927656, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.12023926, + "routerloss_mlp": 0.0, + "step": 2560, + "time_per_iteration": 2.686300277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.06417727, + "diversity_loss_mlp": 0.0, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.07955060847799823, + "language_loss": 0.8272332, + "learning_rate": 0.0005361074507322261, + "loss": 0.83799613, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2561, + "time_per_iteration": 2.5809431076049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073469, + "balance_loss_mlp": 1.06138754, + "diversity_loss_mlp": 0.0, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.07091460094801966, + "language_loss": 0.81425411, + "learning_rate": 0.000535796714840489, + "loss": 0.82498884, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2562, + "time_per_iteration": 2.6425187587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073356, + "balance_loss_mlp": 1.06107163, + "diversity_loss_mlp": 0.0, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.10871355986071002, + "language_loss": 0.83800626, + "learning_rate": 0.0005354859650512348, + "loss": 0.84873986, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2563, + "time_per_iteration": 2.7957375049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074544, + "balance_loss_mlp": 1.06282604, + "diversity_loss_mlp": 0.0, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.0798917687203661, + "language_loss": 0.87428886, + "learning_rate": 0.0005351752014851074, + "loss": 0.88503432, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2564, + "time_per_iteration": 2.6205673217773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085324, + "balance_loss_mlp": 1.07352281, + "diversity_loss_mlp": 0.0, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.06874397476353511, + "language_loss": 0.83621442, + "learning_rate": 0.0005348644242627553, + "loss": 0.84706771, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2565, + "time_per_iteration": 2.7460625171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010105, + "balance_loss_mlp": 1.00411022, + "diversity_loss_mlp": 0.0, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.013767653611631516, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76297128, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2566, + "time_per_iteration": 4.943475723266602 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110567, + "balance_loss_mlp": 1.09899187, + "diversity_loss_mlp": 0.0, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.08759046492811678, + "language_loss": 0.81650245, + "learning_rate": 0.0005342428293320013, + "loss": 0.82760805, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2567, + "time_per_iteration": 2.7889564037323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102659, + "balance_loss_mlp": 1.09142327, + "diversity_loss_mlp": 0.0, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.07999691418133484, + "language_loss": 0.8344667, + "learning_rate": 0.0005339320118649238, + "loss": 0.84549326, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.11230469, + "routerloss_mlp": 0.0, + "step": 2568, + "time_per_iteration": 2.7774229049682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.09715271, + "diversity_loss_mlp": 0.0, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.07608170940546952, + "language_loss": 0.86422324, + "learning_rate": 0.000533621181224271, + "loss": 0.87530512, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2569, + "time_per_iteration": 2.7708005905151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08442283, + "diversity_loss_mlp": 0.0, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.06858054906862693, + "language_loss": 0.8138749, + "learning_rate": 0.0005333103375307182, + "loss": 0.82483125, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2570, + "time_per_iteration": 2.8407034873962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090688, + "balance_loss_mlp": 1.07972121, + "diversity_loss_mlp": 0.0, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06174009778797697, + "language_loss": 0.85711801, + "learning_rate": 0.0005329994809049451, + "loss": 0.86802495, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.10974121, + "routerloss_mlp": 0.0, + "step": 2571, + "time_per_iteration": 2.7500712871551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096363, + "balance_loss_mlp": 1.08508563, + "diversity_loss_mlp": 0.0, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.06855083904022342, + "language_loss": 0.88066995, + "learning_rate": 0.0005326886114676375, + "loss": 0.89163363, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2572, + "time_per_iteration": 2.730137825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_mlp": 1.07269001, + "diversity_loss_mlp": 0.0, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06053914015656951, + "language_loss": 0.88364595, + "learning_rate": 0.0005323777293394854, + "loss": 0.89448464, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2573, + "time_per_iteration": 2.539825201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084718, + "balance_loss_mlp": 1.07365584, + "diversity_loss_mlp": 0.0, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.06797932871808014, + "language_loss": 0.81904709, + "learning_rate": 0.000532066834641184, + "loss": 0.8298943, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.11065674, + "routerloss_mlp": 0.0, + "step": 2574, + "time_per_iteration": 2.6663713455200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103824, + "balance_loss_mlp": 1.09271336, + "diversity_loss_mlp": 0.0, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.07191084425213706, + "language_loss": 0.85331243, + "learning_rate": 0.0005317559274934334, + "loss": 0.86435068, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.11114502, + "routerloss_mlp": 0.0, + "step": 2575, + "time_per_iteration": 2.756410598754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097116, + "balance_loss_mlp": 1.08592236, + "diversity_loss_mlp": 0.0, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.08893709148941176, + "language_loss": 0.80365205, + "learning_rate": 0.0005314450080169382, + "loss": 0.81462318, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2576, + "time_per_iteration": 2.613163471221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092174, + "balance_loss_mlp": 1.0810523, + "diversity_loss_mlp": 0.0, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.10818754121519983, + "language_loss": 0.8082127, + "learning_rate": 0.0005311340763324083, + "loss": 0.81913447, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.11126709, + "routerloss_mlp": 0.0, + "step": 2577, + "time_per_iteration": 2.5670807361602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087439, + "balance_loss_mlp": 1.07612574, + "diversity_loss_mlp": 0.0, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.07097138632102568, + "language_loss": 0.82323599, + "learning_rate": 0.0005308231325605578, + "loss": 0.83411032, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.11315918, + "routerloss_mlp": 0.0, + "step": 2578, + "time_per_iteration": 2.6519079208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085757, + "balance_loss_mlp": 1.07421172, + "diversity_loss_mlp": 0.0, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.06601832089031445, + "language_loss": 0.76727217, + "learning_rate": 0.0005305121768221061, + "loss": 0.7781297, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2579, + "time_per_iteration": 3.1306209564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_mlp": 1.03489161, + "diversity_loss_mlp": 0.0, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.022004289450105873, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76079202, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2580, + "time_per_iteration": 4.8141255378723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079045, + "balance_loss_mlp": 1.06767821, + "diversity_loss_mlp": 0.0, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.06618835036619775, + "language_loss": 0.91614985, + "learning_rate": 0.0005298902299282984, + "loss": 0.92694032, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2581, + "time_per_iteration": 2.586012125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087579, + "balance_loss_mlp": 1.07617044, + "diversity_loss_mlp": 0.0, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.07143589820149647, + "language_loss": 0.84265745, + "learning_rate": 0.0005295792390144033, + "loss": 0.85353327, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2582, + "time_per_iteration": 2.704911708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096311, + "balance_loss_mlp": 1.08442605, + "diversity_loss_mlp": 0.0, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.07556433689349051, + "language_loss": 0.83576399, + "learning_rate": 0.0005292682366168294, + "loss": 0.84672707, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2583, + "time_per_iteration": 2.5530638694763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105009, + "balance_loss_mlp": 1.09309435, + "diversity_loss_mlp": 0.0, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.06699014279274042, + "language_loss": 0.80089158, + "learning_rate": 0.0005289572228563181, + "loss": 0.81194162, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2584, + "time_per_iteration": 2.729093551635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100707, + "balance_loss_mlp": 1.08861935, + "diversity_loss_mlp": 0.0, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.0657007833960997, + "language_loss": 0.83234823, + "learning_rate": 0.000528646197853616, + "loss": 0.8433553, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2585, + "time_per_iteration": 2.727252721786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113697, + "balance_loss_mlp": 1.10166335, + "diversity_loss_mlp": 0.0, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.07376563164337009, + "language_loss": 0.85810697, + "learning_rate": 0.0005283351617294735, + "loss": 0.86924398, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.12023926, + "routerloss_mlp": 0.0, + "step": 2586, + "time_per_iteration": 2.945610761642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011716, + "balance_loss_mlp": 1.00470638, + "diversity_loss_mlp": 0.0, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.017193207514109847, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77648377, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.0703125, + "routerloss_mlp": 0.0, + "step": 2587, + "time_per_iteration": 5.038366079330444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.07597303, + "diversity_loss_mlp": 0.0, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.06591325697086226, + "language_loss": 0.86769819, + "learning_rate": 0.0005277130565998916, + "loss": 0.87858337, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.12554932, + "routerloss_mlp": 0.0, + "step": 2588, + "time_per_iteration": 2.7726681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086804, + "balance_loss_mlp": 1.07443595, + "diversity_loss_mlp": 0.0, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.05822748641904789, + "language_loss": 0.81899714, + "learning_rate": 0.0005274019878359748, + "loss": 0.82986516, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.12371826, + "routerloss_mlp": 0.0, + "step": 2589, + "time_per_iteration": 2.733985424041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.06275249, + "diversity_loss_mlp": 0.0, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.0736619230298454, + "language_loss": 0.87174684, + "learning_rate": 0.0005270909084336628, + "loss": 0.88249791, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.12335205, + "routerloss_mlp": 0.0, + "step": 2590, + "time_per_iteration": 2.648728370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075145, + "balance_loss_mlp": 1.06231809, + "diversity_loss_mlp": 0.0, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.07329601175103365, + "language_loss": 0.8877548, + "learning_rate": 0.0005267798185137276, + "loss": 0.89850616, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.12835693, + "routerloss_mlp": 0.0, + "step": 2591, + "time_per_iteration": 2.616903066635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061242, + "balance_loss_mlp": 1.04852843, + "diversity_loss_mlp": 0.0, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.0712913700859702, + "language_loss": 0.89140213, + "learning_rate": 0.0005264687181969444, + "loss": 0.90201461, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.12713623, + "routerloss_mlp": 0.0, + "step": 2592, + "time_per_iteration": 2.7121951580047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067017, + "balance_loss_mlp": 1.05430353, + "diversity_loss_mlp": 0.0, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.07969645648170227, + "language_loss": 0.75208342, + "learning_rate": 0.0005261576076040937, + "loss": 0.76275361, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.12719727, + "routerloss_mlp": 0.0, + "step": 2593, + "time_per_iteration": 3.248811721801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059604, + "balance_loss_mlp": 1.04746807, + "diversity_loss_mlp": 0.0, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.07355463018535204, + "language_loss": 0.84396625, + "learning_rate": 0.0005258464868559591, + "loss": 0.85456228, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.12121582, + "routerloss_mlp": 0.0, + "step": 2594, + "time_per_iteration": 2.6535778045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_mlp": 1.0461601, + "diversity_loss_mlp": 0.0, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.06735340586139127, + "language_loss": 0.88490266, + "learning_rate": 0.0005255353560733284, + "loss": 0.89548326, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2595, + "time_per_iteration": 2.5711045265197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_mlp": 1.03453541, + "diversity_loss_mlp": 0.0, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.025598241729826776, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76619136, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 2596, + "time_per_iteration": 4.7992448806762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106698, + "balance_loss_mlp": 1.05498767, + "diversity_loss_mlp": 0.0, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.07107233717475309, + "language_loss": 0.83179224, + "learning_rate": 0.0005249130648877492, + "loss": 0.84246206, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2597, + "time_per_iteration": 2.7089900970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068426, + "balance_loss_mlp": 1.05646324, + "diversity_loss_mlp": 0.0, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.08792128719199578, + "language_loss": 0.84945238, + "learning_rate": 0.0005246019047263953, + "loss": 0.86013663, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2598, + "time_per_iteration": 2.4586942195892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070932, + "balance_loss_mlp": 1.0594883, + "diversity_loss_mlp": 0.0, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.08031275074858332, + "language_loss": 0.82562858, + "learning_rate": 0.0005242907350137353, + "loss": 0.83633792, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2599, + "time_per_iteration": 2.547146797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075807, + "balance_loss_mlp": 1.06445217, + "diversity_loss_mlp": 0.0, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.08690624784708721, + "language_loss": 0.79332286, + "learning_rate": 0.0005239795558705754, + "loss": 0.80408096, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2600, + "time_per_iteration": 2.5985541343688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077027, + "balance_loss_mlp": 1.06555915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.06025548364908716, + "language_loss": 0.89517641, + "learning_rate": 0.0005236683674177264, + "loss": 0.90594667, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2601, + "time_per_iteration": 2.6358349323272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090245, + "balance_loss_mlp": 1.07874131, + "diversity_loss_mlp": 0.0, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.06252214062087984, + "language_loss": 0.82497251, + "learning_rate": 0.0005233571697760021, + "loss": 0.83587497, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.11505127, + "routerloss_mlp": 0.0, + "step": 2602, + "time_per_iteration": 2.8629817962646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112229, + "balance_loss_mlp": 1.10087442, + "diversity_loss_mlp": 0.0, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.06974132169475507, + "language_loss": 0.8293485, + "learning_rate": 0.0005230459630662203, + "loss": 0.84047079, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.11352539, + "routerloss_mlp": 0.0, + "step": 2603, + "time_per_iteration": 2.939380168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114631, + "balance_loss_mlp": 1.10359812, + "diversity_loss_mlp": 0.0, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.10511771954620508, + "language_loss": 0.81605637, + "learning_rate": 0.0005227347474092022, + "loss": 0.82720268, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2604, + "time_per_iteration": 2.7169747352600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112322, + "balance_loss_mlp": 1.11197877, + "diversity_loss_mlp": 0.0, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.07495893748856379, + "language_loss": 0.83243322, + "learning_rate": 0.0005224235229257724, + "loss": 0.84366548, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.11236572, + "routerloss_mlp": 0.0, + "step": 2605, + "time_per_iteration": 2.6940438747406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113092, + "balance_loss_mlp": 1.10178471, + "diversity_loss_mlp": 0.0, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.06884013858989874, + "language_loss": 0.86851203, + "learning_rate": 0.0005221122897367589, + "loss": 0.87964296, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.11309814, + "routerloss_mlp": 0.0, + "step": 2606, + "time_per_iteration": 2.800685405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109504, + "balance_loss_mlp": 1.09854841, + "diversity_loss_mlp": 0.0, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.08142217271827161, + "language_loss": 0.81335354, + "learning_rate": 0.0005218010479629932, + "loss": 0.82444859, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2607, + "time_per_iteration": 2.657087564468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098904, + "balance_loss_mlp": 1.08753133, + "diversity_loss_mlp": 0.0, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.08269023882009051, + "language_loss": 0.82140303, + "learning_rate": 0.0005214897977253102, + "loss": 0.83239204, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2608, + "time_per_iteration": 2.649846076965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084998, + "balance_loss_mlp": 1.07372093, + "diversity_loss_mlp": 0.0, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.061165709745894754, + "language_loss": 0.84233439, + "learning_rate": 0.0005211785391445473, + "loss": 0.8531844, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2609, + "time_per_iteration": 2.7179222106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087043, + "balance_loss_mlp": 1.07538986, + "diversity_loss_mlp": 0.0, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.06641391212047838, + "language_loss": 0.79080439, + "learning_rate": 0.0005208672723415467, + "loss": 0.80167478, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2610, + "time_per_iteration": 2.7928884029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.07359457, + "diversity_loss_mlp": 0.0, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.07063839016412009, + "language_loss": 0.79436052, + "learning_rate": 0.0005205559974371525, + "loss": 0.80521345, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2611, + "time_per_iteration": 2.75744366645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085649, + "balance_loss_mlp": 1.07412767, + "diversity_loss_mlp": 0.0, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.06307258943078059, + "language_loss": 0.82345438, + "learning_rate": 0.0005202447145522123, + "loss": 0.83431089, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2612, + "time_per_iteration": 2.6847879886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084149, + "balance_loss_mlp": 1.07245421, + "diversity_loss_mlp": 0.0, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.060686478103186246, + "language_loss": 0.79358983, + "learning_rate": 0.0005199334238075769, + "loss": 0.80443138, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2613, + "time_per_iteration": 2.560041666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084812, + "balance_loss_mlp": 1.07277226, + "diversity_loss_mlp": 0.0, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.086387426867178, + "language_loss": 0.91963339, + "learning_rate": 0.0005196221253241, + "loss": 0.93048155, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.12030029, + "routerloss_mlp": 0.0, + "step": 2614, + "time_per_iteration": 2.6397578716278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107839, + "balance_loss_mlp": 1.06617713, + "diversity_loss_mlp": 0.0, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.09198716130289855, + "language_loss": 0.82890773, + "learning_rate": 0.0005193108192226383, + "loss": 0.83969164, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2615, + "time_per_iteration": 2.7370193004608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.06396329, + "diversity_loss_mlp": 0.0, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.08941342921082604, + "language_loss": 0.86907744, + "learning_rate": 0.000518999505624052, + "loss": 0.87983918, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2616, + "time_per_iteration": 2.733515739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067104, + "balance_loss_mlp": 1.05521274, + "diversity_loss_mlp": 0.0, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.05504525356098391, + "language_loss": 0.83447164, + "learning_rate": 0.000518688184649203, + "loss": 0.84514272, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2617, + "time_per_iteration": 2.816542625427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075166, + "balance_loss_mlp": 1.06264269, + "diversity_loss_mlp": 0.0, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.07489503160460931, + "language_loss": 0.83596766, + "learning_rate": 0.0005183768564189577, + "loss": 0.84671938, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2618, + "time_per_iteration": 2.5781893730163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081949, + "balance_loss_mlp": 1.07029045, + "diversity_loss_mlp": 0.0, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.0695581827230682, + "language_loss": 0.81485611, + "learning_rate": 0.0005180655210541838, + "loss": 0.82567555, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2619, + "time_per_iteration": 2.5642077922821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091231, + "balance_loss_mlp": 1.07894695, + "diversity_loss_mlp": 0.0, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.08072673001204132, + "language_loss": 0.83226323, + "learning_rate": 0.0005177541786757527, + "loss": 0.84317553, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2620, + "time_per_iteration": 2.7365450859069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100722, + "balance_loss_mlp": 1.0882231, + "diversity_loss_mlp": 0.0, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.0921594393427519, + "language_loss": 0.82626402, + "learning_rate": 0.000517442829404538, + "loss": 0.83727121, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2621, + "time_per_iteration": 3.053333044052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097629, + "balance_loss_mlp": 1.08534431, + "diversity_loss_mlp": 0.0, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.0844592365120011, + "language_loss": 0.87026393, + "learning_rate": 0.0005171314733614166, + "loss": 0.88124025, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.12286377, + "routerloss_mlp": 0.0, + "step": 2622, + "time_per_iteration": 2.8867554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099715, + "balance_loss_mlp": 1.08721614, + "diversity_loss_mlp": 0.0, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.07191738026805333, + "language_loss": 0.78457403, + "learning_rate": 0.0005168201106672671, + "loss": 0.79557121, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 2623, + "time_per_iteration": 2.7532849311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083535, + "balance_loss_mlp": 1.07122076, + "diversity_loss_mlp": 0.0, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.06664161086213699, + "language_loss": 0.84876573, + "learning_rate": 0.0005165087414429717, + "loss": 0.85960108, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.12316895, + "routerloss_mlp": 0.0, + "step": 2624, + "time_per_iteration": 2.614475965499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_mlp": 1.061566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.06712294156504883, + "language_loss": 0.83509946, + "learning_rate": 0.0005161973658094144, + "loss": 0.84583604, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2625, + "time_per_iteration": 2.6536033153533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875819, + "balance_loss_mlp": 1.51064336, + "diversity_loss_mlp": 0.21324398, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.02954045761884847, + "language_loss": 0.82599998, + "learning_rate": 0.000515885983887482, + "loss": 0.83475816, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01387555, + "step": 2626, + "time_per_iteration": 2.801612138748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070563, + "balance_loss_mlp": 1.05863595, + "diversity_loss_mlp": 0.0, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.07357396162877478, + "language_loss": 0.84283531, + "learning_rate": 0.0005155745957980636, + "loss": 0.8535409, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2627, + "time_per_iteration": 2.6239585876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071483, + "balance_loss_mlp": 1.0589962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.06901961430938243, + "language_loss": 0.88532668, + "learning_rate": 0.000515263201662051, + "loss": 0.89604151, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2628, + "time_per_iteration": 2.65803861618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107038, + "balance_loss_mlp": 1.05840504, + "diversity_loss_mlp": 0.0, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.06314416177701848, + "language_loss": 0.8250618, + "learning_rate": 0.0005149518016003378, + "loss": 0.8357656, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.11968994, + "routerloss_mlp": 0.0, + "step": 2629, + "time_per_iteration": 3.1646623611450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061865, + "balance_loss_mlp": 1.04946709, + "diversity_loss_mlp": 0.0, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.1007750022567515, + "language_loss": 0.82337832, + "learning_rate": 0.0005146403957338206, + "loss": 0.83399695, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.12402344, + "routerloss_mlp": 0.0, + "step": 2630, + "time_per_iteration": 2.5879476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.05236936, + "diversity_loss_mlp": 0.0, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.06667308072604639, + "language_loss": 0.82288837, + "learning_rate": 0.0005143289841833975, + "loss": 0.83353263, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.12060547, + "routerloss_mlp": 0.0, + "step": 2631, + "time_per_iteration": 2.8448615074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.05643749, + "diversity_loss_mlp": 0.0, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.09203997555384738, + "language_loss": 0.82179189, + "learning_rate": 0.0005140175670699696, + "loss": 0.83247638, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.11999512, + "routerloss_mlp": 0.0, + "step": 2632, + "time_per_iteration": 2.642666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067258, + "balance_loss_mlp": 1.05545044, + "diversity_loss_mlp": 0.0, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.04894531982576629, + "language_loss": 0.82796603, + "learning_rate": 0.0005137061445144395, + "loss": 0.8386386, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2633, + "time_per_iteration": 2.8800737857818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076133, + "balance_loss_mlp": 1.06476033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.06583044180155191, + "language_loss": 0.87074906, + "learning_rate": 0.000513394716637712, + "loss": 0.88151038, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2634, + "time_per_iteration": 2.7507505416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_mlp": 1.02921486, + "diversity_loss_mlp": 0.0, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.03533282921310782, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80227697, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.06787109, + "routerloss_mlp": 0.0, + "step": 2635, + "time_per_iteration": 4.825605869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110028, + "balance_loss_mlp": 1.08881176, + "diversity_loss_mlp": 0.0, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.07735545811428028, + "language_loss": 0.81068468, + "learning_rate": 0.0005127718454042958, + "loss": 0.82168746, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2636, + "time_per_iteration": 2.8241050243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099835, + "balance_loss_mlp": 1.08840299, + "diversity_loss_mlp": 0.0, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.08187506034762644, + "language_loss": 0.83836603, + "learning_rate": 0.0005124604022894269, + "loss": 0.8493644, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2637, + "time_per_iteration": 2.9366774559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019034, + "balance_loss_mlp": 1.01259708, + "diversity_loss_mlp": 0.0, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.025963071476552062, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.7820726, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.06445312, + "routerloss_mlp": 0.0, + "step": 2638, + "time_per_iteration": 4.828620433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092892, + "balance_loss_mlp": 1.08166814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.07837351333742608, + "language_loss": 0.83244252, + "learning_rate": 0.0005118375016679325, + "loss": 0.84337139, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.11224365, + "routerloss_mlp": 0.0, + "step": 2639, + "time_per_iteration": 2.801852226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077953, + "balance_loss_mlp": 1.0666697, + "diversity_loss_mlp": 0.0, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.07879033409242599, + "language_loss": 0.80358827, + "learning_rate": 0.0005115260444031382, + "loss": 0.81436777, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2640, + "time_per_iteration": 2.596771240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010253, + "balance_loss_mlp": 1.00422084, + "diversity_loss_mlp": 0.0, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011737851482073082, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79742074, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.06030273, + "routerloss_mlp": 0.0, + "step": 2641, + "time_per_iteration": 4.948842287063599 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075403, + "balance_loss_mlp": 1.06412029, + "diversity_loss_mlp": 0.0, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.08031663653823312, + "language_loss": 0.8740893, + "learning_rate": 0.0005109031165700483, + "loss": 0.88484335, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.112854, + "routerloss_mlp": 0.0, + "step": 2642, + "time_per_iteration": 2.5833895206451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060876, + "balance_loss_mlp": 1.04938459, + "diversity_loss_mlp": 0.0, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.06372027514248847, + "language_loss": 0.83170295, + "learning_rate": 0.0005105916462435945, + "loss": 0.84231174, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2643, + "time_per_iteration": 2.841296911239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106144, + "balance_loss_mlp": 1.05014455, + "diversity_loss_mlp": 0.0, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.0681709540800111, + "language_loss": 0.85266602, + "learning_rate": 0.0005102801718050989, + "loss": 0.86328042, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2644, + "time_per_iteration": 2.680905818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058539, + "balance_loss_mlp": 1.04714894, + "diversity_loss_mlp": 0.0, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.07434027721258654, + "language_loss": 0.89314902, + "learning_rate": 0.0005099686933754867, + "loss": 0.90373439, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.1138916, + "routerloss_mlp": 0.0, + "step": 2645, + "time_per_iteration": 2.723043441772461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062253, + "balance_loss_mlp": 1.05088663, + "diversity_loss_mlp": 0.0, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.07256046334666034, + "language_loss": 0.8429243, + "learning_rate": 0.0005096572110756845, + "loss": 0.85354686, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2646, + "time_per_iteration": 2.6682143211364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069615, + "balance_loss_mlp": 1.05801558, + "diversity_loss_mlp": 0.0, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.06200075514200526, + "language_loss": 0.85445803, + "learning_rate": 0.0005093457250266205, + "loss": 0.86515421, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2647, + "time_per_iteration": 2.682891368865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.05816472, + "diversity_loss_mlp": 0.0, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.1092618136395953, + "language_loss": 0.83279526, + "learning_rate": 0.000509034235349224, + "loss": 0.84349322, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.11627197, + "routerloss_mlp": 0.0, + "step": 2648, + "time_per_iteration": 2.7173004150390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068823, + "balance_loss_mlp": 1.05756938, + "diversity_loss_mlp": 0.0, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.07759183255272654, + "language_loss": 0.81290972, + "learning_rate": 0.0005087227421644266, + "loss": 0.82359791, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2649, + "time_per_iteration": 2.79217791557312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066501, + "balance_loss_mlp": 1.05469334, + "diversity_loss_mlp": 0.0, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.07036579944312285, + "language_loss": 0.85978615, + "learning_rate": 0.0005084112455931602, + "loss": 0.87045121, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2650, + "time_per_iteration": 2.593323230743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.06125915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.06673546987966349, + "language_loss": 0.85377133, + "learning_rate": 0.0005080997457563586, + "loss": 0.86449993, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2651, + "time_per_iteration": 2.5473101139068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074592, + "balance_loss_mlp": 1.06324303, + "diversity_loss_mlp": 0.0, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.07839929831674766, + "language_loss": 0.79146206, + "learning_rate": 0.0005077882427749569, + "loss": 0.80220807, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.11340332, + "routerloss_mlp": 0.0, + "step": 2652, + "time_per_iteration": 2.5378577709198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081444, + "balance_loss_mlp": 1.07002354, + "diversity_loss_mlp": 0.0, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.09222135648623411, + "language_loss": 0.84599656, + "learning_rate": 0.0005074767367698913, + "loss": 0.85681099, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2653, + "time_per_iteration": 2.7541823387145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.0749042, + "diversity_loss_mlp": 0.0, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.07250262260433718, + "language_loss": 0.82987714, + "learning_rate": 0.0005071652278620988, + "loss": 0.84074312, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2654, + "time_per_iteration": 3.0615251064300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089781, + "balance_loss_mlp": 1.07870018, + "diversity_loss_mlp": 0.0, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.07582936293709001, + "language_loss": 0.83328903, + "learning_rate": 0.0005068537161725186, + "loss": 0.84418684, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.11083984, + "routerloss_mlp": 0.0, + "step": 2655, + "time_per_iteration": 2.7840993404388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092544, + "balance_loss_mlp": 1.08139753, + "diversity_loss_mlp": 0.0, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.07786356346883126, + "language_loss": 0.84288549, + "learning_rate": 0.0005065422018220893, + "loss": 0.85381097, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.1114502, + "routerloss_mlp": 0.0, + "step": 2656, + "time_per_iteration": 2.832575798034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102102, + "balance_loss_mlp": 1.09118247, + "diversity_loss_mlp": 0.0, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.08194812181942494, + "language_loss": 0.80392313, + "learning_rate": 0.0005062306849317521, + "loss": 0.81494415, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.10931396, + "routerloss_mlp": 0.0, + "step": 2657, + "time_per_iteration": 2.794966220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100168, + "balance_loss_mlp": 1.08891487, + "diversity_loss_mlp": 0.0, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.08210850574888065, + "language_loss": 0.83486134, + "learning_rate": 0.0005059191656224487, + "loss": 0.84586298, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2658, + "time_per_iteration": 2.744889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093286, + "balance_loss_mlp": 1.08238411, + "diversity_loss_mlp": 0.0, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.07321009008554179, + "language_loss": 0.88860798, + "learning_rate": 0.0005056076440151212, + "loss": 0.89954078, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.10906982, + "routerloss_mlp": 0.0, + "step": 2659, + "time_per_iteration": 2.6951825618743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113007, + "balance_loss_mlp": 1.12453902, + "diversity_loss_mlp": 0.0, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.07076104465295206, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77418184, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2660, + "time_per_iteration": 4.850585460662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081367, + "balance_loss_mlp": 1.07051301, + "diversity_loss_mlp": 0.0, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06225287802871053, + "language_loss": 0.86966121, + "learning_rate": 0.0005049845943901691, + "loss": 0.88047487, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.10864258, + "routerloss_mlp": 0.0, + "step": 2661, + "time_per_iteration": 2.8342370986938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079888, + "balance_loss_mlp": 1.0692786, + "diversity_loss_mlp": 0.0, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.058043198592839004, + "language_loss": 0.86637139, + "learning_rate": 0.0005046730666144338, + "loss": 0.87717032, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2662, + "time_per_iteration": 2.8066177368164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078601, + "balance_loss_mlp": 1.06801558, + "diversity_loss_mlp": 0.0, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.058701328600128284, + "language_loss": 0.87834954, + "learning_rate": 0.0005043615370244532, + "loss": 0.88913548, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 2663, + "time_per_iteration": 3.3716113567352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105143, + "balance_loss_mlp": 1.04589903, + "diversity_loss_mlp": 0.0, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.02890820887526385, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79295814, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2664, + "time_per_iteration": 4.632098913192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074303, + "balance_loss_mlp": 1.0636878, + "diversity_loss_mlp": 0.0, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.05776678043634197, + "language_loss": 0.85301316, + "learning_rate": 0.0005037384728855425, + "loss": 0.86375624, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2665, + "time_per_iteration": 2.8025074005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077204, + "balance_loss_mlp": 1.06618285, + "diversity_loss_mlp": 0.0, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.08001364709617295, + "language_loss": 0.84092522, + "learning_rate": 0.0005034269385785075, + "loss": 0.85169727, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2666, + "time_per_iteration": 2.6508989334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070249, + "balance_loss_mlp": 1.05929327, + "diversity_loss_mlp": 0.0, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.06550806602425656, + "language_loss": 0.849998, + "learning_rate": 0.0005031154029410168, + "loss": 0.86070049, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.10955811, + "routerloss_mlp": 0.0, + "step": 2667, + "time_per_iteration": 2.6072959899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062599, + "balance_loss_mlp": 1.05130351, + "diversity_loss_mlp": 0.0, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.07261202613887993, + "language_loss": 0.86903906, + "learning_rate": 0.0005028038660940197, + "loss": 0.87966514, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2668, + "time_per_iteration": 2.5607664585113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060608, + "balance_loss_mlp": 1.04923522, + "diversity_loss_mlp": 0.0, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.06521290367629204, + "language_loss": 0.84553415, + "learning_rate": 0.0005024923281584648, + "loss": 0.8561402, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 2669, + "time_per_iteration": 2.623643159866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066692, + "balance_loss_mlp": 1.05528402, + "diversity_loss_mlp": 0.0, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.06549707374857121, + "language_loss": 0.82560658, + "learning_rate": 0.0005021807892553026, + "loss": 0.83627355, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2670, + "time_per_iteration": 2.699392318725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062757, + "balance_loss_mlp": 1.05140269, + "diversity_loss_mlp": 0.0, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.07318428846825417, + "language_loss": 0.84862608, + "learning_rate": 0.0005018692495054828, + "loss": 0.85925364, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2671, + "time_per_iteration": 2.7645046710968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106912, + "balance_loss_mlp": 1.05812323, + "diversity_loss_mlp": 0.0, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.06397327244364565, + "language_loss": 0.80696338, + "learning_rate": 0.0005015577090299561, + "loss": 0.81765461, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.11004639, + "routerloss_mlp": 0.0, + "step": 2672, + "time_per_iteration": 2.684048891067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068328, + "balance_loss_mlp": 1.05731261, + "diversity_loss_mlp": 0.0, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.06574977800170037, + "language_loss": 0.86744952, + "learning_rate": 0.0005012461679496729, + "loss": 0.87813282, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2673, + "time_per_iteration": 2.5885825157165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077367, + "balance_loss_mlp": 1.06613708, + "diversity_loss_mlp": 0.0, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.09032594792095527, + "language_loss": 0.87748468, + "learning_rate": 0.0005009346263855848, + "loss": 0.88825834, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.11236572, + "routerloss_mlp": 0.0, + "step": 2674, + "time_per_iteration": 2.5970752239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092275, + "balance_loss_mlp": 1.08141518, + "diversity_loss_mlp": 0.0, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.06465969942237398, + "language_loss": 0.83699256, + "learning_rate": 0.0005006230844586422, + "loss": 0.84791529, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2675, + "time_per_iteration": 2.7912445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00882234, + "balance_loss_mlp": 1.52600026, + "diversity_loss_mlp": 0.21199086, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.0263651655655577, + "language_loss": 0.78895926, + "learning_rate": 0.0005003115422897968, + "loss": 0.79778159, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01323896, + "step": 2676, + "time_per_iteration": 2.8051552772521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111408, + "balance_loss_mlp": 1.10282683, + "diversity_loss_mlp": 0.0, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.0741463219638638, + "language_loss": 0.87253916, + "learning_rate": 0.0005, + "loss": 0.88367999, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2677, + "time_per_iteration": 2.6435391902923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119404, + "balance_loss_mlp": 1.10841274, + "diversity_loss_mlp": 0.0, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.08792863943872284, + "language_loss": 0.79283178, + "learning_rate": 0.0004996884577102033, + "loss": 0.80402583, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.10992432, + "routerloss_mlp": 0.0, + "step": 2678, + "time_per_iteration": 3.089707374572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111646, + "balance_loss_mlp": 1.10545659, + "diversity_loss_mlp": 0.0, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.08112886088857633, + "language_loss": 0.84611261, + "learning_rate": 0.000499376915541358, + "loss": 0.85727721, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.10998535, + "routerloss_mlp": 0.0, + "step": 2679, + "time_per_iteration": 2.7143540382385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109969, + "balance_loss_mlp": 1.08910465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.16255458440641746, + "language_loss": 0.81113428, + "learning_rate": 0.0004990653736144155, + "loss": 0.82213122, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 2680, + "time_per_iteration": 2.857952356338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084916, + "balance_loss_mlp": 1.07416916, + "diversity_loss_mlp": 0.0, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.06912387000686389, + "language_loss": 0.85820174, + "learning_rate": 0.0004987538320503271, + "loss": 0.86905092, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.10748291, + "routerloss_mlp": 0.0, + "step": 2681, + "time_per_iteration": 2.485462188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077984, + "balance_loss_mlp": 1.06715369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.08121908376237164, + "language_loss": 0.83137929, + "learning_rate": 0.0004984422909700442, + "loss": 0.84215909, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2682, + "time_per_iteration": 2.7179505825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068711, + "balance_loss_mlp": 1.05784559, + "diversity_loss_mlp": 0.0, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.07829442771548371, + "language_loss": 0.83800036, + "learning_rate": 0.0004981307504945173, + "loss": 0.84868753, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2683, + "time_per_iteration": 2.71893048286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061815, + "balance_loss_mlp": 1.05075228, + "diversity_loss_mlp": 0.0, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.08619577510477876, + "language_loss": 0.89448887, + "learning_rate": 0.0004978192107446976, + "loss": 0.90510702, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2684, + "time_per_iteration": 2.7385506629943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062179, + "balance_loss_mlp": 1.05111599, + "diversity_loss_mlp": 0.0, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.08129158019501125, + "language_loss": 0.8740204, + "learning_rate": 0.0004975076718415353, + "loss": 0.88464212, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2685, + "time_per_iteration": 2.599379777908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_mlp": 1.04478931, + "diversity_loss_mlp": 0.0, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.06772474949474022, + "language_loss": 0.90610582, + "learning_rate": 0.0004971961339059806, + "loss": 0.91666389, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.11016846, + "routerloss_mlp": 0.0, + "step": 2686, + "time_per_iteration": 2.498819589614868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057473, + "balance_loss_mlp": 1.04611838, + "diversity_loss_mlp": 0.0, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.06487308694775892, + "language_loss": 0.84021914, + "learning_rate": 0.0004968845970589832, + "loss": 0.85079384, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2687, + "time_per_iteration": 2.6814825534820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061325, + "balance_loss_mlp": 1.04982185, + "diversity_loss_mlp": 0.0, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.06911328459433905, + "language_loss": 0.8435297, + "learning_rate": 0.0004965730614214926, + "loss": 0.8541429, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2688, + "time_per_iteration": 2.6537294387817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106762, + "balance_loss_mlp": 1.05618167, + "diversity_loss_mlp": 0.0, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.07039148040030412, + "language_loss": 0.85285878, + "learning_rate": 0.0004962615271144576, + "loss": 0.86353499, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2689, + "time_per_iteration": 2.50710129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064714, + "balance_loss_mlp": 1.05325246, + "diversity_loss_mlp": 0.0, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.0770213433091723, + "language_loss": 0.82680881, + "learning_rate": 0.0004959499942588264, + "loss": 0.83745599, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.11456299, + "routerloss_mlp": 0.0, + "step": 2690, + "time_per_iteration": 2.892293930053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049886, + "balance_loss_mlp": 1.04297149, + "diversity_loss_mlp": 0.0, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.03551055813206397, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79249913, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 2691, + "time_per_iteration": 4.764665842056274 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070219, + "balance_loss_mlp": 1.05894208, + "diversity_loss_mlp": 0.0, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.08037192658361764, + "language_loss": 0.85416174, + "learning_rate": 0.0004953269333855661, + "loss": 0.86486399, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2692, + "time_per_iteration": 2.785511016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075514, + "balance_loss_mlp": 1.06407034, + "diversity_loss_mlp": 0.0, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.06114385406953633, + "language_loss": 0.84516799, + "learning_rate": 0.0004950154056098309, + "loss": 0.85592318, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.11437988, + "routerloss_mlp": 0.0, + "step": 2693, + "time_per_iteration": 2.683246374130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.07183599, + "diversity_loss_mlp": 0.0, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.08066804074186672, + "language_loss": 0.84078431, + "learning_rate": 0.0004947038797692867, + "loss": 0.85161769, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2694, + "time_per_iteration": 2.8312196731567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00872465, + "balance_loss_mlp": 1.50766385, + "diversity_loss_mlp": 0.2097543, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.031552182630998016, + "language_loss": 0.77636528, + "learning_rate": 0.0004943923559848789, + "loss": 0.78508997, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01375636, + "step": 2695, + "time_per_iteration": 2.8084189891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010865, + "balance_loss_mlp": 1.07534158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.055486891719670514, + "language_loss": 0.90695632, + "learning_rate": 0.0004940808343775515, + "loss": 0.91782129, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.1116333, + "routerloss_mlp": 0.0, + "step": 2696, + "time_per_iteration": 2.6868011951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00874209, + "balance_loss_mlp": 1.50797677, + "diversity_loss_mlp": 0.21290711, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.034010170020107075, + "language_loss": 0.82213199, + "learning_rate": 0.0004937693150682479, + "loss": 0.83087409, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01376703, + "step": 2697, + "time_per_iteration": 2.5905513763427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090314, + "balance_loss_mlp": 1.07915568, + "diversity_loss_mlp": 0.0, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.06705206433038317, + "language_loss": 0.7658723, + "learning_rate": 0.0004934577981779107, + "loss": 0.77677542, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.1116333, + "routerloss_mlp": 0.0, + "step": 2698, + "time_per_iteration": 2.7049057483673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087095, + "balance_loss_mlp": 1.07585335, + "diversity_loss_mlp": 0.0, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.061529133753451364, + "language_loss": 0.812904, + "learning_rate": 0.0004931462838274817, + "loss": 0.82377493, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.11242676, + "routerloss_mlp": 0.0, + "step": 2699, + "time_per_iteration": 2.8723175525665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089813, + "balance_loss_mlp": 1.07877994, + "diversity_loss_mlp": 0.0, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.08487292742433496, + "language_loss": 0.84222901, + "learning_rate": 0.0004928347721379011, + "loss": 0.85312712, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2700, + "time_per_iteration": 2.639867067337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080545, + "balance_loss_mlp": 1.06974459, + "diversity_loss_mlp": 0.0, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.06134037245316137, + "language_loss": 0.82221866, + "learning_rate": 0.0004925232632301089, + "loss": 0.83302414, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.10797119, + "routerloss_mlp": 0.0, + "step": 2701, + "time_per_iteration": 2.622311592102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077123, + "balance_loss_mlp": 1.0660243, + "diversity_loss_mlp": 0.0, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.06337758152829237, + "language_loss": 0.79842103, + "learning_rate": 0.0004922117572250431, + "loss": 0.80919224, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.11096191, + "routerloss_mlp": 0.0, + "step": 2702, + "time_per_iteration": 2.6980605125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070723, + "balance_loss_mlp": 1.05936241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.07398400160993446, + "language_loss": 0.80852163, + "learning_rate": 0.0004919002542436414, + "loss": 0.81922889, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2703, + "time_per_iteration": 2.8354647159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072853, + "balance_loss_mlp": 1.0619514, + "diversity_loss_mlp": 0.0, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.064542502306726, + "language_loss": 0.8126899, + "learning_rate": 0.0004915887544068399, + "loss": 0.8234185, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.10906982, + "routerloss_mlp": 0.0, + "step": 2704, + "time_per_iteration": 2.6693973541259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068776, + "balance_loss_mlp": 1.05770195, + "diversity_loss_mlp": 0.0, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.06578360362401801, + "language_loss": 0.7856639, + "learning_rate": 0.0004912772578355736, + "loss": 0.79635167, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2705, + "time_per_iteration": 2.892735481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0611918, + "diversity_loss_mlp": 0.0, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.07750798967783011, + "language_loss": 0.82549465, + "learning_rate": 0.000490965764650776, + "loss": 0.83621788, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.11126709, + "routerloss_mlp": 0.0, + "step": 2706, + "time_per_iteration": 2.8544106483459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070979, + "balance_loss_mlp": 1.05984521, + "diversity_loss_mlp": 0.0, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.06572065456776559, + "language_loss": 0.82828736, + "learning_rate": 0.0004906542749733798, + "loss": 0.83899713, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.11132812, + "routerloss_mlp": 0.0, + "step": 2707, + "time_per_iteration": 3.6044294834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.06353068, + "diversity_loss_mlp": 0.0, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.055629683487612144, + "language_loss": 0.85401118, + "learning_rate": 0.0004903427889243156, + "loss": 0.86475539, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.10894775, + "routerloss_mlp": 0.0, + "step": 2708, + "time_per_iteration": 2.830115795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075334, + "balance_loss_mlp": 1.06425905, + "diversity_loss_mlp": 0.0, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.06692681375903406, + "language_loss": 0.85444081, + "learning_rate": 0.0004900313066245134, + "loss": 0.86519414, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2709, + "time_per_iteration": 2.6552441120147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106839, + "balance_loss_mlp": 1.05745232, + "diversity_loss_mlp": 0.0, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.06855502771674758, + "language_loss": 0.81061214, + "learning_rate": 0.0004897198281949012, + "loss": 0.82129598, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.10949707, + "routerloss_mlp": 0.0, + "step": 2710, + "time_per_iteration": 2.645981550216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00874972, + "balance_loss_mlp": 1.51124442, + "diversity_loss_mlp": 0.21021394, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.03577466895356274, + "language_loss": 0.78009295, + "learning_rate": 0.0004894083537564057, + "loss": 0.78884268, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01424256, + "step": 2711, + "time_per_iteration": 2.746945858001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0086804, + "balance_loss_mlp": 1.49602354, + "diversity_loss_mlp": 0.21089339, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.02967241377466632, + "language_loss": 0.80981171, + "learning_rate": 0.0004890968834299519, + "loss": 0.81849211, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01458106, + "step": 2712, + "time_per_iteration": 2.749049663543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072348, + "balance_loss_mlp": 1.06096959, + "diversity_loss_mlp": 0.0, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.06422523073894505, + "language_loss": 0.78739542, + "learning_rate": 0.0004887854173364633, + "loss": 0.79811883, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2713, + "time_per_iteration": 2.760077953338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00862336, + "balance_loss_mlp": 1.48416615, + "diversity_loss_mlp": 0.2112534, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.02839704110509781, + "language_loss": 0.81564224, + "learning_rate": 0.0004884739555968617, + "loss": 0.8242656, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01462588, + "step": 2714, + "time_per_iteration": 2.902200698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043267, + "balance_loss_mlp": 1.03711605, + "diversity_loss_mlp": 0.0, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.025188943281148922, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.8002032, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2715, + "time_per_iteration": 4.977273464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847492, + "balance_loss_mlp": 1.45660305, + "diversity_loss_mlp": 0.21012819, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.03573397478438407, + "language_loss": 0.86888605, + "learning_rate": 0.0004878510456629992, + "loss": 0.87736094, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01412619, + "step": 2716, + "time_per_iteration": 2.998455286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068588, + "balance_loss_mlp": 1.05767989, + "diversity_loss_mlp": 0.0, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.06765059094142209, + "language_loss": 0.85142076, + "learning_rate": 0.00048753959771057314, + "loss": 0.86210662, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.10925293, + "routerloss_mlp": 0.0, + "step": 2717, + "time_per_iteration": 2.6113662719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065726, + "balance_loss_mlp": 1.05442464, + "diversity_loss_mlp": 0.0, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.08600503840688169, + "language_loss": 0.82445514, + "learning_rate": 0.0004872281545957044, + "loss": 0.83511233, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.11297607, + "routerloss_mlp": 0.0, + "step": 2718, + "time_per_iteration": 2.7617604732513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070418, + "balance_loss_mlp": 1.05911732, + "diversity_loss_mlp": 0.0, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.061040572409093316, + "language_loss": 0.86051857, + "learning_rate": 0.0004869167164393055, + "loss": 0.87122279, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.11303711, + "routerloss_mlp": 0.0, + "step": 2719, + "time_per_iteration": 2.932154417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069726, + "balance_loss_mlp": 1.05857992, + "diversity_loss_mlp": 0.0, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.11614833297327579, + "language_loss": 0.89542395, + "learning_rate": 0.00048660528336228793, + "loss": 0.90612125, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.11151123, + "routerloss_mlp": 0.0, + "step": 2720, + "time_per_iteration": 2.7917380332946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071611, + "balance_loss_mlp": 1.06013143, + "diversity_loss_mlp": 0.0, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.05730438157509479, + "language_loss": 0.90177751, + "learning_rate": 0.0004862938554855606, + "loss": 0.91249359, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2721, + "time_per_iteration": 2.809875965118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074661, + "balance_loss_mlp": 1.06371188, + "diversity_loss_mlp": 0.0, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.06740042101514945, + "language_loss": 0.86071771, + "learning_rate": 0.0004859824329300304, + "loss": 0.87146431, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.10949707, + "routerloss_mlp": 0.0, + "step": 2722, + "time_per_iteration": 2.5660176277160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070887, + "balance_loss_mlp": 1.05932951, + "diversity_loss_mlp": 0.0, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.06312939516717878, + "language_loss": 0.83826602, + "learning_rate": 0.00048567101581660244, + "loss": 0.84897488, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.11560059, + "routerloss_mlp": 0.0, + "step": 2723, + "time_per_iteration": 2.593005895614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107168, + "balance_loss_mlp": 1.0603317, + "diversity_loss_mlp": 0.0, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.07171512526566694, + "language_loss": 0.86622667, + "learning_rate": 0.00048535960426617956, + "loss": 0.87694347, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2724, + "time_per_iteration": 2.611551523208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070313, + "balance_loss_mlp": 1.05852962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.07077799246948024, + "language_loss": 0.81735158, + "learning_rate": 0.0004850481983996621, + "loss": 0.82805473, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2725, + "time_per_iteration": 2.7656939029693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058325, + "balance_loss_mlp": 1.04673731, + "diversity_loss_mlp": 0.0, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.07497614956550303, + "language_loss": 0.87961793, + "learning_rate": 0.0004847367983379492, + "loss": 0.89020109, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2726, + "time_per_iteration": 2.523099899291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066059, + "balance_loss_mlp": 1.05477571, + "diversity_loss_mlp": 0.0, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.06275633211650163, + "language_loss": 0.78715622, + "learning_rate": 0.00048442540420193643, + "loss": 0.79781681, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2727, + "time_per_iteration": 2.9433038234710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056804, + "balance_loss_mlp": 1.04506755, + "diversity_loss_mlp": 0.0, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.07393634521455344, + "language_loss": 0.79367208, + "learning_rate": 0.0004841140161125182, + "loss": 0.80424011, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2728, + "time_per_iteration": 3.619252920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063037, + "balance_loss_mlp": 1.05171847, + "diversity_loss_mlp": 0.0, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.07165329358033216, + "language_loss": 0.84827459, + "learning_rate": 0.0004838026341905857, + "loss": 0.85890496, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.11322021, + "routerloss_mlp": 0.0, + "step": 2729, + "time_per_iteration": 2.716114044189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057536, + "balance_loss_mlp": 1.04594862, + "diversity_loss_mlp": 0.0, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.13042739485624238, + "language_loss": 0.85312545, + "learning_rate": 0.00048349125855702844, + "loss": 0.86370087, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2730, + "time_per_iteration": 2.787280559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00837258, + "balance_loss_mlp": 1.43598437, + "diversity_loss_mlp": 0.21135046, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.027658523195400363, + "language_loss": 0.81318069, + "learning_rate": 0.00048317988933273287, + "loss": 0.82155323, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01359018, + "step": 2731, + "time_per_iteration": 2.763814687728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057346, + "balance_loss_mlp": 1.04585993, + "diversity_loss_mlp": 0.0, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.07420390441928848, + "language_loss": 0.82373381, + "learning_rate": 0.00048286852663858367, + "loss": 0.83430725, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2732, + "time_per_iteration": 2.9533157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063655, + "balance_loss_mlp": 1.05203819, + "diversity_loss_mlp": 0.0, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.07616653501098058, + "language_loss": 0.8428973, + "learning_rate": 0.000482557170595462, + "loss": 0.8535338, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2733, + "time_per_iteration": 2.865147829055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065104, + "balance_loss_mlp": 1.0532366, + "diversity_loss_mlp": 0.0, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.060395165010054055, + "language_loss": 0.87880594, + "learning_rate": 0.0004822458213242475, + "loss": 0.88945693, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.11859131, + "routerloss_mlp": 0.0, + "step": 2734, + "time_per_iteration": 2.557253360748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070633, + "balance_loss_mlp": 1.05886698, + "diversity_loss_mlp": 0.0, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.1031910380133139, + "language_loss": 0.86086309, + "learning_rate": 0.00048193447894581627, + "loss": 0.8715694, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2735, + "time_per_iteration": 3.122976541519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076001, + "balance_loss_mlp": 1.06436014, + "diversity_loss_mlp": 0.0, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.06843040001694842, + "language_loss": 0.8809998, + "learning_rate": 0.00048162314358104243, + "loss": 0.89175981, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2736, + "time_per_iteration": 2.6340246200561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00824973, + "balance_loss_mlp": 1.41347969, + "diversity_loss_mlp": 0.20989257, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.031515925317837694, + "language_loss": 0.83306372, + "learning_rate": 0.0004813118153507969, + "loss": 0.84131336, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01328672, + "step": 2737, + "time_per_iteration": 2.7356157302856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_mlp": 1.03480983, + "diversity_loss_mlp": 0.0, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.03217065957479051, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83488321, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2738, + "time_per_iteration": 4.772867202758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.06062317, + "diversity_loss_mlp": 0.0, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.0555866415390632, + "language_loss": 0.83715498, + "learning_rate": 0.00048068918077736163, + "loss": 0.84787494, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2739, + "time_per_iteration": 3.2028074264526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076914, + "balance_loss_mlp": 1.06573176, + "diversity_loss_mlp": 0.0, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.06998122113459494, + "language_loss": 0.81445146, + "learning_rate": 0.0004803778746759001, + "loss": 0.82522058, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2740, + "time_per_iteration": 2.87070369720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082959, + "balance_loss_mlp": 1.07215285, + "diversity_loss_mlp": 0.0, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.07737040857299185, + "language_loss": 0.82122779, + "learning_rate": 0.00048006657619242317, + "loss": 0.83205736, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.10809326, + "routerloss_mlp": 0.0, + "step": 2741, + "time_per_iteration": 2.6385269165039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107519, + "balance_loss_mlp": 1.06447887, + "diversity_loss_mlp": 0.0, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.07879516603511716, + "language_loss": 0.78380877, + "learning_rate": 0.00047975528544778775, + "loss": 0.79456067, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.10717773, + "routerloss_mlp": 0.0, + "step": 2742, + "time_per_iteration": 2.6197235584259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079206, + "balance_loss_mlp": 1.06839335, + "diversity_loss_mlp": 0.0, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.07439948679259917, + "language_loss": 0.88816094, + "learning_rate": 0.00047944400256284754, + "loss": 0.89895302, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.10827637, + "routerloss_mlp": 0.0, + "step": 2743, + "time_per_iteration": 2.6887855529785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00830459, + "balance_loss_mlp": 1.42072511, + "diversity_loss_mlp": 0.21262056, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.03227823662204125, + "language_loss": 0.799101, + "learning_rate": 0.0004791327276584532, + "loss": 0.80740565, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01378582, + "step": 2744, + "time_per_iteration": 2.8497848510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.07629538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.0718535906247093, + "language_loss": 0.80497956, + "learning_rate": 0.00047882146085545264, + "loss": 0.81585032, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.10784912, + "routerloss_mlp": 0.0, + "step": 2745, + "time_per_iteration": 2.6078941822052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017458, + "balance_loss_mlp": 1.01199865, + "diversity_loss_mlp": 0.0, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.013176381696238814, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76419842, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.0546875, + "routerloss_mlp": 0.0, + "step": 2746, + "time_per_iteration": 4.974900007247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078307, + "balance_loss_mlp": 1.06777453, + "diversity_loss_mlp": 0.0, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.0894490118638191, + "language_loss": 0.79344547, + "learning_rate": 0.00047819895203700684, + "loss": 0.80422854, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2747, + "time_per_iteration": 2.717135190963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015273, + "balance_loss_mlp": 1.00983751, + "diversity_loss_mlp": 0.0, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.009473538771460566, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76527709, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.05444336, + "routerloss_mlp": 0.0, + "step": 2748, + "time_per_iteration": 4.642770290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085947, + "balance_loss_mlp": 1.07577801, + "diversity_loss_mlp": 0.0, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.07060951554594143, + "language_loss": 0.88469762, + "learning_rate": 0.0004775764770742277, + "loss": 0.89555711, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2749, + "time_per_iteration": 2.8018476963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087955, + "balance_loss_mlp": 1.07761312, + "diversity_loss_mlp": 0.0, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.08234082280170717, + "language_loss": 0.86406553, + "learning_rate": 0.00047726525259079777, + "loss": 0.8749451, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2750, + "time_per_iteration": 2.8415229320526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00831428, + "balance_loss_mlp": 1.42309499, + "diversity_loss_mlp": 0.21321589, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.03400797212131273, + "language_loss": 0.88723552, + "learning_rate": 0.0004769540369337798, + "loss": 0.89554983, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01327293, + "step": 2751, + "time_per_iteration": 2.752032518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100532, + "balance_loss_mlp": 1.09000587, + "diversity_loss_mlp": 0.0, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06288245154731438, + "language_loss": 0.85769415, + "learning_rate": 0.00047664283022399794, + "loss": 0.86869949, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2752, + "time_per_iteration": 2.8568003177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107464, + "balance_loss_mlp": 1.09725976, + "diversity_loss_mlp": 0.0, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.0883883166781065, + "language_loss": 0.80924225, + "learning_rate": 0.00047633163258227376, + "loss": 0.82031691, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.10205078, + "routerloss_mlp": 0.0, + "step": 2753, + "time_per_iteration": 2.8275938034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104119, + "balance_loss_mlp": 1.09359312, + "diversity_loss_mlp": 0.0, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.06733658380062774, + "language_loss": 0.85417688, + "learning_rate": 0.0004760204441294247, + "loss": 0.86521804, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2754, + "time_per_iteration": 2.6338090896606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104137, + "balance_loss_mlp": 1.09376574, + "diversity_loss_mlp": 0.0, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.06936353635633287, + "language_loss": 0.85999346, + "learning_rate": 0.00047570926498626486, + "loss": 0.87103486, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 2755, + "time_per_iteration": 2.716575860977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108637, + "balance_loss_mlp": 1.09822416, + "diversity_loss_mlp": 0.0, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.061285448286525046, + "language_loss": 0.81361842, + "learning_rate": 0.00047539809527360474, + "loss": 0.82470477, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2756, + "time_per_iteration": 2.881225109100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102434, + "balance_loss_mlp": 1.0919373, + "diversity_loss_mlp": 0.0, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.05865021558391441, + "language_loss": 0.82642096, + "learning_rate": 0.0004750869351122511, + "loss": 0.83744538, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 2757, + "time_per_iteration": 2.9978790283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096362, + "balance_loss_mlp": 1.08600891, + "diversity_loss_mlp": 0.0, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.07787390265260127, + "language_loss": 0.81663013, + "learning_rate": 0.00047477578462300685, + "loss": 0.82759368, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2758, + "time_per_iteration": 2.700833797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090285, + "balance_loss_mlp": 1.07975245, + "diversity_loss_mlp": 0.0, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.069319292192906, + "language_loss": 0.80022508, + "learning_rate": 0.0004744646439266718, + "loss": 0.81112796, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.10528564, + "routerloss_mlp": 0.0, + "step": 2759, + "time_per_iteration": 3.0144033432006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084993, + "balance_loss_mlp": 1.07477677, + "diversity_loss_mlp": 0.0, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.05678736813253772, + "language_loss": 0.92058611, + "learning_rate": 0.000474153513144041, + "loss": 0.93143606, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 2760, + "time_per_iteration": 2.890305995941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082633, + "balance_loss_mlp": 1.07224369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.06975892982263965, + "language_loss": 0.8659752, + "learning_rate": 0.00047384239239590633, + "loss": 0.87680155, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2761, + "time_per_iteration": 2.864649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076622, + "balance_loss_mlp": 1.06607819, + "diversity_loss_mlp": 0.0, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06592907525694008, + "language_loss": 0.88956439, + "learning_rate": 0.0004735312818030556, + "loss": 0.90033066, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 2762, + "time_per_iteration": 2.7256298065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.06967998, + "diversity_loss_mlp": 0.0, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.06903030148880929, + "language_loss": 0.82737643, + "learning_rate": 0.0004732201814862727, + "loss": 0.83817625, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 2763, + "time_per_iteration": 2.785104990005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078579, + "balance_loss_mlp": 1.0687145, + "diversity_loss_mlp": 0.0, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.07391416357546753, + "language_loss": 0.81619537, + "learning_rate": 0.0004729090915663373, + "loss": 0.82698119, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 2764, + "time_per_iteration": 2.841716766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841129, + "balance_loss_mlp": 1.43825924, + "diversity_loss_mlp": 0.21717778, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.03676047653681057, + "language_loss": 0.84753668, + "learning_rate": 0.00047259801216402534, + "loss": 0.85594797, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01341068, + "step": 2765, + "time_per_iteration": 2.5414865016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078812, + "balance_loss_mlp": 1.06872129, + "diversity_loss_mlp": 0.0, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.08353685320939014, + "language_loss": 0.86307138, + "learning_rate": 0.00047228694340010845, + "loss": 0.87385947, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 2766, + "time_per_iteration": 2.571230173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083419, + "balance_loss_mlp": 1.07304192, + "diversity_loss_mlp": 0.0, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.07758433064211989, + "language_loss": 0.85983396, + "learning_rate": 0.0004719758853953544, + "loss": 0.87066811, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 2767, + "time_per_iteration": 3.5577545166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.07479465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.08923013324738549, + "language_loss": 0.83480549, + "learning_rate": 0.00047166483827052645, + "loss": 0.84565854, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.10510254, + "routerloss_mlp": 0.0, + "step": 2768, + "time_per_iteration": 2.3904964923858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014357, + "balance_loss_mlp": 1.0088253, + "diversity_loss_mlp": 0.0, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.015852342000118255, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78092843, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2769, + "time_per_iteration": 4.993681907653809 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100393, + "balance_loss_mlp": 1.08974218, + "diversity_loss_mlp": 0.0, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.07499519146645399, + "language_loss": 0.8344022, + "learning_rate": 0.000471042777143682, + "loss": 0.84540612, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.10656738, + "routerloss_mlp": 0.0, + "step": 2770, + "time_per_iteration": 3.2187654972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099844, + "balance_loss_mlp": 1.0895741, + "diversity_loss_mlp": 0.0, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.07177386868704265, + "language_loss": 0.79602164, + "learning_rate": 0.0004707317633831707, + "loss": 0.80702007, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 2771, + "time_per_iteration": 2.5579092502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097617, + "balance_loss_mlp": 1.08694136, + "diversity_loss_mlp": 0.0, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.08358365289860634, + "language_loss": 0.78326285, + "learning_rate": 0.00047042076098559673, + "loss": 0.79423904, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.10687256, + "routerloss_mlp": 0.0, + "step": 2772, + "time_per_iteration": 2.6240808963775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089428, + "balance_loss_mlp": 1.07924104, + "diversity_loss_mlp": 0.0, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.07827879900232339, + "language_loss": 0.7374208, + "learning_rate": 0.00047010977007170174, + "loss": 0.7483151, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 2773, + "time_per_iteration": 3.239807605743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108646, + "balance_loss_mlp": 1.07606506, + "diversity_loss_mlp": 0.0, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.0770996892807777, + "language_loss": 0.82462615, + "learning_rate": 0.00046979879076222334, + "loss": 0.83549076, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 2774, + "time_per_iteration": 2.6871917247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081473, + "balance_loss_mlp": 1.07122087, + "diversity_loss_mlp": 0.0, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.060681013844514214, + "language_loss": 0.84932172, + "learning_rate": 0.0004694878231778939, + "loss": 0.86013645, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2775, + "time_per_iteration": 3.3516969680786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083848, + "balance_loss_mlp": 1.07336903, + "diversity_loss_mlp": 0.0, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.06561156947814625, + "language_loss": 0.84353071, + "learning_rate": 0.0004691768674394423, + "loss": 0.85436922, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2776, + "time_per_iteration": 2.9356815814971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010203, + "balance_loss_mlp": 1.01491189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.017317997453326725, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85504305, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 2777, + "time_per_iteration": 4.766932010650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017275, + "balance_loss_mlp": 1.01186275, + "diversity_loss_mlp": 0.0, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.016201867017030143, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77670807, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 2778, + "time_per_iteration": 5.022111177444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081648, + "balance_loss_mlp": 1.07109189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.08348606714079294, + "language_loss": 0.79229748, + "learning_rate": 0.00046824407250656676, + "loss": 0.803114, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.10565186, + "routerloss_mlp": 0.0, + "step": 2779, + "time_per_iteration": 2.6202685832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079296, + "balance_loss_mlp": 1.06859064, + "diversity_loss_mlp": 0.0, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.0812040646365834, + "language_loss": 0.83481312, + "learning_rate": 0.0004679331653588161, + "loss": 0.84560603, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.1071167, + "routerloss_mlp": 0.0, + "step": 2780, + "time_per_iteration": 2.6287879943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.07337165, + "diversity_loss_mlp": 0.0, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.08148878126655458, + "language_loss": 0.85570091, + "learning_rate": 0.0004676222706605147, + "loss": 0.86654037, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2781, + "time_per_iteration": 2.634186029434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082358, + "balance_loss_mlp": 1.07175457, + "diversity_loss_mlp": 0.0, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.08561637601090062, + "language_loss": 0.84885913, + "learning_rate": 0.0004673113885323626, + "loss": 0.85968268, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.10601807, + "routerloss_mlp": 0.0, + "step": 2782, + "time_per_iteration": 2.839108943939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084061, + "balance_loss_mlp": 1.07358241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.0730092425976976, + "language_loss": 0.78793383, + "learning_rate": 0.00046700051909505494, + "loss": 0.79877448, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.10479736, + "routerloss_mlp": 0.0, + "step": 2783, + "time_per_iteration": 3.1548988819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080824, + "balance_loss_mlp": 1.06943369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06678731146909953, + "language_loss": 0.84066731, + "learning_rate": 0.000466689662469282, + "loss": 0.85147554, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2784, + "time_per_iteration": 2.6213507652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082867, + "balance_loss_mlp": 1.07235312, + "diversity_loss_mlp": 0.0, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.06931446022689573, + "language_loss": 0.83996934, + "learning_rate": 0.00046637881877572917, + "loss": 0.85079801, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.10522461, + "routerloss_mlp": 0.0, + "step": 2785, + "time_per_iteration": 3.1161208152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084248, + "balance_loss_mlp": 1.07350779, + "diversity_loss_mlp": 0.0, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.05978198327100757, + "language_loss": 0.84824258, + "learning_rate": 0.0004660679881350764, + "loss": 0.85908508, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 2786, + "time_per_iteration": 2.7317774295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_mlp": 1.0375849, + "diversity_loss_mlp": 0.0, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.025126940202686972, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.7665174, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.06005859, + "routerloss_mlp": 0.0, + "step": 2787, + "time_per_iteration": 5.0151801109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079952, + "balance_loss_mlp": 1.06945598, + "diversity_loss_mlp": 0.0, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07181749108152896, + "language_loss": 0.78038859, + "learning_rate": 0.0004654463664951667, + "loss": 0.79118812, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 2788, + "time_per_iteration": 2.9862492084503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074852, + "balance_loss_mlp": 1.06444538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06160548649513732, + "language_loss": 0.83008492, + "learning_rate": 0.0004651355757372447, + "loss": 0.84083349, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 2789, + "time_per_iteration": 2.6209347248077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00838367, + "balance_loss_mlp": 1.43426061, + "diversity_loss_mlp": 0.2158158, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.029696530744324656, + "language_loss": 0.8589375, + "learning_rate": 0.00046482479851489274, + "loss": 0.86732113, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01332852, + "step": 2790, + "time_per_iteration": 2.6991934776306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077231, + "balance_loss_mlp": 1.06660962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.09378702232215988, + "language_loss": 0.77937293, + "learning_rate": 0.00046451403494876525, + "loss": 0.79014528, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.10632324, + "routerloss_mlp": 0.0, + "step": 2791, + "time_per_iteration": 2.8735973834991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070268, + "balance_loss_mlp": 1.05943799, + "diversity_loss_mlp": 0.0, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.07434319158841775, + "language_loss": 0.84554839, + "learning_rate": 0.0004642032851595111, + "loss": 0.85625106, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.1083374, + "routerloss_mlp": 0.0, + "step": 2792, + "time_per_iteration": 2.7458536624908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065699, + "balance_loss_mlp": 1.05472004, + "diversity_loss_mlp": 0.0, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.06545464420604186, + "language_loss": 0.85163087, + "learning_rate": 0.00046389254926777404, + "loss": 0.86228788, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.10980225, + "routerloss_mlp": 0.0, + "step": 2793, + "time_per_iteration": 2.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062925, + "balance_loss_mlp": 1.0519762, + "diversity_loss_mlp": 0.0, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.06502650627416932, + "language_loss": 0.78292251, + "learning_rate": 0.0004635818273941926, + "loss": 0.79355174, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.10955811, + "routerloss_mlp": 0.0, + "step": 2794, + "time_per_iteration": 3.569359302520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058844, + "balance_loss_mlp": 1.04798412, + "diversity_loss_mlp": 0.0, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.0851115940139546, + "language_loss": 0.81696212, + "learning_rate": 0.0004632711196593997, + "loss": 0.82755053, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2795, + "time_per_iteration": 2.763248920440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059516, + "balance_loss_mlp": 1.04872167, + "diversity_loss_mlp": 0.0, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.08577601840657965, + "language_loss": 0.85307401, + "learning_rate": 0.00046296042618402297, + "loss": 0.86366916, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.10791016, + "routerloss_mlp": 0.0, + "step": 2796, + "time_per_iteration": 3.059995651245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065204, + "balance_loss_mlp": 1.05436158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.05816929772054262, + "language_loss": 0.79285312, + "learning_rate": 0.0004626497470886839, + "loss": 0.80350512, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2797, + "time_per_iteration": 2.9551138877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059667, + "balance_loss_mlp": 1.04897988, + "diversity_loss_mlp": 0.0, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.06686475877008137, + "language_loss": 0.82082057, + "learning_rate": 0.00046233908249399897, + "loss": 0.83141726, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.10693359, + "routerloss_mlp": 0.0, + "step": 2798, + "time_per_iteration": 2.7494163513183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_mlp": 1.06012726, + "diversity_loss_mlp": 0.0, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.06311972638358435, + "language_loss": 0.78919041, + "learning_rate": 0.00046202843252057905, + "loss": 0.79990107, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.10943604, + "routerloss_mlp": 0.0, + "step": 2799, + "time_per_iteration": 2.586824655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076382, + "balance_loss_mlp": 1.06545627, + "diversity_loss_mlp": 0.0, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.06763496495115903, + "language_loss": 0.83705521, + "learning_rate": 0.00046171779728902896, + "loss": 0.84781897, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2800, + "time_per_iteration": 2.5922951698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084728, + "balance_loss_mlp": 1.07354665, + "diversity_loss_mlp": 0.0, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.12725923305511472, + "language_loss": 0.86135888, + "learning_rate": 0.000461407176919948, + "loss": 0.87220615, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2801, + "time_per_iteration": 2.532080888748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_mlp": 1.07459974, + "diversity_loss_mlp": 0.0, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.08372818850883645, + "language_loss": 0.85317719, + "learning_rate": 0.00046109657153392997, + "loss": 0.8640309, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.10772705, + "routerloss_mlp": 0.0, + "step": 2802, + "time_per_iteration": 2.7498726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082922, + "balance_loss_mlp": 1.07185912, + "diversity_loss_mlp": 0.0, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.07972844989907181, + "language_loss": 0.82981819, + "learning_rate": 0.0004607859812515622, + "loss": 0.84064734, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2803, + "time_per_iteration": 2.5823397636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077838, + "balance_loss_mlp": 1.06679916, + "diversity_loss_mlp": 0.0, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.06982591680837838, + "language_loss": 0.88185596, + "learning_rate": 0.00046047540619342667, + "loss": 0.89263427, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2804, + "time_per_iteration": 2.582594156265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089122, + "balance_loss_mlp": 1.07845902, + "diversity_loss_mlp": 0.0, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.06923180186476277, + "language_loss": 0.80359995, + "learning_rate": 0.00046016484648009933, + "loss": 0.81449121, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.10675049, + "routerloss_mlp": 0.0, + "step": 2805, + "time_per_iteration": 2.705085277557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082055, + "balance_loss_mlp": 1.0713259, + "diversity_loss_mlp": 0.0, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.06938884531628577, + "language_loss": 0.81049907, + "learning_rate": 0.0004598543022321501, + "loss": 0.82131958, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.10736084, + "routerloss_mlp": 0.0, + "step": 2806, + "time_per_iteration": 2.6722495555877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855076, + "balance_loss_mlp": 1.46593428, + "diversity_loss_mlp": 0.21781196, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.030466031644405155, + "language_loss": 0.79783833, + "learning_rate": 0.0004595437735701433, + "loss": 0.80638903, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01320273, + "step": 2807, + "time_per_iteration": 2.734110116958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088135, + "balance_loss_mlp": 1.07728648, + "diversity_loss_mlp": 0.0, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.08474622827734493, + "language_loss": 0.83849192, + "learning_rate": 0.00045923326061463623, + "loss": 0.84937334, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2808, + "time_per_iteration": 2.7606189250946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089545, + "balance_loss_mlp": 1.07878006, + "diversity_loss_mlp": 0.0, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.06442619071995537, + "language_loss": 0.8173002, + "learning_rate": 0.00045892276348618113, + "loss": 0.82819563, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.10772705, + "routerloss_mlp": 0.0, + "step": 2809, + "time_per_iteration": 2.9691591262817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_mlp": 1.02887774, + "diversity_loss_mlp": 0.0, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.01908051648382603, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79294789, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 2810, + "time_per_iteration": 4.957923173904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089256, + "balance_loss_mlp": 1.07848597, + "diversity_loss_mlp": 0.0, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.05960464217413758, + "language_loss": 0.80596066, + "learning_rate": 0.000458301817192603, + "loss": 0.81685317, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.10778809, + "routerloss_mlp": 0.0, + "step": 2811, + "time_per_iteration": 2.852247714996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021724, + "balance_loss_mlp": 1.0165503, + "diversity_loss_mlp": 0.0, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.015447521326512613, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81863511, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.05175781, + "routerloss_mlp": 0.0, + "step": 2812, + "time_per_iteration": 4.808724880218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080487, + "balance_loss_mlp": 1.06993747, + "diversity_loss_mlp": 0.0, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.06805695837678187, + "language_loss": 0.87130654, + "learning_rate": 0.00045768093565369983, + "loss": 0.88211143, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.10552979, + "routerloss_mlp": 0.0, + "step": 2813, + "time_per_iteration": 2.7794101238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090945, + "balance_loss_mlp": 1.08034182, + "diversity_loss_mlp": 0.0, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.06578755075233327, + "language_loss": 0.8208549, + "learning_rate": 0.0004573705194685646, + "loss": 0.83176434, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.1060791, + "routerloss_mlp": 0.0, + "step": 2814, + "time_per_iteration": 2.686871290206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084437, + "balance_loss_mlp": 1.07364845, + "diversity_loss_mlp": 0.0, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.07321549809116977, + "language_loss": 0.84966654, + "learning_rate": 0.00045706011983366157, + "loss": 0.86051095, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.10784912, + "routerloss_mlp": 0.0, + "step": 2815, + "time_per_iteration": 2.676772117614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843207, + "balance_loss_mlp": 1.44560027, + "diversity_loss_mlp": 0.21445701, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.03775972378408833, + "language_loss": 0.82685602, + "learning_rate": 0.00045674973686949847, + "loss": 0.83528805, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01317827, + "step": 2816, + "time_per_iteration": 2.548164129257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079521, + "balance_loss_mlp": 1.06887531, + "diversity_loss_mlp": 0.0, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.06715248152064907, + "language_loss": 0.85478067, + "learning_rate": 0.0004564393706965766, + "loss": 0.86557591, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.10656738, + "routerloss_mlp": 0.0, + "step": 2817, + "time_per_iteration": 2.9715416431427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078759, + "balance_loss_mlp": 1.06789875, + "diversity_loss_mlp": 0.0, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.07300594242261846, + "language_loss": 0.81410033, + "learning_rate": 0.00045612902143539116, + "loss": 0.82488787, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.10864258, + "routerloss_mlp": 0.0, + "step": 2818, + "time_per_iteration": 2.5861568450927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.05926371, + "diversity_loss_mlp": 0.0, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.07796543703625758, + "language_loss": 0.8169418, + "learning_rate": 0.00045581868920642986, + "loss": 0.82763875, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 2819, + "time_per_iteration": 2.495675563812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079235, + "balance_loss_mlp": 1.06864905, + "diversity_loss_mlp": 0.0, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.08284985931126, + "language_loss": 0.79605496, + "learning_rate": 0.00045550837413017457, + "loss": 0.80684733, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2820, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081511, + "balance_loss_mlp": 1.07137275, + "diversity_loss_mlp": 0.0, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06853869944040722, + "language_loss": 0.85501075, + "learning_rate": 0.0004551980763271005, + "loss": 0.86582589, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 2821, + "time_per_iteration": 2.6689629554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080137, + "balance_loss_mlp": 1.06970072, + "diversity_loss_mlp": 0.0, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.07047505467714002, + "language_loss": 0.83788973, + "learning_rate": 0.0004548877959176756, + "loss": 0.84869111, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2822, + "time_per_iteration": 2.8898305892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.06903815, + "diversity_loss_mlp": 0.0, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.06782192405371351, + "language_loss": 0.86297488, + "learning_rate": 0.00045457753302236166, + "loss": 0.87376869, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.10339355, + "routerloss_mlp": 0.0, + "step": 2823, + "time_per_iteration": 2.626262903213501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087962, + "balance_loss_mlp": 1.07755554, + "diversity_loss_mlp": 0.0, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.07336203540826484, + "language_loss": 0.87131381, + "learning_rate": 0.00045426728776161353, + "loss": 0.88219345, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2824, + "time_per_iteration": 2.7630255222320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085716, + "balance_loss_mlp": 1.07529116, + "diversity_loss_mlp": 0.0, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.07766893457840997, + "language_loss": 0.81382459, + "learning_rate": 0.00045395706025587863, + "loss": 0.82468176, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2825, + "time_per_iteration": 2.653036594390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070472, + "balance_loss_mlp": 1.05976105, + "diversity_loss_mlp": 0.0, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.08392292239142347, + "language_loss": 0.82965428, + "learning_rate": 0.00045364685062559843, + "loss": 0.84035897, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.10717773, + "routerloss_mlp": 0.0, + "step": 2826, + "time_per_iteration": 2.8091156482696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075397, + "balance_loss_mlp": 1.06498957, + "diversity_loss_mlp": 0.0, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.06510139608888613, + "language_loss": 0.91622829, + "learning_rate": 0.0004533366589912067, + "loss": 0.92698228, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2827, + "time_per_iteration": 2.949005365371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075847, + "balance_loss_mlp": 1.06538677, + "diversity_loss_mlp": 0.0, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.07049343673366977, + "language_loss": 0.77641904, + "learning_rate": 0.0004530264854731306, + "loss": 0.78717756, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.10461426, + "routerloss_mlp": 0.0, + "step": 2828, + "time_per_iteration": 3.054252862930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079568, + "balance_loss_mlp": 1.06920242, + "diversity_loss_mlp": 0.0, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.05986165572949975, + "language_loss": 0.84122354, + "learning_rate": 0.00045271633019179034, + "loss": 0.85201919, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 2829, + "time_per_iteration": 2.788818836212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077904, + "balance_loss_mlp": 1.06762242, + "diversity_loss_mlp": 0.0, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.05963281032217842, + "language_loss": 0.87701666, + "learning_rate": 0.0004524061932675986, + "loss": 0.88779569, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.10284424, + "routerloss_mlp": 0.0, + "step": 2830, + "time_per_iteration": 2.861154079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073509, + "balance_loss_mlp": 1.06306028, + "diversity_loss_mlp": 0.0, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.11132414831600651, + "language_loss": 0.87095535, + "learning_rate": 0.00045209607482096125, + "loss": 0.88169038, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.10455322, + "routerloss_mlp": 0.0, + "step": 2831, + "time_per_iteration": 3.041248321533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107632, + "balance_loss_mlp": 1.06573415, + "diversity_loss_mlp": 0.0, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.07049073021000962, + "language_loss": 0.84385192, + "learning_rate": 0.0004517859749722772, + "loss": 0.85461509, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2832, + "time_per_iteration": 2.663478374481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075242, + "balance_loss_mlp": 1.0643816, + "diversity_loss_mlp": 0.0, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.06386820666055518, + "language_loss": 0.79316235, + "learning_rate": 0.0004514758938419376, + "loss": 0.80391467, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.10870361, + "routerloss_mlp": 0.0, + "step": 2833, + "time_per_iteration": 2.8141582012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_mlp": 1.03721869, + "diversity_loss_mlp": 0.0, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.027736452139364785, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77963334, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2834, + "time_per_iteration": 4.960749864578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075071, + "balance_loss_mlp": 1.06446719, + "diversity_loss_mlp": 0.0, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.06436328535255592, + "language_loss": 0.83993077, + "learning_rate": 0.00045085578821782175, + "loss": 0.85068148, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.1060791, + "routerloss_mlp": 0.0, + "step": 2835, + "time_per_iteration": 2.6025185585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020548, + "balance_loss_mlp": 1.01516008, + "diversity_loss_mlp": 0.0, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.015651807900939278, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77155292, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 2836, + "time_per_iteration": 4.911514043807983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079494, + "balance_loss_mlp": 1.06864595, + "diversity_loss_mlp": 0.0, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.05502946705999508, + "language_loss": 0.81078947, + "learning_rate": 0.00045023575891159866, + "loss": 0.82158434, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.10852051, + "routerloss_mlp": 0.0, + "step": 2837, + "time_per_iteration": 2.7158284187316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008506, + "balance_loss_mlp": 1.00321293, + "diversity_loss_mlp": 0.0, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.010060791837063862, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75772309, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 2838, + "time_per_iteration": 4.9448912143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078413, + "balance_loss_mlp": 1.06803036, + "diversity_loss_mlp": 0.0, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.059936217606746015, + "language_loss": 0.78111225, + "learning_rate": 0.0004496158068861354, + "loss": 0.79189646, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 2839, + "time_per_iteration": 2.8019115924835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.07090366, + "diversity_loss_mlp": 0.0, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.06804602152838367, + "language_loss": 0.80713242, + "learning_rate": 0.00044930586015455207, + "loss": 0.81794775, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.10638428, + "routerloss_mlp": 0.0, + "step": 2840, + "time_per_iteration": 2.771359443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076777, + "balance_loss_mlp": 1.06646562, + "diversity_loss_mlp": 0.0, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.0578733121218936, + "language_loss": 0.88904727, + "learning_rate": 0.000448995933104179, + "loss": 0.89981508, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2841, + "time_per_iteration": 2.8486392498016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081465, + "balance_loss_mlp": 1.07075977, + "diversity_loss_mlp": 0.0, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.07392730491467848, + "language_loss": 0.80162299, + "learning_rate": 0.00044868602585534077, + "loss": 0.81243765, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.10699463, + "routerloss_mlp": 0.0, + "step": 2842, + "time_per_iteration": 2.8463480472564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074348, + "balance_loss_mlp": 1.06379187, + "diversity_loss_mlp": 0.0, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.0858024928700591, + "language_loss": 0.89360344, + "learning_rate": 0.0004483761385283541, + "loss": 0.90434694, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.10565186, + "routerloss_mlp": 0.0, + "step": 2843, + "time_per_iteration": 2.534032106399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870358, + "balance_loss_mlp": 1.4994092, + "diversity_loss_mlp": 0.21570696, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.030684440159293704, + "language_loss": 0.8165319, + "learning_rate": 0.0004480662712435281, + "loss": 0.82523549, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01280049, + "step": 2844, + "time_per_iteration": 2.7523300647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081741, + "balance_loss_mlp": 1.07085109, + "diversity_loss_mlp": 0.0, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.08261462073704483, + "language_loss": 0.88389564, + "learning_rate": 0.0004477564241211635, + "loss": 0.89471304, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 2845, + "time_per_iteration": 2.5676896572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068187, + "balance_loss_mlp": 1.0573566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.07762403474355188, + "language_loss": 0.868963, + "learning_rate": 0.0004474465972815541, + "loss": 0.87964487, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.10839844, + "routerloss_mlp": 0.0, + "step": 2846, + "time_per_iteration": 2.4843738079071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073996, + "balance_loss_mlp": 1.06337464, + "diversity_loss_mlp": 0.0, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.05857404260801407, + "language_loss": 0.87612844, + "learning_rate": 0.000447136790844985, + "loss": 0.88686836, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.10626221, + "routerloss_mlp": 0.0, + "step": 2847, + "time_per_iteration": 2.659214973449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068933, + "balance_loss_mlp": 1.05774474, + "diversity_loss_mlp": 0.0, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.0657788254057266, + "language_loss": 0.80922693, + "learning_rate": 0.00044682700493173385, + "loss": 0.81991625, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.11187744, + "routerloss_mlp": 0.0, + "step": 2848, + "time_per_iteration": 2.8093039989471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071859, + "balance_loss_mlp": 1.06077814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.06921376228249611, + "language_loss": 0.80399549, + "learning_rate": 0.00044651723966207004, + "loss": 0.81471407, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.11090088, + "routerloss_mlp": 0.0, + "step": 2849, + "time_per_iteration": 3.1084961891174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.05826974, + "diversity_loss_mlp": 0.0, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.06382752106805908, + "language_loss": 0.78137773, + "learning_rate": 0.00044620749515625536, + "loss": 0.79206896, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2850, + "time_per_iteration": 2.8127682209014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_mlp": 1.05505395, + "diversity_loss_mlp": 0.0, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.07084116902380141, + "language_loss": 0.85142213, + "learning_rate": 0.00044589777153454334, + "loss": 0.86208153, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 2851, + "time_per_iteration": 2.7690277099609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063836, + "balance_loss_mlp": 1.05239749, + "diversity_loss_mlp": 0.0, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.06308922523972363, + "language_loss": 0.83850712, + "learning_rate": 0.00044558806891717895, + "loss": 0.84914547, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2852, + "time_per_iteration": 2.542076587677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066106, + "balance_loss_mlp": 1.05529404, + "diversity_loss_mlp": 0.0, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.06000502851088379, + "language_loss": 0.79783493, + "learning_rate": 0.0004452783874243998, + "loss": 0.808496, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.1081543, + "routerloss_mlp": 0.0, + "step": 2853, + "time_per_iteration": 2.8680150508880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070657, + "balance_loss_mlp": 1.06022012, + "diversity_loss_mlp": 0.0, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.07387916596955035, + "language_loss": 0.84572864, + "learning_rate": 0.00044496872717643475, + "loss": 0.85643518, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 2854, + "time_per_iteration": 2.676128625869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_mlp": 1.04261672, + "diversity_loss_mlp": 0.0, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.03710413532206065, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78137678, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2855, + "time_per_iteration": 4.937518835067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.06609333, + "diversity_loss_mlp": 0.0, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.06582649113696544, + "language_loss": 0.81989098, + "learning_rate": 0.0004443494708958217, + "loss": 0.83065504, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 2856, + "time_per_iteration": 2.9764318466186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077652, + "balance_loss_mlp": 1.06707263, + "diversity_loss_mlp": 0.0, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.05962775351044122, + "language_loss": 0.80705082, + "learning_rate": 0.0004440398751035906, + "loss": 0.81782728, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2857, + "time_per_iteration": 2.8708760738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107925, + "balance_loss_mlp": 1.06846118, + "diversity_loss_mlp": 0.0, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.08652259855452149, + "language_loss": 0.83723986, + "learning_rate": 0.00044373030103700645, + "loss": 0.84803236, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.10791016, + "routerloss_mlp": 0.0, + "step": 2858, + "time_per_iteration": 2.629887342453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857386, + "balance_loss_mlp": 1.47058845, + "diversity_loss_mlp": 0.21831456, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.03034959963101528, + "language_loss": 0.79655832, + "learning_rate": 0.000443420748816257, + "loss": 0.80513215, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01293462, + "step": 2859, + "time_per_iteration": 2.8473408222198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107821, + "balance_loss_mlp": 1.06795764, + "diversity_loss_mlp": 0.0, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.07076083110298415, + "language_loss": 0.78692329, + "learning_rate": 0.0004431112185615208, + "loss": 0.79770535, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2860, + "time_per_iteration": 2.751131534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.0721283, + "diversity_loss_mlp": 0.0, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.06396450124437818, + "language_loss": 0.7993266, + "learning_rate": 0.00044280171039296845, + "loss": 0.81015229, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.10449219, + "routerloss_mlp": 0.0, + "step": 2861, + "time_per_iteration": 2.606870651245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082002, + "balance_loss_mlp": 1.0716126, + "diversity_loss_mlp": 0.0, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.0734058146638898, + "language_loss": 0.8832019, + "learning_rate": 0.0004424922244307616, + "loss": 0.89402187, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2862, + "time_per_iteration": 2.728055477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081346, + "balance_loss_mlp": 1.07124305, + "diversity_loss_mlp": 0.0, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.08810368166009505, + "language_loss": 0.82030249, + "learning_rate": 0.00044218276079505315, + "loss": 0.83111596, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.10101318, + "routerloss_mlp": 0.0, + "step": 2863, + "time_per_iteration": 2.8925743103027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076982, + "balance_loss_mlp": 1.0667721, + "diversity_loss_mlp": 0.0, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.06918705117949257, + "language_loss": 0.74817479, + "learning_rate": 0.0004418733196059876, + "loss": 0.75894463, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 2864, + "time_per_iteration": 2.747131109237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068041, + "balance_loss_mlp": 1.0579797, + "diversity_loss_mlp": 0.0, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.060188467246496694, + "language_loss": 0.79747194, + "learning_rate": 0.0004415639009837008, + "loss": 0.80815232, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 2865, + "time_per_iteration": 2.838609218597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077074, + "balance_loss_mlp": 1.06704867, + "diversity_loss_mlp": 0.0, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.06869441498871262, + "language_loss": 0.82126647, + "learning_rate": 0.00044125450504831955, + "loss": 0.83203721, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2866, + "time_per_iteration": 2.7267115116119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080685, + "balance_loss_mlp": 1.07046294, + "diversity_loss_mlp": 0.0, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.0812577822304444, + "language_loss": 0.82503623, + "learning_rate": 0.0004409451319199622, + "loss": 0.83584309, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 2867, + "time_per_iteration": 2.6727194786071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080071, + "balance_loss_mlp": 1.07005203, + "diversity_loss_mlp": 0.0, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07302760882162292, + "language_loss": 0.84415638, + "learning_rate": 0.0004406357817187381, + "loss": 0.8549571, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2868, + "time_per_iteration": 2.9669716358184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084226, + "balance_loss_mlp": 1.07424247, + "diversity_loss_mlp": 0.0, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.06120403113840053, + "language_loss": 0.81250817, + "learning_rate": 0.0004403264545647474, + "loss": 0.82335043, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 2869, + "time_per_iteration": 3.535280704498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092656, + "balance_loss_mlp": 1.08244562, + "diversity_loss_mlp": 0.0, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.05305368525165607, + "language_loss": 0.84751379, + "learning_rate": 0.00044001715057808154, + "loss": 0.85844034, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 2870, + "time_per_iteration": 2.757197618484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00867753, + "balance_loss_mlp": 1.49414647, + "diversity_loss_mlp": 0.21602358, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.02933333976418528, + "language_loss": 0.81627762, + "learning_rate": 0.0004397078698788232, + "loss": 0.82495517, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01266836, + "step": 2871, + "time_per_iteration": 3.241936445236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046288, + "balance_loss_mlp": 1.04097104, + "diversity_loss_mlp": 0.0, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.0256992480173019, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81488657, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 2872, + "time_per_iteration": 4.879035234451294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103168, + "balance_loss_mlp": 1.09304726, + "diversity_loss_mlp": 0.0, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.06889966135830194, + "language_loss": 0.78025937, + "learning_rate": 0.00043908937882281343, + "loss": 0.79129106, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.10119629, + "routerloss_mlp": 0.0, + "step": 2873, + "time_per_iteration": 2.624072313308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097291, + "balance_loss_mlp": 1.08644319, + "diversity_loss_mlp": 0.0, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.06659644406743612, + "language_loss": 0.82492054, + "learning_rate": 0.0004387801687061814, + "loss": 0.83589351, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2874, + "time_per_iteration": 2.839524269104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100748, + "balance_loss_mlp": 1.09040689, + "diversity_loss_mlp": 0.0, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.06411004123803754, + "language_loss": 0.80204833, + "learning_rate": 0.0004384709823571958, + "loss": 0.81305587, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2875, + "time_per_iteration": 2.768268346786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092947, + "balance_loss_mlp": 1.08278441, + "diversity_loss_mlp": 0.0, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.0827933156096061, + "language_loss": 0.83099473, + "learning_rate": 0.0004381618198958932, + "loss": 0.84192419, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 2876, + "time_per_iteration": 3.509364604949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084381, + "balance_loss_mlp": 1.07393849, + "diversity_loss_mlp": 0.0, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.0672046455921574, + "language_loss": 0.83616996, + "learning_rate": 0.00043785268144230137, + "loss": 0.84701377, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.10449219, + "routerloss_mlp": 0.0, + "step": 2877, + "time_per_iteration": 2.8941080570220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078858, + "balance_loss_mlp": 1.06849325, + "diversity_loss_mlp": 0.0, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.08466064144544548, + "language_loss": 0.82657743, + "learning_rate": 0.00043754356711643837, + "loss": 0.83736604, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 2878, + "time_per_iteration": 2.6849513053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072509, + "balance_loss_mlp": 1.0620904, + "diversity_loss_mlp": 0.0, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.08115939494621484, + "language_loss": 0.84283209, + "learning_rate": 0.0004372344770383132, + "loss": 0.85355723, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2879, + "time_per_iteration": 2.809833526611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064473, + "balance_loss_mlp": 1.05426884, + "diversity_loss_mlp": 0.0, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.15468249092113104, + "language_loss": 0.82951438, + "learning_rate": 0.00043692541132792507, + "loss": 0.84015906, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.10205078, + "routerloss_mlp": 0.0, + "step": 2880, + "time_per_iteration": 2.6886332035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106573, + "balance_loss_mlp": 1.05541205, + "diversity_loss_mlp": 0.0, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.07258014540865806, + "language_loss": 0.83396262, + "learning_rate": 0.00043661637010526384, + "loss": 0.84461993, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2881, + "time_per_iteration": 2.484912872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010637, + "balance_loss_mlp": 1.05335283, + "diversity_loss_mlp": 0.0, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.07022154553173111, + "language_loss": 0.83217472, + "learning_rate": 0.00043630735349031025, + "loss": 0.8428117, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 2882, + "time_per_iteration": 2.627950429916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.05427396, + "diversity_loss_mlp": 0.0, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.05734398116556458, + "language_loss": 0.81837022, + "learning_rate": 0.00043599836160303495, + "loss": 0.8290168, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2883, + "time_per_iteration": 2.87358021736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061705, + "balance_loss_mlp": 1.05094647, + "diversity_loss_mlp": 0.0, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.05952583825506871, + "language_loss": 0.77472365, + "learning_rate": 0.0004356893945633995, + "loss": 0.78534073, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 2884, + "time_per_iteration": 2.9415786266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058414, + "balance_loss_mlp": 1.04738104, + "diversity_loss_mlp": 0.0, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.06387157363580499, + "language_loss": 0.81997669, + "learning_rate": 0.0004353804524913551, + "loss": 0.8305608, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2885, + "time_per_iteration": 2.5772132873535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106399, + "balance_loss_mlp": 1.05298674, + "diversity_loss_mlp": 0.0, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.07314612024272811, + "language_loss": 0.82015049, + "learning_rate": 0.0004350715355068441, + "loss": 0.8307904, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.11010742, + "routerloss_mlp": 0.0, + "step": 2886, + "time_per_iteration": 2.7211849689483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062969, + "balance_loss_mlp": 1.05221653, + "diversity_loss_mlp": 0.0, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.08671001380075964, + "language_loss": 0.79774809, + "learning_rate": 0.00043476264372979847, + "loss": 0.8083778, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.10754395, + "routerloss_mlp": 0.0, + "step": 2887, + "time_per_iteration": 2.5452206134796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064606, + "balance_loss_mlp": 1.05403173, + "diversity_loss_mlp": 0.0, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.08125450311694367, + "language_loss": 0.78590369, + "learning_rate": 0.0004344537772801408, + "loss": 0.79654968, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.10577393, + "routerloss_mlp": 0.0, + "step": 2888, + "time_per_iteration": 3.870267391204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_mlp": 1.02839172, + "diversity_loss_mlp": 0.0, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.026917818165577125, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74456155, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 2889, + "time_per_iteration": 4.943026065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091351, + "balance_loss_mlp": 1.08043766, + "diversity_loss_mlp": 0.0, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.07456412824125162, + "language_loss": 0.83536172, + "learning_rate": 0.0004338361208426298, + "loss": 0.84627521, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.10919189, + "routerloss_mlp": 0.0, + "step": 2890, + "time_per_iteration": 2.65266752243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094404, + "balance_loss_mlp": 1.08348465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.057576040721241756, + "language_loss": 0.81499392, + "learning_rate": 0.00043352733109457164, + "loss": 0.82593793, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.10919189, + "routerloss_mlp": 0.0, + "step": 2891, + "time_per_iteration": 2.927246332168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106638, + "balance_loss_mlp": 1.09556401, + "diversity_loss_mlp": 0.0, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.0763949134442708, + "language_loss": 0.84462321, + "learning_rate": 0.00043321856715349244, + "loss": 0.85568959, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2892, + "time_per_iteration": 2.970857858657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110278, + "balance_loss_mlp": 1.0918721, + "diversity_loss_mlp": 0.0, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.07453927070697552, + "language_loss": 0.80594504, + "learning_rate": 0.00043290982913926466, + "loss": 0.81697285, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.10913086, + "routerloss_mlp": 0.0, + "step": 2893, + "time_per_iteration": 2.8581972122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105658, + "balance_loss_mlp": 1.09473801, + "diversity_loss_mlp": 0.0, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.08476057735977802, + "language_loss": 0.84177083, + "learning_rate": 0.0004326011171717514, + "loss": 0.85282743, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2894, + "time_per_iteration": 2.90563702583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094642, + "balance_loss_mlp": 1.08371019, + "diversity_loss_mlp": 0.0, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.06785531665857511, + "language_loss": 0.80468631, + "learning_rate": 0.0004322924313708051, + "loss": 0.8156327, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.10931396, + "routerloss_mlp": 0.0, + "step": 2895, + "time_per_iteration": 2.51784610748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092855, + "balance_loss_mlp": 1.08219218, + "diversity_loss_mlp": 0.0, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.07706946900287333, + "language_loss": 0.84533763, + "learning_rate": 0.0004319837718562681, + "loss": 0.85626626, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.10668945, + "routerloss_mlp": 0.0, + "step": 2896, + "time_per_iteration": 2.5862512588500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083747, + "balance_loss_mlp": 1.07321525, + "diversity_loss_mlp": 0.0, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.0793708179068888, + "language_loss": 0.83050567, + "learning_rate": 0.0004316751387479726, + "loss": 0.84134316, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2897, + "time_per_iteration": 2.778136730194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857516, + "balance_loss_mlp": 1.47219694, + "diversity_loss_mlp": 0.21748725, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.034004819690404205, + "language_loss": 0.82499564, + "learning_rate": 0.0004313665321657409, + "loss": 0.83357084, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01267361, + "step": 2898, + "time_per_iteration": 3.7754030227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.06795418, + "diversity_loss_mlp": 0.0, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.08236969633510602, + "language_loss": 0.79824448, + "learning_rate": 0.00043105795222938436, + "loss": 0.80903113, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.1071167, + "routerloss_mlp": 0.0, + "step": 2899, + "time_per_iteration": 2.7090694904327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073397, + "balance_loss_mlp": 1.06296027, + "diversity_loss_mlp": 0.0, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.07659548301877016, + "language_loss": 0.78690445, + "learning_rate": 0.00043074939905870467, + "loss": 0.79763848, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2900, + "time_per_iteration": 2.6444900035858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069144, + "balance_loss_mlp": 1.05899358, + "diversity_loss_mlp": 0.0, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.08372730008806528, + "language_loss": 0.80284113, + "learning_rate": 0.0004304408727734927, + "loss": 0.81353253, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 2901, + "time_per_iteration": 2.6800661087036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855039, + "balance_loss_mlp": 1.46478724, + "diversity_loss_mlp": 0.21833366, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.026106559121528438, + "language_loss": 0.88945115, + "learning_rate": 0.0004301323734935288, + "loss": 0.89800155, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01347797, + "step": 2902, + "time_per_iteration": 2.6880388259887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_mlp": 1.05446076, + "diversity_loss_mlp": 0.0, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.08715674624995783, + "language_loss": 0.87386537, + "learning_rate": 0.000429823901338583, + "loss": 0.88451326, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2903, + "time_per_iteration": 2.611330032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.06004524, + "diversity_loss_mlp": 0.0, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.07350666628476007, + "language_loss": 0.86772639, + "learning_rate": 0.00042951545642841513, + "loss": 0.87843215, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2904, + "time_per_iteration": 3.066653251647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06802535, + "diversity_loss_mlp": 0.0, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.06907930895976065, + "language_loss": 0.86694556, + "learning_rate": 0.0004292070388827737, + "loss": 0.87773216, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.10644531, + "routerloss_mlp": 0.0, + "step": 2905, + "time_per_iteration": 2.5430614948272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068326, + "balance_loss_mlp": 1.05785918, + "diversity_loss_mlp": 0.0, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06877653703862108, + "language_loss": 0.81346464, + "learning_rate": 0.00042889864882139753, + "loss": 0.82414794, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2906, + "time_per_iteration": 2.5722434520721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075755, + "balance_loss_mlp": 1.06534863, + "diversity_loss_mlp": 0.0, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06732553967994827, + "language_loss": 0.81503737, + "learning_rate": 0.0004285902863640139, + "loss": 0.82579494, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.10406494, + "routerloss_mlp": 0.0, + "step": 2907, + "time_per_iteration": 2.643721580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074648, + "balance_loss_mlp": 1.06431222, + "diversity_loss_mlp": 0.0, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.06943407338412115, + "language_loss": 0.86278725, + "learning_rate": 0.00042828195163033966, + "loss": 0.87353367, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.10339355, + "routerloss_mlp": 0.0, + "step": 2908, + "time_per_iteration": 2.7045791149139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081822, + "balance_loss_mlp": 1.07135582, + "diversity_loss_mlp": 0.0, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07324820072157985, + "language_loss": 0.79102659, + "learning_rate": 0.0004279736447400812, + "loss": 0.80184484, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2909, + "time_per_iteration": 2.585176944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107588, + "balance_loss_mlp": 1.06558049, + "diversity_loss_mlp": 0.0, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.07142642262643135, + "language_loss": 0.78468478, + "learning_rate": 0.00042766536581293385, + "loss": 0.79544365, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 2910, + "time_per_iteration": 2.723602771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090371, + "balance_loss_mlp": 1.07975566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.0702995437532307, + "language_loss": 0.79552364, + "learning_rate": 0.0004273571149685819, + "loss": 0.80642736, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2911, + "time_per_iteration": 2.7220258712768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091791, + "balance_loss_mlp": 1.08147311, + "diversity_loss_mlp": 0.0, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.06270923487878967, + "language_loss": 0.84021366, + "learning_rate": 0.00042704889232669937, + "loss": 0.85113156, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 2912, + "time_per_iteration": 2.6799380779266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848913, + "balance_loss_mlp": 1.45588994, + "diversity_loss_mlp": 0.21708892, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.03254511626684893, + "language_loss": 0.85648382, + "learning_rate": 0.0004267406980069484, + "loss": 0.86497295, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01242387, + "step": 2913, + "time_per_iteration": 2.7309391498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.10193157, + "diversity_loss_mlp": 0.0, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.05402445789476675, + "language_loss": 0.79744071, + "learning_rate": 0.0004264325321289808, + "loss": 0.80856508, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.10510254, + "routerloss_mlp": 0.0, + "step": 2914, + "time_per_iteration": 2.8245773315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104335, + "balance_loss_mlp": 1.09404707, + "diversity_loss_mlp": 0.0, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.07588418732744176, + "language_loss": 0.86308336, + "learning_rate": 0.00042612439481243736, + "loss": 0.87412667, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.10284424, + "routerloss_mlp": 0.0, + "step": 2915, + "time_per_iteration": 2.7910971641540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109566, + "balance_loss_mlp": 1.09916496, + "diversity_loss_mlp": 0.0, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.07165476469353879, + "language_loss": 0.90284097, + "learning_rate": 0.00042581628617694735, + "loss": 0.91393661, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2916, + "time_per_iteration": 2.7449898719787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00839442, + "balance_loss_mlp": 1.43753612, + "diversity_loss_mlp": 0.21687999, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.03331291255724556, + "language_loss": 0.81856477, + "learning_rate": 0.0004255082063421296, + "loss": 0.82695925, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01223436, + "step": 2917, + "time_per_iteration": 2.705263614654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.12130046, + "diversity_loss_mlp": 0.0, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.07697799391889214, + "language_loss": 0.84842837, + "learning_rate": 0.00042520015542759065, + "loss": 0.85974395, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2918, + "time_per_iteration": 2.8643360137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110893, + "balance_loss_mlp": 1.09857666, + "diversity_loss_mlp": 0.0, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.059259650717302215, + "language_loss": 0.88182557, + "learning_rate": 0.00042489213355292687, + "loss": 0.89291489, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2919, + "time_per_iteration": 2.871605634689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113923, + "balance_loss_mlp": 1.1035037, + "diversity_loss_mlp": 0.0, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.07025137955977834, + "language_loss": 0.81129396, + "learning_rate": 0.00042458414083772276, + "loss": 0.82243323, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2920, + "time_per_iteration": 2.5280137062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110383, + "balance_loss_mlp": 1.09353638, + "diversity_loss_mlp": 0.0, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.06291310679725345, + "language_loss": 0.85259616, + "learning_rate": 0.000424276177401552, + "loss": 0.86363447, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 2921, + "time_per_iteration": 2.8061861991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091107, + "balance_loss_mlp": 1.08052063, + "diversity_loss_mlp": 0.0, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.06947728514830868, + "language_loss": 0.8586399, + "learning_rate": 0.0004239682433639763, + "loss": 0.86955094, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2922, + "time_per_iteration": 2.7068192958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.07726383, + "diversity_loss_mlp": 0.0, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.06724553342566655, + "language_loss": 0.85617495, + "learning_rate": 0.0004236603388445467, + "loss": 0.86705184, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 2923, + "time_per_iteration": 2.5658164024353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083423, + "balance_loss_mlp": 1.07329023, + "diversity_loss_mlp": 0.0, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.06491959150956746, + "language_loss": 0.82087809, + "learning_rate": 0.00042335246396280166, + "loss": 0.83171237, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.10131836, + "routerloss_mlp": 0.0, + "step": 2924, + "time_per_iteration": 2.7210686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076248, + "balance_loss_mlp": 1.06606197, + "diversity_loss_mlp": 0.0, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.06924351044147684, + "language_loss": 0.90442908, + "learning_rate": 0.0004230446188382693, + "loss": 0.91519153, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 2925, + "time_per_iteration": 2.5210559368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072823, + "balance_loss_mlp": 1.06237423, + "diversity_loss_mlp": 0.0, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.06189914516088338, + "language_loss": 0.80191588, + "learning_rate": 0.0004227368035904654, + "loss": 0.81264406, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.10455322, + "routerloss_mlp": 0.0, + "step": 2926, + "time_per_iteration": 2.957545757293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073658, + "balance_loss_mlp": 1.06312013, + "diversity_loss_mlp": 0.0, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.07119677802103677, + "language_loss": 0.8312782, + "learning_rate": 0.00042242901833889474, + "loss": 0.84201479, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.10540771, + "routerloss_mlp": 0.0, + "step": 2927, + "time_per_iteration": 2.6197497844696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069584, + "balance_loss_mlp": 1.05933261, + "diversity_loss_mlp": 0.0, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.07548469953325632, + "language_loss": 0.85944557, + "learning_rate": 0.0004221212632030501, + "loss": 0.87014145, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2928, + "time_per_iteration": 3.0718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074032, + "balance_loss_mlp": 1.0636375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.0702405954135719, + "language_loss": 0.8005904, + "learning_rate": 0.0004218135383024124, + "loss": 0.81133074, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2929, + "time_per_iteration": 2.6883885860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068804, + "balance_loss_mlp": 1.05836129, + "diversity_loss_mlp": 0.0, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.07423933793606223, + "language_loss": 0.85405028, + "learning_rate": 0.0004215058437564511, + "loss": 0.86473835, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2930, + "time_per_iteration": 2.5645458698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.06520677, + "diversity_loss_mlp": 0.0, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.07045402067927274, + "language_loss": 0.82365847, + "learning_rate": 0.00042119817968462397, + "loss": 0.83441579, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.10528564, + "routerloss_mlp": 0.0, + "step": 2931, + "time_per_iteration": 2.596431255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843243, + "balance_loss_mlp": 1.44432163, + "diversity_loss_mlp": 0.21611315, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.034099962370994746, + "language_loss": 0.87154222, + "learning_rate": 0.0004208905462063766, + "loss": 0.8799746, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01302544, + "step": 2932, + "time_per_iteration": 2.7103724479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088146, + "balance_loss_mlp": 1.07760167, + "diversity_loss_mlp": 0.0, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.07257480225633914, + "language_loss": 0.84035242, + "learning_rate": 0.00042058294344114315, + "loss": 0.8512339, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 2933, + "time_per_iteration": 2.6817541122436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846618, + "balance_loss_mlp": 1.45035362, + "diversity_loss_mlp": 0.21710092, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.03239193802507573, + "language_loss": 0.77597153, + "learning_rate": 0.0004202753715083456, + "loss": 0.78443778, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01289086, + "step": 2934, + "time_per_iteration": 3.1172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.08684492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.08960488369203884, + "language_loss": 0.8126961, + "learning_rate": 0.0004199678305273936, + "loss": 0.82367325, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2935, + "time_per_iteration": 2.648293972015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096103, + "balance_loss_mlp": 1.08564794, + "diversity_loss_mlp": 0.0, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.06584718006017456, + "language_loss": 0.81395173, + "learning_rate": 0.0004196603206176854, + "loss": 0.82491279, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.10461426, + "routerloss_mlp": 0.0, + "step": 2936, + "time_per_iteration": 2.9504921436309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110133, + "balance_loss_mlp": 1.09094691, + "diversity_loss_mlp": 0.0, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.06854637503151859, + "language_loss": 0.83705592, + "learning_rate": 0.000419352841898607, + "loss": 0.84806919, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2937, + "time_per_iteration": 2.965176582336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100386, + "balance_loss_mlp": 1.09003913, + "diversity_loss_mlp": 0.0, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.06908295336200668, + "language_loss": 0.77684075, + "learning_rate": 0.000419045394489532, + "loss": 0.7878446, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2938, + "time_per_iteration": 2.692997455596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094877, + "balance_loss_mlp": 1.08429718, + "diversity_loss_mlp": 0.0, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.06508171061148607, + "language_loss": 0.76831025, + "learning_rate": 0.0004187379785098224, + "loss": 0.77925897, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2939, + "time_per_iteration": 3.123154401779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110149, + "balance_loss_mlp": 1.09110653, + "diversity_loss_mlp": 0.0, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.08014464510269267, + "language_loss": 0.83749938, + "learning_rate": 0.00041843059407882744, + "loss": 0.84851432, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2940, + "time_per_iteration": 2.9720611572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099107, + "balance_loss_mlp": 1.0887475, + "diversity_loss_mlp": 0.0, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.06910210619422795, + "language_loss": 0.82642627, + "learning_rate": 0.0004181232413158842, + "loss": 0.83741736, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2941, + "time_per_iteration": 2.657360315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094217, + "balance_loss_mlp": 1.08388722, + "diversity_loss_mlp": 0.0, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.08913898875539945, + "language_loss": 0.82192254, + "learning_rate": 0.0004178159203403179, + "loss": 0.83286464, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2942, + "time_per_iteration": 2.8812596797943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080275, + "balance_loss_mlp": 1.07014799, + "diversity_loss_mlp": 0.0, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.06202774017820852, + "language_loss": 0.8130517, + "learning_rate": 0.0004175086312714409, + "loss": 0.82385445, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 2943, + "time_per_iteration": 2.561537027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.07015431, + "diversity_loss_mlp": 0.0, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.05809127095966742, + "language_loss": 0.83570457, + "learning_rate": 0.00041720137422855366, + "loss": 0.84651101, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.10491943, + "routerloss_mlp": 0.0, + "step": 2944, + "time_per_iteration": 2.7395284175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075472, + "balance_loss_mlp": 1.06576228, + "diversity_loss_mlp": 0.0, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.07239714207057282, + "language_loss": 0.79116005, + "learning_rate": 0.00041689414933094383, + "loss": 0.80191475, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 2945, + "time_per_iteration": 2.654930353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067367, + "balance_loss_mlp": 1.05734193, + "diversity_loss_mlp": 0.0, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.07615309090382201, + "language_loss": 0.80823922, + "learning_rate": 0.00041658695669788653, + "loss": 0.81891298, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2946, + "time_per_iteration": 2.747903347015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069298, + "balance_loss_mlp": 1.05894506, + "diversity_loss_mlp": 0.0, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.09594015960064259, + "language_loss": 0.81304628, + "learning_rate": 0.00041627979644864453, + "loss": 0.82373923, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2947, + "time_per_iteration": 2.8192365169525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064628, + "balance_loss_mlp": 1.05435264, + "diversity_loss_mlp": 0.0, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.06124486727819338, + "language_loss": 0.81212783, + "learning_rate": 0.0004159726687024683, + "loss": 0.82277411, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 2948, + "time_per_iteration": 2.634019613265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066585, + "balance_loss_mlp": 1.05610037, + "diversity_loss_mlp": 0.0, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.0698899799050157, + "language_loss": 0.7929486, + "learning_rate": 0.00041566557357859506, + "loss": 0.80361444, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2949, + "time_per_iteration": 2.861374616622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.05816913, + "diversity_loss_mlp": 0.0, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.0603589352170923, + "language_loss": 0.79605162, + "learning_rate": 0.0004153585111962502, + "loss": 0.80673802, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2950, + "time_per_iteration": 3.3136749267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076091, + "balance_loss_mlp": 1.06528509, + "diversity_loss_mlp": 0.0, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.07046051490297799, + "language_loss": 0.84271163, + "learning_rate": 0.0004150514816746453, + "loss": 0.85347259, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.10803223, + "routerloss_mlp": 0.0, + "step": 2951, + "time_per_iteration": 2.7142550945281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079575, + "balance_loss_mlp": 1.0689894, + "diversity_loss_mlp": 0.0, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.07561213643312675, + "language_loss": 0.85564739, + "learning_rate": 0.0004147444851329802, + "loss": 0.8664431, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2952, + "time_per_iteration": 2.663442611694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079915, + "balance_loss_mlp": 1.06943655, + "diversity_loss_mlp": 0.0, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.06334656392280237, + "language_loss": 0.85917854, + "learning_rate": 0.00041443752169044126, + "loss": 0.86997765, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2953, + "time_per_iteration": 3.0424787998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083209, + "balance_loss_mlp": 1.07296944, + "diversity_loss_mlp": 0.0, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.08759511227816434, + "language_loss": 0.84844387, + "learning_rate": 0.0004141305914662025, + "loss": 0.85927594, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.10241699, + "routerloss_mlp": 0.0, + "step": 2954, + "time_per_iteration": 2.720574378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080604, + "balance_loss_mlp": 1.06977344, + "diversity_loss_mlp": 0.0, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0625505952609041, + "language_loss": 0.80443704, + "learning_rate": 0.0004138236945794246, + "loss": 0.81524312, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.10839844, + "routerloss_mlp": 0.0, + "step": 2955, + "time_per_iteration": 2.880007743835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067912, + "balance_loss_mlp": 1.05775595, + "diversity_loss_mlp": 0.0, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.08164782403227437, + "language_loss": 0.84066302, + "learning_rate": 0.00041351683114925576, + "loss": 0.85134214, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2956, + "time_per_iteration": 3.061213731765747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072205, + "balance_loss_mlp": 1.06213737, + "diversity_loss_mlp": 0.0, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.06079019071224684, + "language_loss": 0.86355555, + "learning_rate": 0.0004132100012948308, + "loss": 0.87427759, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 2957, + "time_per_iteration": 2.631786823272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069231, + "balance_loss_mlp": 1.0587523, + "diversity_loss_mlp": 0.0, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.07979265854660174, + "language_loss": 0.84526646, + "learning_rate": 0.00041290320513527145, + "loss": 0.85595882, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2958, + "time_per_iteration": 2.5593366622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061477, + "balance_loss_mlp": 1.05111814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.09201222931646683, + "language_loss": 0.85128796, + "learning_rate": 0.0004125964427896867, + "loss": 0.86190271, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.1036377, + "routerloss_mlp": 0.0, + "step": 2959, + "time_per_iteration": 2.667381525039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063517, + "balance_loss_mlp": 1.05320501, + "diversity_loss_mlp": 0.0, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06922825543149586, + "language_loss": 0.79212141, + "learning_rate": 0.0004122897143771723, + "loss": 0.80275661, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2960, + "time_per_iteration": 2.523068904876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067248, + "balance_loss_mlp": 1.0569005, + "diversity_loss_mlp": 0.0, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.06880331468011665, + "language_loss": 0.81306094, + "learning_rate": 0.0004119830200168109, + "loss": 0.82373345, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 2961, + "time_per_iteration": 2.7224626541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106382, + "balance_loss_mlp": 1.05356169, + "diversity_loss_mlp": 0.0, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.08443053343043137, + "language_loss": 0.88515878, + "learning_rate": 0.0004116763598276714, + "loss": 0.89579695, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 2962, + "time_per_iteration": 2.4910728931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067582, + "balance_loss_mlp": 1.05738318, + "diversity_loss_mlp": 0.0, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.07427131552828858, + "language_loss": 0.81298989, + "learning_rate": 0.00041136973392881017, + "loss": 0.82366574, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.10198975, + "routerloss_mlp": 0.0, + "step": 2963, + "time_per_iteration": 2.8261218070983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063452, + "balance_loss_mlp": 1.05275846, + "diversity_loss_mlp": 0.0, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.0795338566562928, + "language_loss": 0.82039535, + "learning_rate": 0.00041106314243926983, + "loss": 0.83102989, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.10699463, + "routerloss_mlp": 0.0, + "step": 2964, + "time_per_iteration": 2.7321033477783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058191, + "balance_loss_mlp": 1.04802823, + "diversity_loss_mlp": 0.0, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.07985594809339186, + "language_loss": 0.87473917, + "learning_rate": 0.0004107565854780798, + "loss": 0.88532114, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2965, + "time_per_iteration": 2.685188055038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105982, + "balance_loss_mlp": 1.0495863, + "diversity_loss_mlp": 0.0, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.12021988187086102, + "language_loss": 0.80887079, + "learning_rate": 0.000410450063164256, + "loss": 0.81946903, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.10241699, + "routerloss_mlp": 0.0, + "step": 2966, + "time_per_iteration": 2.8859732151031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061114, + "balance_loss_mlp": 1.05084372, + "diversity_loss_mlp": 0.0, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.07877125068742231, + "language_loss": 0.82298398, + "learning_rate": 0.00041014357561680115, + "loss": 0.83359516, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 2967, + "time_per_iteration": 2.5546090602874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072036, + "balance_loss_mlp": 1.06186163, + "diversity_loss_mlp": 0.0, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.0603559044145355, + "language_loss": 0.86396813, + "learning_rate": 0.0004098371229547039, + "loss": 0.87468845, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.10174561, + "routerloss_mlp": 0.0, + "step": 2968, + "time_per_iteration": 2.7246880531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055906, + "balance_loss_mlp": 1.05082798, + "diversity_loss_mlp": 0.0, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.032213471653528905, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81066716, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 2969, + "time_per_iteration": 4.802457571029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845784, + "balance_loss_mlp": 1.44834208, + "diversity_loss_mlp": 0.21849446, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.042172582609019446, + "language_loss": 0.80489594, + "learning_rate": 0.00040922432276247107, + "loss": 0.81335378, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01236574, + "step": 2970, + "time_per_iteration": 2.579711675643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100592, + "balance_loss_mlp": 1.09026289, + "diversity_loss_mlp": 0.0, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.08651791755700546, + "language_loss": 0.84556907, + "learning_rate": 0.0004089179754702457, + "loss": 0.85657501, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2971, + "time_per_iteration": 2.744509220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109172, + "balance_loss_mlp": 1.08128309, + "diversity_loss_mlp": 0.0, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.0875480726861112, + "language_loss": 0.79658413, + "learning_rate": 0.00040861166353919843, + "loss": 0.80750132, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2972, + "time_per_iteration": 2.816767692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843649, + "balance_loss_mlp": 1.44322622, + "diversity_loss_mlp": 0.21953782, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.0303598736791247, + "language_loss": 0.81879437, + "learning_rate": 0.00040830538708824983, + "loss": 0.82723081, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01226737, + "step": 2973, + "time_per_iteration": 2.8936269283294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084736, + "balance_loss_mlp": 1.07479978, + "diversity_loss_mlp": 0.0, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.06866249599002382, + "language_loss": 0.81754982, + "learning_rate": 0.000407999146236307, + "loss": 0.82839715, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2974, + "time_per_iteration": 2.558587074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086743, + "balance_loss_mlp": 1.07657444, + "diversity_loss_mlp": 0.0, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.07286762161416734, + "language_loss": 0.83382261, + "learning_rate": 0.0004076929411022634, + "loss": 0.84468997, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2975, + "time_per_iteration": 2.604498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082309, + "balance_loss_mlp": 1.07231879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.06868291627032407, + "language_loss": 0.79575276, + "learning_rate": 0.0004073867718049982, + "loss": 0.80657583, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 2976, + "time_per_iteration": 3.082519054412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841274, + "balance_loss_mlp": 1.44052804, + "diversity_loss_mlp": 0.21771878, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.03510584247140754, + "language_loss": 0.8255651, + "learning_rate": 0.00040708063846337704, + "loss": 0.83397782, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01215104, + "step": 2977, + "time_per_iteration": 2.7563750743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108914, + "balance_loss_mlp": 1.07897186, + "diversity_loss_mlp": 0.0, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.07105452232664011, + "language_loss": 0.81019402, + "learning_rate": 0.00040677454119625143, + "loss": 0.82108539, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2978, + "time_per_iteration": 2.575923442840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089611, + "balance_loss_mlp": 1.07962155, + "diversity_loss_mlp": 0.0, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.07243213986729599, + "language_loss": 0.82912952, + "learning_rate": 0.0004064684801224587, + "loss": 0.84002566, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.09985352, + "routerloss_mlp": 0.0, + "step": 2979, + "time_per_iteration": 2.5965535640716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085844, + "balance_loss_mlp": 1.07600939, + "diversity_loss_mlp": 0.0, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.11138747568582645, + "language_loss": 0.80322999, + "learning_rate": 0.00040616245536082224, + "loss": 0.81408834, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 2980, + "time_per_iteration": 2.599320650100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079792, + "balance_loss_mlp": 1.07008803, + "diversity_loss_mlp": 0.0, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.06764455313032879, + "language_loss": 0.81366718, + "learning_rate": 0.00040585646703015165, + "loss": 0.82446504, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 2981, + "time_per_iteration": 2.8000056743621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_mlp": 1.0740515, + "diversity_loss_mlp": 0.0, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.07435230765684324, + "language_loss": 0.78094304, + "learning_rate": 0.0004055505152492419, + "loss": 0.79178286, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2982, + "time_per_iteration": 2.6867222785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075905, + "balance_loss_mlp": 1.06574273, + "diversity_loss_mlp": 0.0, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.06874763078804642, + "language_loss": 0.74040514, + "learning_rate": 0.00040524460013687425, + "loss": 0.7511642, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2983, + "time_per_iteration": 2.722419500350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070682, + "balance_loss_mlp": 1.06058455, + "diversity_loss_mlp": 0.0, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.06717754752260814, + "language_loss": 0.81118953, + "learning_rate": 0.0004049387218118155, + "loss": 0.82189637, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 2984, + "time_per_iteration": 2.960744857788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065588, + "balance_loss_mlp": 1.05519915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07543134348802799, + "language_loss": 0.85138291, + "learning_rate": 0.00040463288039281777, + "loss": 0.86203879, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2985, + "time_per_iteration": 2.769758939743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_mlp": 1.03847778, + "diversity_loss_mlp": 0.0, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.0202426857746204, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78919691, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.04785156, + "routerloss_mlp": 0.0, + "step": 2986, + "time_per_iteration": 4.966659784317017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062164, + "balance_loss_mlp": 1.05206716, + "diversity_loss_mlp": 0.0, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.15131369926607025, + "language_loss": 0.82060635, + "learning_rate": 0.0004040213087479444, + "loss": 0.83122802, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 2987, + "time_per_iteration": 2.9445290565490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071505, + "balance_loss_mlp": 1.0615747, + "diversity_loss_mlp": 0.0, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.0782867157663105, + "language_loss": 0.85397077, + "learning_rate": 0.0004037155787595018, + "loss": 0.86468589, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2988, + "time_per_iteration": 2.5765254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066911, + "balance_loss_mlp": 1.05708241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.06722963936024443, + "language_loss": 0.80743146, + "learning_rate": 0.000403409886151987, + "loss": 0.81810057, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 2989, + "time_per_iteration": 2.916736364364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_mlp": 1.02410662, + "diversity_loss_mlp": 0.0, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.01652195359171043, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.8302803, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.0480957, + "routerloss_mlp": 0.0, + "step": 2990, + "time_per_iteration": 4.79939866065979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019783, + "balance_loss_mlp": 1.0149194, + "diversity_loss_mlp": 0.0, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.012607930583697005, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79218388, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 2991, + "time_per_iteration": 4.873241901397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107448, + "balance_loss_mlp": 1.06493187, + "diversity_loss_mlp": 0.0, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.07321689676824589, + "language_loss": 0.7675758, + "learning_rate": 0.00040249303380173807, + "loss": 0.77832061, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 2992, + "time_per_iteration": 3.119454860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075897, + "balance_loss_mlp": 1.06607461, + "diversity_loss_mlp": 0.0, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.06951674167184135, + "language_loss": 0.78929973, + "learning_rate": 0.00040218749190459126, + "loss": 0.80005872, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 2993, + "time_per_iteration": 2.735741138458252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074749, + "balance_loss_mlp": 1.06464601, + "diversity_loss_mlp": 0.0, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.09040694151318206, + "language_loss": 0.82524914, + "learning_rate": 0.00040188198798162775, + "loss": 0.83599663, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.10101318, + "routerloss_mlp": 0.0, + "step": 2994, + "time_per_iteration": 2.604189872741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107928, + "balance_loss_mlp": 1.06903386, + "diversity_loss_mlp": 0.0, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.07247823517444965, + "language_loss": 0.85413349, + "learning_rate": 0.000401576522151455, + "loss": 0.86492634, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.10247803, + "routerloss_mlp": 0.0, + "step": 2995, + "time_per_iteration": 2.8580820560455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082336, + "balance_loss_mlp": 1.07231033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.07641213429349043, + "language_loss": 0.82611746, + "learning_rate": 0.0004012710945326651, + "loss": 0.83694082, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2996, + "time_per_iteration": 2.7899913787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093927, + "balance_loss_mlp": 1.08396673, + "diversity_loss_mlp": 0.0, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.06499516885792743, + "language_loss": 0.81305802, + "learning_rate": 0.0004009657052438355, + "loss": 0.82399726, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 2997, + "time_per_iteration": 2.7985143661499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109354, + "balance_loss_mlp": 1.08339536, + "diversity_loss_mlp": 0.0, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.07919341256021087, + "language_loss": 0.85873878, + "learning_rate": 0.00040066035440352904, + "loss": 0.86967415, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 2998, + "time_per_iteration": 2.633052110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032353, + "balance_loss_mlp": 1.02706063, + "diversity_loss_mlp": 0.0, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.024696349234847453, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80325484, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 2999, + "time_per_iteration": 4.901000022888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111633, + "balance_loss_mlp": 1.10161996, + "diversity_loss_mlp": 0.0, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.09685011562347093, + "language_loss": 0.76085562, + "learning_rate": 0.00040004976854266145, + "loss": 0.77197194, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3000, + "time_per_iteration": 2.5440561771392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106478, + "balance_loss_mlp": 1.09615445, + "diversity_loss_mlp": 0.0, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.08566214489971447, + "language_loss": 0.81596673, + "learning_rate": 0.0003997445337591505, + "loss": 0.82703155, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.10327148, + "routerloss_mlp": 0.0, + "step": 3001, + "time_per_iteration": 2.6576101779937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101254, + "balance_loss_mlp": 1.09120488, + "diversity_loss_mlp": 0.0, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.07034086792873868, + "language_loss": 0.74008942, + "learning_rate": 0.0003994393378982635, + "loss": 0.75110197, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3002, + "time_per_iteration": 2.646756172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_mlp": 1.02816153, + "diversity_loss_mlp": 0.0, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.018933197318392565, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80571294, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.05126953, + "routerloss_mlp": 0.0, + "step": 3003, + "time_per_iteration": 4.810927867889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084692, + "balance_loss_mlp": 1.07440448, + "diversity_loss_mlp": 0.0, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.09168460196837042, + "language_loss": 0.8788178, + "learning_rate": 0.0003988290634182961, + "loss": 0.88966477, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.10290527, + "routerloss_mlp": 0.0, + "step": 3004, + "time_per_iteration": 2.8026678562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086517, + "balance_loss_mlp": 1.0765686, + "diversity_loss_mlp": 0.0, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.07023697016091271, + "language_loss": 0.80836314, + "learning_rate": 0.0003985239850361453, + "loss": 0.81922829, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3005, + "time_per_iteration": 2.605581760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108379, + "balance_loss_mlp": 1.0739491, + "diversity_loss_mlp": 0.0, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.08589270039345176, + "language_loss": 0.84542817, + "learning_rate": 0.0003982189460504777, + "loss": 0.85626608, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.09838867, + "routerloss_mlp": 0.0, + "step": 3006, + "time_per_iteration": 2.755309820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081707, + "balance_loss_mlp": 1.07148504, + "diversity_loss_mlp": 0.0, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.07367765629951939, + "language_loss": 0.79058981, + "learning_rate": 0.00039791394657971935, + "loss": 0.80140698, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3007, + "time_per_iteration": 2.7115721702575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083463, + "balance_loss_mlp": 1.07349145, + "diversity_loss_mlp": 0.0, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.08639799759711958, + "language_loss": 0.84195948, + "learning_rate": 0.00039760898674228205, + "loss": 0.85279417, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.09967041, + "routerloss_mlp": 0.0, + "step": 3008, + "time_per_iteration": 2.6536192893981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.07249665, + "diversity_loss_mlp": 0.0, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.06522284264232586, + "language_loss": 0.80620825, + "learning_rate": 0.0003973040666565613, + "loss": 0.81703728, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.10406494, + "routerloss_mlp": 0.0, + "step": 3009, + "time_per_iteration": 3.0663528442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083981, + "balance_loss_mlp": 1.07382393, + "diversity_loss_mlp": 0.0, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.06612730330601824, + "language_loss": 0.82148051, + "learning_rate": 0.000396999186440938, + "loss": 0.83232027, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 3010, + "time_per_iteration": 2.8332176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078314, + "balance_loss_mlp": 1.06794286, + "diversity_loss_mlp": 0.0, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.0828593686110812, + "language_loss": 0.85258269, + "learning_rate": 0.000396694346213777, + "loss": 0.86336583, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 3011, + "time_per_iteration": 2.6009714603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107282, + "balance_loss_mlp": 1.06272256, + "diversity_loss_mlp": 0.0, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.06962390382868744, + "language_loss": 0.83265769, + "learning_rate": 0.0003963895460934276, + "loss": 0.84338593, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 3012, + "time_per_iteration": 3.1654391288757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069146, + "balance_loss_mlp": 1.05900097, + "diversity_loss_mlp": 0.0, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.07925389671051855, + "language_loss": 0.84790504, + "learning_rate": 0.00039608478619822376, + "loss": 0.85859656, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.10144043, + "routerloss_mlp": 0.0, + "step": 3013, + "time_per_iteration": 2.427522659301758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_mlp": 1.05792189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.06006231039706783, + "language_loss": 0.82350284, + "learning_rate": 0.00039578006664648394, + "loss": 0.83418107, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3014, + "time_per_iteration": 2.744586229324341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073341, + "balance_loss_mlp": 1.06352377, + "diversity_loss_mlp": 0.0, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.06972986465808689, + "language_loss": 0.81348431, + "learning_rate": 0.0003954753875565105, + "loss": 0.82421774, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3015, + "time_per_iteration": 3.0640695095062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072847, + "balance_loss_mlp": 1.06282723, + "diversity_loss_mlp": 0.0, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.07357715078918559, + "language_loss": 0.82623494, + "learning_rate": 0.00039517074904659057, + "loss": 0.83696342, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3016, + "time_per_iteration": 2.6665265560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010727, + "balance_loss_mlp": 1.06269789, + "diversity_loss_mlp": 0.0, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.06753013197016527, + "language_loss": 0.84737754, + "learning_rate": 0.00039486615123499535, + "loss": 0.85810453, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.10003662, + "routerloss_mlp": 0.0, + "step": 3017, + "time_per_iteration": 2.868724822998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067949, + "balance_loss_mlp": 1.05761325, + "diversity_loss_mlp": 0.0, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.06414820954678578, + "language_loss": 0.84855384, + "learning_rate": 0.00039456159423997996, + "loss": 0.85923326, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 3018, + "time_per_iteration": 2.7043581008911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067563, + "balance_loss_mlp": 1.05765033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.06908857206879536, + "language_loss": 0.89950442, + "learning_rate": 0.00039425707817978406, + "loss": 0.91018009, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3019, + "time_per_iteration": 2.661128044128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106838, + "balance_loss_mlp": 1.0578835, + "diversity_loss_mlp": 0.0, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.08125232064199928, + "language_loss": 0.83649898, + "learning_rate": 0.00039395260317263124, + "loss": 0.84718275, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 3020, + "time_per_iteration": 2.5645148754119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070808, + "balance_loss_mlp": 1.06039524, + "diversity_loss_mlp": 0.0, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.06887634041791851, + "language_loss": 0.85043871, + "learning_rate": 0.0003936481693367291, + "loss": 0.86114681, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3021, + "time_per_iteration": 2.7062771320343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077533, + "balance_loss_mlp": 1.06673217, + "diversity_loss_mlp": 0.0, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08641696356618225, + "language_loss": 0.87619507, + "learning_rate": 0.0003933437767902697, + "loss": 0.88697034, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.10803223, + "routerloss_mlp": 0.0, + "step": 3022, + "time_per_iteration": 2.7680017948150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078804, + "balance_loss_mlp": 1.06846249, + "diversity_loss_mlp": 0.0, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.0708496595357851, + "language_loss": 0.78467089, + "learning_rate": 0.00039303942565142825, + "loss": 0.79545891, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 3023, + "time_per_iteration": 2.7319986820220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071706, + "balance_loss_mlp": 1.06121564, + "diversity_loss_mlp": 0.0, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.06941107329713525, + "language_loss": 0.76844412, + "learning_rate": 0.0003927351160383644, + "loss": 0.77916121, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 3024, + "time_per_iteration": 2.7925262451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069902, + "balance_loss_mlp": 1.05980492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.07084631667240687, + "language_loss": 0.77815473, + "learning_rate": 0.000392430848069222, + "loss": 0.78885376, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 3025, + "time_per_iteration": 2.5290136337280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075514, + "balance_loss_mlp": 1.06532741, + "diversity_loss_mlp": 0.0, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.07224483468752362, + "language_loss": 0.82501459, + "learning_rate": 0.00039212662186212795, + "loss": 0.83576977, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3026, + "time_per_iteration": 2.6017684936523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106609, + "balance_loss_mlp": 1.05593956, + "diversity_loss_mlp": 0.0, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.05478704818063415, + "language_loss": 0.77076197, + "learning_rate": 0.0003918224375351934, + "loss": 0.78142285, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 3027, + "time_per_iteration": 2.707127571105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069708, + "balance_loss_mlp": 1.05940795, + "diversity_loss_mlp": 0.0, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.07026049561627037, + "language_loss": 0.78559566, + "learning_rate": 0.0003915182952065135, + "loss": 0.79629278, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 3028, + "time_per_iteration": 2.6728062629699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863261, + "balance_loss_mlp": 1.48110199, + "diversity_loss_mlp": 0.21947324, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.028926470462326558, + "language_loss": 0.87632734, + "learning_rate": 0.0003912141949941664, + "loss": 0.88495994, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0129736, + "step": 3029, + "time_per_iteration": 2.7290279865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068105, + "balance_loss_mlp": 1.05748928, + "diversity_loss_mlp": 0.0, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.11092566755711959, + "language_loss": 0.82848042, + "learning_rate": 0.0003909101370162143, + "loss": 0.83916146, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 3030, + "time_per_iteration": 2.5907628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_mlp": 1.05161262, + "diversity_loss_mlp": 0.0, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.028764883169419067, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73491609, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.06103516, + "routerloss_mlp": 0.0, + "step": 3031, + "time_per_iteration": 4.87787127494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066859, + "balance_loss_mlp": 1.05651772, + "diversity_loss_mlp": 0.0, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.06710106844205427, + "language_loss": 0.82853395, + "learning_rate": 0.0003903021482356622, + "loss": 0.83920258, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 3032, + "time_per_iteration": 2.777536153793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067707, + "balance_loss_mlp": 1.05757427, + "diversity_loss_mlp": 0.0, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.05521171326439417, + "language_loss": 0.82775813, + "learning_rate": 0.00038999821766910465, + "loss": 0.83843517, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.10131836, + "routerloss_mlp": 0.0, + "step": 3033, + "time_per_iteration": 2.990370035171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064798, + "balance_loss_mlp": 1.05444503, + "diversity_loss_mlp": 0.0, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.06933125597123427, + "language_loss": 0.85725427, + "learning_rate": 0.00038969432980902606, + "loss": 0.86790228, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 3034, + "time_per_iteration": 2.522594690322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101659, + "balance_loss_mlp": 1.01134527, + "diversity_loss_mlp": 0.0, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.016170176694849804, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80801094, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.05249023, + "routerloss_mlp": 0.0, + "step": 3035, + "time_per_iteration": 4.804777383804321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070254, + "balance_loss_mlp": 1.06007361, + "diversity_loss_mlp": 0.0, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.06630987198212972, + "language_loss": 0.82630336, + "learning_rate": 0.00038908668268020953, + "loss": 0.83700585, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.10180664, + "routerloss_mlp": 0.0, + "step": 3036, + "time_per_iteration": 2.6598165035247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064922, + "balance_loss_mlp": 1.0547123, + "diversity_loss_mlp": 0.0, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.06353975651870693, + "language_loss": 0.85077345, + "learning_rate": 0.00038878292364738097, + "loss": 0.86142278, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3037, + "time_per_iteration": 2.817431688308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066587, + "balance_loss_mlp": 1.05653155, + "diversity_loss_mlp": 0.0, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.06847185322789755, + "language_loss": 0.86992419, + "learning_rate": 0.0003884792077928508, + "loss": 0.88059008, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 3038, + "time_per_iteration": 2.515582323074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067425, + "balance_loss_mlp": 1.05704808, + "diversity_loss_mlp": 0.0, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.08132102193369704, + "language_loss": 0.76704037, + "learning_rate": 0.0003881755352345322, + "loss": 0.77771461, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 3039, + "time_per_iteration": 2.506476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.05959702, + "diversity_loss_mlp": 0.0, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.05655703451029381, + "language_loss": 0.87182224, + "learning_rate": 0.0003878719060903207, + "loss": 0.88252252, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 3040, + "time_per_iteration": 2.5755503177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077595, + "balance_loss_mlp": 1.06733704, + "diversity_loss_mlp": 0.0, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.07213898072930079, + "language_loss": 0.83620822, + "learning_rate": 0.0003875683204780961, + "loss": 0.84698415, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 3041, + "time_per_iteration": 2.7087528705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00858209, + "balance_loss_mlp": 1.47420132, + "diversity_loss_mlp": 0.21720865, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.0337374590034744, + "language_loss": 0.85750413, + "learning_rate": 0.00038726477851572043, + "loss": 0.86608613, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01250451, + "step": 3042, + "time_per_iteration": 2.8391060829162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085797, + "balance_loss_mlp": 1.07552087, + "diversity_loss_mlp": 0.0, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.07424787281712622, + "language_loss": 0.8043561, + "learning_rate": 0.0003869612803210395, + "loss": 0.81521404, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3043, + "time_per_iteration": 2.6728439331054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_mlp": 1.07525158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.0731909762270397, + "language_loss": 0.83286428, + "learning_rate": 0.0003866578260118817, + "loss": 0.8437193, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 3044, + "time_per_iteration": 2.6332969665527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108221, + "balance_loss_mlp": 1.07239914, + "diversity_loss_mlp": 0.0, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.07445534470947208, + "language_loss": 0.82966632, + "learning_rate": 0.0003863544157060581, + "loss": 0.84048843, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3045, + "time_per_iteration": 2.668837785720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081605, + "balance_loss_mlp": 1.07137656, + "diversity_loss_mlp": 0.0, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.07387128485113956, + "language_loss": 0.82359195, + "learning_rate": 0.0003860510495213634, + "loss": 0.83440793, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3046, + "time_per_iteration": 2.8229498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106696, + "balance_loss_mlp": 1.05705416, + "diversity_loss_mlp": 0.0, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.08160785595799389, + "language_loss": 0.78622752, + "learning_rate": 0.0003857477275755746, + "loss": 0.79689717, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3047, + "time_per_iteration": 2.6294050216674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066008, + "balance_loss_mlp": 1.0557915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.0580402220657833, + "language_loss": 0.83646655, + "learning_rate": 0.00038544444998645167, + "loss": 0.84712666, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.10217285, + "routerloss_mlp": 0.0, + "step": 3048, + "time_per_iteration": 3.0289785861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059507, + "balance_loss_mlp": 1.04951751, + "diversity_loss_mlp": 0.0, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.0674332369398686, + "language_loss": 0.81847656, + "learning_rate": 0.00038514121687173767, + "loss": 0.82907164, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 3049, + "time_per_iteration": 2.5797152519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058576, + "balance_loss_mlp": 1.04861593, + "diversity_loss_mlp": 0.0, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.08495884025795868, + "language_loss": 0.82019609, + "learning_rate": 0.00038483802834915807, + "loss": 0.83078188, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3050, + "time_per_iteration": 3.0199241638183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061818, + "balance_loss_mlp": 1.05154216, + "diversity_loss_mlp": 0.0, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.07816426751212531, + "language_loss": 0.78978479, + "learning_rate": 0.00038453488453642074, + "loss": 0.800403, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3051, + "time_per_iteration": 2.7338953018188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105642, + "balance_loss_mlp": 1.04610801, + "diversity_loss_mlp": 0.0, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.07385283463746846, + "language_loss": 0.86878967, + "learning_rate": 0.00038423178555121697, + "loss": 0.87935388, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.10308838, + "routerloss_mlp": 0.0, + "step": 3052, + "time_per_iteration": 2.7545297145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058315, + "balance_loss_mlp": 1.04783666, + "diversity_loss_mlp": 0.0, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.07920619209623277, + "language_loss": 0.85583031, + "learning_rate": 0.00038392873151121994, + "loss": 0.86641347, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 3053, + "time_per_iteration": 3.07143235206604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.04924083, + "diversity_loss_mlp": 0.0, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.07754087781816771, + "language_loss": 0.83137167, + "learning_rate": 0.0003836257225340859, + "loss": 0.84196955, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.10552979, + "routerloss_mlp": 0.0, + "step": 3054, + "time_per_iteration": 2.6132304668426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066843, + "balance_loss_mlp": 1.05597091, + "diversity_loss_mlp": 0.0, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.0689474058081498, + "language_loss": 0.82020974, + "learning_rate": 0.00038332275873745336, + "loss": 0.83087826, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.10882568, + "routerloss_mlp": 0.0, + "step": 3055, + "time_per_iteration": 3.107823371887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855378, + "balance_loss_mlp": 1.46855807, + "diversity_loss_mlp": 0.21676093, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.026786885849911755, + "language_loss": 0.82891941, + "learning_rate": 0.0003830198402389431, + "loss": 0.83747321, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01271825, + "step": 3056, + "time_per_iteration": 2.7645249366760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_mlp": 1.03548789, + "diversity_loss_mlp": 0.0, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.027829027984012215, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78389645, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.05224609, + "routerloss_mlp": 0.0, + "step": 3057, + "time_per_iteration": 4.995454549789429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082248, + "balance_loss_mlp": 1.07115602, + "diversity_loss_mlp": 0.0, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.10105227922023945, + "language_loss": 0.83302426, + "learning_rate": 0.0003824141396066855, + "loss": 0.8438468, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.11096191, + "routerloss_mlp": 0.0, + "step": 3058, + "time_per_iteration": 2.568283796310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086707, + "balance_loss_mlp": 1.07570362, + "diversity_loss_mlp": 0.0, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.10870959422332387, + "language_loss": 0.8283565, + "learning_rate": 0.000382111357708092, + "loss": 0.83922356, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.10998535, + "routerloss_mlp": 0.0, + "step": 3059, + "time_per_iteration": 2.7063958644866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080617, + "balance_loss_mlp": 1.06985879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.09017347087331092, + "language_loss": 0.83373827, + "learning_rate": 0.00038180862157792864, + "loss": 0.84454447, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 3060, + "time_per_iteration": 2.7716259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071354, + "balance_loss_mlp": 1.06098306, + "diversity_loss_mlp": 0.0, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.06780881013643715, + "language_loss": 0.81814772, + "learning_rate": 0.0003815059313337279, + "loss": 0.82886124, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 3061, + "time_per_iteration": 2.664134979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.06180596, + "diversity_loss_mlp": 0.0, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.06335749004143083, + "language_loss": 0.78063929, + "learning_rate": 0.00038120328709300436, + "loss": 0.79135942, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3062, + "time_per_iteration": 2.8627028465270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066046, + "balance_loss_mlp": 1.05566847, + "diversity_loss_mlp": 0.0, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.06769296518732247, + "language_loss": 0.8382163, + "learning_rate": 0.0003809006889732549, + "loss": 0.84887671, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 3063, + "time_per_iteration": 2.809983253479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066721, + "balance_loss_mlp": 1.05686879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.07471445768221775, + "language_loss": 0.88052714, + "learning_rate": 0.0003805981370919589, + "loss": 0.89119434, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3064, + "time_per_iteration": 2.526881456375122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106806, + "balance_loss_mlp": 1.05822492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.06588713514234819, + "language_loss": 0.83812523, + "learning_rate": 0.0003802956315665771, + "loss": 0.84880579, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3065, + "time_per_iteration": 2.6691834926605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072593, + "balance_loss_mlp": 1.06285346, + "diversity_loss_mlp": 0.0, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.11425397529110681, + "language_loss": 0.8185159, + "learning_rate": 0.0003799931725145529, + "loss": 0.82924175, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.09729004, + "routerloss_mlp": 0.0, + "step": 3066, + "time_per_iteration": 2.6098556518554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_mlp": 1.06719375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.07983506473752326, + "language_loss": 0.85902935, + "learning_rate": 0.00037969076005331083, + "loss": 0.86980045, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3067, + "time_per_iteration": 2.7626185417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081851, + "balance_loss_mlp": 1.07184935, + "diversity_loss_mlp": 0.0, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.07247659487205776, + "language_loss": 0.8802191, + "learning_rate": 0.00037938839430025817, + "loss": 0.89103758, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3068, + "time_per_iteration": 2.6493396759033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088575, + "balance_loss_mlp": 1.07886577, + "diversity_loss_mlp": 0.0, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.0655302097756617, + "language_loss": 0.85496283, + "learning_rate": 0.0003790860753727835, + "loss": 0.8658486, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3069, + "time_per_iteration": 2.7941815853118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089673, + "balance_loss_mlp": 1.07995713, + "diversity_loss_mlp": 0.0, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0796849495747384, + "language_loss": 0.82864797, + "learning_rate": 0.00037878380338825766, + "loss": 0.83954477, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3070, + "time_per_iteration": 2.6861939430236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102877, + "balance_loss_mlp": 1.09311378, + "diversity_loss_mlp": 0.0, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.08458672700427887, + "language_loss": 0.81556624, + "learning_rate": 0.00037848157846403287, + "loss": 0.82659507, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3071, + "time_per_iteration": 2.873662233352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101959, + "balance_loss_mlp": 1.09236836, + "diversity_loss_mlp": 0.0, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.07248408902015292, + "language_loss": 0.83281767, + "learning_rate": 0.0003781794007174435, + "loss": 0.84383726, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3072, + "time_per_iteration": 2.762472629547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088348, + "balance_loss_mlp": 1.08360386, + "diversity_loss_mlp": 0.0, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.032251872290910595, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75162888, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.04736328, + "routerloss_mlp": 0.0, + "step": 3073, + "time_per_iteration": 4.854618787765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107188, + "balance_loss_mlp": 1.09715033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.058981009489694675, + "language_loss": 0.80947924, + "learning_rate": 0.0003775751872264152, + "loss": 0.8205511, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.1003418, + "routerloss_mlp": 0.0, + "step": 3074, + "time_per_iteration": 2.771085023880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101985, + "balance_loss_mlp": 1.09195375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.056077752757325364, + "language_loss": 0.87175214, + "learning_rate": 0.0003772731517165527, + "loss": 0.88277197, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.10028076, + "routerloss_mlp": 0.0, + "step": 3075, + "time_per_iteration": 2.8292393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103862, + "balance_loss_mlp": 1.09419441, + "diversity_loss_mlp": 0.0, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.07602524147414737, + "language_loss": 0.83311272, + "learning_rate": 0.0003769711638534784, + "loss": 0.84415126, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3076, + "time_per_iteration": 2.97261381149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099488, + "balance_loss_mlp": 1.08962953, + "diversity_loss_mlp": 0.0, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.07287223806238774, + "language_loss": 0.79046565, + "learning_rate": 0.00037666922375443446, + "loss": 0.8014605, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3077, + "time_per_iteration": 2.6755480766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093205, + "balance_loss_mlp": 1.08349538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.06803693763690793, + "language_loss": 0.81907725, + "learning_rate": 0.00037636733153664396, + "loss": 0.83000934, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3078, + "time_per_iteration": 2.8055219650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109815, + "balance_loss_mlp": 1.08854795, + "diversity_loss_mlp": 0.0, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.08595437511710807, + "language_loss": 0.80202127, + "learning_rate": 0.0003760654873173124, + "loss": 0.81300277, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3079, + "time_per_iteration": 2.6700353622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089136, + "balance_loss_mlp": 1.07927787, + "diversity_loss_mlp": 0.0, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.06826446524438025, + "language_loss": 0.82043588, + "learning_rate": 0.00037576369121362566, + "loss": 0.8313272, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3080, + "time_per_iteration": 2.596071481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089019, + "balance_loss_mlp": 1.07946444, + "diversity_loss_mlp": 0.0, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.057614109423291045, + "language_loss": 0.81680822, + "learning_rate": 0.0003754619433427516, + "loss": 0.82769841, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3081, + "time_per_iteration": 2.9003093242645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087273, + "balance_loss_mlp": 1.07771826, + "diversity_loss_mlp": 0.0, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.09118109008842482, + "language_loss": 0.7796042, + "learning_rate": 0.0003751602438218392, + "loss": 0.79047692, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3082, + "time_per_iteration": 2.7739951610565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06927121, + "diversity_loss_mlp": 0.0, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.07641398361038237, + "language_loss": 0.84107417, + "learning_rate": 0.0003748585927680186, + "loss": 0.85186076, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3083, + "time_per_iteration": 2.6706809997558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087865, + "balance_loss_mlp": 1.07850111, + "diversity_loss_mlp": 0.0, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.07450452823339063, + "language_loss": 0.82992828, + "learning_rate": 0.00037455699029840086, + "loss": 0.84080696, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3084, + "time_per_iteration": 2.648775100708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082396, + "balance_loss_mlp": 1.07310402, + "diversity_loss_mlp": 0.0, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.0678124296562273, + "language_loss": 0.84694779, + "learning_rate": 0.0003742554365300787, + "loss": 0.85777175, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3085, + "time_per_iteration": 2.787437677383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00854998, + "balance_loss_mlp": 1.4632709, + "diversity_loss_mlp": 0.21810779, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.030613192067315453, + "language_loss": 0.79049134, + "learning_rate": 0.0003739539315801255, + "loss": 0.79904133, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01430825, + "step": 3086, + "time_per_iteration": 2.9476425647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088902, + "balance_loss_mlp": 1.07956231, + "diversity_loss_mlp": 0.0, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.08021663243926581, + "language_loss": 0.91758776, + "learning_rate": 0.000373652475565596, + "loss": 0.92847675, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3087, + "time_per_iteration": 2.473820924758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086482, + "balance_loss_mlp": 1.07684994, + "diversity_loss_mlp": 0.0, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.0746565513598584, + "language_loss": 0.81288451, + "learning_rate": 0.00037335106860352587, + "loss": 0.8237493, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3088, + "time_per_iteration": 2.6710119247436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.07624292, + "diversity_loss_mlp": 0.0, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.06157127364570171, + "language_loss": 0.82947195, + "learning_rate": 0.00037304971081093146, + "loss": 0.84033072, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3089, + "time_per_iteration": 2.5530550479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095759, + "balance_loss_mlp": 1.0863055, + "diversity_loss_mlp": 0.0, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.06188782031055571, + "language_loss": 0.80896157, + "learning_rate": 0.00037274840230481024, + "loss": 0.81991911, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3090, + "time_per_iteration": 2.707697868347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094547, + "balance_loss_mlp": 1.08488476, + "diversity_loss_mlp": 0.0, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.07660649649984981, + "language_loss": 0.79309815, + "learning_rate": 0.00037244714320214077, + "loss": 0.80404359, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3091, + "time_per_iteration": 2.524418354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094365, + "balance_loss_mlp": 1.08449435, + "diversity_loss_mlp": 0.0, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.07189913531932149, + "language_loss": 0.83442843, + "learning_rate": 0.000372145933619882, + "loss": 0.84537208, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3092, + "time_per_iteration": 2.889267683029175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098289, + "balance_loss_mlp": 1.0883646, + "diversity_loss_mlp": 0.0, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.08404319768947686, + "language_loss": 0.82928061, + "learning_rate": 0.000371844773674974, + "loss": 0.84026349, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3093, + "time_per_iteration": 2.729433059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00849837, + "balance_loss_mlp": 1.45755267, + "diversity_loss_mlp": 0.21677493, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.03215359042810467, + "language_loss": 0.82038867, + "learning_rate": 0.0003715436634843375, + "loss": 0.82888705, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01267278, + "step": 3094, + "time_per_iteration": 2.8759658336639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110065, + "balance_loss_mlp": 1.10049295, + "diversity_loss_mlp": 0.0, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.05868361705811182, + "language_loss": 0.80998492, + "learning_rate": 0.00037124260316487355, + "loss": 0.82108557, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3095, + "time_per_iteration": 2.8515610694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.11049807, + "diversity_loss_mlp": 0.0, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.06311708190042467, + "language_loss": 0.89435279, + "learning_rate": 0.0003709415928334643, + "loss": 0.90555483, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3096, + "time_per_iteration": 2.5820794105529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00850727, + "balance_loss_mlp": 1.45894229, + "diversity_loss_mlp": 0.21772251, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.03378868601366531, + "language_loss": 0.80653715, + "learning_rate": 0.00037064063260697233, + "loss": 0.81504446, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01239414, + "step": 3097, + "time_per_iteration": 2.897676467895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138893, + "balance_loss_mlp": 1.12893891, + "diversity_loss_mlp": 0.0, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.06769209825818075, + "language_loss": 0.78597271, + "learning_rate": 0.0003703397226022407, + "loss": 0.79736161, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3098, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056672, + "balance_loss_mlp": 1.05123568, + "diversity_loss_mlp": 0.0, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.0345928166567928, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76556545, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.05444336, + "routerloss_mlp": 0.0, + "step": 3099, + "time_per_iteration": 4.977718114852905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847219, + "balance_loss_mlp": 1.45243645, + "diversity_loss_mlp": 0.21764749, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.029968084230811296, + "language_loss": 0.83180296, + "learning_rate": 0.0003697380537253339, + "loss": 0.84027505, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01217673, + "step": 3100, + "time_per_iteration": 2.673551559448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.11119175, + "diversity_loss_mlp": 0.0, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.06630352939366652, + "language_loss": 0.81596649, + "learning_rate": 0.0003694372950867471, + "loss": 0.82717824, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 3101, + "time_per_iteration": 2.7776670455932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119741, + "balance_loss_mlp": 1.1100198, + "diversity_loss_mlp": 0.0, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.07189145573728124, + "language_loss": 0.77408171, + "learning_rate": 0.0003691365871370976, + "loss": 0.78527915, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3102, + "time_per_iteration": 3.04355525970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116521, + "balance_loss_mlp": 1.1067102, + "diversity_loss_mlp": 0.0, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06839859357083694, + "language_loss": 0.8504554, + "learning_rate": 0.00036883592999313093, + "loss": 0.8616206, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3103, + "time_per_iteration": 2.6881608963012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_mlp": 1.1020087, + "diversity_loss_mlp": 0.0, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.07720585150601726, + "language_loss": 0.7960434, + "learning_rate": 0.0003685353237715722, + "loss": 0.80715817, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3104, + "time_per_iteration": 2.910879135131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104035, + "balance_loss_mlp": 1.09433126, + "diversity_loss_mlp": 0.0, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.08349083770410728, + "language_loss": 0.81658864, + "learning_rate": 0.0003682347685891274, + "loss": 0.82762903, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3105, + "time_per_iteration": 2.8556530475616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093856, + "balance_loss_mlp": 1.08412814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.07861180875636395, + "language_loss": 0.80587226, + "learning_rate": 0.0003679342645624822, + "loss": 0.81681079, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3106, + "time_per_iteration": 2.9788949489593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091288, + "balance_loss_mlp": 1.08144689, + "diversity_loss_mlp": 0.0, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.062123999367099406, + "language_loss": 0.81345969, + "learning_rate": 0.0003676338118083025, + "loss": 0.82437259, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.09832764, + "routerloss_mlp": 0.0, + "step": 3107, + "time_per_iteration": 3.0514276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083265, + "balance_loss_mlp": 1.07369304, + "diversity_loss_mlp": 0.0, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.07200241428310707, + "language_loss": 0.79341209, + "learning_rate": 0.0003673334104432347, + "loss": 0.8042447, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3108, + "time_per_iteration": 2.6402766704559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084433, + "balance_loss_mlp": 1.07493854, + "diversity_loss_mlp": 0.0, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.06431634181531254, + "language_loss": 0.83437502, + "learning_rate": 0.0003670330605839048, + "loss": 0.84521937, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3109, + "time_per_iteration": 2.8350021839141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071839, + "balance_loss_mlp": 1.06252289, + "diversity_loss_mlp": 0.0, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.08338826074003908, + "language_loss": 0.76629049, + "learning_rate": 0.0003667327623469191, + "loss": 0.77700889, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3110, + "time_per_iteration": 2.7434427738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086579, + "balance_loss_mlp": 1.0770725, + "diversity_loss_mlp": 0.0, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.07334566089126898, + "language_loss": 0.7758621, + "learning_rate": 0.00036643251584886333, + "loss": 0.78672791, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3111, + "time_per_iteration": 2.7712619304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080276, + "balance_loss_mlp": 1.07075715, + "diversity_loss_mlp": 0.0, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.0661546294312284, + "language_loss": 0.81729323, + "learning_rate": 0.00036613232120630393, + "loss": 0.82809597, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3112, + "time_per_iteration": 2.6437926292419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077999, + "balance_loss_mlp": 1.06822348, + "diversity_loss_mlp": 0.0, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.09952194732663294, + "language_loss": 0.80305058, + "learning_rate": 0.00036583217853578643, + "loss": 0.81383061, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3113, + "time_per_iteration": 2.5917038917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085265, + "balance_loss_mlp": 1.07562053, + "diversity_loss_mlp": 0.0, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.09394979208953491, + "language_loss": 0.77671385, + "learning_rate": 0.000365532087953837, + "loss": 0.78756654, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.09631348, + "routerloss_mlp": 0.0, + "step": 3114, + "time_per_iteration": 3.6197850704193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075561, + "balance_loss_mlp": 1.06598282, + "diversity_loss_mlp": 0.0, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.08322265150120763, + "language_loss": 0.89675403, + "learning_rate": 0.00036523204957696065, + "loss": 0.90750962, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3115, + "time_per_iteration": 2.5928850173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068785, + "balance_loss_mlp": 1.05900383, + "diversity_loss_mlp": 0.0, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.07018475264035358, + "language_loss": 0.80565965, + "learning_rate": 0.00036493206352164324, + "loss": 0.81634748, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3116, + "time_per_iteration": 2.9302330017089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070118, + "balance_loss_mlp": 1.06046212, + "diversity_loss_mlp": 0.0, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.07338463965566117, + "language_loss": 0.85090643, + "learning_rate": 0.000364632129904349, + "loss": 0.86160767, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3117, + "time_per_iteration": 2.7801764011383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072158, + "balance_loss_mlp": 1.0622344, + "diversity_loss_mlp": 0.0, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.06545944211786243, + "language_loss": 0.78013116, + "learning_rate": 0.00036433224884152283, + "loss": 0.79085279, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 3118, + "time_per_iteration": 2.714756727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107233, + "balance_loss_mlp": 1.06249511, + "diversity_loss_mlp": 0.0, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.08041065589047977, + "language_loss": 0.77752131, + "learning_rate": 0.00036403242044958875, + "loss": 0.78824466, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.09832764, + "routerloss_mlp": 0.0, + "step": 3119, + "time_per_iteration": 2.583292245864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078089, + "balance_loss_mlp": 1.06846261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.07420053325288596, + "language_loss": 0.91699272, + "learning_rate": 0.0003637326448449507, + "loss": 0.92777365, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3120, + "time_per_iteration": 2.717006206512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080309, + "balance_loss_mlp": 1.07065916, + "diversity_loss_mlp": 0.0, + "epoch": 0.6004232397075798, + "flos": 545146661376.0, + "grad_norm": 0.053625374444117885, + "language_loss": 0.86324787, + "learning_rate": 0.00036343292214399177, + "loss": 0.87405097, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3121, + "time_per_iteration": 2.7628395557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092008, + "balance_loss_mlp": 1.08205438, + "diversity_loss_mlp": 0.0, + "epoch": 0.6006156213928434, + "flos": 629947694592.0, + "grad_norm": 0.08110417303016995, + "language_loss": 0.77154052, + "learning_rate": 0.00036313325246307456, + "loss": 0.78246063, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3122, + "time_per_iteration": 2.7920055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097808, + "balance_loss_mlp": 1.08813453, + "diversity_loss_mlp": 0.0, + "epoch": 0.600808003078107, + "flos": 582315277824.0, + "grad_norm": 0.07750521229706399, + "language_loss": 0.87508434, + "learning_rate": 0.0003628336359185411, + "loss": 0.88606238, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 3123, + "time_per_iteration": 2.6752257347106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086195, + "balance_loss_mlp": 1.07632422, + "diversity_loss_mlp": 0.0, + "epoch": 0.6010003847633705, + "flos": 635274855936.0, + "grad_norm": 0.09005007447476754, + "language_loss": 0.75524527, + "learning_rate": 0.000362534072626713, + "loss": 0.7661072, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3124, + "time_per_iteration": 2.7923338413238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077955, + "balance_loss_mlp": 1.06818557, + "diversity_loss_mlp": 0.0, + "epoch": 0.6011927664486341, + "flos": 718763922432.0, + "grad_norm": 0.07223530633843779, + "language_loss": 0.81714958, + "learning_rate": 0.00036223456270389093, + "loss": 0.82792914, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3125, + "time_per_iteration": 3.0091912746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075718, + "balance_loss_mlp": 1.06540036, + "diversity_loss_mlp": 0.0, + "epoch": 0.6013851481338977, + "flos": 499036184064.0, + "grad_norm": 0.06403369467156497, + "language_loss": 0.80792087, + "learning_rate": 0.00036193510626635517, + "loss": 0.81867802, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 3126, + "time_per_iteration": 2.704378843307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066992, + "balance_loss_mlp": 1.05687714, + "diversity_loss_mlp": 0.0, + "epoch": 0.6015775298191612, + "flos": 749587447296.0, + "grad_norm": 0.06193993783441067, + "language_loss": 0.81725299, + "learning_rate": 0.0003616357034303649, + "loss": 0.82792288, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3127, + "time_per_iteration": 3.002530813217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062328, + "balance_loss_mlp": 1.05243957, + "diversity_loss_mlp": 0.0, + "epoch": 0.6017699115044248, + "flos": 593063202816.0, + "grad_norm": 0.054941683840542065, + "language_loss": 0.78751493, + "learning_rate": 0.0003613363543121584, + "loss": 0.79813826, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.09887695, + "routerloss_mlp": 0.0, + "step": 3128, + "time_per_iteration": 2.8690690994262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063837, + "balance_loss_mlp": 1.05367482, + "diversity_loss_mlp": 0.0, + "epoch": 0.6019622931896883, + "flos": 515111270400.0, + "grad_norm": 0.06760978748019858, + "language_loss": 0.85022873, + "learning_rate": 0.00036103705902795357, + "loss": 0.86086708, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 3129, + "time_per_iteration": 2.7233073711395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106265, + "balance_loss_mlp": 1.0526309, + "diversity_loss_mlp": 0.0, + "epoch": 0.6021546748749519, + "flos": 490469852160.0, + "grad_norm": 0.08999540715217709, + "language_loss": 0.79606092, + "learning_rate": 0.0003607378176939471, + "loss": 0.80668741, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3130, + "time_per_iteration": 2.6465327739715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060318, + "balance_loss_mlp": 1.0503943, + "diversity_loss_mlp": 0.0, + "epoch": 0.6023470565602155, + "flos": 541032721920.0, + "grad_norm": 0.0812918345139536, + "language_loss": 0.82358718, + "learning_rate": 0.00036043863042631465, + "loss": 0.83419037, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3131, + "time_per_iteration": 2.645275354385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060921, + "balance_loss_mlp": 1.05113363, + "diversity_loss_mlp": 0.0, + "epoch": 0.6025394382454791, + "flos": 845020408320.0, + "grad_norm": 0.07968064937120022, + "language_loss": 0.7648955, + "learning_rate": 0.00036013949734121133, + "loss": 0.77550471, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3132, + "time_per_iteration": 3.1564602851867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847858, + "balance_loss_mlp": 1.44895816, + "diversity_loss_mlp": 0.22101411, + "epoch": 0.6027318199307425, + "flos": 577173496320.0, + "grad_norm": 0.03213509913040014, + "language_loss": 0.82544625, + "learning_rate": 0.00035984041855477043, + "loss": 0.83392477, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01287225, + "step": 3133, + "time_per_iteration": 2.7710041999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00606016, + "balance_loss_mlp": 1.03831875, + "diversity_loss_mlp": 0.14934492, + "epoch": 0.6029242016160061, + "flos": 1470976754688.0, + "grad_norm": 0.0016585081527992916, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79315913, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01218408, + "step": 3134, + "time_per_iteration": 5.010243892669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058814, + "balance_loss_mlp": 1.04887819, + "diversity_loss_mlp": 0.0, + "epoch": 0.6031165833012697, + "flos": 480744626688.0, + "grad_norm": 0.06935738535706247, + "language_loss": 0.79867685, + "learning_rate": 0.00035924242434230637, + "loss": 0.80926502, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 3135, + "time_per_iteration": 2.644461154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059705, + "balance_loss_mlp": 1.04970384, + "diversity_loss_mlp": 0.0, + "epoch": 0.6033089649865333, + "flos": 499468612608.0, + "grad_norm": 0.08930778928911463, + "language_loss": 0.78960454, + "learning_rate": 0.00035894350914844516, + "loss": 0.80020154, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3136, + "time_per_iteration": 2.6219546794891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060882, + "balance_loss_mlp": 1.05073738, + "diversity_loss_mlp": 0.0, + "epoch": 0.6035013466717969, + "flos": 556613710848.0, + "grad_norm": 0.07477991129212373, + "language_loss": 0.82716846, + "learning_rate": 0.0003586446487175703, + "loss": 0.83777732, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 3137, + "time_per_iteration": 2.7377843856811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057253, + "balance_loss_mlp": 1.04716182, + "diversity_loss_mlp": 0.0, + "epoch": 0.6036937283570604, + "flos": 594827421696.0, + "grad_norm": 0.06084036951856249, + "language_loss": 0.85439289, + "learning_rate": 0.0003583458431657099, + "loss": 0.86496538, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 3138, + "time_per_iteration": 2.773810863494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056899, + "balance_loss_mlp": 1.04697502, + "diversity_loss_mlp": 0.0, + "epoch": 0.603886110042324, + "flos": 540958569984.0, + "grad_norm": 0.10358798927054172, + "language_loss": 0.82887417, + "learning_rate": 0.00035804709260887056, + "loss": 0.83944315, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.09924316, + "routerloss_mlp": 0.0, + "step": 3139, + "time_per_iteration": 2.7064261436462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0084935, + "balance_loss_mlp": 1.45506001, + "diversity_loss_mlp": 0.21838406, + "epoch": 0.6040784917275875, + "flos": 518582808576.0, + "grad_norm": 0.02792942393132789, + "language_loss": 0.89382195, + "learning_rate": 0.0003577483971630373, + "loss": 0.9023155, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01262751, + "step": 3140, + "time_per_iteration": 2.747962236404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063532, + "balance_loss_mlp": 1.053352, + "diversity_loss_mlp": 0.0, + "epoch": 0.6042708734128511, + "flos": 660751395840.0, + "grad_norm": 0.05833739987767841, + "language_loss": 0.84937215, + "learning_rate": 0.00035744975694417414, + "loss": 0.86000752, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.10180664, + "routerloss_mlp": 0.0, + "step": 3141, + "time_per_iteration": 2.886625289916992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060532, + "balance_loss_mlp": 1.05025589, + "diversity_loss_mlp": 0.0, + "epoch": 0.6044632550981146, + "flos": 572330520576.0, + "grad_norm": 0.07799366016494108, + "language_loss": 0.82322264, + "learning_rate": 0.00035715117206822344, + "loss": 0.83382797, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3142, + "time_per_iteration": 2.8120434284210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061407, + "balance_loss_mlp": 1.05125666, + "diversity_loss_mlp": 0.0, + "epoch": 0.6046556367833782, + "flos": 546681083904.0, + "grad_norm": 0.06292121779847899, + "language_loss": 0.80965286, + "learning_rate": 0.0003568526426511065, + "loss": 0.82026696, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 3143, + "time_per_iteration": 2.600508689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857497, + "balance_loss_mlp": 1.4695704, + "diversity_loss_mlp": 0.22092447, + "epoch": 0.6048480184686418, + "flos": 776838117888.0, + "grad_norm": 0.033476134745844106, + "language_loss": 0.83131814, + "learning_rate": 0.000356554168808722, + "loss": 0.8398931, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0122495, + "step": 3144, + "time_per_iteration": 3.026810646057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106893, + "balance_loss_mlp": 1.058887, + "diversity_loss_mlp": 0.0, + "epoch": 0.6050404001539054, + "flos": 657144036864.0, + "grad_norm": 0.07082652980877534, + "language_loss": 0.85014772, + "learning_rate": 0.00035625575065694837, + "loss": 0.86083698, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.10040283, + "routerloss_mlp": 0.0, + "step": 3145, + "time_per_iteration": 2.840867519378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845224, + "balance_loss_mlp": 1.44920301, + "diversity_loss_mlp": 0.21683007, + "epoch": 0.605232781839169, + "flos": 548983816704.0, + "grad_norm": 0.03030378734616264, + "language_loss": 0.77627134, + "learning_rate": 0.0003559573883116415, + "loss": 0.78472358, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01220777, + "step": 3146, + "time_per_iteration": 2.7349908351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107352, + "balance_loss_mlp": 1.06324959, + "diversity_loss_mlp": 0.0, + "epoch": 0.6054251635244324, + "flos": 605402449920.0, + "grad_norm": 0.05605665058846549, + "language_loss": 0.85758018, + "learning_rate": 0.00035565908188863604, + "loss": 0.86831534, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.10272217, + "routerloss_mlp": 0.0, + "step": 3147, + "time_per_iteration": 2.8125319480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845087, + "balance_loss_mlp": 1.44807422, + "diversity_loss_mlp": 0.21802135, + "epoch": 0.605617545209696, + "flos": 613679887872.0, + "grad_norm": 0.03003998541469304, + "language_loss": 0.79795343, + "learning_rate": 0.00035536083150374464, + "loss": 0.80640435, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01203923, + "step": 3148, + "time_per_iteration": 2.8052470684051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017561, + "balance_loss_mlp": 1.01191068, + "diversity_loss_mlp": 0.0, + "epoch": 0.6058099268949596, + "flos": 1498301577216.0, + "grad_norm": 0.017174605961616223, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75765514, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.05639648, + "routerloss_mlp": 0.0, + "step": 3149, + "time_per_iteration": 4.839694023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068624, + "balance_loss_mlp": 1.05813408, + "diversity_loss_mlp": 0.0, + "epoch": 0.6060023085802232, + "flos": 670476621312.0, + "grad_norm": 0.07659984741592324, + "language_loss": 0.86092103, + "learning_rate": 0.0003547644993114475, + "loss": 0.87160718, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.10491943, + "routerloss_mlp": 0.0, + "step": 3150, + "time_per_iteration": 2.847841739654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072042, + "balance_loss_mlp": 1.06145024, + "diversity_loss_mlp": 0.0, + "epoch": 0.6061946902654868, + "flos": 606168562176.0, + "grad_norm": 0.11052058943541425, + "language_loss": 0.79770887, + "learning_rate": 0.00035446641773555806, + "loss": 0.80842924, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 3151, + "time_per_iteration": 2.748117208480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068453, + "balance_loss_mlp": 1.05804002, + "diversity_loss_mlp": 0.0, + "epoch": 0.6063870719507503, + "flos": 557844185088.0, + "grad_norm": 0.06928200582264574, + "language_loss": 0.87033039, + "learning_rate": 0.000354168392660816, + "loss": 0.88101488, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 3152, + "time_per_iteration": 2.7237491607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064757, + "balance_loss_mlp": 1.05449951, + "diversity_loss_mlp": 0.0, + "epoch": 0.6065794536360138, + "flos": 557154796032.0, + "grad_norm": 0.08776252561897581, + "language_loss": 0.83035654, + "learning_rate": 0.0003538704242029252, + "loss": 0.84100413, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 3153, + "time_per_iteration": 2.687469959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064416, + "balance_loss_mlp": 1.05382478, + "diversity_loss_mlp": 0.0, + "epoch": 0.6067718353212774, + "flos": 690144385536.0, + "grad_norm": 0.06996316305541914, + "language_loss": 0.78274238, + "learning_rate": 0.0003535725124775672, + "loss": 0.79338652, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 3154, + "time_per_iteration": 2.844794750213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056628, + "balance_loss_mlp": 1.04631591, + "diversity_loss_mlp": 0.0, + "epoch": 0.606964217006541, + "flos": 521804726784.0, + "grad_norm": 0.06399916678040601, + "language_loss": 0.86628783, + "learning_rate": 0.00035327465760040126, + "loss": 0.87685412, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 3155, + "time_per_iteration": 2.7096383571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_mlp": 1.03957009, + "diversity_loss_mlp": 0.0, + "epoch": 0.6071565986918045, + "flos": 641555707392.0, + "grad_norm": 0.08275092128409181, + "language_loss": 0.84610963, + "learning_rate": 0.00035297685968706526, + "loss": 0.85660648, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3156, + "time_per_iteration": 2.770024061203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054018, + "balance_loss_mlp": 1.04370594, + "diversity_loss_mlp": 0.0, + "epoch": 0.6073489803770681, + "flos": 560581917696.0, + "grad_norm": 0.07863496537101755, + "language_loss": 0.83056825, + "learning_rate": 0.00035267911885317454, + "loss": 0.84110844, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 3157, + "time_per_iteration": 2.671334743499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050585, + "balance_loss_mlp": 1.04051757, + "diversity_loss_mlp": 0.0, + "epoch": 0.6075413620623317, + "flos": 586088193024.0, + "grad_norm": 0.06000790250856451, + "language_loss": 0.81843442, + "learning_rate": 0.0003523814352143222, + "loss": 0.82894027, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3158, + "time_per_iteration": 2.820080518722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053512, + "balance_loss_mlp": 1.04349208, + "diversity_loss_mlp": 0.0, + "epoch": 0.6077337437475953, + "flos": 630812551680.0, + "grad_norm": 0.0842902191025903, + "language_loss": 0.91154212, + "learning_rate": 0.00035208380888607937, + "loss": 0.92207724, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3159, + "time_per_iteration": 2.769655466079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_mlp": 1.02448559, + "diversity_loss_mlp": 0.0, + "epoch": 0.6079261254328588, + "flos": 1468503696384.0, + "grad_norm": 0.01971528727847153, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80491835, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.05371094, + "routerloss_mlp": 0.0, + "step": 3160, + "time_per_iteration": 4.852057933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020567, + "balance_loss_mlp": 1.015203, + "diversity_loss_mlp": 0.0, + "epoch": 0.6081185071181223, + "flos": 1523024861184.0, + "grad_norm": 0.015706814795434412, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76712799, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.05371094, + "routerloss_mlp": 0.0, + "step": 3161, + "time_per_iteration": 5.034492015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105269, + "balance_loss_mlp": 1.04277158, + "diversity_loss_mlp": 0.0, + "epoch": 0.6083108888033859, + "flos": 556319674368.0, + "grad_norm": 0.07240231538807727, + "language_loss": 0.82060492, + "learning_rate": 0.00035119127492038446, + "loss": 0.83113182, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3162, + "time_per_iteration": 2.7958009243011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058888, + "balance_loss_mlp": 1.04918981, + "diversity_loss_mlp": 0.0, + "epoch": 0.6085032704886495, + "flos": 841166000640.0, + "grad_norm": 0.08243185287386566, + "language_loss": 0.8267377, + "learning_rate": 0.00035089387898984436, + "loss": 0.83732659, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.09692383, + "routerloss_mlp": 0.0, + "step": 3163, + "time_per_iteration": 3.0141196250915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106001, + "balance_loss_mlp": 1.04982388, + "diversity_loss_mlp": 0.0, + "epoch": 0.6086956521739131, + "flos": 684792631296.0, + "grad_norm": 0.07404044041946549, + "language_loss": 0.81452298, + "learning_rate": 0.0003505965409474343, + "loss": 0.82512313, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3164, + "time_per_iteration": 2.884279727935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00822199, + "balance_loss_mlp": 1.40056133, + "diversity_loss_mlp": 0.21809974, + "epoch": 0.6088880338591766, + "flos": 535799536128.0, + "grad_norm": 0.02989314006565827, + "language_loss": 0.86555362, + "learning_rate": 0.0003502992609085913, + "loss": 0.8737756, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01286863, + "step": 3165, + "time_per_iteration": 2.665219306945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064479, + "balance_loss_mlp": 1.05481732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6090804155444401, + "flos": 731533026816.0, + "grad_norm": 0.0721176964117247, + "language_loss": 0.82392001, + "learning_rate": 0.00035000203898872954, + "loss": 0.83456486, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3166, + "time_per_iteration": 3.0119569301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064027, + "balance_loss_mlp": 1.05416799, + "diversity_loss_mlp": 0.0, + "epoch": 0.6092727972297037, + "flos": 699014665728.0, + "grad_norm": 0.07129548452914211, + "language_loss": 0.84480536, + "learning_rate": 0.0003497048753032406, + "loss": 0.85544562, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3167, + "time_per_iteration": 2.854588031768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069841, + "balance_loss_mlp": 1.05985689, + "diversity_loss_mlp": 0.0, + "epoch": 0.6094651789149673, + "flos": 1051946735616.0, + "grad_norm": 0.07231997141892146, + "language_loss": 0.80835009, + "learning_rate": 0.000349407769967494, + "loss": 0.8190484, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 3168, + "time_per_iteration": 3.3936102390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072291, + "balance_loss_mlp": 1.06240892, + "diversity_loss_mlp": 0.0, + "epoch": 0.6096575606002309, + "flos": 503085883392.0, + "grad_norm": 0.08318926372150726, + "language_loss": 0.8467539, + "learning_rate": 0.0003491107230968361, + "loss": 0.85747683, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.09881592, + "routerloss_mlp": 0.0, + "step": 3169, + "time_per_iteration": 2.618696928024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070277, + "balance_loss_mlp": 1.06021023, + "diversity_loss_mlp": 0.0, + "epoch": 0.6098499422854944, + "flos": 585643281408.0, + "grad_norm": 0.06713277413300113, + "language_loss": 0.81751496, + "learning_rate": 0.00034881373480659085, + "loss": 0.82821774, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3170, + "time_per_iteration": 2.862299919128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063164, + "balance_loss_mlp": 1.05321598, + "diversity_loss_mlp": 0.0, + "epoch": 0.610042323970758, + "flos": 469205996544.0, + "grad_norm": 0.08200914133790435, + "language_loss": 0.77840459, + "learning_rate": 0.0003485168052120594, + "loss": 0.78903627, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3171, + "time_per_iteration": 2.564657688140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060206, + "balance_loss_mlp": 1.05049598, + "diversity_loss_mlp": 0.0, + "epoch": 0.6102347056560216, + "flos": 514177403904.0, + "grad_norm": 0.07281146068818606, + "language_loss": 0.80045426, + "learning_rate": 0.00034821993442851973, + "loss": 0.81105626, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3172, + "time_per_iteration": 2.6049551963806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058405, + "balance_loss_mlp": 1.04840922, + "diversity_loss_mlp": 0.0, + "epoch": 0.6104270873412851, + "flos": 469013276160.0, + "grad_norm": 0.08175384117022455, + "language_loss": 0.82176208, + "learning_rate": 0.00034792312257122735, + "loss": 0.83234608, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 3173, + "time_per_iteration": 2.6007068157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00813523, + "balance_loss_mlp": 1.38556361, + "diversity_loss_mlp": 0.21673629, + "epoch": 0.6106194690265486, + "flos": 549875837952.0, + "grad_norm": 0.0335182000566727, + "language_loss": 0.80848879, + "learning_rate": 0.00034762636975541506, + "loss": 0.81662405, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01237353, + "step": 3174, + "time_per_iteration": 2.6783013343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061612, + "balance_loss_mlp": 1.05138397, + "diversity_loss_mlp": 0.0, + "epoch": 0.6108118507118122, + "flos": 472857772032.0, + "grad_norm": 0.07909505551334972, + "language_loss": 0.81032109, + "learning_rate": 0.0003473296760962923, + "loss": 0.82093716, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.10229492, + "routerloss_mlp": 0.0, + "step": 3175, + "time_per_iteration": 2.7157249450683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017221, + "balance_loss_mlp": 1.01159382, + "diversity_loss_mlp": 0.0, + "epoch": 0.6110042323970758, + "flos": 1445166904320.0, + "grad_norm": 0.020158265394599716, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79550958, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.05615234, + "routerloss_mlp": 0.0, + "step": 3176, + "time_per_iteration": 4.707489728927612 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_mlp": 1.04915345, + "diversity_loss_mlp": 0.0, + "epoch": 0.6111966140823394, + "flos": 794153590272.0, + "grad_norm": 0.08734600695876651, + "language_loss": 0.8132062, + "learning_rate": 0.00034673646670883976, + "loss": 0.82379746, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.09973145, + "routerloss_mlp": 0.0, + "step": 3177, + "time_per_iteration": 2.965688705444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101108, + "balance_loss_mlp": 1.00557232, + "diversity_loss_mlp": 0.0, + "epoch": 0.611388995767603, + "flos": 1557650663424.0, + "grad_norm": 0.01801959168057259, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76726103, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.05517578, + "routerloss_mlp": 0.0, + "step": 3178, + "time_per_iteration": 4.958420991897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00819092, + "balance_loss_mlp": 1.39532781, + "diversity_loss_mlp": 0.21795917, + "epoch": 0.6115813774528664, + "flos": 712169210880.0, + "grad_norm": 0.031831362939539476, + "language_loss": 0.81821573, + "learning_rate": 0.0003461434953300865, + "loss": 0.82640672, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01244847, + "step": 3179, + "time_per_iteration": 2.92270827293396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063068, + "balance_loss_mlp": 1.05295873, + "diversity_loss_mlp": 0.0, + "epoch": 0.61177375913813, + "flos": 684308072448.0, + "grad_norm": 0.055258394831610054, + "language_loss": 0.81141388, + "learning_rate": 0.0003458470991817515, + "loss": 0.82204449, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3180, + "time_per_iteration": 2.9693758487701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060777, + "balance_loss_mlp": 1.05068588, + "diversity_loss_mlp": 0.0, + "epoch": 0.6119661408233936, + "flos": 511662127104.0, + "grad_norm": 0.06960725666926779, + "language_loss": 0.85075366, + "learning_rate": 0.0003455507628808802, + "loss": 0.86136144, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 3181, + "time_per_iteration": 2.6036593914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071608, + "balance_loss_mlp": 1.06117702, + "diversity_loss_mlp": 0.0, + "epoch": 0.6121585225086572, + "flos": 556809002496.0, + "grad_norm": 0.09091925049493645, + "language_loss": 0.84135175, + "learning_rate": 0.00034525448654252076, + "loss": 0.85206783, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.10430908, + "routerloss_mlp": 0.0, + "step": 3182, + "time_per_iteration": 2.636809825897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061719, + "balance_loss_mlp": 1.05150867, + "diversity_loss_mlp": 0.0, + "epoch": 0.6123509041939207, + "flos": 561849467904.0, + "grad_norm": 0.07252100888517035, + "language_loss": 0.82806599, + "learning_rate": 0.0003449582702816976, + "loss": 0.83868313, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3183, + "time_per_iteration": 2.707475423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070149, + "balance_loss_mlp": 1.05986118, + "diversity_loss_mlp": 0.0, + "epoch": 0.6125432858791843, + "flos": 558056729088.0, + "grad_norm": 0.07323153161974344, + "language_loss": 0.82831162, + "learning_rate": 0.0003446621142134122, + "loss": 0.8390131, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.10290527, + "routerloss_mlp": 0.0, + "step": 3184, + "time_per_iteration": 2.6639719009399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068209, + "balance_loss_mlp": 1.05824375, + "diversity_loss_mlp": 0.0, + "epoch": 0.6127356675644479, + "flos": 415015944192.0, + "grad_norm": 0.08088263565451759, + "language_loss": 0.84134692, + "learning_rate": 0.0003443660184526424, + "loss": 0.85202903, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3185, + "time_per_iteration": 2.465219736099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068542, + "balance_loss_mlp": 1.05862343, + "diversity_loss_mlp": 0.0, + "epoch": 0.6129280492497114, + "flos": 603843434496.0, + "grad_norm": 0.06289917121629264, + "language_loss": 0.86502969, + "learning_rate": 0.0003440699831143429, + "loss": 0.87571514, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3186, + "time_per_iteration": 2.7979393005371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062443, + "balance_loss_mlp": 1.05262065, + "diversity_loss_mlp": 0.0, + "epoch": 0.613120430934975, + "flos": 519766295040.0, + "grad_norm": 0.07676649362634465, + "language_loss": 0.82236582, + "learning_rate": 0.0003437740083134449, + "loss": 0.83299029, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 3187, + "time_per_iteration": 2.686150312423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066248, + "balance_loss_mlp": 1.0564487, + "diversity_loss_mlp": 0.0, + "epoch": 0.6133128126202385, + "flos": 511083965952.0, + "grad_norm": 0.08991197971935971, + "language_loss": 0.83540225, + "learning_rate": 0.00034347809416485574, + "loss": 0.84606475, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.09790039, + "routerloss_mlp": 0.0, + "step": 3188, + "time_per_iteration": 2.604308605194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106686, + "balance_loss_mlp": 1.05696571, + "diversity_loss_mlp": 0.0, + "epoch": 0.6135051943055021, + "flos": 607562021376.0, + "grad_norm": 0.07330624647380965, + "language_loss": 0.81935883, + "learning_rate": 0.0003431822407834597, + "loss": 0.83002746, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.09887695, + "routerloss_mlp": 0.0, + "step": 3189, + "time_per_iteration": 2.786008596420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070082, + "balance_loss_mlp": 1.0602051, + "diversity_loss_mlp": 0.0, + "epoch": 0.6136975759907657, + "flos": 1160200931328.0, + "grad_norm": 0.07745901872485048, + "language_loss": 0.84407461, + "learning_rate": 0.00034288644828411706, + "loss": 0.85477537, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.09869385, + "routerloss_mlp": 0.0, + "step": 3190, + "time_per_iteration": 3.4646387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078292, + "balance_loss_mlp": 1.06861246, + "diversity_loss_mlp": 0.0, + "epoch": 0.6138899576760293, + "flos": 706938596352.0, + "grad_norm": 0.07529521339256182, + "language_loss": 0.75715351, + "learning_rate": 0.0003425907167816649, + "loss": 0.76793635, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3191, + "time_per_iteration": 2.874946117401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00808796, + "balance_loss_mlp": 1.37378812, + "diversity_loss_mlp": 0.21839428, + "epoch": 0.6140823393612928, + "flos": 586443898368.0, + "grad_norm": 0.033870623426287425, + "language_loss": 0.84848714, + "learning_rate": 0.00034229504639091623, + "loss": 0.85657513, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01270431, + "step": 3192, + "time_per_iteration": 2.8179514408111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074782, + "balance_loss_mlp": 1.06519175, + "diversity_loss_mlp": 0.0, + "epoch": 0.6142747210465563, + "flos": 804130633728.0, + "grad_norm": 0.07980932307836838, + "language_loss": 0.79876941, + "learning_rate": 0.0003419994372266606, + "loss": 0.80951726, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3193, + "time_per_iteration": 3.121509552001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070128, + "balance_loss_mlp": 1.06069219, + "diversity_loss_mlp": 0.0, + "epoch": 0.6144671027318199, + "flos": 529434620928.0, + "grad_norm": 0.05544583647367184, + "language_loss": 0.82228541, + "learning_rate": 0.00034170388940366335, + "loss": 0.83298671, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3194, + "time_per_iteration": 2.725961685180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071987, + "balance_loss_mlp": 1.0625093, + "diversity_loss_mlp": 0.0, + "epoch": 0.6146594844170835, + "flos": 805425348096.0, + "grad_norm": 0.06534437990847952, + "language_loss": 0.80109018, + "learning_rate": 0.0003414084030366667, + "loss": 0.81181002, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.0947876, + "routerloss_mlp": 0.0, + "step": 3195, + "time_per_iteration": 3.127318859100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073594, + "balance_loss_mlp": 1.06399155, + "diversity_loss_mlp": 0.0, + "epoch": 0.6148518661023471, + "flos": 501697193472.0, + "grad_norm": 0.07171859971508983, + "language_loss": 0.83377409, + "learning_rate": 0.0003411129782403883, + "loss": 0.84451008, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3196, + "time_per_iteration": 2.7145206928253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078425, + "balance_loss_mlp": 1.06870365, + "diversity_loss_mlp": 0.0, + "epoch": 0.6150442477876106, + "flos": 510688613376.0, + "grad_norm": 0.09666217933122766, + "language_loss": 0.85076511, + "learning_rate": 0.0003408176151295225, + "loss": 0.86154932, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3197, + "time_per_iteration": 2.5919525623321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079298, + "balance_loss_mlp": 1.06990433, + "diversity_loss_mlp": 0.0, + "epoch": 0.6152366294728742, + "flos": 527005979136.0, + "grad_norm": 0.06581377475358774, + "language_loss": 0.77279031, + "learning_rate": 0.00034052231381873944, + "loss": 0.78358328, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3198, + "time_per_iteration": 2.597702741622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082219, + "balance_loss_mlp": 1.07295024, + "diversity_loss_mlp": 0.0, + "epoch": 0.6154290111581378, + "flos": 473300112384.0, + "grad_norm": 0.0683279233493331, + "language_loss": 0.85131848, + "learning_rate": 0.00034022707442268494, + "loss": 0.8621406, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3199, + "time_per_iteration": 2.562068223953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080014, + "balance_loss_mlp": 1.07069743, + "diversity_loss_mlp": 0.0, + "epoch": 0.6156213928434013, + "flos": 550819616256.0, + "grad_norm": 0.0761762485373057, + "language_loss": 0.82035017, + "learning_rate": 0.0003399318970559813, + "loss": 0.83115035, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 3200, + "time_per_iteration": 2.789898157119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080478, + "balance_loss_mlp": 1.07100666, + "diversity_loss_mlp": 0.0, + "epoch": 0.6158137745286649, + "flos": 750941259264.0, + "grad_norm": 0.08069642466901547, + "language_loss": 0.84662288, + "learning_rate": 0.00033963678183322656, + "loss": 0.85742772, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3201, + "time_per_iteration": 3.026878595352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091206, + "balance_loss_mlp": 1.08173513, + "diversity_loss_mlp": 0.0, + "epoch": 0.6160061562139284, + "flos": 555815665152.0, + "grad_norm": 0.059556899615455, + "language_loss": 0.82784677, + "learning_rate": 0.0003393417288689945, + "loss": 0.83875883, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3202, + "time_per_iteration": 2.6654982566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090691, + "balance_loss_mlp": 1.08118427, + "diversity_loss_mlp": 0.0, + "epoch": 0.616198537899192, + "flos": 742177437696.0, + "grad_norm": 0.07467788423655687, + "language_loss": 0.76113433, + "learning_rate": 0.00033904673827783504, + "loss": 0.77204126, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3203, + "time_per_iteration": 2.92669939994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010864, + "balance_loss_mlp": 1.07689261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6163909195844556, + "flos": 478810082304.0, + "grad_norm": 0.06286363142909755, + "language_loss": 0.8181622, + "learning_rate": 0.00033875181017427357, + "loss": 0.82902622, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3204, + "time_per_iteration": 2.5680675506591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090188, + "balance_loss_mlp": 1.08068752, + "diversity_loss_mlp": 0.0, + "epoch": 0.6165833012697192, + "flos": 531517469184.0, + "grad_norm": 0.07085405603281952, + "language_loss": 0.81132901, + "learning_rate": 0.00033845694467281133, + "loss": 0.82223082, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3205, + "time_per_iteration": 2.8592958450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00806951, + "balance_loss_mlp": 1.37197065, + "diversity_loss_mlp": 0.21751499, + "epoch": 0.6167756829549826, + "flos": 807765156864.0, + "grad_norm": 0.030824309293312202, + "language_loss": 0.83412218, + "learning_rate": 0.00033816214188792516, + "loss": 0.84219164, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01220786, + "step": 3206, + "time_per_iteration": 3.1863744258880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087794, + "balance_loss_mlp": 1.07844186, + "diversity_loss_mlp": 0.0, + "epoch": 0.6169680646402462, + "flos": 488928089088.0, + "grad_norm": 0.07935266980456598, + "language_loss": 0.85488075, + "learning_rate": 0.00033786740193406784, + "loss": 0.86575866, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3207, + "time_per_iteration": 2.626253604888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.07682097, + "diversity_loss_mlp": 0.0, + "epoch": 0.6171604463255098, + "flos": 618954918912.0, + "grad_norm": 0.07540350896316815, + "language_loss": 0.81724775, + "learning_rate": 0.00033757272492566736, + "loss": 0.82811046, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3208, + "time_per_iteration": 2.8899030685424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080715, + "balance_loss_mlp": 1.07114851, + "diversity_loss_mlp": 0.0, + "epoch": 0.6173528280107734, + "flos": 528859031040.0, + "grad_norm": 0.05796890161537444, + "language_loss": 0.87216032, + "learning_rate": 0.0003372781109771278, + "loss": 0.88296747, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3209, + "time_per_iteration": 2.752558708190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077325, + "balance_loss_mlp": 1.06753802, + "diversity_loss_mlp": 0.0, + "epoch": 0.617545209696037, + "flos": 596581728768.0, + "grad_norm": 0.06419749590312054, + "language_loss": 0.76373756, + "learning_rate": 0.0003369835602028281, + "loss": 0.7745108, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3210, + "time_per_iteration": 2.7878270149230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068114, + "balance_loss_mlp": 1.05842817, + "diversity_loss_mlp": 0.0, + "epoch": 0.6177375913813005, + "flos": 475098835968.0, + "grad_norm": 0.0669620080474601, + "language_loss": 0.79502624, + "learning_rate": 0.0003366890727171232, + "loss": 0.8057074, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3211, + "time_per_iteration": 2.7112903594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069089, + "balance_loss_mlp": 1.05950451, + "diversity_loss_mlp": 0.0, + "epoch": 0.617929973066564, + "flos": 529812721152.0, + "grad_norm": 0.08442057123784988, + "language_loss": 0.78359348, + "learning_rate": 0.00033639464863434313, + "loss": 0.79428434, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3212, + "time_per_iteration": 2.634425163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_mlp": 1.03023958, + "diversity_loss_mlp": 0.0, + "epoch": 0.6181223547518276, + "flos": 1420053783552.0, + "grad_norm": 0.02134222442632316, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79478121, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 3213, + "time_per_iteration": 4.7891459465026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066789, + "balance_loss_mlp": 1.05715084, + "diversity_loss_mlp": 0.0, + "epoch": 0.6183147364370912, + "flos": 740319243264.0, + "grad_norm": 0.07602232380536252, + "language_loss": 0.79711038, + "learning_rate": 0.00033580599113475543, + "loss": 0.80777824, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3214, + "time_per_iteration": 2.987006187438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065135, + "balance_loss_mlp": 1.0553956, + "diversity_loss_mlp": 0.0, + "epoch": 0.6185071181223547, + "flos": 381649978368.0, + "grad_norm": 0.0762428760353498, + "language_loss": 0.86394417, + "learning_rate": 0.00033551175794648507, + "loss": 0.87459552, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.09735107, + "routerloss_mlp": 0.0, + "step": 3215, + "time_per_iteration": 2.4780433177948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064383, + "balance_loss_mlp": 1.05447078, + "diversity_loss_mlp": 0.0, + "epoch": 0.6186994998076183, + "flos": 463347661824.0, + "grad_norm": 0.059308624592263506, + "language_loss": 0.81911212, + "learning_rate": 0.00033521758861821365, + "loss": 0.82975602, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3216, + "time_per_iteration": 2.5746333599090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062859, + "balance_loss_mlp": 1.05332255, + "diversity_loss_mlp": 0.0, + "epoch": 0.6188918814928819, + "flos": 485273742336.0, + "grad_norm": 0.06339313693664829, + "language_loss": 0.89093363, + "learning_rate": 0.0003349234832641479, + "loss": 0.90156221, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3217, + "time_per_iteration": 2.561518669128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062704, + "balance_loss_mlp": 1.05323243, + "diversity_loss_mlp": 0.0, + "epoch": 0.6190842631781455, + "flos": 657307021824.0, + "grad_norm": 0.07035473810033784, + "language_loss": 0.81230485, + "learning_rate": 0.00033462944199846975, + "loss": 0.82293189, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3218, + "time_per_iteration": 3.0372345447540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065156, + "balance_loss_mlp": 1.05549467, + "diversity_loss_mlp": 0.0, + "epoch": 0.619276644863409, + "flos": 403603223040.0, + "grad_norm": 0.07112802613336307, + "language_loss": 0.86179578, + "learning_rate": 0.00033433546493533606, + "loss": 0.87244731, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3219, + "time_per_iteration": 2.4615468978881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066941, + "balance_loss_mlp": 1.05763078, + "diversity_loss_mlp": 0.0, + "epoch": 0.6194690265486725, + "flos": 583093499904.0, + "grad_norm": 0.07983484825062852, + "language_loss": 0.84651643, + "learning_rate": 0.00033404155218887897, + "loss": 0.8571859, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3220, + "time_per_iteration": 2.725001335144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066491, + "balance_loss_mlp": 1.05722845, + "diversity_loss_mlp": 0.0, + "epoch": 0.6196614082339361, + "flos": 504246974976.0, + "grad_norm": 0.05498489673307501, + "language_loss": 0.87258649, + "learning_rate": 0.00033374770387320534, + "loss": 0.88325131, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3221, + "time_per_iteration": 2.7884719371795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066206, + "balance_loss_mlp": 1.05684233, + "diversity_loss_mlp": 0.0, + "epoch": 0.6198537899191997, + "flos": 575409277440.0, + "grad_norm": 0.06826724081601121, + "language_loss": 0.85091376, + "learning_rate": 0.00033345392010239737, + "loss": 0.86157584, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3222, + "time_per_iteration": 2.758528232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_mlp": 1.06346869, + "diversity_loss_mlp": 0.0, + "epoch": 0.6200461716044633, + "flos": 593157178368.0, + "grad_norm": 0.07112470494876487, + "language_loss": 0.82199866, + "learning_rate": 0.0003331602009905118, + "loss": 0.8327266, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3223, + "time_per_iteration": 2.7497544288635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073437, + "balance_loss_mlp": 1.06405497, + "diversity_loss_mlp": 0.0, + "epoch": 0.6202385532897268, + "flos": 666093238272.0, + "grad_norm": 0.06198906744782324, + "language_loss": 0.8420788, + "learning_rate": 0.00033286654665158085, + "loss": 0.85281318, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3224, + "time_per_iteration": 2.938769817352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00805444, + "balance_loss_mlp": 1.36691594, + "diversity_loss_mlp": 0.21943557, + "epoch": 0.6204309349749904, + "flos": 484952541696.0, + "grad_norm": 0.03128305924884035, + "language_loss": 0.87915754, + "learning_rate": 0.0003325729571996109, + "loss": 0.88721198, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01226849, + "step": 3225, + "time_per_iteration": 2.6774377822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.07079625, + "diversity_loss_mlp": 0.0, + "epoch": 0.6206233166602539, + "flos": 584057101824.0, + "grad_norm": 0.15310961758991004, + "language_loss": 0.83791566, + "learning_rate": 0.000332279432748584, + "loss": 0.8487193, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3226, + "time_per_iteration": 2.723944664001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078837, + "balance_loss_mlp": 1.06965768, + "diversity_loss_mlp": 0.0, + "epoch": 0.6208156983455175, + "flos": 476917383168.0, + "grad_norm": 0.06102841985942585, + "language_loss": 0.87609762, + "learning_rate": 0.00033198597341245576, + "loss": 0.886886, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3227, + "time_per_iteration": 2.6077282428741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107844, + "balance_loss_mlp": 1.06877792, + "diversity_loss_mlp": 0.0, + "epoch": 0.6210080800307811, + "flos": 789066137088.0, + "grad_norm": 0.05859377500804419, + "language_loss": 0.81977952, + "learning_rate": 0.00033169257930515763, + "loss": 0.8305639, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3228, + "time_per_iteration": 3.0201709270477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079582, + "balance_loss_mlp": 1.06983042, + "diversity_loss_mlp": 0.0, + "epoch": 0.6212004617160446, + "flos": 607794388992.0, + "grad_norm": 0.06260829937623101, + "language_loss": 0.81892502, + "learning_rate": 0.0003313992505405951, + "loss": 0.82972085, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.09741211, + "routerloss_mlp": 0.0, + "step": 3229, + "time_per_iteration": 2.7065281867980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085739, + "balance_loss_mlp": 1.07612467, + "diversity_loss_mlp": 0.0, + "epoch": 0.6213928434013082, + "flos": 586520621568.0, + "grad_norm": 0.07524693848551285, + "language_loss": 0.81223184, + "learning_rate": 0.0003311059872326487, + "loss": 0.82308924, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.09606934, + "routerloss_mlp": 0.0, + "step": 3230, + "time_per_iteration": 2.6831164360046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082096, + "balance_loss_mlp": 1.07257652, + "diversity_loss_mlp": 0.0, + "epoch": 0.6215852250865718, + "flos": 536076320256.0, + "grad_norm": 0.08041283658351392, + "language_loss": 0.792005, + "learning_rate": 0.0003308127894951734, + "loss": 0.80282593, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3231, + "time_per_iteration": 2.6133408546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.07829607, + "diversity_loss_mlp": 0.0, + "epoch": 0.6217776067718354, + "flos": 618169356288.0, + "grad_norm": 0.0806270364015219, + "language_loss": 0.86446661, + "learning_rate": 0.00033051965744199834, + "loss": 0.87534499, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3232, + "time_per_iteration": 2.7565104961395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081354, + "balance_loss_mlp": 1.07194829, + "diversity_loss_mlp": 0.0, + "epoch": 0.6219699884570988, + "flos": 545875324416.0, + "grad_norm": 0.06624380464527684, + "language_loss": 0.90293765, + "learning_rate": 0.0003302265911869276, + "loss": 0.91375124, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3233, + "time_per_iteration": 2.926671266555786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070794, + "balance_loss_mlp": 1.06132245, + "diversity_loss_mlp": 0.0, + "epoch": 0.6221623701423624, + "flos": 481149891072.0, + "grad_norm": 0.08213933441923858, + "language_loss": 0.84280741, + "learning_rate": 0.0003299335908437397, + "loss": 0.85351539, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3234, + "time_per_iteration": 2.5910556316375732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074187, + "balance_loss_mlp": 1.06473994, + "diversity_loss_mlp": 0.0, + "epoch": 0.622354751827626, + "flos": 380024151552.0, + "grad_norm": 0.08585428313311574, + "language_loss": 0.79975766, + "learning_rate": 0.0003296406565261873, + "loss": 0.81049955, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3235, + "time_per_iteration": 2.4815149307250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069588, + "balance_loss_mlp": 1.06017601, + "diversity_loss_mlp": 0.0, + "epoch": 0.6225471335128896, + "flos": 667869940224.0, + "grad_norm": 0.07182021420774376, + "language_loss": 0.84884858, + "learning_rate": 0.0003293477883479978, + "loss": 0.85954452, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3236, + "time_per_iteration": 2.821707248687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.05992377, + "diversity_loss_mlp": 0.0, + "epoch": 0.6227395151981532, + "flos": 771320807424.0, + "grad_norm": 0.08520791019751349, + "language_loss": 0.79754794, + "learning_rate": 0.0003290549864228727, + "loss": 0.80824208, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3237, + "time_per_iteration": 2.932542324066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075658, + "balance_loss_mlp": 1.06604934, + "diversity_loss_mlp": 0.0, + "epoch": 0.6229318968834167, + "flos": 484354556928.0, + "grad_norm": 0.07053580491728426, + "language_loss": 0.86281902, + "learning_rate": 0.0003287622508644875, + "loss": 0.87357557, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3238, + "time_per_iteration": 2.742324113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00814101, + "balance_loss_mlp": 1.38574493, + "diversity_loss_mlp": 0.21743111, + "epoch": 0.6231242785686802, + "flos": 462935056896.0, + "grad_norm": 0.03587473659698897, + "language_loss": 0.86128193, + "learning_rate": 0.0003284695817864923, + "loss": 0.86942297, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01251296, + "step": 3239, + "time_per_iteration": 2.5240445137023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071749, + "balance_loss_mlp": 1.06229532, + "diversity_loss_mlp": 0.0, + "epoch": 0.6233166602539438, + "flos": 609089103360.0, + "grad_norm": 0.08834225044652763, + "language_loss": 0.84207428, + "learning_rate": 0.0003281769793025116, + "loss": 0.85279179, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3240, + "time_per_iteration": 2.733356237411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00812174, + "balance_loss_mlp": 1.3801111, + "diversity_loss_mlp": 0.21927354, + "epoch": 0.6235090419392074, + "flos": 439200340992.0, + "grad_norm": 0.03793852776762896, + "language_loss": 0.8948651, + "learning_rate": 0.00032788444352614346, + "loss": 0.90298682, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01248194, + "step": 3241, + "time_per_iteration": 2.599942922592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_mlp": 1.06840372, + "diversity_loss_mlp": 0.0, + "epoch": 0.6237014236244709, + "flos": 504904430592.0, + "grad_norm": 0.07096292336409799, + "language_loss": 0.80582923, + "learning_rate": 0.0003275919745709606, + "loss": 0.81660759, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3242, + "time_per_iteration": 2.5855822563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079567, + "balance_loss_mlp": 1.07014906, + "diversity_loss_mlp": 0.0, + "epoch": 0.6238938053097345, + "flos": 512917194240.0, + "grad_norm": 0.06686828549294242, + "language_loss": 0.81972641, + "learning_rate": 0.00032729957255050936, + "loss": 0.83052206, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3243, + "time_per_iteration": 2.652064561843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.06973052, + "diversity_loss_mlp": 0.0, + "epoch": 0.6240861869949981, + "flos": 736751531520.0, + "grad_norm": 0.0716805986451115, + "language_loss": 0.81674051, + "learning_rate": 0.0003270072375783102, + "loss": 0.8275336, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3244, + "time_per_iteration": 2.894718647003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070218, + "balance_loss_mlp": 1.06071746, + "diversity_loss_mlp": 0.0, + "epoch": 0.6242785686802617, + "flos": 494712271872.0, + "grad_norm": 0.06745739273028781, + "language_loss": 0.79402959, + "learning_rate": 0.00032671496976785774, + "loss": 0.80473179, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3245, + "time_per_iteration": 2.637991428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077124, + "balance_loss_mlp": 1.06772995, + "diversity_loss_mlp": 0.0, + "epoch": 0.6244709503655252, + "flos": 745846465536.0, + "grad_norm": 0.06297519573167677, + "language_loss": 0.7578575, + "learning_rate": 0.0003264227692326205, + "loss": 0.76862872, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3246, + "time_per_iteration": 3.0627310276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010763, + "balance_loss_mlp": 1.06653643, + "diversity_loss_mlp": 0.0, + "epoch": 0.6246633320507887, + "flos": 492602259456.0, + "grad_norm": 0.06711643928809063, + "language_loss": 0.85974544, + "learning_rate": 0.00032613063608604055, + "loss": 0.87050849, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3247, + "time_per_iteration": 2.6602516174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_mlp": 1.0650897, + "diversity_loss_mlp": 0.0, + "epoch": 0.6248557137360523, + "flos": 517391981568.0, + "grad_norm": 0.06836828090896512, + "language_loss": 0.8368777, + "learning_rate": 0.0003258385704415343, + "loss": 0.84762454, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3248, + "time_per_iteration": 2.5850605964660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068989, + "balance_loss_mlp": 1.05929732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6250480954213159, + "flos": 519363601920.0, + "grad_norm": 0.0567839390219681, + "language_loss": 0.82901073, + "learning_rate": 0.0003255465724124915, + "loss": 0.83970058, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3249, + "time_per_iteration": 2.7133941650390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068793, + "balance_loss_mlp": 1.05952442, + "diversity_loss_mlp": 0.0, + "epoch": 0.6252404771065795, + "flos": 516060191232.0, + "grad_norm": 0.05839887652934639, + "language_loss": 0.82966471, + "learning_rate": 0.00032525464211227587, + "loss": 0.84035265, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3250, + "time_per_iteration": 2.611469030380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071121, + "balance_loss_mlp": 1.06180525, + "diversity_loss_mlp": 0.0, + "epoch": 0.6254328587918431, + "flos": 576916535808.0, + "grad_norm": 0.07351416510504778, + "language_loss": 0.85770059, + "learning_rate": 0.0003249627796542249, + "loss": 0.8684119, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3251, + "time_per_iteration": 2.6665618419647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066555, + "balance_loss_mlp": 1.05709553, + "diversity_loss_mlp": 0.0, + "epoch": 0.6256252404771065, + "flos": 597930771456.0, + "grad_norm": 0.06415360650327814, + "language_loss": 0.84284747, + "learning_rate": 0.00032467098515164943, + "loss": 0.853513, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3252, + "time_per_iteration": 2.8863329887390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069653, + "balance_loss_mlp": 1.06005657, + "diversity_loss_mlp": 0.0, + "epoch": 0.6258176221623701, + "flos": 508299245568.0, + "grad_norm": 0.07319159145136593, + "language_loss": 0.83726692, + "learning_rate": 0.00032437925871783456, + "loss": 0.84796345, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3253, + "time_per_iteration": 2.6411869525909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107529, + "balance_loss_mlp": 1.06570566, + "diversity_loss_mlp": 0.0, + "epoch": 0.6260100038476337, + "flos": 639645755904.0, + "grad_norm": 0.06969705547120199, + "language_loss": 0.84202456, + "learning_rate": 0.00032408760046603803, + "loss": 0.85277742, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3254, + "time_per_iteration": 2.79947829246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070892, + "balance_loss_mlp": 1.06131983, + "diversity_loss_mlp": 0.0, + "epoch": 0.6262023855328973, + "flos": 841007784960.0, + "grad_norm": 0.06622216529123302, + "language_loss": 0.77594912, + "learning_rate": 0.00032379601050949193, + "loss": 0.78665805, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3255, + "time_per_iteration": 3.089614152908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073078, + "balance_loss_mlp": 1.06385732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6263947672181608, + "flos": 522138410496.0, + "grad_norm": 0.06913459813204618, + "language_loss": 0.88098216, + "learning_rate": 0.0003235044889614013, + "loss": 0.8917129, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3256, + "time_per_iteration": 2.5961923599243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076462, + "balance_loss_mlp": 1.0670923, + "diversity_loss_mlp": 0.0, + "epoch": 0.6265871489034244, + "flos": 607055440896.0, + "grad_norm": 0.07985483332339025, + "language_loss": 0.83828497, + "learning_rate": 0.0003232130359349451, + "loss": 0.84904957, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3257, + "time_per_iteration": 2.8164010047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106986, + "balance_loss_mlp": 1.06043053, + "diversity_loss_mlp": 0.0, + "epoch": 0.626779530588688, + "flos": 588484901376.0, + "grad_norm": 0.06128522405733426, + "language_loss": 0.81820428, + "learning_rate": 0.0003229216515432751, + "loss": 0.82890296, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3258, + "time_per_iteration": 2.7743678092956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00804618, + "balance_loss_mlp": 1.36253858, + "diversity_loss_mlp": 0.22081783, + "epoch": 0.6269719122739515, + "flos": 438612268032.0, + "grad_norm": 0.03450370763198899, + "language_loss": 0.80067343, + "learning_rate": 0.0003226303358995174, + "loss": 0.80871964, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01293936, + "step": 3259, + "time_per_iteration": 2.6309425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065495, + "balance_loss_mlp": 1.05593443, + "diversity_loss_mlp": 0.0, + "epoch": 0.6271642939592151, + "flos": 562874738688.0, + "grad_norm": 0.05636981182900784, + "language_loss": 0.88916153, + "learning_rate": 0.00032233908911677, + "loss": 0.89981651, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 3260, + "time_per_iteration": 2.847928524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072322, + "balance_loss_mlp": 1.06297052, + "diversity_loss_mlp": 0.0, + "epoch": 0.6273566756444786, + "flos": 514560273408.0, + "grad_norm": 0.07940970349438319, + "language_loss": 0.810615, + "learning_rate": 0.0003220479113081053, + "loss": 0.8213383, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3261, + "time_per_iteration": 2.7070260047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070214, + "balance_loss_mlp": 1.06123137, + "diversity_loss_mlp": 0.0, + "epoch": 0.6275490573297422, + "flos": 585472955904.0, + "grad_norm": 0.06801817573689214, + "language_loss": 0.78964686, + "learning_rate": 0.00032175680258656836, + "loss": 0.80034894, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 3262, + "time_per_iteration": 2.7481493949890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067516, + "balance_loss_mlp": 1.05819941, + "diversity_loss_mlp": 0.0, + "epoch": 0.6277414390150058, + "flos": 559423024128.0, + "grad_norm": 0.06408124041259919, + "language_loss": 0.80091017, + "learning_rate": 0.00032146576306517794, + "loss": 0.81158531, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3263, + "time_per_iteration": 2.799330949783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071623, + "balance_loss_mlp": 1.06242585, + "diversity_loss_mlp": 0.0, + "epoch": 0.6279338207002694, + "flos": 612706374144.0, + "grad_norm": 0.06510106509747231, + "language_loss": 0.80605328, + "learning_rate": 0.0003211747928569255, + "loss": 0.81676954, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 3264, + "time_per_iteration": 2.71992826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071469, + "balance_loss_mlp": 1.06197381, + "diversity_loss_mlp": 0.0, + "epoch": 0.6281262023855329, + "flos": 625685451264.0, + "grad_norm": 0.06441574996580214, + "language_loss": 0.8154881, + "learning_rate": 0.0003208838920747754, + "loss": 0.82620275, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3265, + "time_per_iteration": 2.8526246547698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073261, + "balance_loss_mlp": 1.06409347, + "diversity_loss_mlp": 0.0, + "epoch": 0.6283185840707964, + "flos": 1123600564224.0, + "grad_norm": 0.07893812182761015, + "language_loss": 0.76554495, + "learning_rate": 0.0003205930608316656, + "loss": 0.7762776, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3266, + "time_per_iteration": 3.4734575748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066498, + "balance_loss_mlp": 1.05708683, + "diversity_loss_mlp": 0.0, + "epoch": 0.62851096575606, + "flos": 515239750656.0, + "grad_norm": 0.06620674427686414, + "language_loss": 0.85159075, + "learning_rate": 0.00032030229924050673, + "loss": 0.86225569, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3267, + "time_per_iteration": 2.7024662494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072026, + "balance_loss_mlp": 1.06285858, + "diversity_loss_mlp": 0.0, + "epoch": 0.6287033474413236, + "flos": 404171472384.0, + "grad_norm": 0.06417389888600762, + "language_loss": 0.79950488, + "learning_rate": 0.00032001160741418247, + "loss": 0.81022519, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 3268, + "time_per_iteration": 2.6112074851989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066415, + "balance_loss_mlp": 1.05720639, + "diversity_loss_mlp": 0.0, + "epoch": 0.6288957291265872, + "flos": 525718605312.0, + "grad_norm": 0.08748068388552233, + "language_loss": 0.82228744, + "learning_rate": 0.0003197209854655494, + "loss": 0.83295155, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3269, + "time_per_iteration": 2.642714500427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064334, + "balance_loss_mlp": 1.05507767, + "diversity_loss_mlp": 0.0, + "epoch": 0.6290881108118507, + "flos": 603722294784.0, + "grad_norm": 0.07987454353472763, + "language_loss": 0.74589109, + "learning_rate": 0.0003194304335074371, + "loss": 0.7565344, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 3270, + "time_per_iteration": 2.8935019969940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061724, + "balance_loss_mlp": 1.05230033, + "diversity_loss_mlp": 0.0, + "epoch": 0.6292804924971143, + "flos": 437675830272.0, + "grad_norm": 0.07476368913364388, + "language_loss": 0.8843264, + "learning_rate": 0.0003191399516526475, + "loss": 0.89494365, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3271, + "time_per_iteration": 2.5182955265045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010675, + "balance_loss_mlp": 1.0580647, + "diversity_loss_mlp": 0.0, + "epoch": 0.6294728741823779, + "flos": 606662659584.0, + "grad_norm": 0.0671044499872579, + "language_loss": 0.79825693, + "learning_rate": 0.0003188495400139559, + "loss": 0.80893195, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3272, + "time_per_iteration": 2.834392786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106563, + "balance_loss_mlp": 1.05608094, + "diversity_loss_mlp": 0.0, + "epoch": 0.6296652558676414, + "flos": 701529942528.0, + "grad_norm": 0.07440991142052084, + "language_loss": 0.84596652, + "learning_rate": 0.00031855919870411013, + "loss": 0.85662282, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.09539795, + "routerloss_mlp": 0.0, + "step": 3273, + "time_per_iteration": 2.8662502765655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.05781233, + "diversity_loss_mlp": 0.0, + "epoch": 0.6298576375529049, + "flos": 523909969920.0, + "grad_norm": 0.06934000715416044, + "language_loss": 0.8508203, + "learning_rate": 0.0003182689278358305, + "loss": 0.86149418, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3274, + "time_per_iteration": 2.707679510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071338, + "balance_loss_mlp": 1.06173623, + "diversity_loss_mlp": 0.0, + "epoch": 0.6300500192381685, + "flos": 475963693056.0, + "grad_norm": 0.08830765837123684, + "language_loss": 0.79631943, + "learning_rate": 0.0003179787275218105, + "loss": 0.80703276, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3275, + "time_per_iteration": 2.6076841354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00806629, + "balance_loss_mlp": 1.3660543, + "diversity_loss_mlp": 0.22307114, + "epoch": 0.6302424009234321, + "flos": 520880772096.0, + "grad_norm": 0.030809011685951734, + "language_loss": 0.84306061, + "learning_rate": 0.0003176885978747155, + "loss": 0.85112691, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01206683, + "step": 3276, + "time_per_iteration": 2.6712234020233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070055, + "balance_loss_mlp": 1.06039953, + "diversity_loss_mlp": 0.0, + "epoch": 0.6304347826086957, + "flos": 694596777984.0, + "grad_norm": 0.05912857494905308, + "language_loss": 0.82393259, + "learning_rate": 0.0003173985390071839, + "loss": 0.83463317, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3277, + "time_per_iteration": 2.8781204223632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020459, + "balance_loss_mlp": 1.01545238, + "diversity_loss_mlp": 0.0, + "epoch": 0.6306271642939593, + "flos": 1466858045952.0, + "grad_norm": 0.014813696367821054, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78920913, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.05004883, + "routerloss_mlp": 0.0, + "step": 3278, + "time_per_iteration": 4.869734287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071906, + "balance_loss_mlp": 1.06190431, + "diversity_loss_mlp": 0.0, + "epoch": 0.6308195459792227, + "flos": 601740762624.0, + "grad_norm": 0.07813339799532502, + "language_loss": 0.80876654, + "learning_rate": 0.00031681863406122704, + "loss": 0.8194856, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3279, + "time_per_iteration": 2.773547410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074089, + "balance_loss_mlp": 1.06446278, + "diversity_loss_mlp": 0.0, + "epoch": 0.6310119276644863, + "flos": 726858178560.0, + "grad_norm": 0.07216916580711319, + "language_loss": 0.85329819, + "learning_rate": 0.00031652878820794087, + "loss": 0.86403906, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3280, + "time_per_iteration": 2.980884552001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070557, + "balance_loss_mlp": 1.0605855, + "diversity_loss_mlp": 0.0, + "epoch": 0.6312043093497499, + "flos": 519749042688.0, + "grad_norm": 0.08329353384521647, + "language_loss": 0.85882401, + "learning_rate": 0.00031623901358449627, + "loss": 0.8695296, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.09967041, + "routerloss_mlp": 0.0, + "step": 3281, + "time_per_iteration": 2.650691509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107097, + "balance_loss_mlp": 1.06155276, + "diversity_loss_mlp": 0.0, + "epoch": 0.6313966910350135, + "flos": 531191499264.0, + "grad_norm": 0.06939094759952598, + "language_loss": 0.88689077, + "learning_rate": 0.0003159493103033936, + "loss": 0.89760047, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3282, + "time_per_iteration": 2.589892864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022479, + "balance_loss_mlp": 1.0175674, + "diversity_loss_mlp": 0.0, + "epoch": 0.631589072720277, + "flos": 1379887529472.0, + "grad_norm": 0.015595592818812096, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80941534, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.04907227, + "routerloss_mlp": 0.0, + "step": 3283, + "time_per_iteration": 4.845726728439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063188, + "balance_loss_mlp": 1.05360401, + "diversity_loss_mlp": 0.0, + "epoch": 0.6317814544055406, + "flos": 624677432832.0, + "grad_norm": 0.08266858178450832, + "language_loss": 0.82553136, + "learning_rate": 0.0003153701182180776, + "loss": 0.83616328, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3284, + "time_per_iteration": 2.783351421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065881, + "balance_loss_mlp": 1.05632019, + "diversity_loss_mlp": 0.0, + "epoch": 0.6319738360908042, + "flos": 498119569920.0, + "grad_norm": 0.063758085961612, + "language_loss": 0.81699741, + "learning_rate": 0.00031508062963872655, + "loss": 0.82765627, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3285, + "time_per_iteration": 2.5591769218444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064029, + "balance_loss_mlp": 1.05435503, + "diversity_loss_mlp": 0.0, + "epoch": 0.6321662177760677, + "flos": 579760353792.0, + "grad_norm": 0.06946286940388995, + "language_loss": 0.79716074, + "learning_rate": 0.0003147912128514423, + "loss": 0.80780101, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 3286, + "time_per_iteration": 2.7374072074890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00792206, + "balance_loss_mlp": 1.3388809, + "diversity_loss_mlp": 0.2218435, + "epoch": 0.6323585994613313, + "flos": 601486373376.0, + "grad_norm": 0.030646294163886513, + "language_loss": 0.87300044, + "learning_rate": 0.0003145018679685859, + "loss": 0.8809225, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01184397, + "step": 3287, + "time_per_iteration": 2.7549750804901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067783, + "balance_loss_mlp": 1.05837727, + "diversity_loss_mlp": 0.0, + "epoch": 0.6325509811465948, + "flos": 528535259136.0, + "grad_norm": 0.05105189166461937, + "language_loss": 0.87830782, + "learning_rate": 0.00031421259510249134, + "loss": 0.88898563, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3288, + "time_per_iteration": 2.7835381031036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067714, + "balance_loss_mlp": 1.05796242, + "diversity_loss_mlp": 0.0, + "epoch": 0.6327433628318584, + "flos": 574262866944.0, + "grad_norm": 0.136960350782239, + "language_loss": 0.81129575, + "learning_rate": 0.00031392339436546414, + "loss": 0.82197285, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.09747314, + "routerloss_mlp": 0.0, + "step": 3289, + "time_per_iteration": 2.8133864402770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069758, + "balance_loss_mlp": 1.05946374, + "diversity_loss_mlp": 0.0, + "epoch": 0.632935744517122, + "flos": 517088033280.0, + "grad_norm": 0.0683406709240254, + "language_loss": 0.8385359, + "learning_rate": 0.00031363426586978205, + "loss": 0.84923339, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 3290, + "time_per_iteration": 2.7862977981567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070784, + "balance_loss_mlp": 1.06093121, + "diversity_loss_mlp": 0.0, + "epoch": 0.6331281262023856, + "flos": 617462714880.0, + "grad_norm": 0.06517080869241837, + "language_loss": 0.84541273, + "learning_rate": 0.0003133452097276947, + "loss": 0.85612059, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3291, + "time_per_iteration": 2.735102415084839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063814, + "balance_loss_mlp": 1.05341327, + "diversity_loss_mlp": 0.0, + "epoch": 0.633320507887649, + "flos": 592954546176.0, + "grad_norm": 0.06655999718782692, + "language_loss": 0.8441304, + "learning_rate": 0.0003130562260514238, + "loss": 0.85476851, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 3292, + "time_per_iteration": 2.7411108016967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067846, + "balance_loss_mlp": 1.05757022, + "diversity_loss_mlp": 0.0, + "epoch": 0.6335128895729126, + "flos": 582349782528.0, + "grad_norm": 0.05657366074496326, + "language_loss": 0.81691957, + "learning_rate": 0.0003127673149531626, + "loss": 0.82759798, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3293, + "time_per_iteration": 2.766249418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066103, + "balance_loss_mlp": 1.05568373, + "diversity_loss_mlp": 0.0, + "epoch": 0.6337052712581762, + "flos": 453036934656.0, + "grad_norm": 0.0752121645824798, + "language_loss": 0.83436191, + "learning_rate": 0.0003124784765450762, + "loss": 0.84502298, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3294, + "time_per_iteration": 2.5490550994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066062, + "balance_loss_mlp": 1.05569124, + "diversity_loss_mlp": 0.0, + "epoch": 0.6338976529434398, + "flos": 573407921664.0, + "grad_norm": 0.06917813795445459, + "language_loss": 0.797925, + "learning_rate": 0.0003121897109393017, + "loss": 0.80858564, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 3295, + "time_per_iteration": 2.779365062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061765, + "balance_loss_mlp": 1.05135238, + "diversity_loss_mlp": 0.0, + "epoch": 0.6340900346287034, + "flos": 508758838272.0, + "grad_norm": 0.06234951999103671, + "language_loss": 0.89289808, + "learning_rate": 0.0003119010182479481, + "loss": 0.9035157, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3296, + "time_per_iteration": 2.6138393878936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069092, + "balance_loss_mlp": 1.05855989, + "diversity_loss_mlp": 0.0, + "epoch": 0.6342824163139669, + "flos": 479746520064.0, + "grad_norm": 0.06350246507064496, + "language_loss": 0.82675922, + "learning_rate": 0.00031161239858309563, + "loss": 0.83745015, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.10540771, + "routerloss_mlp": 0.0, + "step": 3297, + "time_per_iteration": 2.586970329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072163, + "balance_loss_mlp": 1.06148767, + "diversity_loss_mlp": 0.0, + "epoch": 0.6344747979992305, + "flos": 572031714816.0, + "grad_norm": 0.0696399427467901, + "language_loss": 0.83455825, + "learning_rate": 0.0003113238520567964, + "loss": 0.84527981, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.10681152, + "routerloss_mlp": 0.0, + "step": 3298, + "time_per_iteration": 2.6586110591888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065276, + "balance_loss_mlp": 1.05495286, + "diversity_loss_mlp": 0.0, + "epoch": 0.634667179684494, + "flos": 605911601664.0, + "grad_norm": 0.07177816314390054, + "language_loss": 0.81584775, + "learning_rate": 0.00031103537878107403, + "loss": 0.82650054, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 3299, + "time_per_iteration": 2.708526372909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106763, + "balance_loss_mlp": 1.05756879, + "diversity_loss_mlp": 0.0, + "epoch": 0.6348595613697576, + "flos": 646944537600.0, + "grad_norm": 0.0821312661024272, + "language_loss": 0.7999661, + "learning_rate": 0.0003107469788679238, + "loss": 0.81064236, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3300, + "time_per_iteration": 2.774571180343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070552, + "balance_loss_mlp": 1.06004977, + "diversity_loss_mlp": 0.0, + "epoch": 0.6350519430550212, + "flos": 639074935296.0, + "grad_norm": 0.06269586290013059, + "language_loss": 0.86672354, + "learning_rate": 0.00031045865242931267, + "loss": 0.87742901, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 3301, + "time_per_iteration": 2.800271987915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_mlp": 1.06537664, + "diversity_loss_mlp": 0.0, + "epoch": 0.6352443247402847, + "flos": 686437908480.0, + "grad_norm": 0.060025608417058285, + "language_loss": 0.83086729, + "learning_rate": 0.00031017039957717877, + "loss": 0.84162271, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 3302, + "time_per_iteration": 2.99652361869812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083489, + "balance_loss_mlp": 1.07342744, + "diversity_loss_mlp": 0.0, + "epoch": 0.6354367064255483, + "flos": 559442847744.0, + "grad_norm": 0.0673613891994724, + "language_loss": 0.89035141, + "learning_rate": 0.0003098822204234318, + "loss": 0.90118629, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3303, + "time_per_iteration": 2.6769609451293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076148, + "balance_loss_mlp": 1.06632543, + "diversity_loss_mlp": 0.0, + "epoch": 0.6356290881108119, + "flos": 979487520768.0, + "grad_norm": 0.0682411238472533, + "language_loss": 0.87294948, + "learning_rate": 0.00030959411507995273, + "loss": 0.88371098, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 3304, + "time_per_iteration": 3.25303053855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073289, + "balance_loss_mlp": 1.06334674, + "diversity_loss_mlp": 0.0, + "epoch": 0.6358214697960755, + "flos": 528278298624.0, + "grad_norm": 0.09293144525754729, + "language_loss": 0.80997777, + "learning_rate": 0.00030930608365859407, + "loss": 0.82071066, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3305, + "time_per_iteration": 2.650047540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079087, + "balance_loss_mlp": 1.06908488, + "diversity_loss_mlp": 0.0, + "epoch": 0.6360138514813389, + "flos": 516811249152.0, + "grad_norm": 0.06298630616486185, + "language_loss": 0.87762672, + "learning_rate": 0.00030901812627117943, + "loss": 0.8884176, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.10003662, + "routerloss_mlp": 0.0, + "step": 3306, + "time_per_iteration": 2.605576276779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106604, + "balance_loss_mlp": 1.05617523, + "diversity_loss_mlp": 0.0, + "epoch": 0.6362062331666025, + "flos": 466525163520.0, + "grad_norm": 0.09439685712352788, + "language_loss": 0.8446157, + "learning_rate": 0.000308730243029504, + "loss": 0.85527611, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3307, + "time_per_iteration": 2.6111857891082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070403, + "balance_loss_mlp": 1.06070554, + "diversity_loss_mlp": 0.0, + "epoch": 0.6363986148518661, + "flos": 549720193536.0, + "grad_norm": 0.06852736886674453, + "language_loss": 0.7914747, + "learning_rate": 0.0003084424340453339, + "loss": 0.80217868, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3308, + "time_per_iteration": 2.8072149753570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063715, + "balance_loss_mlp": 1.05379033, + "diversity_loss_mlp": 0.0, + "epoch": 0.6365909965371297, + "flos": 583049083392.0, + "grad_norm": 0.0739185528440478, + "language_loss": 0.82162523, + "learning_rate": 0.0003081546994304064, + "loss": 0.8322624, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3309, + "time_per_iteration": 2.7670769691467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_mlp": 1.04971123, + "diversity_loss_mlp": 0.0, + "epoch": 0.6367833782223933, + "flos": 531255739392.0, + "grad_norm": 0.07802596117693822, + "language_loss": 0.81907165, + "learning_rate": 0.0003078670392964298, + "loss": 0.82966554, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3310, + "time_per_iteration": 2.6474099159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058905, + "balance_loss_mlp": 1.04899311, + "diversity_loss_mlp": 0.0, + "epoch": 0.6369757599076568, + "flos": 569506526208.0, + "grad_norm": 0.0731557233203608, + "language_loss": 0.82997435, + "learning_rate": 0.00030757945375508406, + "loss": 0.84056342, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3311, + "time_per_iteration": 2.6429851055145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054164, + "balance_loss_mlp": 1.04434729, + "diversity_loss_mlp": 0.0, + "epoch": 0.6371681415929203, + "flos": 539957892096.0, + "grad_norm": 0.06845871409018763, + "language_loss": 0.81414253, + "learning_rate": 0.00030729194291801944, + "loss": 0.8246842, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3312, + "time_per_iteration": 2.6631555557250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_mlp": 1.04690671, + "diversity_loss_mlp": 0.0, + "epoch": 0.6373605232781839, + "flos": 483566423040.0, + "grad_norm": 0.08097298950364754, + "language_loss": 0.77058214, + "learning_rate": 0.00030700450689685787, + "loss": 0.78114825, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3313, + "time_per_iteration": 2.540600061416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059608, + "balance_loss_mlp": 1.0500232, + "diversity_loss_mlp": 0.0, + "epoch": 0.6375529049634475, + "flos": 578581636608.0, + "grad_norm": 0.0804877394257798, + "language_loss": 0.85728467, + "learning_rate": 0.00030671714580319186, + "loss": 0.86788076, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 3314, + "time_per_iteration": 2.804875135421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055412, + "balance_loss_mlp": 1.04565513, + "diversity_loss_mlp": 0.0, + "epoch": 0.637745286648711, + "flos": 682257530880.0, + "grad_norm": 0.07597136338877614, + "language_loss": 0.83442312, + "learning_rate": 0.0003064298597485846, + "loss": 0.84497726, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.09747314, + "routerloss_mlp": 0.0, + "step": 3315, + "time_per_iteration": 2.860419273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010587, + "balance_loss_mlp": 1.04858494, + "diversity_loss_mlp": 0.0, + "epoch": 0.6379376683339746, + "flos": 504637558272.0, + "grad_norm": 0.06770078099501715, + "language_loss": 0.83771706, + "learning_rate": 0.00030614264884457054, + "loss": 0.84830409, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3316, + "time_per_iteration": 2.6398963928222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_mlp": 1.04450154, + "diversity_loss_mlp": 0.0, + "epoch": 0.6381300500192382, + "flos": 502020965376.0, + "grad_norm": 0.09575765703427323, + "language_loss": 0.77156532, + "learning_rate": 0.000305855513202655, + "loss": 0.78211164, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 3317, + "time_per_iteration": 2.57024884223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052471, + "balance_loss_mlp": 1.04220688, + "diversity_loss_mlp": 0.0, + "epoch": 0.6383224317045018, + "flos": 400489961472.0, + "grad_norm": 0.07693758647747995, + "language_loss": 0.77392501, + "learning_rate": 0.0003055684529343138, + "loss": 0.7844497, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 3318, + "time_per_iteration": 2.4296517372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058636, + "balance_loss_mlp": 1.04889059, + "diversity_loss_mlp": 0.0, + "epoch": 0.6385148133897653, + "flos": 499377208320.0, + "grad_norm": 0.08157026730411542, + "language_loss": 0.78901523, + "learning_rate": 0.00030528146815099374, + "loss": 0.79960155, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.09741211, + "routerloss_mlp": 0.0, + "step": 3319, + "time_per_iteration": 2.6178040504455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105942, + "balance_loss_mlp": 1.0495379, + "diversity_loss_mlp": 0.0, + "epoch": 0.6387071950750288, + "flos": 527665632768.0, + "grad_norm": 0.05929975411068792, + "language_loss": 0.72059178, + "learning_rate": 0.00030499455896411203, + "loss": 0.73118603, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.09875488, + "routerloss_mlp": 0.0, + "step": 3320, + "time_per_iteration": 2.627962589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026073, + "balance_loss_mlp": 1.02049422, + "diversity_loss_mlp": 0.0, + "epoch": 0.6388995767602924, + "flos": 1455979069440.0, + "grad_norm": 0.01967957525447477, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77326888, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.0559082, + "routerloss_mlp": 0.0, + "step": 3321, + "time_per_iteration": 4.926000595092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068116, + "balance_loss_mlp": 1.05800068, + "diversity_loss_mlp": 0.0, + "epoch": 0.639091958445556, + "flos": 603895191552.0, + "grad_norm": 0.06833251339694629, + "language_loss": 0.76524007, + "learning_rate": 0.0003044209678251865, + "loss": 0.77592129, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.10107422, + "routerloss_mlp": 0.0, + "step": 3322, + "time_per_iteration": 2.916396379470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_mlp": 1.05691469, + "diversity_loss_mlp": 0.0, + "epoch": 0.6392843401308196, + "flos": 584516694528.0, + "grad_norm": 0.05729140281605497, + "language_loss": 0.84366953, + "learning_rate": 0.0003041342860958306, + "loss": 0.85433549, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 3323, + "time_per_iteration": 2.7770862579345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_mlp": 1.06162453, + "diversity_loss_mlp": 0.0, + "epoch": 0.6394767218160831, + "flos": 514681413120.0, + "grad_norm": 0.08519156923386062, + "language_loss": 0.91346496, + "learning_rate": 0.00030384768040828857, + "loss": 0.92417842, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3324, + "time_per_iteration": 2.6812171936035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081336, + "balance_loss_mlp": 1.07172787, + "diversity_loss_mlp": 0.0, + "epoch": 0.6396691035013466, + "flos": 541732022784.0, + "grad_norm": 0.07651235317530308, + "language_loss": 0.85160887, + "learning_rate": 0.00030356115087383094, + "loss": 0.86242223, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.0960083, + "routerloss_mlp": 0.0, + "step": 3325, + "time_per_iteration": 2.6458263397216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00811228, + "balance_loss_mlp": 1.37989581, + "diversity_loss_mlp": 0.21910624, + "epoch": 0.6398614851866102, + "flos": 525535796736.0, + "grad_norm": 0.034032588306098184, + "language_loss": 0.8530367, + "learning_rate": 0.00030327469760369803, + "loss": 0.86114895, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01172681, + "step": 3326, + "time_per_iteration": 2.6054904460906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075022, + "balance_loss_mlp": 1.06528878, + "diversity_loss_mlp": 0.0, + "epoch": 0.6400538668718738, + "flos": 622989937152.0, + "grad_norm": 0.06651858881657381, + "language_loss": 0.84802389, + "learning_rate": 0.0003029883207091009, + "loss": 0.85877407, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.097229, + "routerloss_mlp": 0.0, + "step": 3327, + "time_per_iteration": 2.7084085941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075174, + "balance_loss_mlp": 1.06530905, + "diversity_loss_mlp": 0.0, + "epoch": 0.6402462485571374, + "flos": 503367436800.0, + "grad_norm": 0.07064025062286232, + "language_loss": 0.78362405, + "learning_rate": 0.00030270202030122095, + "loss": 0.79437578, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3328, + "time_per_iteration": 2.668501615524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076287, + "balance_loss_mlp": 1.06659508, + "diversity_loss_mlp": 0.0, + "epoch": 0.6404386302424009, + "flos": 819247260672.0, + "grad_norm": 0.07541554155703202, + "language_loss": 0.85661519, + "learning_rate": 0.00030241579649121, + "loss": 0.867378, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3329, + "time_per_iteration": 2.9972317218780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107413, + "balance_loss_mlp": 1.06488538, + "diversity_loss_mlp": 0.0, + "epoch": 0.6406310119276645, + "flos": 471812677632.0, + "grad_norm": 0.06439571325368963, + "language_loss": 0.7957617, + "learning_rate": 0.00030212964939018994, + "loss": 0.806503, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.09234619, + "routerloss_mlp": 0.0, + "step": 3330, + "time_per_iteration": 2.5598840713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075671, + "balance_loss_mlp": 1.06651545, + "diversity_loss_mlp": 0.0, + "epoch": 0.6408233936129281, + "flos": 425583631872.0, + "grad_norm": 0.07958558119065547, + "language_loss": 0.85401917, + "learning_rate": 0.0003018435791092527, + "loss": 0.8647759, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3331, + "time_per_iteration": 2.4886720180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077198, + "balance_loss_mlp": 1.06757176, + "diversity_loss_mlp": 0.0, + "epoch": 0.6410157752981916, + "flos": 549784433664.0, + "grad_norm": 0.08502928683846613, + "language_loss": 0.80926251, + "learning_rate": 0.00030155758575946083, + "loss": 0.8200345, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3332, + "time_per_iteration": 2.661039113998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073982, + "balance_loss_mlp": 1.06464815, + "diversity_loss_mlp": 0.0, + "epoch": 0.6412081569834551, + "flos": 475899452928.0, + "grad_norm": 0.07641451366860309, + "language_loss": 0.84045428, + "learning_rate": 0.0003012716694518467, + "loss": 0.85119408, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.09332275, + "routerloss_mlp": 0.0, + "step": 3333, + "time_per_iteration": 2.579451322555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074146, + "balance_loss_mlp": 1.06456161, + "diversity_loss_mlp": 0.0, + "epoch": 0.6414005386687187, + "flos": 540921494016.0, + "grad_norm": 0.06148329614598223, + "language_loss": 0.85011578, + "learning_rate": 0.000300985830297413, + "loss": 0.86085725, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3334, + "time_per_iteration": 2.6951658725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070237, + "balance_loss_mlp": 1.0607183, + "diversity_loss_mlp": 0.0, + "epoch": 0.6415929203539823, + "flos": 1041317379072.0, + "grad_norm": 0.07715385519242493, + "language_loss": 0.8765533, + "learning_rate": 0.00030070006840713205, + "loss": 0.88725567, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3335, + "time_per_iteration": 3.415095329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068996, + "balance_loss_mlp": 1.05956614, + "diversity_loss_mlp": 0.0, + "epoch": 0.6417853020392459, + "flos": 648337996800.0, + "grad_norm": 0.06540243812784874, + "language_loss": 0.73462147, + "learning_rate": 0.000300414383891947, + "loss": 0.74531144, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3336, + "time_per_iteration": 2.8207781314849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070682, + "balance_loss_mlp": 1.06142569, + "diversity_loss_mlp": 0.0, + "epoch": 0.6419776837245095, + "flos": 500899147776.0, + "grad_norm": 0.062126831222401244, + "language_loss": 0.88856506, + "learning_rate": 0.00030012877686276973, + "loss": 0.89927197, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3337, + "time_per_iteration": 2.701467752456665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070599, + "balance_loss_mlp": 1.06103206, + "diversity_loss_mlp": 0.0, + "epoch": 0.642170065409773, + "flos": 620620392960.0, + "grad_norm": 0.06622404014204096, + "language_loss": 0.86998606, + "learning_rate": 0.0002998432474304832, + "loss": 0.88069206, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3338, + "time_per_iteration": 2.754462242126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023208, + "balance_loss_mlp": 1.01724732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6423624470950365, + "flos": 1423539629568.0, + "grad_norm": 0.025409804512754288, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80260551, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 3339, + "time_per_iteration": 4.871408700942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061344, + "balance_loss_mlp": 1.05190849, + "diversity_loss_mlp": 0.0, + "epoch": 0.6425548287803001, + "flos": 562353477120.0, + "grad_norm": 0.056182904751461135, + "language_loss": 0.88884711, + "learning_rate": 0.00029927242179996107, + "loss": 0.89946061, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3340, + "time_per_iteration": 2.6943204402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063875, + "balance_loss_mlp": 1.05451107, + "diversity_loss_mlp": 0.0, + "epoch": 0.6427472104655637, + "flos": 585443220480.0, + "grad_norm": 0.05740093819519034, + "language_loss": 0.83547878, + "learning_rate": 0.0002989871258233398, + "loss": 0.8461175, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3341, + "time_per_iteration": 2.759075164794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106288, + "balance_loss_mlp": 1.05317652, + "diversity_loss_mlp": 0.0, + "epoch": 0.6429395921508272, + "flos": 404282700288.0, + "grad_norm": 0.08495529058707293, + "language_loss": 0.82866132, + "learning_rate": 0.0002987019078868373, + "loss": 0.83929014, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3342, + "time_per_iteration": 2.460184097290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00806137, + "balance_loss_mlp": 1.3687458, + "diversity_loss_mlp": 0.21894245, + "epoch": 0.6431319738360908, + "flos": 548783755776.0, + "grad_norm": 0.03059825895364693, + "language_loss": 0.81932986, + "learning_rate": 0.00029841676810118484, + "loss": 0.82739115, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01229309, + "step": 3343, + "time_per_iteration": 2.6885409355163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_mlp": 1.04915428, + "diversity_loss_mlp": 0.0, + "epoch": 0.6433243555213544, + "flos": 793375368192.0, + "grad_norm": 0.0604476685897385, + "language_loss": 0.87177467, + "learning_rate": 0.0002981317065770839, + "loss": 0.88236231, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3344, + "time_per_iteration": 3.03983736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060176, + "balance_loss_mlp": 1.05044222, + "diversity_loss_mlp": 0.0, + "epoch": 0.643516737206618, + "flos": 583031831040.0, + "grad_norm": 0.07704872008291591, + "language_loss": 0.8078779, + "learning_rate": 0.00029784672342520493, + "loss": 0.81847966, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.097229, + "routerloss_mlp": 0.0, + "step": 3345, + "time_per_iteration": 2.6846296787261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061675, + "balance_loss_mlp": 1.05220366, + "diversity_loss_mlp": 0.0, + "epoch": 0.6437091188918815, + "flos": 518750936064.0, + "grad_norm": 0.06975007259690363, + "language_loss": 0.8341136, + "learning_rate": 0.00029756181875618834, + "loss": 0.84473026, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3346, + "time_per_iteration": 2.5665693283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00808422, + "balance_loss_mlp": 1.37269104, + "diversity_loss_mlp": 0.21939373, + "epoch": 0.643901500577145, + "flos": 384946048512.0, + "grad_norm": 0.035494504018204545, + "language_loss": 0.83294541, + "learning_rate": 0.0002972769926806439, + "loss": 0.84102958, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0123796, + "step": 3347, + "time_per_iteration": 2.504934549331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0080263, + "balance_loss_mlp": 1.36098909, + "diversity_loss_mlp": 0.21952364, + "epoch": 0.6440938822624086, + "flos": 483722067456.0, + "grad_norm": 0.0334865497392214, + "language_loss": 0.88848293, + "learning_rate": 0.0002969922453091508, + "loss": 0.89650929, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01237371, + "step": 3348, + "time_per_iteration": 2.588092803955078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105701, + "balance_loss_mlp": 1.04741955, + "diversity_loss_mlp": 0.0, + "epoch": 0.6442862639476722, + "flos": 540469241856.0, + "grad_norm": 0.07081599083542611, + "language_loss": 0.85229504, + "learning_rate": 0.00029670757675225777, + "loss": 0.86286509, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3349, + "time_per_iteration": 2.7467896938323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056774, + "balance_loss_mlp": 1.04726744, + "diversity_loss_mlp": 0.0, + "epoch": 0.6444786456329358, + "flos": 526912003584.0, + "grad_norm": 0.08621507866757971, + "language_loss": 0.79660463, + "learning_rate": 0.0002964229871204831, + "loss": 0.80717242, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3350, + "time_per_iteration": 2.65602707862854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056473, + "balance_loss_mlp": 1.04715693, + "diversity_loss_mlp": 0.0, + "epoch": 0.6446710273181993, + "flos": 697892848128.0, + "grad_norm": 0.0705050991392221, + "language_loss": 0.83769023, + "learning_rate": 0.00029613847652431403, + "loss": 0.84825498, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3351, + "time_per_iteration": 2.8451104164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00797485, + "balance_loss_mlp": 1.35163832, + "diversity_loss_mlp": 0.21852379, + "epoch": 0.6448634090034628, + "flos": 625023226368.0, + "grad_norm": 0.02943697991412704, + "language_loss": 0.79510611, + "learning_rate": 0.0002958540450742078, + "loss": 0.80308104, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01240353, + "step": 3352, + "time_per_iteration": 2.950679063796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060228, + "balance_loss_mlp": 1.05063784, + "diversity_loss_mlp": 0.0, + "epoch": 0.6450557906887264, + "flos": 600950057472.0, + "grad_norm": 0.06852868488451136, + "language_loss": 0.7732749, + "learning_rate": 0.0002955696928805901, + "loss": 0.78387713, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 3353, + "time_per_iteration": 2.8771724700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067774, + "balance_loss_mlp": 1.0582372, + "diversity_loss_mlp": 0.0, + "epoch": 0.64524817237399, + "flos": 646200820224.0, + "grad_norm": 0.10704512558750189, + "language_loss": 0.86111909, + "learning_rate": 0.0002952854200538563, + "loss": 0.87179685, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3354, + "time_per_iteration": 2.777782917022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798015, + "balance_loss_mlp": 1.35377836, + "diversity_loss_mlp": 0.21820019, + "epoch": 0.6454405540592536, + "flos": 473411340288.0, + "grad_norm": 0.032699702246912744, + "language_loss": 0.82167614, + "learning_rate": 0.000295001226704371, + "loss": 0.82965624, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01202584, + "step": 3355, + "time_per_iteration": 2.5991604328155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_mlp": 1.05207551, + "diversity_loss_mlp": 0.0, + "epoch": 0.6456329357445171, + "flos": 611841517056.0, + "grad_norm": 0.07645377110954561, + "language_loss": 0.82891458, + "learning_rate": 0.00029471711294246783, + "loss": 0.8395294, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3356, + "time_per_iteration": 2.8146939277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064507, + "balance_loss_mlp": 1.05512571, + "diversity_loss_mlp": 0.0, + "epoch": 0.6458253174297807, + "flos": 731683901952.0, + "grad_norm": 0.07650305014050414, + "language_loss": 0.82254899, + "learning_rate": 0.0002944330788784494, + "loss": 0.83319402, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3357, + "time_per_iteration": 2.90537428855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_mlp": 1.05508041, + "diversity_loss_mlp": 0.0, + "epoch": 0.6460176991150443, + "flos": 570413228544.0, + "grad_norm": 0.06168723315149378, + "language_loss": 0.84662282, + "learning_rate": 0.00029414912462258786, + "loss": 0.85727078, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3358, + "time_per_iteration": 2.8301830291748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068597, + "balance_loss_mlp": 1.05873299, + "diversity_loss_mlp": 0.0, + "epoch": 0.6462100808003078, + "flos": 583160311296.0, + "grad_norm": 0.07109215771884392, + "language_loss": 0.81651056, + "learning_rate": 0.00029386525028512366, + "loss": 0.8271966, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3359, + "time_per_iteration": 2.689298152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068602, + "balance_loss_mlp": 1.05881464, + "diversity_loss_mlp": 0.0, + "epoch": 0.6464024624855714, + "flos": 483919557120.0, + "grad_norm": 0.0690455154627963, + "language_loss": 0.86761546, + "learning_rate": 0.0002935814559762666, + "loss": 0.8783015, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3360, + "time_per_iteration": 2.820415496826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072441, + "balance_loss_mlp": 1.06286263, + "diversity_loss_mlp": 0.0, + "epoch": 0.6465948441708349, + "flos": 527774289408.0, + "grad_norm": 0.06340694058104589, + "language_loss": 0.7940557, + "learning_rate": 0.0002932977418061957, + "loss": 0.80478007, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3361, + "time_per_iteration": 2.638246536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075754, + "balance_loss_mlp": 1.06592488, + "diversity_loss_mlp": 0.0, + "epoch": 0.6467872258560985, + "flos": 669421615104.0, + "grad_norm": 0.11078731162526398, + "language_loss": 0.80980253, + "learning_rate": 0.00029301410788505833, + "loss": 0.82056004, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3362, + "time_per_iteration": 2.829946279525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067795, + "balance_loss_mlp": 1.05792451, + "diversity_loss_mlp": 0.0, + "epoch": 0.6469796075413621, + "flos": 432101620224.0, + "grad_norm": 0.08350394703111745, + "language_loss": 0.80845594, + "learning_rate": 0.00029273055432297126, + "loss": 0.81913394, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.09875488, + "routerloss_mlp": 0.0, + "step": 3363, + "time_per_iteration": 2.5047130584716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057084, + "balance_loss_mlp": 1.04717803, + "diversity_loss_mlp": 0.0, + "epoch": 0.6471719892266257, + "flos": 803750335488.0, + "grad_norm": 0.06756647759690963, + "language_loss": 0.80998582, + "learning_rate": 0.00029244708123001917, + "loss": 0.8205567, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3364, + "time_per_iteration": 3.071207284927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059298, + "balance_loss_mlp": 1.04951715, + "diversity_loss_mlp": 0.0, + "epoch": 0.6473643709118891, + "flos": 577208001024.0, + "grad_norm": 0.08982319043529345, + "language_loss": 0.84555328, + "learning_rate": 0.0002921636887162565, + "loss": 0.85614622, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3365, + "time_per_iteration": 2.768284797668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057421, + "balance_loss_mlp": 1.04800391, + "diversity_loss_mlp": 0.0, + "epoch": 0.6475567525971527, + "flos": 761420113920.0, + "grad_norm": 0.08629567448100454, + "language_loss": 0.83712798, + "learning_rate": 0.00029188037689170595, + "loss": 0.84770226, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3366, + "time_per_iteration": 2.9462075233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054242, + "balance_loss_mlp": 1.04440713, + "diversity_loss_mlp": 0.0, + "epoch": 0.6477491342824163, + "flos": 843103116288.0, + "grad_norm": 0.07194825267456643, + "language_loss": 0.84329098, + "learning_rate": 0.0002915971458663586, + "loss": 0.85383338, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.09820557, + "routerloss_mlp": 0.0, + "step": 3367, + "time_per_iteration": 3.052452802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105506, + "balance_loss_mlp": 1.04521894, + "diversity_loss_mlp": 0.0, + "epoch": 0.6479415159676799, + "flos": 884820298752.0, + "grad_norm": 0.06187590041276245, + "language_loss": 0.81901962, + "learning_rate": 0.00029131399575017494, + "loss": 0.82957023, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.09838867, + "routerloss_mlp": 0.0, + "step": 3368, + "time_per_iteration": 3.260995864868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054713, + "balance_loss_mlp": 1.04508734, + "diversity_loss_mlp": 0.0, + "epoch": 0.6481338976529435, + "flos": 615513116160.0, + "grad_norm": 0.08987768190651603, + "language_loss": 0.85898274, + "learning_rate": 0.0002910309266530836, + "loss": 0.8695299, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3369, + "time_per_iteration": 2.8022115230560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059559, + "balance_loss_mlp": 1.0497539, + "diversity_loss_mlp": 0.0, + "epoch": 0.648326279338207, + "flos": 510009136128.0, + "grad_norm": 0.07644364345836648, + "language_loss": 0.8560974, + "learning_rate": 0.0002907479386849814, + "loss": 0.86669296, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.09796143, + "routerloss_mlp": 0.0, + "step": 3370, + "time_per_iteration": 2.646334171295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057441, + "balance_loss_mlp": 1.04791021, + "diversity_loss_mlp": 0.0, + "epoch": 0.6485186610234706, + "flos": 702498313728.0, + "grad_norm": 0.07833648604751785, + "language_loss": 0.80597669, + "learning_rate": 0.0002904650319557339, + "loss": 0.81655109, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3371, + "time_per_iteration": 2.9977073669433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787303, + "balance_loss_mlp": 1.33170056, + "diversity_loss_mlp": 0.21746175, + "epoch": 0.6487110427087341, + "flos": 560683233792.0, + "grad_norm": 0.036264020076934224, + "language_loss": 0.81342006, + "learning_rate": 0.0002901822065751758, + "loss": 0.82129312, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01272238, + "step": 3372, + "time_per_iteration": 2.697375774383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054945, + "balance_loss_mlp": 1.04537833, + "diversity_loss_mlp": 0.0, + "epoch": 0.6489034243939977, + "flos": 680100530688.0, + "grad_norm": 0.06787352107623057, + "language_loss": 0.8556366, + "learning_rate": 0.0002898994626531093, + "loss": 0.86618596, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3373, + "time_per_iteration": 2.8561713695526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059452, + "balance_loss_mlp": 1.05008769, + "diversity_loss_mlp": 0.0, + "epoch": 0.6490958060792612, + "flos": 474412018176.0, + "grad_norm": 0.07079984620053167, + "language_loss": 0.87879932, + "learning_rate": 0.00028961680029930526, + "loss": 0.88939387, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3374, + "time_per_iteration": 2.535357713699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058352, + "balance_loss_mlp": 1.04902411, + "diversity_loss_mlp": 0.0, + "epoch": 0.6492881877645248, + "flos": 588850518528.0, + "grad_norm": 0.07847742657670442, + "language_loss": 0.7705428, + "learning_rate": 0.00028933421962350317, + "loss": 0.78112632, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3375, + "time_per_iteration": 2.7630350589752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059653, + "balance_loss_mlp": 1.05022955, + "diversity_loss_mlp": 0.0, + "epoch": 0.6494805694497884, + "flos": 642427905024.0, + "grad_norm": 0.060066877370730534, + "language_loss": 0.83867884, + "learning_rate": 0.0002890517207354104, + "loss": 0.84927535, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3376, + "time_per_iteration": 2.8403854370117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067954, + "balance_loss_mlp": 1.05819058, + "diversity_loss_mlp": 0.0, + "epoch": 0.649672951135052, + "flos": 531806736384.0, + "grad_norm": 0.07875615832785021, + "language_loss": 0.81685328, + "learning_rate": 0.0002887693037447029, + "loss": 0.82753289, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3377, + "time_per_iteration": 2.5936834812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786778, + "balance_loss_mlp": 1.32879448, + "diversity_loss_mlp": 0.22056285, + "epoch": 0.6498653328203156, + "flos": 547387725312.0, + "grad_norm": 0.03360133181749734, + "language_loss": 0.82620949, + "learning_rate": 0.00028848696876102443, + "loss": 0.8340773, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01209909, + "step": 3378, + "time_per_iteration": 2.646881341934204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083646, + "balance_loss_mlp": 1.07432425, + "diversity_loss_mlp": 0.0, + "epoch": 0.650057714505579, + "flos": 462228415488.0, + "grad_norm": 0.07289026043627096, + "language_loss": 0.83464664, + "learning_rate": 0.00028820471589398723, + "loss": 0.84548312, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 3379, + "time_per_iteration": 2.5300872325897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079061, + "balance_loss_mlp": 1.3374207, + "diversity_loss_mlp": 0.22020277, + "epoch": 0.6502500961908426, + "flos": 510172121088.0, + "grad_norm": 0.03832598047329158, + "language_loss": 0.78047603, + "learning_rate": 0.00028792254525317196, + "loss": 0.78838205, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01179803, + "step": 3380, + "time_per_iteration": 2.696711301803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090042, + "balance_loss_mlp": 1.08066666, + "diversity_loss_mlp": 0.0, + "epoch": 0.6504424778761062, + "flos": 579827165184.0, + "grad_norm": 0.07654044550208572, + "language_loss": 0.81385279, + "learning_rate": 0.00028764045694812645, + "loss": 0.82475317, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3381, + "time_per_iteration": 2.7730586528778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092437, + "balance_loss_mlp": 1.08303761, + "diversity_loss_mlp": 0.0, + "epoch": 0.6506348595613698, + "flos": 519457577472.0, + "grad_norm": 0.08987457099582341, + "language_loss": 0.76744068, + "learning_rate": 0.0002873584510883671, + "loss": 0.77836508, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3382, + "time_per_iteration": 2.6443450450897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088701, + "balance_loss_mlp": 1.07926512, + "diversity_loss_mlp": 0.0, + "epoch": 0.6508272412466333, + "flos": 510310513152.0, + "grad_norm": 0.07067062397279458, + "language_loss": 0.86143303, + "learning_rate": 0.0002870765277833788, + "loss": 0.87232006, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3383, + "time_per_iteration": 2.740920305252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.07161593, + "diversity_loss_mlp": 0.0, + "epoch": 0.6510196229318969, + "flos": 625623782400.0, + "grad_norm": 0.07689735458190097, + "language_loss": 0.80460048, + "learning_rate": 0.00028679468714261347, + "loss": 0.81540942, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3384, + "time_per_iteration": 2.7767040729522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074737, + "balance_loss_mlp": 1.06546891, + "diversity_loss_mlp": 0.0, + "epoch": 0.6512120046171604, + "flos": 474696142848.0, + "grad_norm": 0.06416640561224615, + "language_loss": 0.76925558, + "learning_rate": 0.0002865129292754918, + "loss": 0.78000295, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3385, + "time_per_iteration": 2.591616630554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075196, + "balance_loss_mlp": 1.06574309, + "diversity_loss_mlp": 0.0, + "epoch": 0.651404386302424, + "flos": 551854798848.0, + "grad_norm": 0.06819374320087251, + "language_loss": 0.81950033, + "learning_rate": 0.00028623125429140105, + "loss": 0.83025235, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3386, + "time_per_iteration": 2.819565773010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068167, + "balance_loss_mlp": 1.05845094, + "diversity_loss_mlp": 0.0, + "epoch": 0.6515967679876876, + "flos": 523311985152.0, + "grad_norm": 0.07152430707450508, + "language_loss": 0.8685019, + "learning_rate": 0.00028594966229969785, + "loss": 0.87918359, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3387, + "time_per_iteration": 2.6802561283111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067324, + "balance_loss_mlp": 1.05746567, + "diversity_loss_mlp": 0.0, + "epoch": 0.6517891496729511, + "flos": 573874854912.0, + "grad_norm": 0.0719578704836234, + "language_loss": 0.81695348, + "learning_rate": 0.00028566815340970577, + "loss": 0.82762671, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3388, + "time_per_iteration": 2.725184917449951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078869, + "balance_loss_mlp": 1.33117235, + "diversity_loss_mlp": 0.22285563, + "epoch": 0.6519815313582147, + "flos": 555926893056.0, + "grad_norm": 0.03133119374313574, + "language_loss": 0.80959165, + "learning_rate": 0.0002853867277307162, + "loss": 0.81747854, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01167633, + "step": 3389, + "time_per_iteration": 2.6700825691223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066001, + "balance_loss_mlp": 1.05601168, + "diversity_loss_mlp": 0.0, + "epoch": 0.6521739130434783, + "flos": 480487666176.0, + "grad_norm": 0.077177119922592, + "language_loss": 0.82811326, + "learning_rate": 0.00028510538537198824, + "loss": 0.83877325, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.09985352, + "routerloss_mlp": 0.0, + "step": 3390, + "time_per_iteration": 2.65598464012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065546, + "balance_loss_mlp": 1.05591428, + "diversity_loss_mlp": 0.0, + "epoch": 0.6523662947287419, + "flos": 665707797504.0, + "grad_norm": 0.06292665593790116, + "language_loss": 0.86663938, + "learning_rate": 0.00028482412644274867, + "loss": 0.87729478, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3391, + "time_per_iteration": 2.926029682159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106361, + "balance_loss_mlp": 1.05354261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6525586764140053, + "flos": 548655275520.0, + "grad_norm": 0.07441000419261597, + "language_loss": 0.74793214, + "learning_rate": 0.00028454295105219207, + "loss": 0.75856817, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3392, + "time_per_iteration": 2.6511483192443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064701, + "balance_loss_mlp": 1.05479479, + "diversity_loss_mlp": 0.0, + "epoch": 0.6527510580992689, + "flos": 802900159488.0, + "grad_norm": 0.053639196798002685, + "language_loss": 0.79547405, + "learning_rate": 0.0002842618593094802, + "loss": 0.80612105, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3393, + "time_per_iteration": 3.1180903911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066549, + "balance_loss_mlp": 1.05651164, + "diversity_loss_mlp": 0.0, + "epoch": 0.6529434397845325, + "flos": 671166010368.0, + "grad_norm": 0.09762000223606793, + "language_loss": 0.80486917, + "learning_rate": 0.00028398085132374243, + "loss": 0.81553459, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.1003418, + "routerloss_mlp": 0.0, + "step": 3394, + "time_per_iteration": 2.805560350418091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061804, + "balance_loss_mlp": 1.05185044, + "diversity_loss_mlp": 0.0, + "epoch": 0.6531358214697961, + "flos": 828409006080.0, + "grad_norm": 0.06212778963151281, + "language_loss": 0.84015262, + "learning_rate": 0.0002836999272040761, + "loss": 0.85077065, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3395, + "time_per_iteration": 3.1151998043060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062758, + "balance_loss_mlp": 1.05245829, + "diversity_loss_mlp": 0.0, + "epoch": 0.6533282031550597, + "flos": 487403578368.0, + "grad_norm": 0.07524661860640132, + "language_loss": 0.83834863, + "learning_rate": 0.00028341908705954575, + "loss": 0.84897625, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 3396, + "time_per_iteration": 2.5500996112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00599946, + "balance_loss_mlp": 1.02570343, + "diversity_loss_mlp": 0.15256089, + "epoch": 0.6535205848403232, + "flos": 1557744638976.0, + "grad_norm": 0.0014313680900061394, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82361758, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01081435, + "step": 3397, + "time_per_iteration": 4.838392496109009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060865, + "balance_loss_mlp": 1.05047619, + "diversity_loss_mlp": 0.0, + "epoch": 0.6537129665255867, + "flos": 493711593984.0, + "grad_norm": 0.08700190278237876, + "language_loss": 0.77911532, + "learning_rate": 0.00028285765913198604, + "loss": 0.78972399, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 3398, + "time_per_iteration": 2.5510177612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056044, + "balance_loss_mlp": 1.04590559, + "diversity_loss_mlp": 0.0, + "epoch": 0.6539053482108503, + "flos": 605002328064.0, + "grad_norm": 0.06794032810044964, + "language_loss": 0.82229477, + "learning_rate": 0.0002825770715669227, + "loss": 0.83285522, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 3399, + "time_per_iteration": 2.7065982818603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052667, + "balance_loss_mlp": 1.04248071, + "diversity_loss_mlp": 0.0, + "epoch": 0.6540977298961139, + "flos": 577778821632.0, + "grad_norm": 0.06703848890261048, + "language_loss": 0.81440985, + "learning_rate": 0.00028229656841292634, + "loss": 0.82493651, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3400, + "time_per_iteration": 2.7117483615875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050894, + "balance_loss_mlp": 1.04067171, + "diversity_loss_mlp": 0.0, + "epoch": 0.6542901115813774, + "flos": 511753531392.0, + "grad_norm": 0.06998039744710104, + "language_loss": 0.76892245, + "learning_rate": 0.0002820161497788979, + "loss": 0.7794314, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.10217285, + "routerloss_mlp": 0.0, + "step": 3401, + "time_per_iteration": 2.590047836303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049365, + "balance_loss_mlp": 1.03930926, + "diversity_loss_mlp": 0.0, + "epoch": 0.654482493266641, + "flos": 625495302144.0, + "grad_norm": 0.06845614791056948, + "language_loss": 0.86992002, + "learning_rate": 0.00028173581577370545, + "loss": 0.88041365, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.1005249, + "routerloss_mlp": 0.0, + "step": 3402, + "time_per_iteration": 2.7577242851257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047368, + "balance_loss_mlp": 1.03716338, + "diversity_loss_mlp": 0.0, + "epoch": 0.6546748749519046, + "flos": 523981550592.0, + "grad_norm": 0.059228402052172, + "language_loss": 0.78973734, + "learning_rate": 0.0002814555665061844, + "loss": 0.80021101, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.10198975, + "routerloss_mlp": 0.0, + "step": 3403, + "time_per_iteration": 2.731137752532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_mlp": 1.0375247, + "diversity_loss_mlp": 0.0, + "epoch": 0.6548672566371682, + "flos": 479210204160.0, + "grad_norm": 0.07926071177251158, + "language_loss": 0.77611935, + "learning_rate": 0.00028117540208513715, + "loss": 0.78659368, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3404, + "time_per_iteration": 2.689107894897461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0077145, + "balance_loss_mlp": 1.2970531, + "diversity_loss_mlp": 0.22200939, + "epoch": 0.6550596383224317, + "flos": 616012356096.0, + "grad_norm": 0.029568297533915613, + "language_loss": 0.85005927, + "learning_rate": 0.00028089532261933313, + "loss": 0.85777372, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01191924, + "step": 3405, + "time_per_iteration": 2.7177927494049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105146, + "balance_loss_mlp": 1.04141116, + "diversity_loss_mlp": 0.0, + "epoch": 0.6552520200076952, + "flos": 488836684800.0, + "grad_norm": 0.08876519929545809, + "language_loss": 0.85989165, + "learning_rate": 0.0002806153282175087, + "loss": 0.87040627, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3406, + "time_per_iteration": 2.5502045154571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053299, + "balance_loss_mlp": 1.04348814, + "diversity_loss_mlp": 0.0, + "epoch": 0.6554444016929588, + "flos": 687619196928.0, + "grad_norm": 0.07350490516448754, + "language_loss": 0.82776654, + "learning_rate": 0.0002803354189883679, + "loss": 0.83829957, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3407, + "time_per_iteration": 2.8476340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054863, + "balance_loss_mlp": 1.0448494, + "diversity_loss_mlp": 0.0, + "epoch": 0.6556367833782224, + "flos": 543051330048.0, + "grad_norm": 0.06617021222220203, + "language_loss": 0.85199594, + "learning_rate": 0.00028005559504058053, + "loss": 0.86254454, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3408, + "time_per_iteration": 2.701035261154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105219, + "balance_loss_mlp": 1.04206932, + "diversity_loss_mlp": 0.0, + "epoch": 0.655829165063486, + "flos": 673535554560.0, + "grad_norm": 0.08388731304351217, + "language_loss": 0.77208018, + "learning_rate": 0.0002797758564827838, + "loss": 0.78260207, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3409, + "time_per_iteration": 2.8340024948120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058496, + "balance_loss_mlp": 1.04903674, + "diversity_loss_mlp": 0.0, + "epoch": 0.6560215467487496, + "flos": 531806736384.0, + "grad_norm": 0.07006819638769121, + "language_loss": 0.83542061, + "learning_rate": 0.0002794962034235824, + "loss": 0.84600556, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3410, + "time_per_iteration": 2.634612798690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054024, + "balance_loss_mlp": 1.04401076, + "diversity_loss_mlp": 0.0, + "epoch": 0.656213928434013, + "flos": 591311467008.0, + "grad_norm": 0.07454971523093613, + "language_loss": 0.74929279, + "learning_rate": 0.00027921663597154695, + "loss": 0.75983304, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3411, + "time_per_iteration": 2.736161708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058952, + "balance_loss_mlp": 1.04926038, + "diversity_loss_mlp": 0.0, + "epoch": 0.6564063101192766, + "flos": 415786825728.0, + "grad_norm": 0.08159088858174726, + "language_loss": 0.81125355, + "learning_rate": 0.00027893715423521525, + "loss": 0.82184303, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3412, + "time_per_iteration": 2.452563524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781164, + "balance_loss_mlp": 1.31892097, + "diversity_loss_mlp": 0.22038518, + "epoch": 0.6565986918045402, + "flos": 453321059328.0, + "grad_norm": 0.03347946196666781, + "language_loss": 0.8419345, + "learning_rate": 0.00027865775832309163, + "loss": 0.84974611, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01151081, + "step": 3413, + "time_per_iteration": 2.6782755851745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068715, + "balance_loss_mlp": 1.05899358, + "diversity_loss_mlp": 0.0, + "epoch": 0.6567910734898038, + "flos": 547746001920.0, + "grad_norm": 0.0675198993979362, + "language_loss": 0.86263126, + "learning_rate": 0.00027837844834364733, + "loss": 0.87331843, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3414, + "time_per_iteration": 2.63967227935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058125, + "balance_loss_mlp": 1.04836726, + "diversity_loss_mlp": 0.0, + "epoch": 0.6569834551750673, + "flos": 655518210048.0, + "grad_norm": 0.06663266607359189, + "language_loss": 0.8659035, + "learning_rate": 0.00027809922440532, + "loss": 0.87648469, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3415, + "time_per_iteration": 2.816204786300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059729, + "balance_loss_mlp": 1.05018628, + "diversity_loss_mlp": 0.0, + "epoch": 0.6571758368603309, + "flos": 539681107968.0, + "grad_norm": 0.06360594790571725, + "language_loss": 0.81154943, + "learning_rate": 0.00027782008661651406, + "loss": 0.82214665, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3416, + "time_per_iteration": 2.80657958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059234, + "balance_loss_mlp": 1.04937577, + "diversity_loss_mlp": 0.0, + "epoch": 0.6573682185455945, + "flos": 497346117120.0, + "grad_norm": 0.062003807204006764, + "language_loss": 0.87255514, + "learning_rate": 0.00027754103508560013, + "loss": 0.88314748, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3417, + "time_per_iteration": 2.648777723312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062014, + "balance_loss_mlp": 1.05205965, + "diversity_loss_mlp": 0.0, + "epoch": 0.657560600230858, + "flos": 447465295872.0, + "grad_norm": 0.06781110485333444, + "language_loss": 0.82382166, + "learning_rate": 0.0002772620699209163, + "loss": 0.83444178, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3418, + "time_per_iteration": 2.566547155380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_mlp": 1.0503962, + "diversity_loss_mlp": 0.0, + "epoch": 0.6577529819161216, + "flos": 481940596224.0, + "grad_norm": 0.0650517875970755, + "language_loss": 0.79616904, + "learning_rate": 0.0002769831912307658, + "loss": 0.80676609, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3419, + "time_per_iteration": 2.606062889099121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061383, + "balance_loss_mlp": 1.05156565, + "diversity_loss_mlp": 0.0, + "epoch": 0.6579453636013851, + "flos": 530843134464.0, + "grad_norm": 0.07306581186555239, + "language_loss": 0.80279779, + "learning_rate": 0.00027670439912341917, + "loss": 0.81341165, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3420, + "time_per_iteration": 2.616004228591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058734, + "balance_loss_mlp": 1.04903078, + "diversity_loss_mlp": 0.0, + "epoch": 0.6581377452866487, + "flos": 628037743104.0, + "grad_norm": 0.07531365664549339, + "language_loss": 0.83319843, + "learning_rate": 0.0002764256937071129, + "loss": 0.84378576, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.09692383, + "routerloss_mlp": 0.0, + "step": 3421, + "time_per_iteration": 2.7864840030670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061647, + "balance_loss_mlp": 1.05205703, + "diversity_loss_mlp": 0.0, + "epoch": 0.6583301269719123, + "flos": 548618199552.0, + "grad_norm": 0.06844647739450752, + "language_loss": 0.87222612, + "learning_rate": 0.00027614707509005036, + "loss": 0.88284254, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 3422, + "time_per_iteration": 2.666473388671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058786, + "balance_loss_mlp": 1.04912376, + "diversity_loss_mlp": 0.0, + "epoch": 0.6585225086571759, + "flos": 427493583360.0, + "grad_norm": 0.0762783210263198, + "language_loss": 0.79373097, + "learning_rate": 0.0002758685433804008, + "loss": 0.8043189, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3423, + "time_per_iteration": 2.4872303009033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056028, + "balance_loss_mlp": 1.04637778, + "diversity_loss_mlp": 0.0, + "epoch": 0.6587148903424394, + "flos": 859620542976.0, + "grad_norm": 0.07259832833327884, + "language_loss": 0.79187661, + "learning_rate": 0.00027559009868630005, + "loss": 0.80243689, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3424, + "time_per_iteration": 3.1284892559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063823, + "balance_loss_mlp": 1.0545187, + "diversity_loss_mlp": 0.0, + "epoch": 0.6589072720277029, + "flos": 805630551552.0, + "grad_norm": 0.07475259244153008, + "language_loss": 0.80332637, + "learning_rate": 0.0002753117411158491, + "loss": 0.81396455, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3425, + "time_per_iteration": 3.024216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066892, + "balance_loss_mlp": 1.05724216, + "diversity_loss_mlp": 0.0, + "epoch": 0.6590996537129665, + "flos": 548618199552.0, + "grad_norm": 0.06493586108743211, + "language_loss": 0.89989424, + "learning_rate": 0.0002750334707771168, + "loss": 0.91056317, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3426, + "time_per_iteration": 2.6436870098114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066532, + "balance_loss_mlp": 1.0567987, + "diversity_loss_mlp": 0.0, + "epoch": 0.6592920353982301, + "flos": 454166092800.0, + "grad_norm": 0.06891806065084582, + "language_loss": 0.81568319, + "learning_rate": 0.0002747552877781369, + "loss": 0.82634848, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.097229, + "routerloss_mlp": 0.0, + "step": 3427, + "time_per_iteration": 2.484457015991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106612, + "balance_loss_mlp": 1.05665517, + "diversity_loss_mlp": 0.0, + "epoch": 0.6594844170834937, + "flos": 567174057984.0, + "grad_norm": 0.06651025164376474, + "language_loss": 0.81769067, + "learning_rate": 0.0002744771922269097, + "loss": 0.82835186, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3428, + "time_per_iteration": 2.724034547805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.06395817, + "diversity_loss_mlp": 0.0, + "epoch": 0.6596767987687572, + "flos": 1187911194624.0, + "grad_norm": 0.08249136451092651, + "language_loss": 0.81983304, + "learning_rate": 0.0002741991842314015, + "loss": 0.83056509, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3429, + "time_per_iteration": 3.4791431427001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106958, + "balance_loss_mlp": 1.06021035, + "diversity_loss_mlp": 0.0, + "epoch": 0.6598691804540208, + "flos": 503491147776.0, + "grad_norm": 0.09631718735244636, + "language_loss": 0.85994452, + "learning_rate": 0.0002739212638995445, + "loss": 0.87064034, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3430, + "time_per_iteration": 2.5809226036071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070566, + "balance_loss_mlp": 1.06089258, + "diversity_loss_mlp": 0.0, + "epoch": 0.6600615621392844, + "flos": 531337231872.0, + "grad_norm": 0.07152811859744175, + "language_loss": 0.83226836, + "learning_rate": 0.00027364343133923696, + "loss": 0.84297395, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 3431, + "time_per_iteration": 2.664724826812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072179, + "balance_loss_mlp": 1.06281483, + "diversity_loss_mlp": 0.0, + "epoch": 0.6602539438245479, + "flos": 565446915072.0, + "grad_norm": 0.07076815482363777, + "language_loss": 0.82710063, + "learning_rate": 0.0002733656866583431, + "loss": 0.83782238, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3432, + "time_per_iteration": 2.6845815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075681, + "balance_loss_mlp": 1.06614459, + "diversity_loss_mlp": 0.0, + "epoch": 0.6604463255098114, + "flos": 857159594496.0, + "grad_norm": 0.07348653509543634, + "language_loss": 0.83014315, + "learning_rate": 0.0002730880299646927, + "loss": 0.84089994, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3433, + "time_per_iteration": 3.09417462348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072804, + "balance_loss_mlp": 1.06348789, + "diversity_loss_mlp": 0.0, + "epoch": 0.660638707195075, + "flos": 674462080512.0, + "grad_norm": 0.060523936244010056, + "language_loss": 0.85307741, + "learning_rate": 0.0002728104613660821, + "loss": 0.86380541, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3434, + "time_per_iteration": 2.844012498855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071618, + "balance_loss_mlp": 1.06231332, + "diversity_loss_mlp": 0.0, + "epoch": 0.6608310888803386, + "flos": 888961402368.0, + "grad_norm": 0.06580511923703304, + "language_loss": 0.83062303, + "learning_rate": 0.0002725329809702729, + "loss": 0.84133923, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3435, + "time_per_iteration": 3.203927516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070425, + "balance_loss_mlp": 1.06119871, + "diversity_loss_mlp": 0.0, + "epoch": 0.6610234705656022, + "flos": 1136347646976.0, + "grad_norm": 0.07937285786961487, + "language_loss": 0.76092625, + "learning_rate": 0.0002722555888849921, + "loss": 0.77163053, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3436, + "time_per_iteration": 3.441042423248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071904, + "balance_loss_mlp": 1.06265306, + "diversity_loss_mlp": 0.0, + "epoch": 0.6612158522508658, + "flos": 468012598272.0, + "grad_norm": 0.06477982340890849, + "language_loss": 0.80420995, + "learning_rate": 0.00027197828521793334, + "loss": 0.81492901, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3437, + "time_per_iteration": 2.508976697921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072662, + "balance_loss_mlp": 1.0631609, + "diversity_loss_mlp": 0.0, + "epoch": 0.6614082339361292, + "flos": 571653614592.0, + "grad_norm": 0.05773126923802199, + "language_loss": 0.85235512, + "learning_rate": 0.0002717010700767552, + "loss": 0.86308175, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3438, + "time_per_iteration": 2.7343809604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788388, + "balance_loss_mlp": 1.33122396, + "diversity_loss_mlp": 0.22170436, + "epoch": 0.6616006156213928, + "flos": 498467934720.0, + "grad_norm": 0.035967269047030424, + "language_loss": 0.76073134, + "learning_rate": 0.00027142394356908226, + "loss": 0.76861525, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01192367, + "step": 3439, + "time_per_iteration": 2.6098694801330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072149, + "balance_loss_mlp": 1.06304741, + "diversity_loss_mlp": 0.0, + "epoch": 0.6617929973066564, + "flos": 602420239872.0, + "grad_norm": 0.07092995700037574, + "language_loss": 0.84935868, + "learning_rate": 0.00027114690580250456, + "loss": 0.86008012, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3440, + "time_per_iteration": 2.7477781772613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067851, + "balance_loss_mlp": 1.05864227, + "diversity_loss_mlp": 0.0, + "epoch": 0.66198537899192, + "flos": 522983443968.0, + "grad_norm": 0.07606845250334485, + "language_loss": 0.87084186, + "learning_rate": 0.0002708699568845776, + "loss": 0.88152039, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3441, + "time_per_iteration": 2.6247143745422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068327, + "balance_loss_mlp": 1.062343, + "diversity_loss_mlp": 0.0, + "epoch": 0.6621777606771835, + "flos": 1566256642560.0, + "grad_norm": 0.03817420207517821, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80356109, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 3442, + "time_per_iteration": 4.9118194580078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070581, + "balance_loss_mlp": 1.06144977, + "diversity_loss_mlp": 0.0, + "epoch": 0.6623701423624471, + "flos": 526664954880.0, + "grad_norm": 0.059711141008881904, + "language_loss": 0.83110899, + "learning_rate": 0.0002703163260247261, + "loss": 0.84181482, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3443, + "time_per_iteration": 2.6146388053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070764, + "balance_loss_mlp": 1.06162047, + "diversity_loss_mlp": 0.0, + "epoch": 0.6625625240477107, + "flos": 528179553792.0, + "grad_norm": 0.07293118954211444, + "language_loss": 0.81726909, + "learning_rate": 0.0002700396442977399, + "loss": 0.82797676, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3444, + "time_per_iteration": 2.6122488975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072126, + "balance_loss_mlp": 1.06287587, + "diversity_loss_mlp": 0.0, + "epoch": 0.6627549057329742, + "flos": 473122073088.0, + "grad_norm": 0.06235524151571192, + "language_loss": 0.84365332, + "learning_rate": 0.0002697630518492817, + "loss": 0.85437459, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3445, + "time_per_iteration": 2.695577621459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074381, + "balance_loss_mlp": 1.06496358, + "diversity_loss_mlp": 0.0, + "epoch": 0.6629472874182378, + "flos": 527996745216.0, + "grad_norm": 0.09449311389962292, + "language_loss": 0.85555631, + "learning_rate": 0.0002694865487867343, + "loss": 0.86630011, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3446, + "time_per_iteration": 2.643448829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066192, + "balance_loss_mlp": 1.0568881, + "diversity_loss_mlp": 0.0, + "epoch": 0.6631396691035013, + "flos": 613200471552.0, + "grad_norm": 0.06130478535455018, + "language_loss": 0.84665477, + "learning_rate": 0.0002692101352174453, + "loss": 0.85731673, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3447, + "time_per_iteration": 2.7684693336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071985, + "balance_loss_mlp": 1.06239462, + "diversity_loss_mlp": 0.0, + "epoch": 0.6633320507887649, + "flos": 609318899712.0, + "grad_norm": 0.0686574359328325, + "language_loss": 0.84783942, + "learning_rate": 0.00026893381124872787, + "loss": 0.85855925, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3448, + "time_per_iteration": 2.6856155395507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.06869519, + "diversity_loss_mlp": 0.0, + "epoch": 0.6635244324740285, + "flos": 749700873216.0, + "grad_norm": 0.07711664740076789, + "language_loss": 0.80761468, + "learning_rate": 0.00026865757698786097, + "loss": 0.8183924, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3449, + "time_per_iteration": 3.0219905376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064699, + "balance_loss_mlp": 1.05549026, + "diversity_loss_mlp": 0.0, + "epoch": 0.6637168141592921, + "flos": 664526882304.0, + "grad_norm": 0.07081100750222453, + "language_loss": 0.81853712, + "learning_rate": 0.000268381432542088, + "loss": 0.82918411, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3450, + "time_per_iteration": 2.7959303855895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063443, + "balance_loss_mlp": 1.05394757, + "diversity_loss_mlp": 0.0, + "epoch": 0.6639091958445555, + "flos": 606783799296.0, + "grad_norm": 0.0764006206271421, + "language_loss": 0.80043346, + "learning_rate": 0.00026810537801861807, + "loss": 0.81106788, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3451, + "time_per_iteration": 2.7303504943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058217, + "balance_loss_mlp": 1.04875171, + "diversity_loss_mlp": 0.0, + "epoch": 0.6641015775298191, + "flos": 476697498624.0, + "grad_norm": 0.05834244489040309, + "language_loss": 0.81090832, + "learning_rate": 0.0002678294135246243, + "loss": 0.82149041, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3452, + "time_per_iteration": 2.733463764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056165, + "balance_loss_mlp": 1.04691422, + "diversity_loss_mlp": 0.0, + "epoch": 0.6642939592150827, + "flos": 904115105280.0, + "grad_norm": 0.07343702884431198, + "language_loss": 0.86356318, + "learning_rate": 0.0002675535391672463, + "loss": 0.87412483, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3453, + "time_per_iteration": 3.115978956222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00797636, + "balance_loss_mlp": 1.35083306, + "diversity_loss_mlp": 0.22054271, + "epoch": 0.6644863409003463, + "flos": 581808697344.0, + "grad_norm": 0.028810841374919304, + "language_loss": 0.86237454, + "learning_rate": 0.0002672777550535877, + "loss": 0.87035096, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01194801, + "step": 3454, + "time_per_iteration": 2.793548822402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060819, + "balance_loss_mlp": 1.05172312, + "diversity_loss_mlp": 0.0, + "epoch": 0.6646787225856099, + "flos": 479002802688.0, + "grad_norm": 0.0753840272591569, + "language_loss": 0.85331321, + "learning_rate": 0.00026700206129071747, + "loss": 0.8639214, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3455, + "time_per_iteration": 2.5915210247039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064823, + "balance_loss_mlp": 1.05565548, + "diversity_loss_mlp": 0.0, + "epoch": 0.6648711042708734, + "flos": 449906420736.0, + "grad_norm": 0.07433202645873906, + "language_loss": 0.89061069, + "learning_rate": 0.00026672645798566925, + "loss": 0.90125895, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3456, + "time_per_iteration": 2.5754494667053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059485, + "balance_loss_mlp": 1.05019283, + "diversity_loss_mlp": 0.0, + "epoch": 0.665063485956137, + "flos": 858960516096.0, + "grad_norm": 0.07294926148794169, + "language_loss": 0.79539233, + "learning_rate": 0.00026645094524544225, + "loss": 0.80598718, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.09289551, + "routerloss_mlp": 0.0, + "step": 3457, + "time_per_iteration": 3.2948148250579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056831, + "balance_loss_mlp": 1.04734802, + "diversity_loss_mlp": 0.0, + "epoch": 0.6652558676414005, + "flos": 604312939008.0, + "grad_norm": 0.08386362480566827, + "language_loss": 0.75221157, + "learning_rate": 0.00026617552317699945, + "loss": 0.76277989, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3458, + "time_per_iteration": 2.789961576461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057714, + "balance_loss_mlp": 1.04836822, + "diversity_loss_mlp": 0.0, + "epoch": 0.6654482493266641, + "flos": 510394576896.0, + "grad_norm": 0.09354786354914506, + "language_loss": 0.87007248, + "learning_rate": 0.0002659001918872693, + "loss": 0.88064957, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3459, + "time_per_iteration": 2.6320250034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058346, + "balance_loss_mlp": 1.04896998, + "diversity_loss_mlp": 0.0, + "epoch": 0.6656406310119277, + "flos": 565605130752.0, + "grad_norm": 0.06598239053228593, + "language_loss": 0.80718446, + "learning_rate": 0.0002656249514831449, + "loss": 0.81776798, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3460, + "time_per_iteration": 2.6485753059387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063647, + "balance_loss_mlp": 1.05442595, + "diversity_loss_mlp": 0.0, + "epoch": 0.6658330126971912, + "flos": 1024298141184.0, + "grad_norm": 0.05863451757746151, + "language_loss": 0.87114978, + "learning_rate": 0.00026534980207148416, + "loss": 0.88178623, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3461, + "time_per_iteration": 3.4618935585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.05719471, + "diversity_loss_mlp": 0.0, + "epoch": 0.6660253943824548, + "flos": 816823388160.0, + "grad_norm": 0.07572861338992695, + "language_loss": 0.73451698, + "learning_rate": 0.0002650747437591097, + "loss": 0.7451815, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 3462, + "time_per_iteration": 2.985516309738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_mlp": 1.02065372, + "diversity_loss_mlp": 0.0, + "epoch": 0.6662177760677184, + "flos": 1496169169920.0, + "grad_norm": 0.017950660829121307, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82906377, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 3463, + "time_per_iteration": 5.041592359542847 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067724, + "balance_loss_mlp": 1.05844963, + "diversity_loss_mlp": 0.0, + "epoch": 0.666410157752982, + "flos": 500120925696.0, + "grad_norm": 0.06793562911737132, + "language_loss": 0.86417711, + "learning_rate": 0.00026452490085933155, + "loss": 0.87485433, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3464, + "time_per_iteration": 2.5661425590515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069546, + "balance_loss_mlp": 1.05994368, + "diversity_loss_mlp": 0.0, + "epoch": 0.6666025394382454, + "flos": 481169714688.0, + "grad_norm": 0.08819800975527838, + "language_loss": 0.89818048, + "learning_rate": 0.00026425011648539614, + "loss": 0.90887594, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3465, + "time_per_iteration": 2.5488314628601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065239, + "balance_loss_mlp": 1.05584478, + "diversity_loss_mlp": 0.0, + "epoch": 0.666794921123509, + "flos": 546653919744.0, + "grad_norm": 0.06406494944770698, + "language_loss": 0.82567346, + "learning_rate": 0.00026397542363768267, + "loss": 0.83632582, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3466, + "time_per_iteration": 2.669250965118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781407, + "balance_loss_mlp": 1.32080197, + "diversity_loss_mlp": 0.21862534, + "epoch": 0.6669873028087726, + "flos": 471988145664.0, + "grad_norm": 0.03313864292511896, + "language_loss": 0.8202821, + "learning_rate": 0.0002637008224228362, + "loss": 0.82809615, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01169338, + "step": 3467, + "time_per_iteration": 2.572173833847046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070583, + "balance_loss_mlp": 1.06133246, + "diversity_loss_mlp": 0.0, + "epoch": 0.6671796844940362, + "flos": 547395065856.0, + "grad_norm": 0.05107139851875669, + "language_loss": 0.8441903, + "learning_rate": 0.00026342631294746653, + "loss": 0.85489613, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3468, + "time_per_iteration": 2.698885917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072156, + "balance_loss_mlp": 1.06254137, + "diversity_loss_mlp": 0.0, + "epoch": 0.6673720661792998, + "flos": 1070317214208.0, + "grad_norm": 0.05734496396036439, + "language_loss": 0.80842233, + "learning_rate": 0.0002631518953181476, + "loss": 0.81914389, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3469, + "time_per_iteration": 3.4733734130859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101407, + "balance_loss_mlp": 1.0077759, + "diversity_loss_mlp": 0.0, + "epoch": 0.6675644478645633, + "flos": 1523790600192.0, + "grad_norm": 0.015747171991140264, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77339357, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.06298828, + "routerloss_mlp": 0.0, + "step": 3470, + "time_per_iteration": 4.929265737533569 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074034, + "balance_loss_mlp": 1.06445539, + "diversity_loss_mlp": 0.0, + "epoch": 0.6677568295498268, + "flos": 579696113664.0, + "grad_norm": 0.060826323549746535, + "language_loss": 0.80429429, + "learning_rate": 0.00026260333602377985, + "loss": 0.81503463, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3471, + "time_per_iteration": 2.848822593688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076958, + "balance_loss_mlp": 1.06758189, + "diversity_loss_mlp": 0.0, + "epoch": 0.6679492112350904, + "flos": 383935458816.0, + "grad_norm": 0.07184696149338711, + "language_loss": 0.87395489, + "learning_rate": 0.0002623291945717007, + "loss": 0.88472444, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.09368896, + "routerloss_mlp": 0.0, + "step": 3472, + "time_per_iteration": 2.500190019607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073902, + "balance_loss_mlp": 1.06426954, + "diversity_loss_mlp": 0.0, + "epoch": 0.668141592920354, + "flos": 1150759830528.0, + "grad_norm": 0.06589735356893138, + "language_loss": 0.84111875, + "learning_rate": 0.00026205514539161175, + "loss": 0.85185778, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3473, + "time_per_iteration": 3.534797191619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072796, + "balance_loss_mlp": 1.0632112, + "diversity_loss_mlp": 0.0, + "epoch": 0.6683339746056175, + "flos": 561100608000.0, + "grad_norm": 0.059882211902428664, + "language_loss": 0.83973366, + "learning_rate": 0.00026178118858990773, + "loss": 0.8504616, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3474, + "time_per_iteration": 2.8565967082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070699, + "balance_loss_mlp": 1.06109083, + "diversity_loss_mlp": 0.0, + "epoch": 0.6685263562908811, + "flos": 514305884160.0, + "grad_norm": 0.06021787961002869, + "language_loss": 0.84205377, + "learning_rate": 0.0002615073242729483, + "loss": 0.85276067, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.0960083, + "routerloss_mlp": 0.0, + "step": 3475, + "time_per_iteration": 2.678913116455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070157, + "balance_loss_mlp": 1.0605185, + "diversity_loss_mlp": 0.0, + "epoch": 0.6687187379761447, + "flos": 629772226560.0, + "grad_norm": 0.05349171948445146, + "language_loss": 0.84449661, + "learning_rate": 0.0002612335525470573, + "loss": 0.85519814, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3476, + "time_per_iteration": 2.8754477500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_mlp": 1.05415487, + "diversity_loss_mlp": 0.0, + "epoch": 0.6689111196614083, + "flos": 535586992128.0, + "grad_norm": 0.0743507074362168, + "language_loss": 0.78049976, + "learning_rate": 0.0002609598735185221, + "loss": 0.79113823, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3477, + "time_per_iteration": 2.6721932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066424, + "balance_loss_mlp": 1.05687511, + "diversity_loss_mlp": 0.0, + "epoch": 0.6691035013466718, + "flos": 603038048256.0, + "grad_norm": 0.06005632064488323, + "language_loss": 0.83158946, + "learning_rate": 0.00026068628729359445, + "loss": 0.84225374, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3478, + "time_per_iteration": 2.7650654315948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068151, + "balance_loss_mlp": 1.05830431, + "diversity_loss_mlp": 0.0, + "epoch": 0.6692958830319353, + "flos": 632855752704.0, + "grad_norm": 0.0704650229723735, + "language_loss": 0.76221395, + "learning_rate": 0.00026041279397848996, + "loss": 0.77289546, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3479, + "time_per_iteration": 2.8531105518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.055673, + "diversity_loss_mlp": 0.0, + "epoch": 0.6694882647171989, + "flos": 645471783936.0, + "grad_norm": 0.06824163679163787, + "language_loss": 0.82570118, + "learning_rate": 0.00026013939367938797, + "loss": 0.8363536, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3480, + "time_per_iteration": 2.8762619495391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798551, + "balance_loss_mlp": 1.35232079, + "diversity_loss_mlp": 0.22152299, + "epoch": 0.6696806464024625, + "flos": 569585447424.0, + "grad_norm": 0.028482542431452974, + "language_loss": 0.81186199, + "learning_rate": 0.00025986608650243204, + "loss": 0.81984746, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01162949, + "step": 3481, + "time_per_iteration": 2.8153860569000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071437, + "balance_loss_mlp": 1.06166184, + "diversity_loss_mlp": 0.0, + "epoch": 0.6698730280877261, + "flos": 622700669952.0, + "grad_norm": 0.08903053329626802, + "language_loss": 0.79281807, + "learning_rate": 0.0002595928725537293, + "loss": 0.80353248, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3482, + "time_per_iteration": 2.8563952445983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064965, + "balance_loss_mlp": 1.05542827, + "diversity_loss_mlp": 0.0, + "epoch": 0.6700654097729896, + "flos": 502507722240.0, + "grad_norm": 0.06597366352184171, + "language_loss": 0.8811605, + "learning_rate": 0.0002593197519393509, + "loss": 0.89181018, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.09539795, + "routerloss_mlp": 0.0, + "step": 3483, + "time_per_iteration": 2.659468650817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060552, + "balance_loss_mlp": 1.05117035, + "diversity_loss_mlp": 0.0, + "epoch": 0.6702577914582531, + "flos": 623876815872.0, + "grad_norm": 0.06129183928704833, + "language_loss": 0.79517573, + "learning_rate": 0.00025904672476533165, + "loss": 0.80578125, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3484, + "time_per_iteration": 2.843041181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_mlp": 1.0531497, + "diversity_loss_mlp": 0.0, + "epoch": 0.6704501731435167, + "flos": 456268764672.0, + "grad_norm": 0.06231151375576235, + "language_loss": 0.82821012, + "learning_rate": 0.0002587737911376704, + "loss": 0.83883744, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3485, + "time_per_iteration": 2.579852819442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065444, + "balance_loss_mlp": 1.0560143, + "diversity_loss_mlp": 0.0, + "epoch": 0.6706425548287803, + "flos": 543229369344.0, + "grad_norm": 0.06196157664485949, + "language_loss": 0.84223086, + "learning_rate": 0.00025850095116232885, + "loss": 0.85288531, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3486, + "time_per_iteration": 2.6867549419403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059913, + "balance_loss_mlp": 1.05029857, + "diversity_loss_mlp": 0.0, + "epoch": 0.6708349365140439, + "flos": 633940494336.0, + "grad_norm": 0.07455755751361211, + "language_loss": 0.77796304, + "learning_rate": 0.000258228204945233, + "loss": 0.78856218, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3487, + "time_per_iteration": 2.9104583263397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788296, + "balance_loss_mlp": 1.33072948, + "diversity_loss_mlp": 0.22110668, + "epoch": 0.6710273181993074, + "flos": 640747749888.0, + "grad_norm": 0.03107378418050736, + "language_loss": 0.84813625, + "learning_rate": 0.00025795555259227254, + "loss": 0.8560192, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0123779, + "step": 3488, + "time_per_iteration": 2.799049139022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064064, + "balance_loss_mlp": 1.05453348, + "diversity_loss_mlp": 0.0, + "epoch": 0.671219699884571, + "flos": 553942789632.0, + "grad_norm": 0.05587900492957358, + "language_loss": 0.8365714, + "learning_rate": 0.00025768299420930046, + "loss": 0.84721196, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.09515381, + "routerloss_mlp": 0.0, + "step": 3489, + "time_per_iteration": 2.7350802421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059988, + "balance_loss_mlp": 1.05058801, + "diversity_loss_mlp": 0.0, + "epoch": 0.6714120815698346, + "flos": 731508433920.0, + "grad_norm": 0.0636982622522837, + "language_loss": 0.83686626, + "learning_rate": 0.0002574105299021332, + "loss": 0.84746611, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3490, + "time_per_iteration": 2.8952267169952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056705, + "balance_loss_mlp": 1.04722226, + "diversity_loss_mlp": 0.0, + "epoch": 0.6716044632550981, + "flos": 688664291328.0, + "grad_norm": 0.059047086854658884, + "language_loss": 0.84235394, + "learning_rate": 0.00025713815977655084, + "loss": 0.85292095, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3491, + "time_per_iteration": 2.8801188468933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059823, + "balance_loss_mlp": 1.05020285, + "diversity_loss_mlp": 0.0, + "epoch": 0.6717968449403616, + "flos": 460629752832.0, + "grad_norm": 0.0713613195550899, + "language_loss": 0.84868813, + "learning_rate": 0.0002568658839382969, + "loss": 0.85928631, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3492, + "time_per_iteration": 2.565765380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055936, + "balance_loss_mlp": 1.04666197, + "diversity_loss_mlp": 0.0, + "epoch": 0.6719892266256252, + "flos": 501608360448.0, + "grad_norm": 0.0809894292628365, + "language_loss": 0.8436929, + "learning_rate": 0.00025659370249307814, + "loss": 0.85425228, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3493, + "time_per_iteration": 2.61505126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_mlp": 1.04709792, + "diversity_loss_mlp": 0.0, + "epoch": 0.6721816083108888, + "flos": 683525081088.0, + "grad_norm": 0.06605957100839344, + "language_loss": 0.85386133, + "learning_rate": 0.00025632161554656473, + "loss": 0.86442864, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.09631348, + "routerloss_mlp": 0.0, + "step": 3494, + "time_per_iteration": 2.8639488220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054512, + "balance_loss_mlp": 1.04485643, + "diversity_loss_mlp": 0.0, + "epoch": 0.6723739899961524, + "flos": 585813980160.0, + "grad_norm": 0.0758709557174038, + "language_loss": 0.8232398, + "learning_rate": 0.00025604962320439017, + "loss": 0.83378488, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3495, + "time_per_iteration": 2.71235728263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056091, + "balance_loss_mlp": 1.04692411, + "diversity_loss_mlp": 0.0, + "epoch": 0.672566371681416, + "flos": 506616519168.0, + "grad_norm": 0.06832671008161519, + "language_loss": 0.82082075, + "learning_rate": 0.0002557777255721516, + "loss": 0.83138162, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3496, + "time_per_iteration": 2.728652000427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052712, + "balance_loss_mlp": 1.04334807, + "diversity_loss_mlp": 0.0, + "epoch": 0.6727587533666795, + "flos": 535671055872.0, + "grad_norm": 0.07590882568517338, + "language_loss": 0.80502313, + "learning_rate": 0.0002555059227554087, + "loss": 0.81555027, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3497, + "time_per_iteration": 2.6704843044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054312, + "balance_loss_mlp": 1.04488301, + "diversity_loss_mlp": 0.0, + "epoch": 0.672951135051943, + "flos": 602832844800.0, + "grad_norm": 0.0738650094824256, + "language_loss": 0.77972269, + "learning_rate": 0.00025523421485968453, + "loss": 0.79026586, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3498, + "time_per_iteration": 2.8093771934509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057909, + "balance_loss_mlp": 1.04843736, + "diversity_loss_mlp": 0.0, + "epoch": 0.6731435167372066, + "flos": 811315989504.0, + "grad_norm": 0.07086262263525961, + "language_loss": 0.85447127, + "learning_rate": 0.00025496260199046585, + "loss": 0.86505038, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3499, + "time_per_iteration": 3.0010836124420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105668, + "balance_loss_mlp": 1.04721487, + "diversity_loss_mlp": 0.0, + "epoch": 0.6733358984224702, + "flos": 611594468352.0, + "grad_norm": 0.056698795982303, + "language_loss": 0.84606051, + "learning_rate": 0.000254691084253202, + "loss": 0.85662723, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3500, + "time_per_iteration": 2.7931160926818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106411, + "balance_loss_mlp": 1.05446577, + "diversity_loss_mlp": 0.0, + "epoch": 0.6735282801077337, + "flos": 558901762560.0, + "grad_norm": 0.075539637024569, + "language_loss": 0.77243733, + "learning_rate": 0.00025441966175330567, + "loss": 0.78307843, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3501, + "time_per_iteration": 2.6508493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067964, + "balance_loss_mlp": 1.05850506, + "diversity_loss_mlp": 0.0, + "epoch": 0.6737206617929973, + "flos": 672433560576.0, + "grad_norm": 0.07065885937587965, + "language_loss": 0.79737401, + "learning_rate": 0.00025414833459615183, + "loss": 0.80805361, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3502, + "time_per_iteration": 2.784524917602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074197, + "balance_loss_mlp": 1.0648514, + "diversity_loss_mlp": 0.0, + "epoch": 0.6739130434782609, + "flos": 633446396928.0, + "grad_norm": 0.06652503704287359, + "language_loss": 0.80206275, + "learning_rate": 0.0002538771028870796, + "loss": 0.8128047, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3503, + "time_per_iteration": 2.802136182785034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075432, + "balance_loss_mlp": 1.06571674, + "diversity_loss_mlp": 0.0, + "epoch": 0.6741054251635245, + "flos": 531445888512.0, + "grad_norm": 0.06376799007020843, + "language_loss": 0.81455564, + "learning_rate": 0.0002536059667313903, + "loss": 0.82530999, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3504, + "time_per_iteration": 2.711933135986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068251, + "balance_loss_mlp": 1.05844057, + "diversity_loss_mlp": 0.0, + "epoch": 0.674297806848788, + "flos": 542604220416.0, + "grad_norm": 0.09964706429340704, + "language_loss": 0.89608288, + "learning_rate": 0.0002533349262343483, + "loss": 0.9067654, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3505, + "time_per_iteration": 2.6715004444122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082337, + "balance_loss_mlp": 1.07268143, + "diversity_loss_mlp": 0.0, + "epoch": 0.6744901885340515, + "flos": 463523129856.0, + "grad_norm": 0.06572677444304757, + "language_loss": 0.81604284, + "learning_rate": 0.0002530639815011807, + "loss": 0.82686627, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3506, + "time_per_iteration": 2.4929287433624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078955, + "balance_loss_mlp": 1.33325195, + "diversity_loss_mlp": 0.2229899, + "epoch": 0.6746825702193151, + "flos": 631830481920.0, + "grad_norm": 0.03439328096706921, + "language_loss": 0.8506915, + "learning_rate": 0.0002527931326370781, + "loss": 0.85858697, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01142928, + "step": 3507, + "time_per_iteration": 2.83644962310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.07446539, + "diversity_loss_mlp": 0.0, + "epoch": 0.6748749519045787, + "flos": 671146186752.0, + "grad_norm": 0.08750505461607005, + "language_loss": 0.82915336, + "learning_rate": 0.00025252237974719276, + "loss": 0.83999527, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3508, + "time_per_iteration": 2.871253252029419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081215, + "balance_loss_mlp": 1.07155883, + "diversity_loss_mlp": 0.0, + "epoch": 0.6750673335898423, + "flos": 767102980608.0, + "grad_norm": 0.08335060522291943, + "language_loss": 0.80458963, + "learning_rate": 0.00025225172293664056, + "loss": 0.81540173, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3509, + "time_per_iteration": 3.033853530883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014527, + "balance_loss_mlp": 1.00832772, + "diversity_loss_mlp": 0.0, + "epoch": 0.6752597152751059, + "flos": 1512607675392.0, + "grad_norm": 0.01800991302482, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77947664, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.06176758, + "routerloss_mlp": 0.0, + "step": 3510, + "time_per_iteration": 4.911616325378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085264, + "balance_loss_mlp": 1.07521439, + "diversity_loss_mlp": 0.0, + "epoch": 0.6754520969603693, + "flos": 687297996288.0, + "grad_norm": 0.09401749664970258, + "language_loss": 0.84862983, + "learning_rate": 0.00025171069797381106, + "loss": 0.85948253, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3511, + "time_per_iteration": 2.8283350467681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071317, + "balance_loss_mlp": 1.06139874, + "diversity_loss_mlp": 0.0, + "epoch": 0.6756444786456329, + "flos": 500577947136.0, + "grad_norm": 0.06520954806538445, + "language_loss": 0.82273233, + "learning_rate": 0.00025144033003157864, + "loss": 0.83344549, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3512, + "time_per_iteration": 2.5983166694641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070965, + "balance_loss_mlp": 1.06117821, + "diversity_loss_mlp": 0.0, + "epoch": 0.6758368603308965, + "flos": 492616940544.0, + "grad_norm": 0.08310754245868612, + "language_loss": 0.78935671, + "learning_rate": 0.00025117005858876806, + "loss": 0.80006635, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3513, + "time_per_iteration": 2.6797635555267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787238, + "balance_loss_mlp": 1.33182001, + "diversity_loss_mlp": 0.21994653, + "epoch": 0.6760292420161601, + "flos": 555934233600.0, + "grad_norm": 0.03353723121835004, + "language_loss": 0.85560071, + "learning_rate": 0.000250899883750308, + "loss": 0.86347306, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0113544, + "step": 3514, + "time_per_iteration": 2.7176060676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059441, + "balance_loss_mlp": 1.04921913, + "diversity_loss_mlp": 0.0, + "epoch": 0.6762216237014236, + "flos": 607601668608.0, + "grad_norm": 0.07453608092591449, + "language_loss": 0.81898236, + "learning_rate": 0.00025062980562109006, + "loss": 0.82957679, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3515, + "time_per_iteration": 2.7594966888427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789958, + "balance_loss_mlp": 1.33716106, + "diversity_loss_mlp": 0.21975538, + "epoch": 0.6764140053866872, + "flos": 533785697280.0, + "grad_norm": 0.033729691487123833, + "language_loss": 0.83036506, + "learning_rate": 0.0002503598243059677, + "loss": 0.83826458, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01150025, + "step": 3516, + "time_per_iteration": 2.891763687133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_mlp": 1.04839277, + "diversity_loss_mlp": 0.0, + "epoch": 0.6766063870719508, + "flos": 504810455040.0, + "grad_norm": 0.07017833187059877, + "language_loss": 0.80408925, + "learning_rate": 0.0002500899399097568, + "loss": 0.81467211, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.09887695, + "routerloss_mlp": 0.0, + "step": 3517, + "time_per_iteration": 2.672029972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786476, + "balance_loss_mlp": 1.32907259, + "diversity_loss_mlp": 0.22110882, + "epoch": 0.6767987687572143, + "flos": 513176726016.0, + "grad_norm": 0.038425556988831724, + "language_loss": 0.85818875, + "learning_rate": 0.0002498201525372359, + "loss": 0.86605346, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01138566, + "step": 3518, + "time_per_iteration": 2.617760419845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054389, + "balance_loss_mlp": 1.04459572, + "diversity_loss_mlp": 0.0, + "epoch": 0.6769911504424779, + "flos": 525039128064.0, + "grad_norm": 0.06814874892769256, + "language_loss": 0.83201683, + "learning_rate": 0.00024955046229314584, + "loss": 0.84256077, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.09783936, + "routerloss_mlp": 0.0, + "step": 3519, + "time_per_iteration": 2.6269547939300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_mlp": 1.04138207, + "diversity_loss_mlp": 0.0, + "epoch": 0.6771835321277414, + "flos": 449896508928.0, + "grad_norm": 0.06326657634867637, + "language_loss": 0.87517166, + "learning_rate": 0.00024928086928218947, + "loss": 0.88568723, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.10174561, + "routerloss_mlp": 0.0, + "step": 3520, + "time_per_iteration": 2.500542163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057369, + "balance_loss_mlp": 1.04749823, + "diversity_loss_mlp": 0.0, + "epoch": 0.677375913813005, + "flos": 709349985792.0, + "grad_norm": 0.0729210521666428, + "language_loss": 0.76251125, + "learning_rate": 0.00024901137360903216, + "loss": 0.77308488, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.09869385, + "routerloss_mlp": 0.0, + "step": 3521, + "time_per_iteration": 2.921558380126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055481, + "balance_loss_mlp": 1.04586673, + "diversity_loss_mlp": 0.0, + "epoch": 0.6775682954982686, + "flos": 428420109312.0, + "grad_norm": 0.08065371435227142, + "language_loss": 0.80853164, + "learning_rate": 0.00024874197537830115, + "loss": 0.81908649, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3522, + "time_per_iteration": 2.5280978679656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793286, + "balance_loss_mlp": 1.3416667, + "diversity_loss_mlp": 0.22178407, + "epoch": 0.6777606771835322, + "flos": 437905626624.0, + "grad_norm": 0.034341347950706966, + "language_loss": 0.834656, + "learning_rate": 0.00024847267469458684, + "loss": 0.8425889, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0115611, + "step": 3523, + "time_per_iteration": 2.5251760482788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058563, + "balance_loss_mlp": 1.04881763, + "diversity_loss_mlp": 0.0, + "epoch": 0.6779530588687956, + "flos": 775442087424.0, + "grad_norm": 0.0593554156839795, + "language_loss": 0.77790511, + "learning_rate": 0.00024820347166244034, + "loss": 0.78849077, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.09741211, + "routerloss_mlp": 0.0, + "step": 3524, + "time_per_iteration": 2.9970362186431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061163, + "balance_loss_mlp": 1.051489, + "diversity_loss_mlp": 0.0, + "epoch": 0.6781454405540592, + "flos": 571782094848.0, + "grad_norm": 0.05785383684082485, + "language_loss": 0.8476572, + "learning_rate": 0.0002479343663863755, + "loss": 0.85826874, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 3525, + "time_per_iteration": 2.748159885406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059476, + "balance_loss_mlp": 1.04968917, + "diversity_loss_mlp": 0.0, + "epoch": 0.6783378222393228, + "flos": 485026693632.0, + "grad_norm": 0.0719627260838572, + "language_loss": 0.76970756, + "learning_rate": 0.00024766535897086876, + "loss": 0.78030241, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3526, + "time_per_iteration": 2.5848824977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060587, + "balance_loss_mlp": 1.05073428, + "diversity_loss_mlp": 0.0, + "epoch": 0.6785302039245864, + "flos": 482839958016.0, + "grad_norm": 0.06835251841322831, + "language_loss": 0.79290187, + "learning_rate": 0.0002473964495203578, + "loss": 0.80350775, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3527, + "time_per_iteration": 2.6953914165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106164, + "balance_loss_mlp": 1.05191827, + "diversity_loss_mlp": 0.0, + "epoch": 0.67872258560985, + "flos": 524732608512.0, + "grad_norm": 0.06684083470405644, + "language_loss": 0.85681713, + "learning_rate": 0.0002471276381392425, + "loss": 0.86743355, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3528, + "time_per_iteration": 2.7917094230651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_mlp": 1.02451074, + "diversity_loss_mlp": 0.0, + "epoch": 0.6789149672951135, + "flos": 1552605428736.0, + "grad_norm": 0.029269024795112553, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.7921958, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.06445312, + "routerloss_mlp": 0.0, + "step": 3529, + "time_per_iteration": 4.962055921554565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066825, + "balance_loss_mlp": 1.05733609, + "diversity_loss_mlp": 0.0, + "epoch": 0.6791073489803771, + "flos": 741406556160.0, + "grad_norm": 0.06831388456608918, + "language_loss": 0.84243917, + "learning_rate": 0.00024659031000260826, + "loss": 0.85310745, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3530, + "time_per_iteration": 2.8746378421783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066552, + "balance_loss_mlp": 1.05688381, + "diversity_loss_mlp": 0.0, + "epoch": 0.6792997306656406, + "flos": 576365538816.0, + "grad_norm": 0.07285232550578888, + "language_loss": 0.80730051, + "learning_rate": 0.0002463217934556985, + "loss": 0.81796598, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3531, + "time_per_iteration": 2.7028424739837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014286, + "balance_loss_mlp": 1.00808728, + "diversity_loss_mlp": 0.0, + "epoch": 0.6794921123509042, + "flos": 1503337273344.0, + "grad_norm": 0.01858574921496822, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77546376, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.06201172, + "routerloss_mlp": 0.0, + "step": 3532, + "time_per_iteration": 4.780252933502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071665, + "balance_loss_mlp": 1.06221724, + "diversity_loss_mlp": 0.0, + "epoch": 0.6796844940361677, + "flos": 698923261440.0, + "grad_norm": 0.08979673870599186, + "language_loss": 0.83808529, + "learning_rate": 0.0002457850559259306, + "loss": 0.84880191, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3533, + "time_per_iteration": 2.9009928703308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107616, + "balance_loss_mlp": 1.06684947, + "diversity_loss_mlp": 0.0, + "epoch": 0.6798768757214313, + "flos": 552759303168.0, + "grad_norm": 0.06667977411786664, + "language_loss": 0.81866515, + "learning_rate": 0.00024551683515145275, + "loss": 0.82942677, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3534, + "time_per_iteration": 2.67411208152771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.0675205, + "diversity_loss_mlp": 0.0, + "epoch": 0.6800692574066949, + "flos": 522936456192.0, + "grad_norm": 0.06662082176408471, + "language_loss": 0.86499625, + "learning_rate": 0.0002452487131761014, + "loss": 0.87576586, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3535, + "time_per_iteration": 2.723414421081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071026, + "balance_loss_mlp": 1.06126261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6802616390919585, + "flos": 574023158784.0, + "grad_norm": 0.07513209939898634, + "language_loss": 0.79904449, + "learning_rate": 0.00024498069010397093, + "loss": 0.80975473, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3536, + "time_per_iteration": 2.729044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071543, + "balance_loss_mlp": 1.06177378, + "diversity_loss_mlp": 0.0, + "epoch": 0.6804540207772221, + "flos": 488157207552.0, + "grad_norm": 0.062001089349607685, + "language_loss": 0.85142958, + "learning_rate": 0.00024471276603911697, + "loss": 0.86214507, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3537, + "time_per_iteration": 4.243680953979492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073179, + "balance_loss_mlp": 1.06360102, + "diversity_loss_mlp": 0.0, + "epoch": 0.6806464024624855, + "flos": 578594119680.0, + "grad_norm": 0.06230124795461592, + "language_loss": 0.79373354, + "learning_rate": 0.0002444449410855572, + "loss": 0.80446529, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3538, + "time_per_iteration": 2.744311571121216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071763, + "balance_loss_mlp": 1.06218505, + "diversity_loss_mlp": 0.0, + "epoch": 0.6808387841477491, + "flos": 553722905088.0, + "grad_norm": 0.057428584707934646, + "language_loss": 0.84307408, + "learning_rate": 0.00024417721534727033, + "loss": 0.85379171, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3539, + "time_per_iteration": 2.643796920776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073723, + "balance_loss_mlp": 1.06420994, + "diversity_loss_mlp": 0.0, + "epoch": 0.6810311658330127, + "flos": 426841270272.0, + "grad_norm": 0.09448746877359589, + "language_loss": 0.82968056, + "learning_rate": 0.00024390958892819687, + "loss": 0.8404178, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3540, + "time_per_iteration": 2.500807285308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010722, + "balance_loss_mlp": 1.0624193, + "diversity_loss_mlp": 0.0, + "epoch": 0.6812235475182763, + "flos": 572256368640.0, + "grad_norm": 0.06494427347835982, + "language_loss": 0.80941665, + "learning_rate": 0.0002436420619322381, + "loss": 0.82013869, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3541, + "time_per_iteration": 2.8345742225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077361, + "balance_loss_mlp": 1.0675267, + "diversity_loss_mlp": 0.0, + "epoch": 0.6814159292035398, + "flos": 501917078016.0, + "grad_norm": 0.07816741001086884, + "language_loss": 0.82754946, + "learning_rate": 0.0002433746344632577, + "loss": 0.83832312, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3542, + "time_per_iteration": 2.6863982677459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067339, + "balance_loss_mlp": 1.05741465, + "diversity_loss_mlp": 0.0, + "epoch": 0.6816083108888034, + "flos": 765531482112.0, + "grad_norm": 0.06517118266272649, + "language_loss": 0.80166835, + "learning_rate": 0.00024310730662508006, + "loss": 0.81234175, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.09924316, + "routerloss_mlp": 0.0, + "step": 3543, + "time_per_iteration": 3.0644540786743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070309, + "balance_loss_mlp": 1.06105816, + "diversity_loss_mlp": 0.0, + "epoch": 0.681800692574067, + "flos": 479459824128.0, + "grad_norm": 0.06994305910782121, + "language_loss": 0.87753445, + "learning_rate": 0.0002428400785214911, + "loss": 0.88823748, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3544, + "time_per_iteration": 2.5769219398498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070709, + "balance_loss_mlp": 1.06136894, + "diversity_loss_mlp": 0.0, + "epoch": 0.6819930742593305, + "flos": 691604656128.0, + "grad_norm": 0.07082765333867001, + "language_loss": 0.82354796, + "learning_rate": 0.00024257295025623794, + "loss": 0.83425504, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3545, + "time_per_iteration": 2.799276828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066867, + "balance_loss_mlp": 1.05750871, + "diversity_loss_mlp": 0.0, + "epoch": 0.6821854559445941, + "flos": 678096603648.0, + "grad_norm": 0.06649234916050309, + "language_loss": 0.8049404, + "learning_rate": 0.00024230592193302892, + "loss": 0.8156091, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3546, + "time_per_iteration": 2.9205825328826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064713, + "balance_loss_mlp": 1.05521762, + "diversity_loss_mlp": 0.0, + "epoch": 0.6823778376298576, + "flos": 462191339520.0, + "grad_norm": 0.07288649013986744, + "language_loss": 0.84268177, + "learning_rate": 0.00024203899365553372, + "loss": 0.85332888, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3547, + "time_per_iteration": 2.5345499515533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_mlp": 1.02241051, + "diversity_loss_mlp": 0.0, + "epoch": 0.6825702193151212, + "flos": 1475298842112.0, + "grad_norm": 0.024887330229706912, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77762419, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.05639648, + "routerloss_mlp": 0.0, + "step": 3548, + "time_per_iteration": 4.575555801391602 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066126, + "balance_loss_mlp": 1.05700111, + "diversity_loss_mlp": 0.0, + "epoch": 0.6827626010003848, + "flos": 723114998784.0, + "grad_norm": 0.06418703018565212, + "language_loss": 0.83182037, + "learning_rate": 0.00024150543765216848, + "loss": 0.84248167, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 3549, + "time_per_iteration": 2.9021003246307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060662, + "balance_loss_mlp": 1.05113733, + "diversity_loss_mlp": 0.0, + "epoch": 0.6829549826856484, + "flos": 558864686592.0, + "grad_norm": 0.07049185581954354, + "language_loss": 0.83715057, + "learning_rate": 0.00024123881013344352, + "loss": 0.8477571, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.09515381, + "routerloss_mlp": 0.0, + "step": 3550, + "time_per_iteration": 2.671104669570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062514, + "balance_loss_mlp": 1.05335271, + "diversity_loss_mlp": 0.0, + "epoch": 0.6831473643709118, + "flos": 624934393344.0, + "grad_norm": 0.06503037380674516, + "language_loss": 0.7999897, + "learning_rate": 0.00024097228307472202, + "loss": 0.81061488, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3551, + "time_per_iteration": 2.826650619506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064244, + "balance_loss_mlp": 1.05474889, + "diversity_loss_mlp": 0.0, + "epoch": 0.6833397460561754, + "flos": 713861849088.0, + "grad_norm": 0.06680109192015529, + "language_loss": 0.82289582, + "learning_rate": 0.00024070585657947846, + "loss": 0.83353829, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3552, + "time_per_iteration": 2.831995725631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010645, + "balance_loss_mlp": 1.05527949, + "diversity_loss_mlp": 0.0, + "epoch": 0.683532127741439, + "flos": 464704045056.0, + "grad_norm": 0.065434895685697, + "language_loss": 0.85023475, + "learning_rate": 0.00024043953075114934, + "loss": 0.86087978, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3553, + "time_per_iteration": 2.622846841812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055947, + "balance_loss_mlp": 1.0463928, + "diversity_loss_mlp": 0.0, + "epoch": 0.6837245094267026, + "flos": 582251037696.0, + "grad_norm": 0.07243414619593286, + "language_loss": 0.89257199, + "learning_rate": 0.00024017330569313128, + "loss": 0.90313148, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3554, + "time_per_iteration": 2.705098867416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065851, + "balance_loss_mlp": 1.05631375, + "diversity_loss_mlp": 0.0, + "epoch": 0.6839168911119662, + "flos": 794173413888.0, + "grad_norm": 0.06810293796091849, + "language_loss": 0.7482394, + "learning_rate": 0.0002399071815087821, + "loss": 0.7588979, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3555, + "time_per_iteration": 3.053788900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064522, + "balance_loss_mlp": 1.05496788, + "diversity_loss_mlp": 0.0, + "epoch": 0.6841092727972297, + "flos": 580009973760.0, + "grad_norm": 0.0721005752972134, + "language_loss": 0.83788198, + "learning_rate": 0.00023964115830142025, + "loss": 0.84852719, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3556, + "time_per_iteration": 2.7068707942962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062319, + "balance_loss_mlp": 1.05320573, + "diversity_loss_mlp": 0.0, + "epoch": 0.6843016544824932, + "flos": 383742738432.0, + "grad_norm": 0.07897700130685587, + "language_loss": 0.87426114, + "learning_rate": 0.00023937523617432522, + "loss": 0.88488424, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3557, + "time_per_iteration": 2.526129722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_mlp": 1.05461264, + "diversity_loss_mlp": 0.0, + "epoch": 0.6844940361677568, + "flos": 1439035476480.0, + "grad_norm": 0.08002974259616906, + "language_loss": 0.8704505, + "learning_rate": 0.00023910941523073705, + "loss": 0.88108861, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3558, + "time_per_iteration": 3.884982109069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067156, + "balance_loss_mlp": 1.05752969, + "diversity_loss_mlp": 0.0, + "epoch": 0.6846864178530204, + "flos": 520870860288.0, + "grad_norm": 0.0697798269972245, + "language_loss": 0.86687434, + "learning_rate": 0.0002388436955738566, + "loss": 0.87754589, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3559, + "time_per_iteration": 2.6896438598632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067142, + "balance_loss_mlp": 1.05763495, + "diversity_loss_mlp": 0.0, + "epoch": 0.6848787995382839, + "flos": 717946053120.0, + "grad_norm": 0.07371598831130721, + "language_loss": 0.81583881, + "learning_rate": 0.00023857807730684523, + "loss": 0.82651019, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3560, + "time_per_iteration": 2.906409740447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070721, + "balance_loss_mlp": 1.06119633, + "diversity_loss_mlp": 0.0, + "epoch": 0.6850711812235475, + "flos": 511061571072.0, + "grad_norm": 0.09020757950976771, + "language_loss": 0.82591355, + "learning_rate": 0.00023831256053282547, + "loss": 0.83662075, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3561, + "time_per_iteration": 2.741647481918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076955, + "balance_loss_mlp": 1.06726301, + "diversity_loss_mlp": 0.0, + "epoch": 0.6852635629088111, + "flos": 668151493632.0, + "grad_norm": 0.06598100836979733, + "language_loss": 0.7798056, + "learning_rate": 0.00023804714535488003, + "loss": 0.79057515, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3562, + "time_per_iteration": 2.8663859367370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022665, + "balance_loss_mlp": 1.01694274, + "diversity_loss_mlp": 0.0, + "epoch": 0.6854559445940747, + "flos": 1522980071424.0, + "grad_norm": 0.018293527884891043, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80832297, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.05712891, + "routerloss_mlp": 0.0, + "step": 3563, + "time_per_iteration": 4.938952684402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.06765318, + "diversity_loss_mlp": 0.0, + "epoch": 0.6856483262793382, + "flos": 454203168768.0, + "grad_norm": 0.06579070354920068, + "language_loss": 0.8089236, + "learning_rate": 0.00023751662019934488, + "loss": 0.81969196, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3564, + "time_per_iteration": 2.4886345863342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085968, + "balance_loss_mlp": 1.07677126, + "diversity_loss_mlp": 0.0, + "epoch": 0.6858407079646017, + "flos": 615552763392.0, + "grad_norm": 0.06770513871895241, + "language_loss": 0.79428673, + "learning_rate": 0.00023725151042772364, + "loss": 0.80514634, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3565, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091397, + "balance_loss_mlp": 1.08220637, + "diversity_loss_mlp": 0.0, + "epoch": 0.6860330896498653, + "flos": 466053087744.0, + "grad_norm": 0.0657025292696896, + "language_loss": 0.83245081, + "learning_rate": 0.00023698650266411276, + "loss": 0.84336478, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3566, + "time_per_iteration": 2.619652032852173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087671, + "balance_loss_mlp": 1.07844996, + "diversity_loss_mlp": 0.0, + "epoch": 0.6862254713351289, + "flos": 864270425088.0, + "grad_norm": 0.07570090303701395, + "language_loss": 0.82732457, + "learning_rate": 0.00023672159701139755, + "loss": 0.83820128, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3567, + "time_per_iteration": 3.2096190452575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092795, + "balance_loss_mlp": 1.08350825, + "diversity_loss_mlp": 0.0, + "epoch": 0.6864178530203925, + "flos": 447141523968.0, + "grad_norm": 0.07219945861824417, + "language_loss": 0.86111134, + "learning_rate": 0.00023645679357242296, + "loss": 0.87203926, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3568, + "time_per_iteration": 2.598115921020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00792363, + "balance_loss_mlp": 1.34135008, + "diversity_loss_mlp": 0.22022857, + "epoch": 0.6866102347056561, + "flos": 424269093888.0, + "grad_norm": 0.03374979092207147, + "language_loss": 0.84308195, + "learning_rate": 0.00023619209244999534, + "loss": 0.85100567, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01157361, + "step": 3569, + "time_per_iteration": 2.647141695022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.07559109, + "diversity_loss_mlp": 0.0, + "epoch": 0.6868026163909196, + "flos": 472373586432.0, + "grad_norm": 0.09720254317506574, + "language_loss": 0.85017771, + "learning_rate": 0.0002359274937468806, + "loss": 0.86102515, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 3570, + "time_per_iteration": 2.5088424682617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080851, + "balance_loss_mlp": 1.07149255, + "diversity_loss_mlp": 0.0, + "epoch": 0.6869949980761831, + "flos": 464190124032.0, + "grad_norm": 0.06491952507138833, + "language_loss": 0.77798098, + "learning_rate": 0.00023566299756580512, + "loss": 0.78878951, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3571, + "time_per_iteration": 2.6349782943725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080655, + "balance_loss_mlp": 1.07132113, + "diversity_loss_mlp": 0.0, + "epoch": 0.6871873797614467, + "flos": 426235944960.0, + "grad_norm": 0.07205344290521438, + "language_loss": 0.78495932, + "learning_rate": 0.0002353986040094551, + "loss": 0.79576588, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3572, + "time_per_iteration": 2.4710493087768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079091, + "balance_loss_mlp": 1.06974494, + "diversity_loss_mlp": 0.0, + "epoch": 0.6873797614467103, + "flos": 443625569280.0, + "grad_norm": 0.07195013135933294, + "language_loss": 0.7977035, + "learning_rate": 0.00023513431318047796, + "loss": 0.80849445, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3573, + "time_per_iteration": 2.5213143825531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081479, + "balance_loss_mlp": 1.07233512, + "diversity_loss_mlp": 0.0, + "epoch": 0.6875721431319738, + "flos": 992323436544.0, + "grad_norm": 0.0671999790126143, + "language_loss": 0.77178657, + "learning_rate": 0.00023487012518147977, + "loss": 0.78260136, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3574, + "time_per_iteration": 3.2319135665893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073879, + "balance_loss_mlp": 1.06456256, + "diversity_loss_mlp": 0.0, + "epoch": 0.6877645248172374, + "flos": 1285513638912.0, + "grad_norm": 0.06898424741609648, + "language_loss": 0.84452772, + "learning_rate": 0.00023460604011502772, + "loss": 0.85526657, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3575, + "time_per_iteration": 3.8878557682037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075527, + "balance_loss_mlp": 1.0666877, + "diversity_loss_mlp": 0.0, + "epoch": 0.687956906502501, + "flos": 876733383168.0, + "grad_norm": 0.0699577179930161, + "language_loss": 0.85862118, + "learning_rate": 0.00023434205808364845, + "loss": 0.86937642, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 3576, + "time_per_iteration": 3.1633143424987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072972, + "balance_loss_mlp": 1.06390619, + "diversity_loss_mlp": 0.0, + "epoch": 0.6881492881877646, + "flos": 563324419584.0, + "grad_norm": 0.07476899851847786, + "language_loss": 0.85238355, + "learning_rate": 0.00023407817918982932, + "loss": 0.86311328, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3577, + "time_per_iteration": 2.7126357555389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075894, + "balance_loss_mlp": 1.06677413, + "diversity_loss_mlp": 0.0, + "epoch": 0.6883416698730281, + "flos": 795127104000.0, + "grad_norm": 0.07427735671199864, + "language_loss": 0.78816962, + "learning_rate": 0.00023381440353601718, + "loss": 0.79892862, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3578, + "time_per_iteration": 2.9925150871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069003, + "balance_loss_mlp": 1.05976987, + "diversity_loss_mlp": 0.0, + "epoch": 0.6885340515582916, + "flos": 723621579264.0, + "grad_norm": 0.07604251893794473, + "language_loss": 0.86125422, + "learning_rate": 0.00023355073122461822, + "loss": 0.87194419, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3579, + "time_per_iteration": 2.938112258911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065285, + "balance_loss_mlp": 1.05620754, + "diversity_loss_mlp": 0.0, + "epoch": 0.6887264332435552, + "flos": 1010926282752.0, + "grad_norm": 0.06357801718819331, + "language_loss": 0.82597542, + "learning_rate": 0.00023328716235799973, + "loss": 0.83662832, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3580, + "time_per_iteration": 3.2711336612701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066362, + "balance_loss_mlp": 1.05755877, + "diversity_loss_mlp": 0.0, + "epoch": 0.6889188149288188, + "flos": 585262983168.0, + "grad_norm": 0.07922172227575792, + "language_loss": 0.84162283, + "learning_rate": 0.00023302369703848803, + "loss": 0.85228646, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 3581, + "time_per_iteration": 2.8185226917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069197, + "balance_loss_mlp": 1.06004775, + "diversity_loss_mlp": 0.0, + "epoch": 0.6891111966140824, + "flos": 636119889408.0, + "grad_norm": 0.07416922878209098, + "language_loss": 0.79931486, + "learning_rate": 0.00023276033536836937, + "loss": 0.81000686, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 3582, + "time_per_iteration": 2.844299554824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061227, + "balance_loss_mlp": 1.05179787, + "diversity_loss_mlp": 0.0, + "epoch": 0.6893035782993459, + "flos": 495270609408.0, + "grad_norm": 0.06489183727188522, + "language_loss": 0.85119617, + "learning_rate": 0.00023249707744988984, + "loss": 0.86180842, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3583, + "time_per_iteration": 2.701711654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060915, + "balance_loss_mlp": 1.05140829, + "diversity_loss_mlp": 0.0, + "epoch": 0.6894959599846094, + "flos": 458215792128.0, + "grad_norm": 0.07019303893436639, + "language_loss": 0.82148254, + "learning_rate": 0.00023223392338525529, + "loss": 0.83209163, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3584, + "time_per_iteration": 2.5167200565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053502, + "balance_loss_mlp": 1.04406083, + "diversity_loss_mlp": 0.0, + "epoch": 0.689688341669873, + "flos": 505003175424.0, + "grad_norm": 0.06639305906088179, + "language_loss": 0.78639823, + "learning_rate": 0.00023197087327663107, + "loss": 0.79693329, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3585, + "time_per_iteration": 2.6349897384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057819, + "balance_loss_mlp": 1.04834747, + "diversity_loss_mlp": 0.0, + "epoch": 0.6898807233551366, + "flos": 763910797824.0, + "grad_norm": 0.0732534701091779, + "language_loss": 0.81201088, + "learning_rate": 0.00023170792722614243, + "loss": 0.82258916, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3586, + "time_per_iteration": 2.9198050498962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056115, + "balance_loss_mlp": 1.04651892, + "diversity_loss_mlp": 0.0, + "epoch": 0.6900731050404002, + "flos": 583337977344.0, + "grad_norm": 0.06720533838288198, + "language_loss": 0.83776879, + "learning_rate": 0.00023144508533587377, + "loss": 0.84832996, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3587, + "time_per_iteration": 2.8723502159118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054327, + "balance_loss_mlp": 1.04436147, + "diversity_loss_mlp": 0.0, + "epoch": 0.6902654867256637, + "flos": 711865262592.0, + "grad_norm": 0.07065225941485688, + "language_loss": 0.78699905, + "learning_rate": 0.0002311823477078698, + "loss": 0.79754233, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3588, + "time_per_iteration": 2.9407894611358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054694, + "balance_loss_mlp": 1.04507959, + "diversity_loss_mlp": 0.0, + "epoch": 0.6904578684109273, + "flos": 597112902144.0, + "grad_norm": 0.0778571388662146, + "language_loss": 0.85240763, + "learning_rate": 0.00023091971444413428, + "loss": 0.8629545, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.0960083, + "routerloss_mlp": 0.0, + "step": 3589, + "time_per_iteration": 2.796943187713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054005, + "balance_loss_mlp": 1.04385448, + "diversity_loss_mlp": 0.0, + "epoch": 0.6906502500961909, + "flos": 585040527360.0, + "grad_norm": 0.0732795678952718, + "language_loss": 0.82600373, + "learning_rate": 0.00023065718564663012, + "loss": 0.8365438, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 3590, + "time_per_iteration": 2.742586135864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010537, + "balance_loss_mlp": 1.00519681, + "diversity_loss_mlp": 0.0, + "epoch": 0.6908426317814544, + "flos": 1587827017728.0, + "grad_norm": 0.012465594930310886, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74922127, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.0534668, + "routerloss_mlp": 0.0, + "step": 3591, + "time_per_iteration": 4.981812477111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079259, + "balance_loss_mlp": 1.34177041, + "diversity_loss_mlp": 0.2198928, + "epoch": 0.6910350134667179, + "flos": 500780579328.0, + "grad_norm": 0.028847197535296083, + "language_loss": 0.80689478, + "learning_rate": 0.0002301324418579666, + "loss": 0.81482071, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0117582, + "step": 3592, + "time_per_iteration": 2.71809983253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0058906, + "balance_loss_mlp": 1.01557088, + "diversity_loss_mlp": 0.14263315, + "epoch": 0.6912273951519815, + "flos": 1409194257408.0, + "grad_norm": 0.0010924650790030575, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79277533, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00995804, + "step": 3593, + "time_per_iteration": 4.800194263458252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064196, + "balance_loss_mlp": 1.05474234, + "diversity_loss_mlp": 0.0, + "epoch": 0.6914197768372451, + "flos": 635279625216.0, + "grad_norm": 0.08227146788009188, + "language_loss": 0.80700612, + "learning_rate": 0.00022960811715677415, + "loss": 0.81764805, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3594, + "time_per_iteration": 2.8780887126922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065404, + "balance_loss_mlp": 1.05574787, + "diversity_loss_mlp": 0.0, + "epoch": 0.6916121585225087, + "flos": 558044246016.0, + "grad_norm": 0.06283622806249096, + "language_loss": 0.82029772, + "learning_rate": 0.00022934611221845608, + "loss": 0.83095175, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3595, + "time_per_iteration": 2.80785870552063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062245, + "balance_loss_mlp": 1.05264866, + "diversity_loss_mlp": 0.0, + "epoch": 0.6918045402077723, + "flos": 529167748608.0, + "grad_norm": 0.07415067488634865, + "language_loss": 0.77666163, + "learning_rate": 0.00022908421235729609, + "loss": 0.78728402, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3596, + "time_per_iteration": 2.75410795211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065698, + "balance_loss_mlp": 1.05607235, + "diversity_loss_mlp": 0.0, + "epoch": 0.6919969218930357, + "flos": 570351559680.0, + "grad_norm": 0.06984612144500793, + "language_loss": 0.8509379, + "learning_rate": 0.0002288224176749728, + "loss": 0.86159492, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3597, + "time_per_iteration": 2.670696258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070664, + "balance_loss_mlp": 1.06105542, + "diversity_loss_mlp": 0.0, + "epoch": 0.6921893035782993, + "flos": 683305196544.0, + "grad_norm": 0.1037313094960325, + "language_loss": 0.78704476, + "learning_rate": 0.00022856072827312385, + "loss": 0.79775131, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.09606934, + "routerloss_mlp": 0.0, + "step": 3598, + "time_per_iteration": 2.795475959777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106581, + "balance_loss_mlp": 1.05624998, + "diversity_loss_mlp": 0.0, + "epoch": 0.6923816852635629, + "flos": 546745324032.0, + "grad_norm": 0.06439958207329444, + "language_loss": 0.77316082, + "learning_rate": 0.00022829914425334598, + "loss": 0.78381896, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3599, + "time_per_iteration": 2.6179866790771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064235, + "balance_loss_mlp": 1.05483484, + "diversity_loss_mlp": 0.0, + "epoch": 0.6925740669488265, + "flos": 510036300288.0, + "grad_norm": 0.06408780313496462, + "language_loss": 0.80725557, + "learning_rate": 0.0002280376657171956, + "loss": 0.81789792, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3600, + "time_per_iteration": 2.633162021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_mlp": 1.05445051, + "diversity_loss_mlp": 0.0, + "epoch": 0.69276644863409, + "flos": 869424689664.0, + "grad_norm": 0.07377083778937557, + "language_loss": 0.76414573, + "learning_rate": 0.00022777629276618706, + "loss": 0.77478784, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3601, + "time_per_iteration": 3.0916104316711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065762, + "balance_loss_mlp": 1.05597496, + "diversity_loss_mlp": 0.0, + "epoch": 0.6929588303193536, + "flos": 625772086272.0, + "grad_norm": 0.06702562864271609, + "language_loss": 0.77948666, + "learning_rate": 0.0002275150255017947, + "loss": 0.79014426, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3602, + "time_per_iteration": 2.7668936252593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012943, + "balance_loss_mlp": 1.00765014, + "diversity_loss_mlp": 0.0, + "epoch": 0.6931512120046172, + "flos": 1545382996992.0, + "grad_norm": 0.010670435186768691, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76745617, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 3603, + "time_per_iteration": 5.010159492492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011501, + "balance_loss_mlp": 1.00618434, + "diversity_loss_mlp": 0.0, + "epoch": 0.6933435936898807, + "flos": 1448230606848.0, + "grad_norm": 0.00963913060826947, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76138604, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 3604, + "time_per_iteration": 4.7926812171936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061095, + "balance_loss_mlp": 1.05157018, + "diversity_loss_mlp": 0.0, + "epoch": 0.6935359753751443, + "flos": 540896901120.0, + "grad_norm": 0.06111799581134822, + "language_loss": 0.84283471, + "learning_rate": 0.0002267318588424379, + "loss": 0.85344565, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3605, + "time_per_iteration": 2.732388496398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_mlp": 1.04717207, + "diversity_loss_mlp": 0.0, + "epoch": 0.6937283570604078, + "flos": 719396411904.0, + "grad_norm": 0.07244313312376265, + "language_loss": 0.87551069, + "learning_rate": 0.00022647101533842845, + "loss": 0.88607633, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3606, + "time_per_iteration": 3.001912832260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_mlp": 1.04882836, + "diversity_loss_mlp": 0.0, + "epoch": 0.6939207387456714, + "flos": 522165574656.0, + "grad_norm": 0.07498146805012186, + "language_loss": 0.76334918, + "learning_rate": 0.00022621027802778872, + "loss": 0.77393162, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3607, + "time_per_iteration": 2.6257400512695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052774, + "balance_loss_mlp": 1.04345798, + "diversity_loss_mlp": 0.0, + "epoch": 0.694113120430935, + "flos": 535359767040.0, + "grad_norm": 0.07029819881410336, + "language_loss": 0.78756207, + "learning_rate": 0.00022594964701174586, + "loss": 0.79808986, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3608, + "time_per_iteration": 2.6099236011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065561, + "balance_loss_mlp": 1.05642402, + "diversity_loss_mlp": 0.0, + "epoch": 0.6943055021161986, + "flos": 523358972928.0, + "grad_norm": 0.10152593614861574, + "language_loss": 0.84643018, + "learning_rate": 0.00022568912239148586, + "loss": 0.85708582, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 3609, + "time_per_iteration": 2.6678829193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059207, + "balance_loss_mlp": 1.04986095, + "diversity_loss_mlp": 0.0, + "epoch": 0.694497883801462, + "flos": 484902982656.0, + "grad_norm": 0.06906376751770449, + "language_loss": 0.81638551, + "learning_rate": 0.00022542870426815344, + "loss": 0.82697761, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3610, + "time_per_iteration": 2.69460129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058231, + "balance_loss_mlp": 1.04869449, + "diversity_loss_mlp": 0.0, + "epoch": 0.6946902654867256, + "flos": 461474786304.0, + "grad_norm": 0.07528135941421366, + "language_loss": 0.86051476, + "learning_rate": 0.00022516839274285173, + "loss": 0.87109709, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3611, + "time_per_iteration": 2.5634658336639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063678, + "balance_loss_mlp": 1.05389714, + "diversity_loss_mlp": 0.0, + "epoch": 0.6948826471719892, + "flos": 512855525376.0, + "grad_norm": 0.06331906344074151, + "language_loss": 0.7521888, + "learning_rate": 0.00022490818791664265, + "loss": 0.76282561, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3612, + "time_per_iteration": 2.617492437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067849, + "balance_loss_mlp": 1.05837226, + "diversity_loss_mlp": 0.0, + "epoch": 0.6950750288572528, + "flos": 557184531456.0, + "grad_norm": 0.05946591075452152, + "language_loss": 0.85666263, + "learning_rate": 0.00022464808989054676, + "loss": 0.86734116, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3613, + "time_per_iteration": 2.6678874492645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789837, + "balance_loss_mlp": 1.33770788, + "diversity_loss_mlp": 0.21965824, + "epoch": 0.6952674105425164, + "flos": 542475740160.0, + "grad_norm": 0.03604068217542595, + "language_loss": 0.76138353, + "learning_rate": 0.00022438809876554284, + "loss": 0.76928186, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01115366, + "step": 3614, + "time_per_iteration": 2.6613171100616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070097, + "balance_loss_mlp": 1.0602442, + "diversity_loss_mlp": 0.0, + "epoch": 0.6954597922277799, + "flos": 546742752768.0, + "grad_norm": 0.08971125257054285, + "language_loss": 0.80425173, + "learning_rate": 0.00022412821464256873, + "loss": 0.81495273, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3615, + "time_per_iteration": 2.7288718223571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_mlp": 1.06157804, + "diversity_loss_mlp": 0.0, + "epoch": 0.6956521739130435, + "flos": 519511905792.0, + "grad_norm": 0.07384702921709109, + "language_loss": 0.82342923, + "learning_rate": 0.00022386843762252023, + "loss": 0.83414114, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3616, + "time_per_iteration": 2.5761711597442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106737, + "balance_loss_mlp": 1.0575707, + "diversity_loss_mlp": 0.0, + "epoch": 0.695844555598307, + "flos": 466275543552.0, + "grad_norm": 0.07908443617567998, + "language_loss": 0.79798818, + "learning_rate": 0.00022360876780625193, + "loss": 0.80866194, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.09790039, + "routerloss_mlp": 0.0, + "step": 3617, + "time_per_iteration": 2.6008386611938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059868, + "balance_loss_mlp": 1.05015886, + "diversity_loss_mlp": 0.0, + "epoch": 0.6960369372835706, + "flos": 600663361536.0, + "grad_norm": 0.07021226627677062, + "language_loss": 0.80116498, + "learning_rate": 0.00022334920529457604, + "loss": 0.81176364, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3618, + "time_per_iteration": 2.9185733795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105864, + "balance_loss_mlp": 1.04876924, + "diversity_loss_mlp": 0.0, + "epoch": 0.6962293189688342, + "flos": 644233969152.0, + "grad_norm": 0.05697997760775425, + "language_loss": 0.87189567, + "learning_rate": 0.00022308975018826423, + "loss": 0.88248205, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3619, + "time_per_iteration": 2.927544355392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054275, + "balance_loss_mlp": 1.04414856, + "diversity_loss_mlp": 0.0, + "epoch": 0.6964217006540977, + "flos": 638810634240.0, + "grad_norm": 0.0740354998090604, + "language_loss": 0.84932256, + "learning_rate": 0.00022283040258804564, + "loss": 0.85986531, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 3620, + "time_per_iteration": 2.755613327026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787662, + "balance_loss_mlp": 1.33203387, + "diversity_loss_mlp": 0.22018704, + "epoch": 0.6966140823393613, + "flos": 652167811584.0, + "grad_norm": 0.033538632644234186, + "language_loss": 0.83875167, + "learning_rate": 0.00022257116259460802, + "loss": 0.84662825, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01155162, + "step": 3621, + "time_per_iteration": 2.844062089920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_mlp": 1.03843641, + "diversity_loss_mlp": 0.0, + "epoch": 0.6968064640246249, + "flos": 704492328960.0, + "grad_norm": 0.06349986715080715, + "language_loss": 0.81602001, + "learning_rate": 0.00022231203030859725, + "loss": 0.82649869, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3622, + "time_per_iteration": 2.9582505226135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053512, + "balance_loss_mlp": 1.04382682, + "diversity_loss_mlp": 0.0, + "epoch": 0.6969988457098885, + "flos": 492555271680.0, + "grad_norm": 0.09473470519326596, + "language_loss": 0.83760095, + "learning_rate": 0.00022205300583061737, + "loss": 0.84813607, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 3623, + "time_per_iteration": 2.5727412700653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016252, + "balance_loss_mlp": 1.01057744, + "diversity_loss_mlp": 0.0, + "epoch": 0.6971912273951519, + "flos": 1352592442368.0, + "grad_norm": 0.01746847385777515, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83854461, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.05664062, + "routerloss_mlp": 0.0, + "step": 3624, + "time_per_iteration": 4.8940582275390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051503, + "balance_loss_mlp": 1.04190028, + "diversity_loss_mlp": 0.0, + "epoch": 0.6973836090804155, + "flos": 602459887104.0, + "grad_norm": 0.07214179790538137, + "language_loss": 0.77598304, + "learning_rate": 0.00022153528070095735, + "loss": 0.78649807, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3625, + "time_per_iteration": 2.694251298904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049268, + "balance_loss_mlp": 1.03960037, + "diversity_loss_mlp": 0.0, + "epoch": 0.6975759907656791, + "flos": 524065614336.0, + "grad_norm": 0.07542787145084529, + "language_loss": 0.88381326, + "learning_rate": 0.00022127658025027568, + "loss": 0.89430594, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3626, + "time_per_iteration": 2.6595661640167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053933, + "balance_loss_mlp": 1.04412818, + "diversity_loss_mlp": 0.0, + "epoch": 0.6977683724509427, + "flos": 480912754176.0, + "grad_norm": 0.08038583191357998, + "language_loss": 0.85689813, + "learning_rate": 0.00022101798800962258, + "loss": 0.86743748, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3627, + "time_per_iteration": 2.6137661933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057745, + "balance_loss_mlp": 1.04847646, + "diversity_loss_mlp": 0.0, + "epoch": 0.6979607541362063, + "flos": 522625167360.0, + "grad_norm": 0.08075391789271535, + "language_loss": 0.78634858, + "learning_rate": 0.00022075950407939227, + "loss": 0.79692602, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3628, + "time_per_iteration": 2.6296188831329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059141, + "balance_loss_mlp": 1.04959214, + "diversity_loss_mlp": 0.0, + "epoch": 0.6981531358214698, + "flos": 548077114368.0, + "grad_norm": 0.0897351301563825, + "language_loss": 0.8281461, + "learning_rate": 0.0002205011285599367, + "loss": 0.83873749, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.09539795, + "routerloss_mlp": 0.0, + "step": 3629, + "time_per_iteration": 2.6147000789642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079513, + "balance_loss_mlp": 1.34714937, + "diversity_loss_mlp": 0.21970588, + "epoch": 0.6983455175067333, + "flos": 700052419584.0, + "grad_norm": 0.029792453728032804, + "language_loss": 0.80962801, + "learning_rate": 0.00022024286155156658, + "loss": 0.81757927, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01170244, + "step": 3630, + "time_per_iteration": 2.8613815307617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058875, + "balance_loss_mlp": 1.04967785, + "diversity_loss_mlp": 0.0, + "epoch": 0.6985378991919969, + "flos": 485078450688.0, + "grad_norm": 0.10033041150535157, + "language_loss": 0.86079919, + "learning_rate": 0.00021998470315454994, + "loss": 0.87138796, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3631, + "time_per_iteration": 2.647185802459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061381, + "balance_loss_mlp": 1.05195761, + "diversity_loss_mlp": 0.0, + "epoch": 0.6987302808772605, + "flos": 558780622848.0, + "grad_norm": 0.06594571513985185, + "language_loss": 0.86829215, + "learning_rate": 0.00021972665346911275, + "loss": 0.87890601, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3632, + "time_per_iteration": 2.757704257965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065424, + "balance_loss_mlp": 1.05622673, + "diversity_loss_mlp": 0.0, + "epoch": 0.698922662562524, + "flos": 483593587200.0, + "grad_norm": 0.06824207534465764, + "language_loss": 0.79957312, + "learning_rate": 0.00021946871259543877, + "loss": 0.81022739, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 3633, + "time_per_iteration": 2.577909231185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063518, + "balance_loss_mlp": 1.05467892, + "diversity_loss_mlp": 0.0, + "epoch": 0.6991150442477876, + "flos": 718909655040.0, + "grad_norm": 0.08329780404335202, + "language_loss": 0.83364546, + "learning_rate": 0.00021921088063366957, + "loss": 0.84428072, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 3634, + "time_per_iteration": 2.933506965637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106547, + "balance_loss_mlp": 1.05625534, + "diversity_loss_mlp": 0.0, + "epoch": 0.6993074259330512, + "flos": 489128150016.0, + "grad_norm": 0.06097911291290099, + "language_loss": 0.81932688, + "learning_rate": 0.00021895315768390435, + "loss": 0.82998157, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 3635, + "time_per_iteration": 2.6155378818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071151, + "balance_loss_mlp": 1.06179357, + "diversity_loss_mlp": 0.0, + "epoch": 0.6994998076183148, + "flos": 718089214464.0, + "grad_norm": 0.05851098027896569, + "language_loss": 0.87547219, + "learning_rate": 0.00021869554384619999, + "loss": 0.88618374, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3636, + "time_per_iteration": 2.9845876693725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106866, + "balance_loss_mlp": 1.05937409, + "diversity_loss_mlp": 0.0, + "epoch": 0.6996921893035783, + "flos": 579016636416.0, + "grad_norm": 0.066101183722826, + "language_loss": 0.80819213, + "learning_rate": 0.00021843803922057115, + "loss": 0.81887871, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3637, + "time_per_iteration": 2.736743688583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069376, + "balance_loss_mlp": 1.060215, + "diversity_loss_mlp": 0.0, + "epoch": 0.6998845709888418, + "flos": 518629796352.0, + "grad_norm": 0.07934438223674636, + "language_loss": 0.8197611, + "learning_rate": 0.00021818064390698977, + "loss": 0.83045483, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3638, + "time_per_iteration": 2.6075611114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070673, + "balance_loss_mlp": 1.06178594, + "diversity_loss_mlp": 0.0, + "epoch": 0.7000769526741054, + "flos": 620951505408.0, + "grad_norm": 0.0705113992952529, + "language_loss": 0.87237096, + "learning_rate": 0.0002179233580053861, + "loss": 0.88307768, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 3639, + "time_per_iteration": 2.7142910957336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107015, + "balance_loss_mlp": 1.06120896, + "diversity_loss_mlp": 0.0, + "epoch": 0.700269334359369, + "flos": 559946856960.0, + "grad_norm": 0.07560028355572443, + "language_loss": 0.85636085, + "learning_rate": 0.00021766618161564688, + "loss": 0.86706233, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3640, + "time_per_iteration": 2.7285115718841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065402, + "balance_loss_mlp": 1.0562886, + "diversity_loss_mlp": 0.0, + "epoch": 0.7004617160446326, + "flos": 483343967232.0, + "grad_norm": 0.06395770762467583, + "language_loss": 0.87343419, + "learning_rate": 0.00021740911483761677, + "loss": 0.88408822, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3641, + "time_per_iteration": 2.584667205810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068314, + "balance_loss_mlp": 1.05936706, + "diversity_loss_mlp": 0.0, + "epoch": 0.7006540977298961, + "flos": 696981003264.0, + "grad_norm": 0.05940351360925286, + "language_loss": 0.91777283, + "learning_rate": 0.00021715215777109837, + "loss": 0.92845595, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 3642, + "time_per_iteration": 2.9933156967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069259, + "balance_loss_mlp": 1.06025815, + "diversity_loss_mlp": 0.0, + "epoch": 0.7008464794151597, + "flos": 504775950336.0, + "grad_norm": 0.07347565488383569, + "language_loss": 0.84518594, + "learning_rate": 0.00021689531051585103, + "loss": 0.85587853, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 3643, + "time_per_iteration": 2.6531710624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067704, + "balance_loss_mlp": 1.05844164, + "diversity_loss_mlp": 0.0, + "epoch": 0.7010388611004232, + "flos": 537242554368.0, + "grad_norm": 0.08696231717445767, + "language_loss": 0.80713868, + "learning_rate": 0.00021663857317159196, + "loss": 0.81781578, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3644, + "time_per_iteration": 2.604703426361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.06396961, + "diversity_loss_mlp": 0.0, + "epoch": 0.7012312427856868, + "flos": 547259245056.0, + "grad_norm": 0.057193672258815845, + "language_loss": 0.81973934, + "learning_rate": 0.00021638194583799487, + "loss": 0.83046699, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 3645, + "time_per_iteration": 2.6747145652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067445, + "balance_loss_mlp": 1.05851054, + "diversity_loss_mlp": 0.0, + "epoch": 0.7014236244709504, + "flos": 941409630720.0, + "grad_norm": 0.08498226844175927, + "language_loss": 0.82551372, + "learning_rate": 0.00021612542861469176, + "loss": 0.83618826, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 3646, + "time_per_iteration": 3.2375802993774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067002, + "balance_loss_mlp": 1.05810285, + "diversity_loss_mlp": 0.0, + "epoch": 0.7016160061562139, + "flos": 525167608320.0, + "grad_norm": 0.07003978186883456, + "language_loss": 0.8260622, + "learning_rate": 0.00021586902160127135, + "loss": 0.83673215, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 3647, + "time_per_iteration": 2.6448206901550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076859, + "balance_loss_mlp": 1.06791854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7018083878414775, + "flos": 373385023488.0, + "grad_norm": 0.11788208419913924, + "language_loss": 0.74163634, + "learning_rate": 0.00021561272489727974, + "loss": 0.75240493, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3648, + "time_per_iteration": 2.5040485858917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107998, + "balance_loss_mlp": 1.07128358, + "diversity_loss_mlp": 0.0, + "epoch": 0.7020007695267411, + "flos": 527784201216.0, + "grad_norm": 0.06337788759133205, + "language_loss": 0.8008945, + "learning_rate": 0.0002153565386022199, + "loss": 0.81169432, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 3649, + "time_per_iteration": 2.7248024940490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076924, + "balance_loss_mlp": 1.06812, + "diversity_loss_mlp": 0.0, + "epoch": 0.7021931512120047, + "flos": 690154297344.0, + "grad_norm": 0.0801860998557123, + "language_loss": 0.82855487, + "learning_rate": 0.00021510046281555262, + "loss": 0.83932412, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 3650, + "time_per_iteration": 2.809051036834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077447, + "balance_loss_mlp": 1.06870925, + "diversity_loss_mlp": 0.0, + "epoch": 0.7023855328972681, + "flos": 639784147968.0, + "grad_norm": 0.08542793543919469, + "language_loss": 0.81736684, + "learning_rate": 0.0002148444976366949, + "loss": 0.82814133, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 3651, + "time_per_iteration": 2.7492573261260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084402, + "balance_loss_mlp": 1.07583714, + "diversity_loss_mlp": 0.0, + "epoch": 0.7025779145825317, + "flos": 560940194304.0, + "grad_norm": 0.0799718694707253, + "language_loss": 0.82820916, + "learning_rate": 0.00021458864316502136, + "loss": 0.83905321, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.08575439, + "routerloss_mlp": 0.0, + "step": 3652, + "time_per_iteration": 2.7140626907348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.07368028, + "diversity_loss_mlp": 0.0, + "epoch": 0.7027702962677953, + "flos": 447445472256.0, + "grad_norm": 0.0716785593922181, + "language_loss": 0.87417138, + "learning_rate": 0.0002143328994998634, + "loss": 0.88499534, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 3653, + "time_per_iteration": 2.5076870918273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074305, + "balance_loss_mlp": 1.06541252, + "diversity_loss_mlp": 0.0, + "epoch": 0.7029626779530589, + "flos": 622500609024.0, + "grad_norm": 0.078552736129926, + "language_loss": 0.78368807, + "learning_rate": 0.00021407726674050982, + "loss": 0.79443109, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 3654, + "time_per_iteration": 2.8595826625823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077013, + "balance_loss_mlp": 1.06806064, + "diversity_loss_mlp": 0.0, + "epoch": 0.7031550596383225, + "flos": 629591989248.0, + "grad_norm": 0.06456326920806615, + "language_loss": 0.8704083, + "learning_rate": 0.0002138217449862061, + "loss": 0.88117838, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 3655, + "time_per_iteration": 2.727473258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074047, + "balance_loss_mlp": 1.06530333, + "diversity_loss_mlp": 0.0, + "epoch": 0.703347441323586, + "flos": 530843134464.0, + "grad_norm": 0.06685907167482581, + "language_loss": 0.78296137, + "learning_rate": 0.00021356633433615403, + "loss": 0.79370177, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 3656, + "time_per_iteration": 2.5853357315063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072471, + "balance_loss_mlp": 1.06341755, + "diversity_loss_mlp": 0.0, + "epoch": 0.7035398230088495, + "flos": 693593528832.0, + "grad_norm": 0.05195711031116695, + "language_loss": 0.83568424, + "learning_rate": 0.0002133110348895133, + "loss": 0.84640896, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3657, + "time_per_iteration": 2.966989517211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069658, + "balance_loss_mlp": 1.06044364, + "diversity_loss_mlp": 0.0, + "epoch": 0.7037322046941131, + "flos": 968035152384.0, + "grad_norm": 0.05842315057280589, + "language_loss": 0.85166538, + "learning_rate": 0.0002130558467453999, + "loss": 0.86236197, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3658, + "time_per_iteration": 3.3303468227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080025, + "balance_loss_mlp": 1.07069683, + "diversity_loss_mlp": 0.0, + "epoch": 0.7039245863793767, + "flos": 502863427584.0, + "grad_norm": 0.06729984707772495, + "language_loss": 0.8469972, + "learning_rate": 0.0002128007700028865, + "loss": 0.85779744, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3659, + "time_per_iteration": 2.7004916667938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069483, + "balance_loss_mlp": 1.06041121, + "diversity_loss_mlp": 0.0, + "epoch": 0.7041169680646402, + "flos": 465954342912.0, + "grad_norm": 0.08608403684795747, + "language_loss": 0.84587854, + "learning_rate": 0.00021254580476100276, + "loss": 0.85657346, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3660, + "time_per_iteration": 2.5480196475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072222, + "balance_loss_mlp": 1.06278646, + "diversity_loss_mlp": 0.0, + "epoch": 0.7043093497499038, + "flos": 632181417984.0, + "grad_norm": 0.07339918095130941, + "language_loss": 0.79315257, + "learning_rate": 0.00021229095111873497, + "loss": 0.80387473, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3661, + "time_per_iteration": 2.7757935523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791151, + "balance_loss_mlp": 1.34026599, + "diversity_loss_mlp": 0.21938899, + "epoch": 0.7045017314351674, + "flos": 542930190336.0, + "grad_norm": 0.027590424390171175, + "language_loss": 0.85883224, + "learning_rate": 0.0002120362091750261, + "loss": 0.8667438, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01132388, + "step": 3662, + "time_per_iteration": 2.896202802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798199, + "balance_loss_mlp": 1.35343075, + "diversity_loss_mlp": 0.22044487, + "epoch": 0.704694113120431, + "flos": 428237300736.0, + "grad_norm": 0.03684811642709949, + "language_loss": 0.87121612, + "learning_rate": 0.00021178157902877566, + "loss": 0.87919807, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01126087, + "step": 3663, + "time_per_iteration": 2.4897618293762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059718, + "balance_loss_mlp": 1.05026472, + "diversity_loss_mlp": 0.0, + "epoch": 0.7048864948056945, + "flos": 650544556032.0, + "grad_norm": 0.06585144557964606, + "language_loss": 0.868586, + "learning_rate": 0.0002115270607788397, + "loss": 0.87918323, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3664, + "time_per_iteration": 2.767237901687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061738, + "balance_loss_mlp": 1.05233264, + "diversity_loss_mlp": 0.0, + "epoch": 0.705078876490958, + "flos": 412562336256.0, + "grad_norm": 0.06809628156665722, + "language_loss": 0.8563199, + "learning_rate": 0.00021127265452403133, + "loss": 0.86693728, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3665, + "time_per_iteration": 2.5270590782165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028622, + "balance_loss_mlp": 1.02266109, + "diversity_loss_mlp": 0.0, + "epoch": 0.7052712581762216, + "flos": 1420040927232.0, + "grad_norm": 0.030216242564882093, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85120249, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 3666, + "time_per_iteration": 4.850507974624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105764, + "balance_loss_mlp": 1.04785872, + "diversity_loss_mlp": 0.0, + "epoch": 0.7054636398614852, + "flos": 493049369088.0, + "grad_norm": 0.07688296901308685, + "language_loss": 0.82549417, + "learning_rate": 0.00021076417839483065, + "loss": 0.83607054, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3667, + "time_per_iteration": 2.789318799972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00785288, + "balance_loss_mlp": 1.32734215, + "diversity_loss_mlp": 0.21942863, + "epoch": 0.7056560215467488, + "flos": 450457417728.0, + "grad_norm": 0.027872662040783723, + "language_loss": 0.85229611, + "learning_rate": 0.00021051010871784589, + "loss": 0.86014903, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01190263, + "step": 3668, + "time_per_iteration": 2.6029293537139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049431, + "balance_loss_mlp": 1.03972173, + "diversity_loss_mlp": 0.0, + "epoch": 0.7058484032320124, + "flos": 565703875584.0, + "grad_norm": 0.06094440535163373, + "language_loss": 0.79136097, + "learning_rate": 0.0002102561514308045, + "loss": 0.80185533, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3669, + "time_per_iteration": 2.717550754547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048509, + "balance_loss_mlp": 1.03882289, + "diversity_loss_mlp": 0.0, + "epoch": 0.7060407849172758, + "flos": 567008501760.0, + "grad_norm": 0.06685679205809081, + "language_loss": 0.82684934, + "learning_rate": 0.00021000230663230135, + "loss": 0.83733451, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3670, + "time_per_iteration": 2.663641929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_mlp": 1.03758621, + "diversity_loss_mlp": 0.0, + "epoch": 0.7062331666025394, + "flos": 468746403840.0, + "grad_norm": 0.0788999580683501, + "language_loss": 0.8333686, + "learning_rate": 0.00020974857442088762, + "loss": 0.84384131, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3671, + "time_per_iteration": 2.603200674057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050365, + "balance_loss_mlp": 1.04090595, + "diversity_loss_mlp": 0.0, + "epoch": 0.706425548287803, + "flos": 595316749824.0, + "grad_norm": 0.06597055707746856, + "language_loss": 0.89200228, + "learning_rate": 0.00020949495489507104, + "loss": 0.90250599, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3672, + "time_per_iteration": 2.6877996921539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_mlp": 1.04270363, + "diversity_loss_mlp": 0.0, + "epoch": 0.7066179299730666, + "flos": 475815389184.0, + "grad_norm": 0.17274894008002345, + "language_loss": 0.84991109, + "learning_rate": 0.00020924144815331525, + "loss": 0.86043334, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3673, + "time_per_iteration": 2.5844242572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_mlp": 1.04517114, + "diversity_loss_mlp": 0.0, + "epoch": 0.7068103116583301, + "flos": 506409117696.0, + "grad_norm": 0.0640379080300773, + "language_loss": 0.83600396, + "learning_rate": 0.00020898805429404044, + "loss": 0.84655201, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3674, + "time_per_iteration": 2.676417350769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056838, + "balance_loss_mlp": 1.04724169, + "diversity_loss_mlp": 0.0, + "epoch": 0.7070026933435937, + "flos": 679336989696.0, + "grad_norm": 0.0780577693768427, + "language_loss": 0.78793156, + "learning_rate": 0.0002087347734156228, + "loss": 0.79849994, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3675, + "time_per_iteration": 2.8697783946990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057522, + "balance_loss_mlp": 1.04800272, + "diversity_loss_mlp": 0.0, + "epoch": 0.7071950750288573, + "flos": 472217942016.0, + "grad_norm": 0.0710988084964876, + "language_loss": 0.79834986, + "learning_rate": 0.00020848160561639452, + "loss": 0.80892509, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.09515381, + "routerloss_mlp": 0.0, + "step": 3676, + "time_per_iteration": 2.7413785457611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106069, + "balance_loss_mlp": 1.05147529, + "diversity_loss_mlp": 0.0, + "epoch": 0.7073874567141208, + "flos": 473742452736.0, + "grad_norm": 0.06834186778178446, + "language_loss": 0.86040401, + "learning_rate": 0.0002082285509946445, + "loss": 0.8710109, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3677, + "time_per_iteration": 2.5471127033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063838, + "balance_loss_mlp": 1.05436051, + "diversity_loss_mlp": 0.0, + "epoch": 0.7075798383993844, + "flos": 545877895680.0, + "grad_norm": 0.06236421972787801, + "language_loss": 0.83409554, + "learning_rate": 0.00020797560964861683, + "loss": 0.84473389, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3678, + "time_per_iteration": 2.748696804046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065573, + "balance_loss_mlp": 1.05635202, + "diversity_loss_mlp": 0.0, + "epoch": 0.7077722200846479, + "flos": 662090526720.0, + "grad_norm": 0.07878907365407993, + "language_loss": 0.80641901, + "learning_rate": 0.0002077227816765122, + "loss": 0.81707478, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3679, + "time_per_iteration": 3.000666618347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_mlp": 1.03114033, + "diversity_loss_mlp": 0.0, + "epoch": 0.7079646017699115, + "flos": 1529960223744.0, + "grad_norm": 0.025842314854182848, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77483988, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.05126953, + "routerloss_mlp": 0.0, + "step": 3680, + "time_per_iteration": 4.779016971588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106697, + "balance_loss_mlp": 1.05772507, + "diversity_loss_mlp": 0.0, + "epoch": 0.7081569834551751, + "flos": 621502502400.0, + "grad_norm": 0.06703239561102693, + "language_loss": 0.78754878, + "learning_rate": 0.00020721746624665383, + "loss": 0.79821849, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3681, + "time_per_iteration": 2.7041916847229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073317, + "balance_loss_mlp": 1.06381631, + "diversity_loss_mlp": 0.0, + "epoch": 0.7083493651404387, + "flos": 794630435328.0, + "grad_norm": 0.06071055961479113, + "language_loss": 0.80160034, + "learning_rate": 0.00020696497898508114, + "loss": 0.81233358, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3682, + "time_per_iteration": 3.003126382827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.06374955, + "diversity_loss_mlp": 0.0, + "epoch": 0.7085417468257021, + "flos": 813747202560.0, + "grad_norm": 0.0794178936209596, + "language_loss": 0.77425051, + "learning_rate": 0.00020671260548979316, + "loss": 0.7849825, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3683, + "time_per_iteration": 3.000619649887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079652, + "balance_loss_mlp": 1.07019854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7087341285109657, + "flos": 700566340608.0, + "grad_norm": 0.06569012319146904, + "language_loss": 0.85012448, + "learning_rate": 0.00020646034585876982, + "loss": 0.86092097, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3684, + "time_per_iteration": 2.8407599925994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788495, + "balance_loss_mlp": 1.33244729, + "diversity_loss_mlp": 0.22155851, + "epoch": 0.7089265101962293, + "flos": 596514917376.0, + "grad_norm": 0.02817752508262258, + "language_loss": 0.84630954, + "learning_rate": 0.00020620820018994718, + "loss": 0.8541944, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0114923, + "step": 3685, + "time_per_iteration": 2.8807289600372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791818, + "balance_loss_mlp": 1.33957911, + "diversity_loss_mlp": 0.22135019, + "epoch": 0.7091188918814929, + "flos": 487106970624.0, + "grad_norm": 0.03572846620936607, + "language_loss": 0.83307725, + "learning_rate": 0.00020595616858121675, + "loss": 0.84099543, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0113536, + "step": 3686, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075035, + "balance_loss_mlp": 1.06569517, + "diversity_loss_mlp": 0.0, + "epoch": 0.7093112735667565, + "flos": 600117507072.0, + "grad_norm": 0.05825520117041851, + "language_loss": 0.80985916, + "learning_rate": 0.00020570425113042586, + "loss": 0.82060945, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3687, + "time_per_iteration": 2.724151611328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078198, + "balance_loss_mlp": 1.06894779, + "diversity_loss_mlp": 0.0, + "epoch": 0.70950365525202, + "flos": 505830956544.0, + "grad_norm": 0.0736963808397267, + "language_loss": 0.8558749, + "learning_rate": 0.0002054524479353776, + "loss": 0.8666569, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3688, + "time_per_iteration": 2.7505970001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074288, + "balance_loss_mlp": 1.06498957, + "diversity_loss_mlp": 0.0, + "epoch": 0.7096960369372836, + "flos": 732160747008.0, + "grad_norm": 0.07506666957013575, + "language_loss": 0.81571054, + "learning_rate": 0.00020520075909383063, + "loss": 0.82645345, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3689, + "time_per_iteration": 2.854198694229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074645, + "balance_loss_mlp": 1.06511474, + "diversity_loss_mlp": 0.0, + "epoch": 0.7098884186225471, + "flos": 972077511168.0, + "grad_norm": 0.06551416788386397, + "language_loss": 0.80860078, + "learning_rate": 0.00020494918470349916, + "loss": 0.81934714, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3690, + "time_per_iteration": 3.2713325023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079528, + "balance_loss_mlp": 1.34716058, + "diversity_loss_mlp": 0.22097552, + "epoch": 0.7100808003078107, + "flos": 504252117504.0, + "grad_norm": 0.03587666052644611, + "language_loss": 0.85333264, + "learning_rate": 0.00020469772486205297, + "loss": 0.86128545, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01121199, + "step": 3691, + "time_per_iteration": 2.626685380935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787595, + "balance_loss_mlp": 1.33183146, + "diversity_loss_mlp": 0.22060202, + "epoch": 0.7102731819930742, + "flos": 540335992320.0, + "grad_norm": 0.030476334667887343, + "language_loss": 0.81455922, + "learning_rate": 0.0002044463796671177, + "loss": 0.82243514, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0113784, + "step": 3692, + "time_per_iteration": 2.7819416522979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074018, + "balance_loss_mlp": 1.06465387, + "diversity_loss_mlp": 0.0, + "epoch": 0.7104655636783378, + "flos": 620378113536.0, + "grad_norm": 0.07963770038273417, + "language_loss": 0.8046093, + "learning_rate": 0.00020419514921627408, + "loss": 0.81534946, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3693, + "time_per_iteration": 2.8676981925964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069877, + "balance_loss_mlp": 1.06088233, + "diversity_loss_mlp": 0.0, + "epoch": 0.7106579453636014, + "flos": 557322923520.0, + "grad_norm": 0.07391756130926609, + "language_loss": 0.77261078, + "learning_rate": 0.00020394403360705855, + "loss": 0.78330958, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 3694, + "time_per_iteration": 2.695068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788663, + "balance_loss_mlp": 1.33321095, + "diversity_loss_mlp": 0.22100018, + "epoch": 0.710850327048865, + "flos": 513048245760.0, + "grad_norm": 0.034812211167962216, + "language_loss": 0.88271379, + "learning_rate": 0.00020369303293696228, + "loss": 0.89060044, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01155703, + "step": 3695, + "time_per_iteration": 2.601621627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066517, + "balance_loss_mlp": 1.05723643, + "diversity_loss_mlp": 0.0, + "epoch": 0.7110427087341286, + "flos": 423619352064.0, + "grad_norm": 0.07715335648803619, + "language_loss": 0.78224587, + "learning_rate": 0.00020344214730343304, + "loss": 0.79291105, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3696, + "time_per_iteration": 2.6193599700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065299, + "balance_loss_mlp": 1.05618572, + "diversity_loss_mlp": 0.0, + "epoch": 0.711235090419392, + "flos": 577415402496.0, + "grad_norm": 0.05468894944159508, + "language_loss": 0.79277122, + "learning_rate": 0.00020319137680387296, + "loss": 0.80342424, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 3697, + "time_per_iteration": 2.9309933185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060045, + "balance_loss_mlp": 1.05068743, + "diversity_loss_mlp": 0.0, + "epoch": 0.7114274721046556, + "flos": 448060709376.0, + "grad_norm": 0.07057759031394817, + "language_loss": 0.80451727, + "learning_rate": 0.0002029407215356398, + "loss": 0.81511772, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3698, + "time_per_iteration": 2.4956727027893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058583, + "balance_loss_mlp": 1.04976714, + "diversity_loss_mlp": 0.0, + "epoch": 0.7116198537899192, + "flos": 621962095104.0, + "grad_norm": 0.0722387573875999, + "language_loss": 0.83844793, + "learning_rate": 0.00020269018159604663, + "loss": 0.84903371, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 3699, + "time_per_iteration": 2.731231689453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057429, + "balance_loss_mlp": 1.04814827, + "diversity_loss_mlp": 0.0, + "epoch": 0.7118122354751828, + "flos": 498724895232.0, + "grad_norm": 0.07123396580800914, + "language_loss": 0.818003, + "learning_rate": 0.00020243975708236162, + "loss": 0.82857728, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3700, + "time_per_iteration": 2.597215414047241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781944, + "balance_loss_mlp": 1.31673443, + "diversity_loss_mlp": 0.22274226, + "epoch": 0.7120046171604463, + "flos": 572718532608.0, + "grad_norm": 0.030217464674653638, + "language_loss": 0.86634398, + "learning_rate": 0.00020218944809180818, + "loss": 0.87416339, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01220552, + "step": 3701, + "time_per_iteration": 2.7128944396972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056546, + "balance_loss_mlp": 1.04739642, + "diversity_loss_mlp": 0.0, + "epoch": 0.7121969988457099, + "flos": 572664204288.0, + "grad_norm": 0.06969302254489844, + "language_loss": 0.84630072, + "learning_rate": 0.00020193925472156493, + "loss": 0.85686618, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3702, + "time_per_iteration": 2.695040702819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009738, + "balance_loss_mlp": 1.00442076, + "diversity_loss_mlp": 0.0, + "epoch": 0.7123893805309734, + "flos": 1523429752320.0, + "grad_norm": 0.015177951683804305, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75298905, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 3703, + "time_per_iteration": 4.91239857673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784779, + "balance_loss_mlp": 1.3239193, + "diversity_loss_mlp": 0.22157452, + "epoch": 0.712581762216237, + "flos": 615105280512.0, + "grad_norm": 0.02622509859947044, + "language_loss": 0.83696187, + "learning_rate": 0.00020143921523049863, + "loss": 0.84480959, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01203172, + "step": 3704, + "time_per_iteration": 3.0262062549591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057244, + "balance_loss_mlp": 1.04805851, + "diversity_loss_mlp": 0.0, + "epoch": 0.7127741439015006, + "flos": 597777698304.0, + "grad_norm": 0.07737525798134272, + "language_loss": 0.838422, + "learning_rate": 0.00020118936930380837, + "loss": 0.84899437, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 3705, + "time_per_iteration": 2.741217851638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105596, + "balance_loss_mlp": 1.04639971, + "diversity_loss_mlp": 0.0, + "epoch": 0.7129665255867641, + "flos": 537398198784.0, + "grad_norm": 0.08146435226617602, + "language_loss": 0.80879092, + "learning_rate": 0.0002009396393856932, + "loss": 0.81935048, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 3706, + "time_per_iteration": 2.643540143966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.0414114, + "diversity_loss_mlp": 0.0, + "epoch": 0.7131589072720277, + "flos": 526442499072.0, + "grad_norm": 0.07418360122955521, + "language_loss": 0.82790005, + "learning_rate": 0.00020069002557310673, + "loss": 0.83840382, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3707, + "time_per_iteration": 2.719648838043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052452, + "balance_loss_mlp": 1.04351699, + "diversity_loss_mlp": 0.0, + "epoch": 0.7133512889572913, + "flos": 530919484416.0, + "grad_norm": 0.05884856391484217, + "language_loss": 0.77115107, + "learning_rate": 0.00020044052796295807, + "loss": 0.78167558, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3708, + "time_per_iteration": 2.830353260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051246, + "balance_loss_mlp": 1.04202533, + "diversity_loss_mlp": 0.0, + "epoch": 0.7135436706425549, + "flos": 503535564288.0, + "grad_norm": 0.07889939453961878, + "language_loss": 0.82217181, + "learning_rate": 0.00020019114665211063, + "loss": 0.83268428, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3709, + "time_per_iteration": 2.581709623336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048706, + "balance_loss_mlp": 1.03982449, + "diversity_loss_mlp": 0.0, + "epoch": 0.7137360523278183, + "flos": 515968786944.0, + "grad_norm": 0.06519405348344502, + "language_loss": 0.81405282, + "learning_rate": 0.00019994188173738276, + "loss": 0.8245399, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 3710, + "time_per_iteration": 2.5735976696014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049854, + "balance_loss_mlp": 1.04063272, + "diversity_loss_mlp": 0.0, + "epoch": 0.7139284340130819, + "flos": 510389434368.0, + "grad_norm": 0.07046885330875076, + "language_loss": 0.80712581, + "learning_rate": 0.0001996927333155477, + "loss": 0.81762433, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 3711, + "time_per_iteration": 2.814368724822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054586, + "balance_loss_mlp": 1.04546654, + "diversity_loss_mlp": 0.0, + "epoch": 0.7141208156983455, + "flos": 890275940352.0, + "grad_norm": 0.07187972004168419, + "language_loss": 0.85349059, + "learning_rate": 0.00019944370148333346, + "loss": 0.8640365, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3712, + "time_per_iteration": 3.169759750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058298, + "balance_loss_mlp": 1.04938745, + "diversity_loss_mlp": 0.0, + "epoch": 0.7143131973836091, + "flos": 535779712512.0, + "grad_norm": 0.060002667598624965, + "language_loss": 0.79623508, + "learning_rate": 0.00019919478633742278, + "loss": 0.80681807, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 3713, + "time_per_iteration": 2.644663095474243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061749, + "balance_loss_mlp": 1.05258763, + "diversity_loss_mlp": 0.0, + "epoch": 0.7145055790688727, + "flos": 473668300800.0, + "grad_norm": 0.07397385813864758, + "language_loss": 0.85182703, + "learning_rate": 0.00019894598797445302, + "loss": 0.86244452, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3714, + "time_per_iteration": 2.5240604877471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061709, + "balance_loss_mlp": 1.05239308, + "diversity_loss_mlp": 0.0, + "epoch": 0.7146979607541362, + "flos": 570521885184.0, + "grad_norm": 0.07339492646897193, + "language_loss": 0.81885231, + "learning_rate": 0.00019869730649101615, + "loss": 0.82946944, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3715, + "time_per_iteration": 2.827868938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063135, + "balance_loss_mlp": 1.05403948, + "diversity_loss_mlp": 0.0, + "epoch": 0.7148903424393998, + "flos": 839666082816.0, + "grad_norm": 0.0742719443850205, + "language_loss": 0.72613627, + "learning_rate": 0.00019844874198365943, + "loss": 0.73676765, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3716, + "time_per_iteration": 3.0963878631591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063516, + "balance_loss_mlp": 1.05428362, + "diversity_loss_mlp": 0.0, + "epoch": 0.7150827241246633, + "flos": 541823427072.0, + "grad_norm": 0.061591749317610134, + "language_loss": 0.83976817, + "learning_rate": 0.00019820029454888362, + "loss": 0.85040331, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3717, + "time_per_iteration": 2.7068889141082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006732, + "balance_loss_mlp": 1.0012722, + "diversity_loss_mlp": 0.0, + "epoch": 0.7152751058099269, + "flos": 1583678200320.0, + "grad_norm": 0.016486733546314403, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75528002, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.0546875, + "routerloss_mlp": 0.0, + "step": 3718, + "time_per_iteration": 5.0301513671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010681, + "balance_loss_mlp": 1.05873013, + "diversity_loss_mlp": 0.0, + "epoch": 0.7154674874951905, + "flos": 517419145728.0, + "grad_norm": 0.06632920905024949, + "language_loss": 0.80107152, + "learning_rate": 0.0001977037512828529, + "loss": 0.81175244, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3719, + "time_per_iteration": 2.573982000350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066134, + "balance_loss_mlp": 1.05686522, + "diversity_loss_mlp": 0.0, + "epoch": 0.715659869180454, + "flos": 602524127232.0, + "grad_norm": 0.05986593090344285, + "language_loss": 0.86432415, + "learning_rate": 0.0001974556556443734, + "loss": 0.87498546, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3720, + "time_per_iteration": 2.7087209224700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106825, + "balance_loss_mlp": 1.0589757, + "diversity_loss_mlp": 0.0, + "epoch": 0.7158522508657176, + "flos": 531675684864.0, + "grad_norm": 0.05551674827732864, + "language_loss": 0.88590324, + "learning_rate": 0.00019720767746402547, + "loss": 0.89658576, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3721, + "time_per_iteration": 2.7290821075439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010754, + "balance_loss_mlp": 1.06610191, + "diversity_loss_mlp": 0.0, + "epoch": 0.7160446325509812, + "flos": 557569972224.0, + "grad_norm": 0.07406216566818759, + "language_loss": 0.79965603, + "learning_rate": 0.00019695981683808222, + "loss": 0.81041002, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3722, + "time_per_iteration": 2.8323793411254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072903, + "balance_loss_mlp": 1.06386733, + "diversity_loss_mlp": 0.0, + "epoch": 0.7162370142362448, + "flos": 690986847744.0, + "grad_norm": 0.08922707402242334, + "language_loss": 0.84955275, + "learning_rate": 0.00019671207386277225, + "loss": 0.86028177, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 3723, + "time_per_iteration": 2.94681978225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069475, + "balance_loss_mlp": 1.06010544, + "diversity_loss_mlp": 0.0, + "epoch": 0.7164293959215082, + "flos": 794109173760.0, + "grad_norm": 0.07420263460977167, + "language_loss": 0.78355432, + "learning_rate": 0.0001964644486342777, + "loss": 0.79424912, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3724, + "time_per_iteration": 2.960944414138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064733, + "balance_loss_mlp": 1.05573297, + "diversity_loss_mlp": 0.0, + "epoch": 0.7166217776067718, + "flos": 494178527232.0, + "grad_norm": 0.0760825236490028, + "language_loss": 0.86588323, + "learning_rate": 0.00019621694124873524, + "loss": 0.87653053, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 3725, + "time_per_iteration": 2.6881937980651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101766, + "balance_loss_mlp": 1.01224804, + "diversity_loss_mlp": 0.0, + "epoch": 0.7168141592920354, + "flos": 1401060354048.0, + "grad_norm": 0.018433056607108506, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77557743, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 3726, + "time_per_iteration": 4.8842387199401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057658, + "balance_loss_mlp": 1.04820442, + "diversity_loss_mlp": 0.0, + "epoch": 0.717006540977299, + "flos": 793150341120.0, + "grad_norm": 0.08148717312552407, + "language_loss": 0.77167314, + "learning_rate": 0.00019572228039082428, + "loss": 0.78224969, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3727, + "time_per_iteration": 3.071643829345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055911, + "balance_loss_mlp": 1.04670763, + "diversity_loss_mlp": 0.0, + "epoch": 0.7171989226625626, + "flos": 554812416000.0, + "grad_norm": 0.05270267691232831, + "language_loss": 0.83482945, + "learning_rate": 0.0001954751271105002, + "loss": 0.84538865, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3728, + "time_per_iteration": 2.8301711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105429, + "balance_loss_mlp": 1.04496169, + "diversity_loss_mlp": 0.0, + "epoch": 0.717391304347826, + "flos": 555914409984.0, + "grad_norm": 0.06896440922655821, + "language_loss": 0.80838037, + "learning_rate": 0.00019522809205721687, + "loss": 0.81892335, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3729, + "time_per_iteration": 2.8094747066497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048553, + "balance_loss_mlp": 1.03930831, + "diversity_loss_mlp": 0.0, + "epoch": 0.7175836860330896, + "flos": 538855898112.0, + "grad_norm": 0.09744205035272979, + "language_loss": 0.83110106, + "learning_rate": 0.0001949811753268816, + "loss": 0.84158659, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3730, + "time_per_iteration": 2.6963374614715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045755, + "balance_loss_mlp": 1.03643274, + "diversity_loss_mlp": 0.0, + "epoch": 0.7177760677183532, + "flos": 515637674496.0, + "grad_norm": 0.0730125544637403, + "language_loss": 0.82630277, + "learning_rate": 0.00019473437701535634, + "loss": 0.83676028, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3731, + "time_per_iteration": 2.6076574325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047574, + "balance_loss_mlp": 1.03844213, + "diversity_loss_mlp": 0.0, + "epoch": 0.7179684494036168, + "flos": 674719041024.0, + "grad_norm": 0.07914181118847867, + "language_loss": 0.89615285, + "learning_rate": 0.00019448769721845677, + "loss": 0.90662855, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3732, + "time_per_iteration": 2.824897289276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047996, + "balance_loss_mlp": 1.03853655, + "diversity_loss_mlp": 0.0, + "epoch": 0.7181608310888803, + "flos": 469912637952.0, + "grad_norm": 0.07061643018013358, + "language_loss": 0.86148334, + "learning_rate": 0.00019424113603195203, + "loss": 0.87196326, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3733, + "time_per_iteration": 2.520390510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_mlp": 1.03879809, + "diversity_loss_mlp": 0.0, + "epoch": 0.7183532127741439, + "flos": 593952652800.0, + "grad_norm": 0.07087799527916698, + "language_loss": 0.79863775, + "learning_rate": 0.0001939946935515657, + "loss": 0.80912238, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3734, + "time_per_iteration": 2.8286993503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104904, + "balance_loss_mlp": 1.03927684, + "diversity_loss_mlp": 0.0, + "epoch": 0.7185455944594075, + "flos": 498917615616.0, + "grad_norm": 0.08245280249652003, + "language_loss": 0.80650169, + "learning_rate": 0.0001937483698729755, + "loss": 0.8169921, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3735, + "time_per_iteration": 2.6458795070648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_mlp": 1.0338974, + "diversity_loss_mlp": 0.0, + "epoch": 0.718737976144671, + "flos": 814933260288.0, + "grad_norm": 0.07515481344769812, + "language_loss": 0.82211673, + "learning_rate": 0.0001935021650918128, + "loss": 0.83255374, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3736, + "time_per_iteration": 3.0285887718200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043391, + "balance_loss_mlp": 1.03346682, + "diversity_loss_mlp": 0.0, + "epoch": 0.7189303578299346, + "flos": 438328143360.0, + "grad_norm": 0.06979349456564556, + "language_loss": 0.87017608, + "learning_rate": 0.0001932560793036625, + "loss": 0.88060999, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.09924316, + "routerloss_mlp": 0.0, + "step": 3737, + "time_per_iteration": 2.482374906539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_mlp": 1.03452408, + "diversity_loss_mlp": 0.0, + "epoch": 0.7191227395151981, + "flos": 549398992896.0, + "grad_norm": 0.08340257337042449, + "language_loss": 0.86882925, + "learning_rate": 0.00019301011260406382, + "loss": 0.87927186, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3738, + "time_per_iteration": 2.6162045001983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104539, + "balance_loss_mlp": 1.03576994, + "diversity_loss_mlp": 0.0, + "epoch": 0.7193151212004617, + "flos": 626938320384.0, + "grad_norm": 0.0721539169034284, + "language_loss": 0.79805303, + "learning_rate": 0.00019276426508850936, + "loss": 0.80850697, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3739, + "time_per_iteration": 2.7380456924438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_mlp": 1.03111315, + "diversity_loss_mlp": 0.0, + "epoch": 0.7195075028857253, + "flos": 741062960640.0, + "grad_norm": 0.0788007665709812, + "language_loss": 0.80469853, + "learning_rate": 0.00019251853685244564, + "loss": 0.81510872, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3740, + "time_per_iteration": 3.0559754371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044512, + "balance_loss_mlp": 1.03485012, + "diversity_loss_mlp": 0.0, + "epoch": 0.7196998845709889, + "flos": 802875566592.0, + "grad_norm": 0.07989753754857366, + "language_loss": 0.80738026, + "learning_rate": 0.00019227292799127283, + "loss": 0.81782538, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3741, + "time_per_iteration": 3.0058369636535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044827, + "balance_loss_mlp": 1.03530192, + "diversity_loss_mlp": 0.0, + "epoch": 0.7198922662562524, + "flos": 925183669248.0, + "grad_norm": 0.17846470971826942, + "language_loss": 0.79000109, + "learning_rate": 0.00019202743860034454, + "loss": 0.80044937, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3742, + "time_per_iteration": 3.218614339828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043684, + "balance_loss_mlp": 1.03441513, + "diversity_loss_mlp": 0.0, + "epoch": 0.7200846479415159, + "flos": 580111289856.0, + "grad_norm": 0.07729553507192725, + "language_loss": 0.83831203, + "learning_rate": 0.00019178206877496873, + "loss": 0.84874886, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3743, + "time_per_iteration": 2.7014403343200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048278, + "balance_loss_mlp": 1.03885424, + "diversity_loss_mlp": 0.0, + "epoch": 0.7202770296267795, + "flos": 557695881216.0, + "grad_norm": 0.06342209640567653, + "language_loss": 0.85333169, + "learning_rate": 0.0001915368186104059, + "loss": 0.86381447, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3744, + "time_per_iteration": 2.733520746231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105441, + "balance_loss_mlp": 1.04513526, + "diversity_loss_mlp": 0.0, + "epoch": 0.7204694113120431, + "flos": 672552129024.0, + "grad_norm": 0.08207076889899251, + "language_loss": 0.81176144, + "learning_rate": 0.0001912916882018706, + "loss": 0.8223055, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3745, + "time_per_iteration": 2.7833125591278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057126, + "balance_loss_mlp": 1.04774427, + "diversity_loss_mlp": 0.0, + "epoch": 0.7206617929973067, + "flos": 799194055680.0, + "grad_norm": 0.08263651010752651, + "language_loss": 0.79468751, + "learning_rate": 0.00019104667764453125, + "loss": 0.80525875, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.09368896, + "routerloss_mlp": 0.0, + "step": 3746, + "time_per_iteration": 3.0572047233581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066676, + "balance_loss_mlp": 1.05751503, + "diversity_loss_mlp": 0.0, + "epoch": 0.7208541746825702, + "flos": 531898140672.0, + "grad_norm": 0.06554660744507769, + "language_loss": 0.80441052, + "learning_rate": 0.00019080178703350926, + "loss": 0.8150773, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3747, + "time_per_iteration": 2.6344070434570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067771, + "balance_loss_mlp": 1.05819249, + "diversity_loss_mlp": 0.0, + "epoch": 0.7210465563678338, + "flos": 535139882496.0, + "grad_norm": 0.07164749029527417, + "language_loss": 0.83225226, + "learning_rate": 0.00019055701646387952, + "loss": 0.84292996, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3748, + "time_per_iteration": 2.674436330795288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014621, + "balance_loss_mlp": 1.00935245, + "diversity_loss_mlp": 0.0, + "epoch": 0.7212389380530974, + "flos": 1533908606976.0, + "grad_norm": 0.01350364958452467, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.8148731, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.05273438, + "routerloss_mlp": 0.0, + "step": 3749, + "time_per_iteration": 4.8169167041778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074721, + "balance_loss_mlp": 1.06568444, + "diversity_loss_mlp": 0.0, + "epoch": 0.7214313197383609, + "flos": 461511862272.0, + "grad_norm": 0.09948968640859872, + "language_loss": 0.86443639, + "learning_rate": 0.00019006783582886368, + "loss": 0.87518358, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3750, + "time_per_iteration": 2.6094882488250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082564, + "balance_loss_mlp": 1.0731287, + "diversity_loss_mlp": 0.0, + "epoch": 0.7216237014236244, + "flos": 1037134056960.0, + "grad_norm": 0.0940617497046545, + "language_loss": 0.8313877, + "learning_rate": 0.00018982342595339437, + "loss": 0.84221339, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3751, + "time_per_iteration": 4.834062576293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077441, + "balance_loss_mlp": 1.06848848, + "diversity_loss_mlp": 0.0, + "epoch": 0.721816083108888, + "flos": 895951466496.0, + "grad_norm": 0.08300933032368943, + "language_loss": 0.81837034, + "learning_rate": 0.00018957913649915076, + "loss": 0.82914484, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 3752, + "time_per_iteration": 3.1204826831817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076559, + "balance_loss_mlp": 1.06739748, + "diversity_loss_mlp": 0.0, + "epoch": 0.7220084647941516, + "flos": 523314556416.0, + "grad_norm": 0.08305681898579634, + "language_loss": 0.79633486, + "learning_rate": 0.00018933496756097428, + "loss": 0.80710053, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3753, + "time_per_iteration": 2.6664350032806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077149, + "balance_loss_mlp": 1.06786871, + "diversity_loss_mlp": 0.0, + "epoch": 0.7222008464794152, + "flos": 816099494400.0, + "grad_norm": 0.08328010196337048, + "language_loss": 0.81679463, + "learning_rate": 0.0001890909192336603, + "loss": 0.82756615, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3754, + "time_per_iteration": 2.994882822036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073126, + "balance_loss_mlp": 1.06407857, + "diversity_loss_mlp": 0.0, + "epoch": 0.7223932281646788, + "flos": 749053702656.0, + "grad_norm": 0.08777822688547723, + "language_loss": 0.70716894, + "learning_rate": 0.00018884699161195623, + "loss": 0.71790028, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 3755, + "time_per_iteration": 4.262615442276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071208, + "balance_loss_mlp": 1.06174874, + "diversity_loss_mlp": 0.0, + "epoch": 0.7225856098499422, + "flos": 745502870016.0, + "grad_norm": 0.0673256778775424, + "language_loss": 0.77517748, + "learning_rate": 0.00018860318479056327, + "loss": 0.78588951, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3756, + "time_per_iteration": 3.1185147762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064095, + "balance_loss_mlp": 1.05514276, + "diversity_loss_mlp": 0.0, + "epoch": 0.7227779915352058, + "flos": 547330825728.0, + "grad_norm": 0.06734169026400741, + "language_loss": 0.83406973, + "learning_rate": 0.00018835949886413555, + "loss": 0.84471071, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 3757, + "time_per_iteration": 2.7693490982055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066701, + "balance_loss_mlp": 1.05735517, + "diversity_loss_mlp": 0.0, + "epoch": 0.7229703732204694, + "flos": 530484857856.0, + "grad_norm": 0.0750419048722912, + "language_loss": 0.78459024, + "learning_rate": 0.0001881159339272806, + "loss": 0.79525727, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3758, + "time_per_iteration": 2.6415517330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059793, + "balance_loss_mlp": 1.05062032, + "diversity_loss_mlp": 0.0, + "epoch": 0.723162754905733, + "flos": 528355021824.0, + "grad_norm": 0.0644798827635335, + "language_loss": 0.78601432, + "learning_rate": 0.00018787249007455858, + "loss": 0.79661226, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 3759, + "time_per_iteration": 2.6022799015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063242, + "balance_loss_mlp": 1.05413401, + "diversity_loss_mlp": 0.0, + "epoch": 0.7233551365909965, + "flos": 654868468224.0, + "grad_norm": 0.07015599197769962, + "language_loss": 0.71291095, + "learning_rate": 0.00018762916740048302, + "loss": 0.72354335, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3760, + "time_per_iteration": 2.8239991664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059773, + "balance_loss_mlp": 1.05033171, + "diversity_loss_mlp": 0.0, + "epoch": 0.7235475182762601, + "flos": 522365635584.0, + "grad_norm": 0.07068719643677601, + "language_loss": 0.86275655, + "learning_rate": 0.0001873859659995195, + "loss": 0.87335426, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3761, + "time_per_iteration": 2.825853109359741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056903, + "balance_loss_mlp": 1.04742599, + "diversity_loss_mlp": 0.0, + "epoch": 0.7237398999615237, + "flos": 609170595840.0, + "grad_norm": 0.06521234046982781, + "language_loss": 0.83369851, + "learning_rate": 0.0001871428859660878, + "loss": 0.84426749, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3762, + "time_per_iteration": 2.765061855316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054846, + "balance_loss_mlp": 1.04584002, + "diversity_loss_mlp": 0.0, + "epoch": 0.7239322816467872, + "flos": 658987176960.0, + "grad_norm": 0.06876344834189922, + "language_loss": 0.81910485, + "learning_rate": 0.00018689992739455975, + "loss": 0.82965332, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 3763, + "time_per_iteration": 2.955744504928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050714, + "balance_loss_mlp": 1.04123139, + "diversity_loss_mlp": 0.0, + "epoch": 0.7241246633320508, + "flos": 969282878976.0, + "grad_norm": 0.06967924844938471, + "language_loss": 0.85903621, + "learning_rate": 0.00018665709037926027, + "loss": 0.86954343, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.0947876, + "routerloss_mlp": 0.0, + "step": 3764, + "time_per_iteration": 3.306689977645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050074, + "balance_loss_mlp": 1.04077554, + "diversity_loss_mlp": 0.0, + "epoch": 0.7243170450173143, + "flos": 514995273216.0, + "grad_norm": 0.07823184864923875, + "language_loss": 0.8509047, + "learning_rate": 0.00018641437501446694, + "loss": 0.86140537, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3765, + "time_per_iteration": 2.5606436729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053259, + "balance_loss_mlp": 1.04385924, + "diversity_loss_mlp": 0.0, + "epoch": 0.7245094267025779, + "flos": 559746796032.0, + "grad_norm": 0.07453327039799393, + "language_loss": 0.8240428, + "learning_rate": 0.0001861717813944104, + "loss": 0.83457536, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3766, + "time_per_iteration": 2.639479875564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052028, + "balance_loss_mlp": 1.04260468, + "diversity_loss_mlp": 0.0, + "epoch": 0.7247018083878415, + "flos": 612642134016.0, + "grad_norm": 0.07462880824505752, + "language_loss": 0.79635704, + "learning_rate": 0.00018592930961327365, + "loss": 0.80687737, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3767, + "time_per_iteration": 2.71537446975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051032, + "balance_loss_mlp": 1.04159653, + "diversity_loss_mlp": 0.0, + "epoch": 0.7248941900731051, + "flos": 634676871168.0, + "grad_norm": 0.06502387009338012, + "language_loss": 0.88172042, + "learning_rate": 0.00018568695976519273, + "loss": 0.89223075, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3768, + "time_per_iteration": 2.7851336002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053572, + "balance_loss_mlp": 1.04388046, + "diversity_loss_mlp": 0.0, + "epoch": 0.7250865717583687, + "flos": 424941230592.0, + "grad_norm": 0.07526480217284313, + "language_loss": 0.80197144, + "learning_rate": 0.00018544473194425593, + "loss": 0.81250715, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3769, + "time_per_iteration": 2.5187532901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054045, + "balance_loss_mlp": 1.044276, + "diversity_loss_mlp": 0.0, + "epoch": 0.7252789534436321, + "flos": 635114068992.0, + "grad_norm": 0.07238275679239237, + "language_loss": 0.78824592, + "learning_rate": 0.00018520262624450485, + "loss": 0.79878634, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3770, + "time_per_iteration": 2.8748114109039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057377, + "balance_loss_mlp": 1.04787064, + "diversity_loss_mlp": 0.0, + "epoch": 0.7254713351288957, + "flos": 617185930752.0, + "grad_norm": 0.08918095477851212, + "language_loss": 0.86894727, + "learning_rate": 0.00018496064275993324, + "loss": 0.87952113, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3771, + "time_per_iteration": 2.824845314025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105481, + "balance_loss_mlp": 1.04509437, + "diversity_loss_mlp": 0.0, + "epoch": 0.7256637168141593, + "flos": 766986983424.0, + "grad_norm": 0.06900224223805673, + "language_loss": 0.82001221, + "learning_rate": 0.00018471878158448686, + "loss": 0.83056033, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3772, + "time_per_iteration": 2.9548990726470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056235, + "balance_loss_mlp": 1.04668033, + "diversity_loss_mlp": 0.0, + "epoch": 0.7258560984994229, + "flos": 495559503360.0, + "grad_norm": 0.058256019250052936, + "language_loss": 0.84301949, + "learning_rate": 0.00018447704281206512, + "loss": 0.85358179, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3773, + "time_per_iteration": 2.83591365814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055601, + "balance_loss_mlp": 1.04598725, + "diversity_loss_mlp": 0.0, + "epoch": 0.7260484801846864, + "flos": 530069681664.0, + "grad_norm": 0.07576068763334884, + "language_loss": 0.82763028, + "learning_rate": 0.0001842354265365191, + "loss": 0.83818638, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3774, + "time_per_iteration": 2.68778657913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060495, + "balance_loss_mlp": 1.05112517, + "diversity_loss_mlp": 0.0, + "epoch": 0.72624086186995, + "flos": 624964128768.0, + "grad_norm": 0.0805275617178238, + "language_loss": 0.80610001, + "learning_rate": 0.0001839939328516526, + "loss": 0.81670493, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3775, + "time_per_iteration": 2.7422258853912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790959, + "balance_loss_mlp": 1.33957541, + "diversity_loss_mlp": 0.21958014, + "epoch": 0.7264332435552135, + "flos": 716522858496.0, + "grad_norm": 0.033705672182060005, + "language_loss": 0.8138454, + "learning_rate": 0.0001837525618512218, + "loss": 0.82175499, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01138153, + "step": 3776, + "time_per_iteration": 2.9108829498291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053299, + "balance_loss_mlp": 1.04409015, + "diversity_loss_mlp": 0.0, + "epoch": 0.7266256252404771, + "flos": 681036968448.0, + "grad_norm": 0.07511121424148261, + "language_loss": 0.8321476, + "learning_rate": 0.00018351131362893519, + "loss": 0.84268057, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3777, + "time_per_iteration": 2.789809465408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058309, + "balance_loss_mlp": 1.04874849, + "diversity_loss_mlp": 0.0, + "epoch": 0.7268180069257407, + "flos": 518906580480.0, + "grad_norm": 0.08246656435114352, + "language_loss": 0.80534494, + "learning_rate": 0.00018327018827845364, + "loss": 0.81592798, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3778, + "time_per_iteration": 2.6201207637786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059499, + "balance_loss_mlp": 1.0502367, + "diversity_loss_mlp": 0.0, + "epoch": 0.7270103886110042, + "flos": 512662804992.0, + "grad_norm": 0.060849425034284504, + "language_loss": 0.87504601, + "learning_rate": 0.00018302918589339036, + "loss": 0.88564098, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3779, + "time_per_iteration": 2.689378499984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061153, + "balance_loss_mlp": 1.05198562, + "diversity_loss_mlp": 0.0, + "epoch": 0.7272027702962678, + "flos": 546653919744.0, + "grad_norm": 0.06743911417724738, + "language_loss": 0.90138805, + "learning_rate": 0.00018278830656731054, + "loss": 0.91199952, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3780, + "time_per_iteration": 2.6595706939697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056667, + "balance_loss_mlp": 1.04758894, + "diversity_loss_mlp": 0.0, + "epoch": 0.7273951519815314, + "flos": 593048521728.0, + "grad_norm": 0.06124301945992682, + "language_loss": 0.86350238, + "learning_rate": 0.00018254755039373222, + "loss": 0.87406909, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 3781, + "time_per_iteration": 2.7230565547943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062194, + "balance_loss_mlp": 1.0530144, + "diversity_loss_mlp": 0.0, + "epoch": 0.727587533666795, + "flos": 606012917760.0, + "grad_norm": 0.07105415138975459, + "language_loss": 0.83752382, + "learning_rate": 0.0001823069174661252, + "loss": 0.84814572, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 3782, + "time_per_iteration": 2.7941086292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056585, + "balance_loss_mlp": 1.04759097, + "diversity_loss_mlp": 0.0, + "epoch": 0.7277799153520584, + "flos": 513021081600.0, + "grad_norm": 0.06458866746308467, + "language_loss": 0.78171599, + "learning_rate": 0.00018206640787791112, + "loss": 0.79228187, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 3783, + "time_per_iteration": 2.618022918701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062955, + "balance_loss_mlp": 1.05387712, + "diversity_loss_mlp": 0.0, + "epoch": 0.727972297037322, + "flos": 537756475392.0, + "grad_norm": 0.06663972838638854, + "language_loss": 0.85480422, + "learning_rate": 0.00018182602172246416, + "loss": 0.86543375, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3784, + "time_per_iteration": 2.6113829612731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.05812776, + "diversity_loss_mlp": 0.0, + "epoch": 0.7281646787225856, + "flos": 535038566400.0, + "grad_norm": 0.07678107880467737, + "language_loss": 0.76375031, + "learning_rate": 0.00018158575909311075, + "loss": 0.77441949, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 3785, + "time_per_iteration": 2.650192975997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061503, + "balance_loss_mlp": 1.05243719, + "diversity_loss_mlp": 0.0, + "epoch": 0.7283570604078492, + "flos": 625055533056.0, + "grad_norm": 0.07604258502871962, + "language_loss": 0.79732937, + "learning_rate": 0.000181345620083129, + "loss": 0.80794436, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3786, + "time_per_iteration": 2.8074841499328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061193, + "balance_loss_mlp": 1.05211556, + "diversity_loss_mlp": 0.0, + "epoch": 0.7285494420931128, + "flos": 534173709312.0, + "grad_norm": 0.0629164713746694, + "language_loss": 0.86736983, + "learning_rate": 0.00018110560478574927, + "loss": 0.87798178, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3787, + "time_per_iteration": 2.6831634044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106216, + "balance_loss_mlp": 1.05288577, + "diversity_loss_mlp": 0.0, + "epoch": 0.7287418237783763, + "flos": 666548061696.0, + "grad_norm": 0.07652228362928638, + "language_loss": 0.80521822, + "learning_rate": 0.0001808657132941533, + "loss": 0.81583983, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3788, + "time_per_iteration": 2.7681210041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063738, + "balance_loss_mlp": 1.05462408, + "diversity_loss_mlp": 0.0, + "epoch": 0.7289342054636399, + "flos": 550602302976.0, + "grad_norm": 0.06755228065084157, + "language_loss": 0.83012414, + "learning_rate": 0.00018062594570147572, + "loss": 0.84076142, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3789, + "time_per_iteration": 2.59897780418396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069496, + "balance_loss_mlp": 1.06051326, + "diversity_loss_mlp": 0.0, + "epoch": 0.7291265871489034, + "flos": 687923145216.0, + "grad_norm": 0.0602370632110868, + "language_loss": 0.84944886, + "learning_rate": 0.00018038630210080243, + "loss": 0.86014384, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 3790, + "time_per_iteration": 2.8492085933685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061985, + "balance_loss_mlp": 1.05299687, + "diversity_loss_mlp": 0.0, + "epoch": 0.729318968834167, + "flos": 572664204288.0, + "grad_norm": 0.06258751029355039, + "language_loss": 0.85112703, + "learning_rate": 0.0001801467825851712, + "loss": 0.86174691, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 3791, + "time_per_iteration": 2.724008321762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063231, + "balance_loss_mlp": 1.05412316, + "diversity_loss_mlp": 0.0, + "epoch": 0.7295113505194305, + "flos": 586061028864.0, + "grad_norm": 0.06759881980366181, + "language_loss": 0.78407717, + "learning_rate": 0.00017990738724757172, + "loss": 0.79470944, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3792, + "time_per_iteration": 2.8527557849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065387, + "balance_loss_mlp": 1.05635726, + "diversity_loss_mlp": 0.0, + "epoch": 0.7297037322046941, + "flos": 707185645056.0, + "grad_norm": 0.05706424828537789, + "language_loss": 0.82412189, + "learning_rate": 0.00017966811618094598, + "loss": 0.83477581, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3793, + "time_per_iteration": 2.891587734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071379, + "balance_loss_mlp": 1.06256318, + "diversity_loss_mlp": 0.0, + "epoch": 0.7298961138899577, + "flos": 487292350464.0, + "grad_norm": 0.0800044571001495, + "language_loss": 0.84934509, + "learning_rate": 0.00017942896947818664, + "loss": 0.86005884, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 3794, + "time_per_iteration": 2.578213691711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027287, + "balance_loss_mlp": 1.02208936, + "diversity_loss_mlp": 0.0, + "epoch": 0.7300884955752213, + "flos": 1365804260352.0, + "grad_norm": 0.018812365315957286, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.7585234, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.05200195, + "routerloss_mlp": 0.0, + "step": 3795, + "time_per_iteration": 4.8731958866119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_mlp": 1.05696881, + "diversity_loss_mlp": 0.0, + "epoch": 0.7302808772604849, + "flos": 531806736384.0, + "grad_norm": 0.08247331408198653, + "language_loss": 0.85473979, + "learning_rate": 0.00017895104953559947, + "loss": 0.86539787, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 3796, + "time_per_iteration": 2.6150035858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071519, + "balance_loss_mlp": 1.06257856, + "diversity_loss_mlp": 0.0, + "epoch": 0.7304732589457483, + "flos": 436171143168.0, + "grad_norm": 0.0876682306683089, + "language_loss": 0.90019357, + "learning_rate": 0.00017871227648131672, + "loss": 0.91090876, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3797, + "time_per_iteration": 2.5456666946411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790219, + "balance_loss_mlp": 1.33552265, + "diversity_loss_mlp": 0.2213349, + "epoch": 0.7306656406310119, + "flos": 451621080576.0, + "grad_norm": 0.0295011086457174, + "language_loss": 0.82969385, + "learning_rate": 0.0001784736281619907, + "loss": 0.83759606, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01178985, + "step": 3798, + "time_per_iteration": 2.617690086364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064248, + "balance_loss_mlp": 1.05507529, + "diversity_loss_mlp": 0.0, + "epoch": 0.7308580223162755, + "flos": 512010491904.0, + "grad_norm": 0.0761333988969544, + "language_loss": 0.74143457, + "learning_rate": 0.00017823510467027232, + "loss": 0.75207704, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 3799, + "time_per_iteration": 2.74944806098938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061269, + "balance_loss_mlp": 1.05231094, + "diversity_loss_mlp": 0.0, + "epoch": 0.7310504040015391, + "flos": 375423455232.0, + "grad_norm": 0.07529945885516458, + "language_loss": 0.7849319, + "learning_rate": 0.00017799670609876516, + "loss": 0.79554456, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3800, + "time_per_iteration": 2.514719247817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106109, + "balance_loss_mlp": 1.05228066, + "diversity_loss_mlp": 0.0, + "epoch": 0.7312427856868026, + "flos": 549334752768.0, + "grad_norm": 0.07202410794231434, + "language_loss": 0.89223945, + "learning_rate": 0.00017775843254002366, + "loss": 0.90285027, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 3801, + "time_per_iteration": 2.742403507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059589, + "balance_loss_mlp": 1.05084491, + "diversity_loss_mlp": 0.0, + "epoch": 0.7314351673720662, + "flos": 767238801408.0, + "grad_norm": 0.060424645606399964, + "language_loss": 0.83728462, + "learning_rate": 0.00017752028408655367, + "loss": 0.84788048, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 3802, + "time_per_iteration": 3.0845768451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.007903, + "balance_loss_mlp": 1.33712423, + "diversity_loss_mlp": 0.22043222, + "epoch": 0.7316275490573297, + "flos": 486734012928.0, + "grad_norm": 0.03351149815402085, + "language_loss": 0.85395515, + "learning_rate": 0.00017728226083081272, + "loss": 0.86185813, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01152179, + "step": 3803, + "time_per_iteration": 2.625450849533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064175, + "balance_loss_mlp": 1.05536509, + "diversity_loss_mlp": 0.0, + "epoch": 0.7318199307425933, + "flos": 473428592640.0, + "grad_norm": 0.06980647435682294, + "language_loss": 0.81371546, + "learning_rate": 0.00017704436286520965, + "loss": 0.82435715, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 3804, + "time_per_iteration": 2.5445075035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064875, + "balance_loss_mlp": 1.05574334, + "diversity_loss_mlp": 0.0, + "epoch": 0.7320123124278569, + "flos": 549463233024.0, + "grad_norm": 0.0710476755005787, + "language_loss": 0.84313726, + "learning_rate": 0.0001768065902821046, + "loss": 0.85378599, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 3805, + "time_per_iteration": 2.6542673110961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060899, + "balance_loss_mlp": 1.05200648, + "diversity_loss_mlp": 0.0, + "epoch": 0.7322046941131204, + "flos": 570781416960.0, + "grad_norm": 0.07797130890244271, + "language_loss": 0.8206104, + "learning_rate": 0.00017656894317380907, + "loss": 0.83121943, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 3806, + "time_per_iteration": 2.701544761657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020369, + "balance_loss_mlp": 1.01498067, + "diversity_loss_mlp": 0.0, + "epoch": 0.732397075798384, + "flos": 1469165548032.0, + "grad_norm": 0.021367923460696967, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.7705164, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 3807, + "time_per_iteration": 5.001535177230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066451, + "balance_loss_mlp": 1.05737972, + "diversity_loss_mlp": 0.0, + "epoch": 0.7325894574836476, + "flos": 464862260736.0, + "grad_norm": 0.08165775614059534, + "language_loss": 0.83709639, + "learning_rate": 0.00017609402575064875, + "loss": 0.84776092, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 3808, + "time_per_iteration": 2.583564043045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061217, + "balance_loss_mlp": 1.05229425, + "diversity_loss_mlp": 0.0, + "epoch": 0.7327818391689112, + "flos": 495493065216.0, + "grad_norm": 0.0811056502064105, + "language_loss": 0.80930746, + "learning_rate": 0.00017585675562016367, + "loss": 0.81991959, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 3809, + "time_per_iteration": 2.6347053050994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101136, + "balance_loss_mlp": 1.00604343, + "diversity_loss_mlp": 0.0, + "epoch": 0.7329742208541746, + "flos": 1433489508864.0, + "grad_norm": 0.015405005389362274, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78224206, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 3810, + "time_per_iteration": 4.809669017791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010632, + "balance_loss_mlp": 1.05418134, + "diversity_loss_mlp": 0.0, + "epoch": 0.7331666025394382, + "flos": 496889095680.0, + "grad_norm": 0.08174261034044085, + "language_loss": 0.85100114, + "learning_rate": 0.00017538259298196474, + "loss": 0.86163306, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 3811, + "time_per_iteration": 2.5669541358947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066918, + "balance_loss_mlp": 1.05802464, + "diversity_loss_mlp": 0.0, + "epoch": 0.7333589842247018, + "flos": 538524785664.0, + "grad_norm": 0.06518192792765873, + "language_loss": 0.82332867, + "learning_rate": 0.00017514570065833745, + "loss": 0.83399785, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 3812, + "time_per_iteration": 2.7447328567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071687, + "balance_loss_mlp": 1.06259131, + "diversity_loss_mlp": 0.0, + "epoch": 0.7335513659099654, + "flos": 491067836928.0, + "grad_norm": 0.09580264059121266, + "language_loss": 0.80788046, + "learning_rate": 0.00017490893445433426, + "loss": 0.81859732, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3813, + "time_per_iteration": 2.6378085613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_mlp": 1.05522716, + "diversity_loss_mlp": 0.0, + "epoch": 0.733743747595229, + "flos": 562150844928.0, + "grad_norm": 0.07102449829418327, + "language_loss": 0.81571025, + "learning_rate": 0.00017467229446187587, + "loss": 0.82635403, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3814, + "time_per_iteration": 2.7120914459228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072677, + "balance_loss_mlp": 1.06393909, + "diversity_loss_mlp": 0.0, + "epoch": 0.7339361292804925, + "flos": 538581685248.0, + "grad_norm": 0.07114012207935533, + "language_loss": 0.81285048, + "learning_rate": 0.00017443578077283424, + "loss": 0.82357717, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 3815, + "time_per_iteration": 2.6395435333251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106649, + "balance_loss_mlp": 1.05747199, + "diversity_loss_mlp": 0.0, + "epoch": 0.734128510965756, + "flos": 548469895680.0, + "grad_norm": 0.07483834875110257, + "language_loss": 0.84961641, + "learning_rate": 0.0001741993934790319, + "loss": 0.86028135, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 3816, + "time_per_iteration": 2.726897716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_mlp": 1.05116272, + "diversity_loss_mlp": 0.0, + "epoch": 0.7343208926510196, + "flos": 540066548736.0, + "grad_norm": 0.07480496039033006, + "language_loss": 0.84648383, + "learning_rate": 0.00017396313267224273, + "loss": 0.85708326, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 3817, + "time_per_iteration": 2.8066418170928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066831, + "balance_loss_mlp": 1.05799198, + "diversity_loss_mlp": 0.0, + "epoch": 0.7345132743362832, + "flos": 571095277056.0, + "grad_norm": 0.0889487029403391, + "language_loss": 0.8847158, + "learning_rate": 0.0001737269984441912, + "loss": 0.89538407, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 3818, + "time_per_iteration": 2.6318438053131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060197, + "balance_loss_mlp": 1.05124998, + "diversity_loss_mlp": 0.0, + "epoch": 0.7347056560215467, + "flos": 545403621888.0, + "grad_norm": 0.07556044268941689, + "language_loss": 0.85168499, + "learning_rate": 0.00017349099088655263, + "loss": 0.86228693, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 3819, + "time_per_iteration": 2.6988065242767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058709, + "balance_loss_mlp": 1.05007255, + "diversity_loss_mlp": 0.0, + "epoch": 0.7348980377068103, + "flos": 595949239296.0, + "grad_norm": 0.06839680418094873, + "language_loss": 0.80908042, + "learning_rate": 0.00017325511009095375, + "loss": 0.81966752, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 3820, + "time_per_iteration": 2.727027177810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057302, + "balance_loss_mlp": 1.04837942, + "diversity_loss_mlp": 0.0, + "epoch": 0.7350904193920739, + "flos": 538554521088.0, + "grad_norm": 0.07744320065165705, + "language_loss": 0.83646286, + "learning_rate": 0.00017301935614897113, + "loss": 0.84703583, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 3821, + "time_per_iteration": 2.6904449462890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059614, + "balance_loss_mlp": 1.05071497, + "diversity_loss_mlp": 0.0, + "epoch": 0.7352828010773375, + "flos": 512981434368.0, + "grad_norm": 0.06367960554180149, + "language_loss": 0.82050133, + "learning_rate": 0.00017278372915213274, + "loss": 0.83109748, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 3822, + "time_per_iteration": 2.715162515640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009526, + "balance_loss_mlp": 1.00437641, + "diversity_loss_mlp": 0.0, + "epoch": 0.735475182762601, + "flos": 1553820848640.0, + "grad_norm": 0.013680325571624621, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80903369, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.05151367, + "routerloss_mlp": 0.0, + "step": 3823, + "time_per_iteration": 4.962257146835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056616, + "balance_loss_mlp": 1.04753208, + "diversity_loss_mlp": 0.0, + "epoch": 0.7356675644478645, + "flos": 681308610048.0, + "grad_norm": 0.08246165896918017, + "language_loss": 0.80686677, + "learning_rate": 0.00017231285635975314, + "loss": 0.81743288, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 3824, + "time_per_iteration": 2.892613172531128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060803, + "balance_loss_mlp": 1.05131412, + "diversity_loss_mlp": 0.0, + "epoch": 0.7358599461331281, + "flos": 515215157760.0, + "grad_norm": 0.06805025721620432, + "language_loss": 0.83387762, + "learning_rate": 0.00017207761074702115, + "loss": 0.84448564, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3825, + "time_per_iteration": 2.600008964538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061591, + "balance_loss_mlp": 1.05259085, + "diversity_loss_mlp": 0.0, + "epoch": 0.7360523278183917, + "flos": 443973934080.0, + "grad_norm": 0.06050130894095604, + "language_loss": 0.84002912, + "learning_rate": 0.0001718424924450514, + "loss": 0.85064507, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 3826, + "time_per_iteration": 2.5992300510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054783, + "balance_loss_mlp": 1.04562807, + "diversity_loss_mlp": 0.0, + "epoch": 0.7362447095036553, + "flos": 603423489024.0, + "grad_norm": 0.057066515344493245, + "language_loss": 0.86262274, + "learning_rate": 0.00017160750154512482, + "loss": 0.87317061, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 3827, + "time_per_iteration": 2.726304292678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00795034, + "balance_loss_mlp": 1.34579134, + "diversity_loss_mlp": 0.220893, + "epoch": 0.7364370911889189, + "flos": 553095184896.0, + "grad_norm": 0.03015959834370855, + "language_loss": 0.83901906, + "learning_rate": 0.0001713726381384731, + "loss": 0.84696937, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01169185, + "step": 3828, + "time_per_iteration": 2.8043603897094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061645, + "balance_loss_mlp": 1.05248344, + "diversity_loss_mlp": 0.0, + "epoch": 0.7366294728741823, + "flos": 449061387264.0, + "grad_norm": 0.06844777280948466, + "language_loss": 0.81076348, + "learning_rate": 0.00017113790231627812, + "loss": 0.8213799, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3829, + "time_per_iteration": 2.619093179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100728, + "balance_loss_mlp": 1.0020107, + "diversity_loss_mlp": 0.0, + "epoch": 0.7368218545594459, + "flos": 1535502500352.0, + "grad_norm": 0.01400462839453399, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80265498, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.05273438, + "routerloss_mlp": 0.0, + "step": 3830, + "time_per_iteration": 4.812221527099609 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00792371, + "balance_loss_mlp": 1.34191561, + "diversity_loss_mlp": 0.21972378, + "epoch": 0.7370142362447095, + "flos": 515425130496.0, + "grad_norm": 0.03330075510268521, + "language_loss": 0.81812584, + "learning_rate": 0.00017066881378973936, + "loss": 0.82604957, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01155161, + "step": 3831, + "time_per_iteration": 2.7056965827941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060657, + "balance_loss_mlp": 1.05176377, + "diversity_loss_mlp": 0.0, + "epoch": 0.7372066179299731, + "flos": 500805172224.0, + "grad_norm": 0.07192956817041389, + "language_loss": 0.83134949, + "learning_rate": 0.00017043446126751189, + "loss": 0.84195602, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 3832, + "time_per_iteration": 2.676421880722046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060842, + "balance_loss_mlp": 1.05175185, + "diversity_loss_mlp": 0.0, + "epoch": 0.7373989996152366, + "flos": 558083893248.0, + "grad_norm": 0.07065913186643534, + "language_loss": 0.76922351, + "learning_rate": 0.00017020023669397376, + "loss": 0.77983195, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3833, + "time_per_iteration": 2.67942214012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063044, + "balance_loss_mlp": 1.0536567, + "diversity_loss_mlp": 0.0, + "epoch": 0.7375913813005002, + "flos": 506777306112.0, + "grad_norm": 0.07582868630536281, + "language_loss": 0.81676751, + "learning_rate": 0.0001699661401600589, + "loss": 0.82739794, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3834, + "time_per_iteration": 2.5813028812408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791828, + "balance_loss_mlp": 1.34016216, + "diversity_loss_mlp": 0.22067872, + "epoch": 0.7377837629857638, + "flos": 486183015936.0, + "grad_norm": 0.03104422851251126, + "language_loss": 0.78392982, + "learning_rate": 0.00016973217175665205, + "loss": 0.79184818, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01140742, + "step": 3835, + "time_per_iteration": 2.622943639755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002245, + "balance_loss_mlp": 0.99702322, + "diversity_loss_mlp": 0.0, + "epoch": 0.7379761446710273, + "flos": 1414693942272.0, + "grad_norm": 0.013207371532760371, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82168412, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.05224609, + "routerloss_mlp": 0.0, + "step": 3836, + "time_per_iteration": 4.931336402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060295, + "balance_loss_mlp": 1.05126452, + "diversity_loss_mlp": 0.0, + "epoch": 0.7381685263562909, + "flos": 629737721856.0, + "grad_norm": 0.06649751574670516, + "language_loss": 0.84498501, + "learning_rate": 0.00016926461970465047, + "loss": 0.85558796, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 3837, + "time_per_iteration": 2.765747547149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059108, + "balance_loss_mlp": 1.04992294, + "diversity_loss_mlp": 0.0, + "epoch": 0.7383609080415544, + "flos": 739224589824.0, + "grad_norm": 0.0574260047104924, + "language_loss": 0.84358233, + "learning_rate": 0.00016903103623757516, + "loss": 0.85417342, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3838, + "time_per_iteration": 3.069658041000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060146, + "balance_loss_mlp": 1.05106258, + "diversity_loss_mlp": 0.0, + "epoch": 0.738553289726818, + "flos": 550206950400.0, + "grad_norm": 0.19052913382225448, + "language_loss": 0.80133057, + "learning_rate": 0.00016879758126404738, + "loss": 0.81193197, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3839, + "time_per_iteration": 2.689941167831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789085, + "balance_loss_mlp": 1.33350182, + "diversity_loss_mlp": 0.2223025, + "epoch": 0.7387456714120816, + "flos": 910294640640.0, + "grad_norm": 0.03551016649676842, + "language_loss": 0.79851139, + "learning_rate": 0.00016856425487470216, + "loss": 0.80640227, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01118332, + "step": 3840, + "time_per_iteration": 3.1254615783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064553, + "balance_loss_mlp": 1.05543303, + "diversity_loss_mlp": 0.0, + "epoch": 0.7389380530973452, + "flos": 852684807168.0, + "grad_norm": 0.0706997471436485, + "language_loss": 0.79199183, + "learning_rate": 0.00016833105716012486, + "loss": 0.8026374, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 3841, + "time_per_iteration": 3.138193368911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063082, + "balance_loss_mlp": 1.05398655, + "diversity_loss_mlp": 0.0, + "epoch": 0.7391304347826086, + "flos": 817026020352.0, + "grad_norm": 0.06630465632536123, + "language_loss": 0.85135829, + "learning_rate": 0.00016809798821085088, + "loss": 0.86198914, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3842, + "time_per_iteration": 3.0023772716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070258, + "balance_loss_mlp": 1.06117415, + "diversity_loss_mlp": 0.0, + "epoch": 0.7393228164678722, + "flos": 572819848704.0, + "grad_norm": 0.05652902477854722, + "language_loss": 0.89046443, + "learning_rate": 0.00016786504811736565, + "loss": 0.90116704, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3843, + "time_per_iteration": 2.706385374069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063666, + "balance_loss_mlp": 1.05483222, + "diversity_loss_mlp": 0.0, + "epoch": 0.7395151981531358, + "flos": 685237169664.0, + "grad_norm": 0.0599118075718357, + "language_loss": 0.82577473, + "learning_rate": 0.00016763223697010442, + "loss": 0.83641136, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 3844, + "time_per_iteration": 3.0668578147888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065987, + "balance_loss_mlp": 1.05714738, + "diversity_loss_mlp": 0.0, + "epoch": 0.7397075798383994, + "flos": 556366662144.0, + "grad_norm": 0.06587022409921209, + "language_loss": 0.84292293, + "learning_rate": 0.00016739955485945256, + "loss": 0.8535828, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 3845, + "time_per_iteration": 2.76232647895813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066755, + "balance_loss_mlp": 1.05776656, + "diversity_loss_mlp": 0.0, + "epoch": 0.739899961523663, + "flos": 546782400000.0, + "grad_norm": 0.07863227392455628, + "language_loss": 0.85949242, + "learning_rate": 0.00016716700187574513, + "loss": 0.87015998, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 3846, + "time_per_iteration": 2.6615161895751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068464, + "balance_loss_mlp": 1.05967295, + "diversity_loss_mlp": 0.0, + "epoch": 0.7400923432089265, + "flos": 609190419456.0, + "grad_norm": 0.0694717633397352, + "language_loss": 0.8384943, + "learning_rate": 0.0001669345781092675, + "loss": 0.84917903, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 3847, + "time_per_iteration": 2.708287477493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068988, + "balance_loss_mlp": 1.06022048, + "diversity_loss_mlp": 0.0, + "epoch": 0.7402847248941901, + "flos": 591007518720.0, + "grad_norm": 0.08739626570818541, + "language_loss": 0.87128854, + "learning_rate": 0.0001667022836502546, + "loss": 0.88197839, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 3848, + "time_per_iteration": 2.768453598022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_mlp": 1.06293964, + "diversity_loss_mlp": 0.0, + "epoch": 0.7404771065794536, + "flos": 477369635328.0, + "grad_norm": 0.07849103844245357, + "language_loss": 0.83004302, + "learning_rate": 0.00016647011858889077, + "loss": 0.84076011, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 3849, + "time_per_iteration": 2.553321123123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066558, + "balance_loss_mlp": 1.05774295, + "diversity_loss_mlp": 0.0, + "epoch": 0.7406694882647172, + "flos": 496446755328.0, + "grad_norm": 0.0747699795491948, + "language_loss": 0.85671914, + "learning_rate": 0.00016623808301531056, + "loss": 0.86738473, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 3850, + "time_per_iteration": 2.6675972938537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072662, + "balance_loss_mlp": 1.06376278, + "diversity_loss_mlp": 0.0, + "epoch": 0.7408618699499807, + "flos": 562205173248.0, + "grad_norm": 0.08247164679043814, + "language_loss": 0.79259217, + "learning_rate": 0.00016600617701959842, + "loss": 0.8033188, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 3851, + "time_per_iteration": 2.7360141277313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_mlp": 1.03028595, + "diversity_loss_mlp": 0.0, + "epoch": 0.7410542516352443, + "flos": 1388228834304.0, + "grad_norm": 0.02428572869696352, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79879034, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.04931641, + "routerloss_mlp": 0.0, + "step": 3852, + "time_per_iteration": 4.992321968078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066311, + "balance_loss_mlp": 1.05746007, + "diversity_loss_mlp": 0.0, + "epoch": 0.7412466333205079, + "flos": 669999776256.0, + "grad_norm": 0.06380286775900439, + "language_loss": 0.81274605, + "learning_rate": 0.00016554275412186315, + "loss": 0.8234092, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 3853, + "time_per_iteration": 2.82212495803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065457, + "balance_loss_mlp": 1.05660534, + "diversity_loss_mlp": 0.0, + "epoch": 0.7414390150057715, + "flos": 489293706240.0, + "grad_norm": 0.08235676445627264, + "language_loss": 0.80846745, + "learning_rate": 0.0001653112373997568, + "loss": 0.81912202, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 3854, + "time_per_iteration": 2.6886162757873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072808, + "balance_loss_mlp": 1.06417763, + "diversity_loss_mlp": 0.0, + "epoch": 0.7416313966910351, + "flos": 599393613312.0, + "grad_norm": 0.0787808176004402, + "language_loss": 0.7459085, + "learning_rate": 0.0001650798506153517, + "loss": 0.75663662, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 3855, + "time_per_iteration": 2.699655294418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064176, + "balance_loss_mlp": 1.05534911, + "diversity_loss_mlp": 0.0, + "epoch": 0.7418237783762985, + "flos": 542539980288.0, + "grad_norm": 0.13185112675918914, + "language_loss": 0.84102911, + "learning_rate": 0.00016484859385848023, + "loss": 0.85167086, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 3856, + "time_per_iteration": 2.6237292289733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066087, + "balance_loss_mlp": 1.05749846, + "diversity_loss_mlp": 0.0, + "epoch": 0.7420161600615621, + "flos": 544136071680.0, + "grad_norm": 0.0735312090287519, + "language_loss": 0.77380371, + "learning_rate": 0.0001646174672189243, + "loss": 0.7844646, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.08599854, + "routerloss_mlp": 0.0, + "step": 3857, + "time_per_iteration": 2.662250518798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066781, + "balance_loss_mlp": 1.05808437, + "diversity_loss_mlp": 0.0, + "epoch": 0.7422085417468257, + "flos": 527178875904.0, + "grad_norm": 0.07158580991852644, + "language_loss": 0.80202585, + "learning_rate": 0.00016438647078641488, + "loss": 0.81269372, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 3858, + "time_per_iteration": 2.5815234184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061243, + "balance_loss_mlp": 1.05223656, + "diversity_loss_mlp": 0.0, + "epoch": 0.7424009234320893, + "flos": 508674774528.0, + "grad_norm": 0.07922307514532904, + "language_loss": 0.82879561, + "learning_rate": 0.00016415560465063344, + "loss": 0.83940804, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 3859, + "time_per_iteration": 2.708585739135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057118, + "balance_loss_mlp": 1.04814172, + "diversity_loss_mlp": 0.0, + "epoch": 0.7425933051173528, + "flos": 512598564864.0, + "grad_norm": 0.07844823875052143, + "language_loss": 0.79364371, + "learning_rate": 0.0001639248689012095, + "loss": 0.80421484, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 3860, + "time_per_iteration": 2.58583927154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_mlp": 1.05484664, + "diversity_loss_mlp": 0.0, + "epoch": 0.7427856868026164, + "flos": 458302053888.0, + "grad_norm": 0.0625994675611715, + "language_loss": 0.87600327, + "learning_rate": 0.00016369426362772271, + "loss": 0.88664174, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 3861, + "time_per_iteration": 2.7810909748077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058337, + "balance_loss_mlp": 1.04926515, + "diversity_loss_mlp": 0.0, + "epoch": 0.74297806848788, + "flos": 605019580416.0, + "grad_norm": 0.06941058470153043, + "language_loss": 0.80742699, + "learning_rate": 0.00016346378891970233, + "loss": 0.81801033, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3862, + "time_per_iteration": 2.846928596496582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063331, + "balance_loss_mlp": 1.05435514, + "diversity_loss_mlp": 0.0, + "epoch": 0.7431704501731435, + "flos": 891390044160.0, + "grad_norm": 0.0684493510726064, + "language_loss": 0.81710279, + "learning_rate": 0.00016323344486662633, + "loss": 0.82773608, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 3863, + "time_per_iteration": 3.331202745437622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061695, + "balance_loss_mlp": 1.05259883, + "diversity_loss_mlp": 0.0, + "epoch": 0.7433628318584071, + "flos": 592163841024.0, + "grad_norm": 0.05806816249285044, + "language_loss": 0.78816247, + "learning_rate": 0.00016300323155792247, + "loss": 0.79877937, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3864, + "time_per_iteration": 2.872833490371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060231, + "balance_loss_mlp": 1.05139732, + "diversity_loss_mlp": 0.0, + "epoch": 0.7435552135436706, + "flos": 477154520064.0, + "grad_norm": 0.06583078508607046, + "language_loss": 0.88677347, + "learning_rate": 0.00016277314908296687, + "loss": 0.89737576, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 3865, + "time_per_iteration": 2.6268508434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062318, + "balance_loss_mlp": 1.05286467, + "diversity_loss_mlp": 0.0, + "epoch": 0.7437475952289342, + "flos": 673184618496.0, + "grad_norm": 0.08180248385301583, + "language_loss": 0.7621361, + "learning_rate": 0.00016254319753108604, + "loss": 0.77275932, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3866, + "time_per_iteration": 2.8856914043426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062277, + "balance_loss_mlp": 1.05305004, + "diversity_loss_mlp": 0.0, + "epoch": 0.7439399769141978, + "flos": 770428786176.0, + "grad_norm": 0.07310249763973194, + "language_loss": 0.77018058, + "learning_rate": 0.00016231337699155492, + "loss": 0.78080332, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3867, + "time_per_iteration": 2.975250244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059189, + "balance_loss_mlp": 1.04974771, + "diversity_loss_mlp": 0.0, + "epoch": 0.7441323585994614, + "flos": 647777088000.0, + "grad_norm": 0.07083990267041149, + "language_loss": 0.78228271, + "learning_rate": 0.0001620836875535977, + "loss": 0.79287452, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3868, + "time_per_iteration": 2.856765031814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105862, + "balance_loss_mlp": 1.04925001, + "diversity_loss_mlp": 0.0, + "epoch": 0.7443247402847248, + "flos": 565372763136.0, + "grad_norm": 0.058820941096758894, + "language_loss": 0.80752689, + "learning_rate": 0.00016185412930638766, + "loss": 0.81811309, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3869, + "time_per_iteration": 2.7962300777435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060986, + "balance_loss_mlp": 1.05180645, + "diversity_loss_mlp": 0.0, + "epoch": 0.7445171219699884, + "flos": 578529879552.0, + "grad_norm": 0.09216022180459393, + "language_loss": 0.82565176, + "learning_rate": 0.00016162470233904765, + "loss": 0.83626163, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3870, + "time_per_iteration": 2.727376937866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059095, + "balance_loss_mlp": 1.05008888, + "diversity_loss_mlp": 0.0, + "epoch": 0.744709503655252, + "flos": 618875997696.0, + "grad_norm": 0.08871714462123159, + "language_loss": 0.82108277, + "learning_rate": 0.00016139540674064856, + "loss": 0.83167374, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 3871, + "time_per_iteration": 2.747559070587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055678, + "balance_loss_mlp": 1.04671371, + "diversity_loss_mlp": 0.0, + "epoch": 0.7449018853405156, + "flos": 528619322880.0, + "grad_norm": 0.063692065795828, + "language_loss": 0.7763024, + "learning_rate": 0.00016116624260021113, + "loss": 0.78685915, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3872, + "time_per_iteration": 2.75909423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106192, + "balance_loss_mlp": 1.0528599, + "diversity_loss_mlp": 0.0, + "epoch": 0.7450942670257792, + "flos": 433314842112.0, + "grad_norm": 0.06099997691226976, + "language_loss": 0.83786505, + "learning_rate": 0.0001609372100067046, + "loss": 0.84848428, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 3873, + "time_per_iteration": 2.5251874923706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796431, + "balance_loss_mlp": 1.34714556, + "diversity_loss_mlp": 0.22299039, + "epoch": 0.7452866487110427, + "flos": 696882258432.0, + "grad_norm": 0.03925838692514683, + "language_loss": 0.85007972, + "learning_rate": 0.0001607083090490475, + "loss": 0.85804403, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01136341, + "step": 3874, + "time_per_iteration": 2.8896329402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061928, + "balance_loss_mlp": 1.0527246, + "diversity_loss_mlp": 0.0, + "epoch": 0.7454790303963063, + "flos": 512210552832.0, + "grad_norm": 0.07963892031444339, + "language_loss": 0.80322075, + "learning_rate": 0.00016047953981610714, + "loss": 0.81384003, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 3875, + "time_per_iteration": 2.7198143005371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_mlp": 1.02416849, + "diversity_loss_mlp": 0.0, + "epoch": 0.7456714120815698, + "flos": 1325949668352.0, + "grad_norm": 0.01953041960218584, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80758721, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.04907227, + "routerloss_mlp": 0.0, + "step": 3876, + "time_per_iteration": 5.047106981277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105976, + "balance_loss_mlp": 1.05069435, + "diversity_loss_mlp": 0.0, + "epoch": 0.7458637937668334, + "flos": 721711627776.0, + "grad_norm": 0.07139005535531126, + "language_loss": 0.80606306, + "learning_rate": 0.0001600223968795889, + "loss": 0.81666064, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3877, + "time_per_iteration": 2.8899221420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_mlp": 1.02230287, + "diversity_loss_mlp": 0.0, + "epoch": 0.746056175452097, + "flos": 1501580395008.0, + "grad_norm": 0.018847716252117216, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76723289, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.04907227, + "routerloss_mlp": 0.0, + "step": 3878, + "time_per_iteration": 4.949044466018677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063533, + "balance_loss_mlp": 1.05449665, + "diversity_loss_mlp": 0.0, + "epoch": 0.7462485571373605, + "flos": 520245711360.0, + "grad_norm": 0.08037956070996295, + "language_loss": 0.8220886, + "learning_rate": 0.00015956578190706483, + "loss": 0.83272392, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3879, + "time_per_iteration": 2.679077386856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058904, + "balance_loss_mlp": 1.04966509, + "diversity_loss_mlp": 0.0, + "epoch": 0.7464409388226241, + "flos": 481206790656.0, + "grad_norm": 0.07423526276361143, + "language_loss": 0.75933188, + "learning_rate": 0.00015933767262892468, + "loss": 0.76992095, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3880, + "time_per_iteration": 2.725120782852173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061626, + "balance_loss_mlp": 1.05248249, + "diversity_loss_mlp": 0.0, + "epoch": 0.7466333205078877, + "flos": 486761177088.0, + "grad_norm": 0.08122487442608403, + "language_loss": 0.81791377, + "learning_rate": 0.00015910969560762927, + "loss": 0.82853001, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 3881, + "time_per_iteration": 2.5659735202789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061027, + "balance_loss_mlp": 1.05212796, + "diversity_loss_mlp": 0.0, + "epoch": 0.7468257021931513, + "flos": 611293091328.0, + "grad_norm": 0.06269003532148706, + "language_loss": 0.83085567, + "learning_rate": 0.00015888185093168727, + "loss": 0.84146595, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 3882, + "time_per_iteration": 2.7333316802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064922, + "balance_loss_mlp": 1.0554266, + "diversity_loss_mlp": 0.0, + "epoch": 0.7470180838784147, + "flos": 533459727360.0, + "grad_norm": 0.06569405974283654, + "language_loss": 0.81109202, + "learning_rate": 0.00015865413868955581, + "loss": 0.82174122, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3883, + "time_per_iteration": 2.6078059673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058237, + "balance_loss_mlp": 1.04946291, + "diversity_loss_mlp": 0.0, + "epoch": 0.7472104655636783, + "flos": 739338388992.0, + "grad_norm": 0.057634664266444945, + "language_loss": 0.82803142, + "learning_rate": 0.00015842655896964054, + "loss": 0.83861375, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 3884, + "time_per_iteration": 3.042433977127075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061863, + "balance_loss_mlp": 1.0528096, + "diversity_loss_mlp": 0.0, + "epoch": 0.7474028472489419, + "flos": 640305409536.0, + "grad_norm": 0.07244796431130596, + "language_loss": 0.73654252, + "learning_rate": 0.00015819911186029567, + "loss": 0.74716115, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 3885, + "time_per_iteration": 2.8399569988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063739, + "balance_loss_mlp": 1.05458951, + "diversity_loss_mlp": 0.0, + "epoch": 0.7475952289342055, + "flos": 590249120256.0, + "grad_norm": 0.0730187367037383, + "language_loss": 0.86386681, + "learning_rate": 0.00015797179744982443, + "loss": 0.87450415, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3886, + "time_per_iteration": 2.6979753971099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068538, + "balance_loss_mlp": 1.05947804, + "diversity_loss_mlp": 0.0, + "epoch": 0.7477876106194691, + "flos": 488191712256.0, + "grad_norm": 0.06196383449999257, + "language_loss": 0.78900141, + "learning_rate": 0.00015774461582647765, + "loss": 0.79968679, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3887, + "time_per_iteration": 2.6235530376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067008, + "balance_loss_mlp": 1.05791271, + "diversity_loss_mlp": 0.0, + "epoch": 0.7479799923047326, + "flos": 554733494784.0, + "grad_norm": 0.07428746170121639, + "language_loss": 0.81271255, + "learning_rate": 0.00015751756707845505, + "loss": 0.82338268, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3888, + "time_per_iteration": 2.654217481613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066279, + "balance_loss_mlp": 1.05733204, + "diversity_loss_mlp": 0.0, + "epoch": 0.7481723739899961, + "flos": 767387105280.0, + "grad_norm": 0.06349901375293318, + "language_loss": 0.8820529, + "learning_rate": 0.00015729065129390502, + "loss": 0.89271569, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 3889, + "time_per_iteration": 2.990723133087158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107017, + "balance_loss_mlp": 1.06125295, + "diversity_loss_mlp": 0.0, + "epoch": 0.7483647556752597, + "flos": 496172542464.0, + "grad_norm": 0.10644115001559669, + "language_loss": 0.82281494, + "learning_rate": 0.0001570638685609241, + "loss": 0.83351666, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 3890, + "time_per_iteration": 2.562049627304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064882, + "balance_loss_mlp": 1.0558815, + "diversity_loss_mlp": 0.0, + "epoch": 0.7485571373605233, + "flos": 472850431488.0, + "grad_norm": 0.07005408827456952, + "language_loss": 0.80632579, + "learning_rate": 0.00015683721896755693, + "loss": 0.81697452, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 3891, + "time_per_iteration": 2.5688047409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018069, + "balance_loss_mlp": 1.01291943, + "diversity_loss_mlp": 0.0, + "epoch": 0.7487495190457868, + "flos": 1554473161728.0, + "grad_norm": 0.021126139986013294, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83228564, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.05151367, + "routerloss_mlp": 0.0, + "step": 3892, + "time_per_iteration": 4.9241249561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063391, + "balance_loss_mlp": 1.05425954, + "diversity_loss_mlp": 0.0, + "epoch": 0.7489419007310504, + "flos": 581845773312.0, + "grad_norm": 0.07047459901443781, + "language_loss": 0.85042292, + "learning_rate": 0.00015638431955158528, + "loss": 0.8610568, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 3893, + "time_per_iteration": 2.696835517883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059789, + "balance_loss_mlp": 1.05092609, + "diversity_loss_mlp": 0.0, + "epoch": 0.749134282416314, + "flos": 567576751104.0, + "grad_norm": 0.07429691825865621, + "language_loss": 0.81044436, + "learning_rate": 0.00015615806990481186, + "loss": 0.8210423, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 3894, + "time_per_iteration": 2.721975088119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061695, + "balance_loss_mlp": 1.05259371, + "diversity_loss_mlp": 0.0, + "epoch": 0.7493266641015776, + "flos": 533061803520.0, + "grad_norm": 0.05332768573038703, + "language_loss": 0.84447378, + "learning_rate": 0.00015593195374931452, + "loss": 0.85509074, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3895, + "time_per_iteration": 2.724210500717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057311, + "balance_loss_mlp": 1.04820967, + "diversity_loss_mlp": 0.0, + "epoch": 0.7495190457868411, + "flos": 523613362176.0, + "grad_norm": 0.08170178598725314, + "language_loss": 0.79939067, + "learning_rate": 0.00015570597117287922, + "loss": 0.80996376, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3896, + "time_per_iteration": 2.6550590991973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058835, + "balance_loss_mlp": 1.04970384, + "diversity_loss_mlp": 0.0, + "epoch": 0.7497114274721046, + "flos": 514187315712.0, + "grad_norm": 0.07111999470543245, + "language_loss": 0.77950025, + "learning_rate": 0.0001554801222632406, + "loss": 0.79008865, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 3897, + "time_per_iteration": 2.5913069248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058781, + "balance_loss_mlp": 1.04961967, + "diversity_loss_mlp": 0.0, + "epoch": 0.7499038091573682, + "flos": 495006308352.0, + "grad_norm": 0.07004004520272819, + "language_loss": 0.8521589, + "learning_rate": 0.00015525440710808052, + "loss": 0.86274672, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3898, + "time_per_iteration": 2.633772850036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105768, + "balance_loss_mlp": 1.04835165, + "diversity_loss_mlp": 0.0, + "epoch": 0.7500961908426318, + "flos": 737658233856.0, + "grad_norm": 0.07310706246925956, + "language_loss": 0.77907795, + "learning_rate": 0.00015502882579502953, + "loss": 0.78965473, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.09332275, + "routerloss_mlp": 0.0, + "step": 3899, + "time_per_iteration": 2.938547372817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054758, + "balance_loss_mlp": 1.04551327, + "diversity_loss_mlp": 0.0, + "epoch": 0.7502885725278954, + "flos": 533400256512.0, + "grad_norm": 0.06650950979385485, + "language_loss": 0.8470974, + "learning_rate": 0.00015480337841166592, + "loss": 0.85764492, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3900, + "time_per_iteration": 2.719611167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064406, + "balance_loss_mlp": 1.05532193, + "diversity_loss_mlp": 0.0, + "epoch": 0.7504809542131589, + "flos": 589324792320.0, + "grad_norm": 0.06798274648693917, + "language_loss": 0.83017278, + "learning_rate": 0.00015457806504551647, + "loss": 0.84081692, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3901, + "time_per_iteration": 2.815099000930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055292, + "balance_loss_mlp": 1.04617858, + "diversity_loss_mlp": 0.0, + "epoch": 0.7506733358984224, + "flos": 511550899200.0, + "grad_norm": 0.06551967362841071, + "language_loss": 0.78146368, + "learning_rate": 0.0001543528857840554, + "loss": 0.79201663, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3902, + "time_per_iteration": 2.660747528076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105756, + "balance_loss_mlp": 1.04829192, + "diversity_loss_mlp": 0.0, + "epoch": 0.750865717583686, + "flos": 539268503040.0, + "grad_norm": 0.08761977110880032, + "language_loss": 0.80069476, + "learning_rate": 0.000154127840714705, + "loss": 0.81127042, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3903, + "time_per_iteration": 2.791895627975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057287, + "balance_loss_mlp": 1.04786348, + "diversity_loss_mlp": 0.0, + "epoch": 0.7510580992689496, + "flos": 476578930176.0, + "grad_norm": 0.08489214172044417, + "language_loss": 0.82145894, + "learning_rate": 0.00015390292992483557, + "loss": 0.83203179, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3904, + "time_per_iteration": 2.531291961669922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058521, + "balance_loss_mlp": 1.04955626, + "diversity_loss_mlp": 0.0, + "epoch": 0.7512504809542132, + "flos": 579043800576.0, + "grad_norm": 0.06641081846092535, + "language_loss": 0.84235787, + "learning_rate": 0.00015367815350176523, + "loss": 0.85294312, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3905, + "time_per_iteration": 2.7290806770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055516, + "balance_loss_mlp": 1.04627776, + "diversity_loss_mlp": 0.0, + "epoch": 0.7514428626394767, + "flos": 418660379136.0, + "grad_norm": 0.06804815402684934, + "language_loss": 0.82392836, + "learning_rate": 0.00015345351153275987, + "loss": 0.8344835, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3906, + "time_per_iteration": 2.530323028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054875, + "balance_loss_mlp": 1.04556477, + "diversity_loss_mlp": 0.0, + "epoch": 0.7516352443247403, + "flos": 641039215104.0, + "grad_norm": 0.06371304983723255, + "language_loss": 0.80832905, + "learning_rate": 0.00015322900410503332, + "loss": 0.81887782, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3907, + "time_per_iteration": 2.840207576751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062824, + "balance_loss_mlp": 1.05359089, + "diversity_loss_mlp": 0.0, + "epoch": 0.7518276260100039, + "flos": 580998168576.0, + "grad_norm": 0.0661364017188776, + "language_loss": 0.77996182, + "learning_rate": 0.00015300463130574703, + "loss": 0.79059005, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3908, + "time_per_iteration": 2.8597986698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00795371, + "balance_loss_mlp": 1.3454839, + "diversity_loss_mlp": 0.22311893, + "epoch": 0.7520200076952674, + "flos": 687342412800.0, + "grad_norm": 0.027335085290279493, + "language_loss": 0.81861627, + "learning_rate": 0.00015278039322201033, + "loss": 0.82656997, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01107004, + "step": 3909, + "time_per_iteration": 2.991687774658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056027, + "balance_loss_mlp": 1.04691339, + "diversity_loss_mlp": 0.0, + "epoch": 0.7522123893805309, + "flos": 486439976448.0, + "grad_norm": 0.07802530294793614, + "language_loss": 0.79405951, + "learning_rate": 0.00015255628994088004, + "loss": 0.80461979, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3910, + "time_per_iteration": 2.552389621734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057943, + "balance_loss_mlp": 1.04875183, + "diversity_loss_mlp": 0.0, + "epoch": 0.7524047710657945, + "flos": 818982586368.0, + "grad_norm": 0.06839079088853381, + "language_loss": 0.75070244, + "learning_rate": 0.00015233232154936082, + "loss": 0.76128185, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3911, + "time_per_iteration": 3.2685062885284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060306, + "balance_loss_mlp": 1.05104983, + "diversity_loss_mlp": 0.0, + "epoch": 0.7525971527510581, + "flos": 699508763136.0, + "grad_norm": 0.0742904302268966, + "language_loss": 0.76248109, + "learning_rate": 0.0001521084881344048, + "loss": 0.77308416, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3912, + "time_per_iteration": 2.8669307231903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063744, + "balance_loss_mlp": 1.05449915, + "diversity_loss_mlp": 0.0, + "epoch": 0.7527895344363217, + "flos": 633787421184.0, + "grad_norm": 0.07365945451583152, + "language_loss": 0.86536098, + "learning_rate": 0.00015188478978291208, + "loss": 0.87599838, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3913, + "time_per_iteration": 2.8062844276428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060792, + "balance_loss_mlp": 1.05141592, + "diversity_loss_mlp": 0.0, + "epoch": 0.7529819161215853, + "flos": 562830322176.0, + "grad_norm": 0.06964875853647617, + "language_loss": 0.86198735, + "learning_rate": 0.00015166122658173014, + "loss": 0.87259525, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3914, + "time_per_iteration": 2.832261085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062251, + "balance_loss_mlp": 1.05276752, + "diversity_loss_mlp": 0.0, + "epoch": 0.7531742978068487, + "flos": 690665647104.0, + "grad_norm": 0.07069372780846282, + "language_loss": 0.88695043, + "learning_rate": 0.00015143779861765332, + "loss": 0.89757293, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3915, + "time_per_iteration": 2.876596689224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057505, + "balance_loss_mlp": 1.04845726, + "diversity_loss_mlp": 0.0, + "epoch": 0.7533666794921123, + "flos": 681101208576.0, + "grad_norm": 0.07477721009048348, + "language_loss": 0.81360573, + "learning_rate": 0.00015121450597742458, + "loss": 0.82418078, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 3916, + "time_per_iteration": 2.83457612991333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105545, + "balance_loss_mlp": 1.04619908, + "diversity_loss_mlp": 0.0, + "epoch": 0.7535590611773759, + "flos": 623669414400.0, + "grad_norm": 0.07347506206734646, + "language_loss": 0.78634655, + "learning_rate": 0.00015099134874773369, + "loss": 0.79690111, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3917, + "time_per_iteration": 2.7597367763519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793692, + "balance_loss_mlp": 1.34194863, + "diversity_loss_mlp": 0.22241086, + "epoch": 0.7537514428626395, + "flos": 519427842048.0, + "grad_norm": 0.028776380158614775, + "language_loss": 0.80358481, + "learning_rate": 0.00015076832701521793, + "loss": 0.81152171, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01151239, + "step": 3918, + "time_per_iteration": 2.746518135070801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050485, + "balance_loss_mlp": 1.04122829, + "diversity_loss_mlp": 0.0, + "epoch": 0.753943824547903, + "flos": 723653512704.0, + "grad_norm": 0.08224807804324459, + "language_loss": 0.82372093, + "learning_rate": 0.000150545440866462, + "loss": 0.83422583, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 3919, + "time_per_iteration": 2.986933708190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056343, + "balance_loss_mlp": 1.047104, + "diversity_loss_mlp": 0.0, + "epoch": 0.7541362062331666, + "flos": 437547350016.0, + "grad_norm": 0.07659379290436485, + "language_loss": 0.78524017, + "learning_rate": 0.000150322690387998, + "loss": 0.79580355, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3920, + "time_per_iteration": 2.5535264015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053379, + "balance_loss_mlp": 1.04395509, + "diversity_loss_mlp": 0.0, + "epoch": 0.7543285879184302, + "flos": 565274018304.0, + "grad_norm": 0.08088787979004233, + "language_loss": 0.75178206, + "learning_rate": 0.00015010007566630535, + "loss": 0.76231587, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3921, + "time_per_iteration": 2.752476930618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052876, + "balance_loss_mlp": 1.0435003, + "diversity_loss_mlp": 0.0, + "epoch": 0.7545209696036937, + "flos": 521036416512.0, + "grad_norm": 0.09066204118342673, + "language_loss": 0.81410325, + "learning_rate": 0.00014987759678781077, + "loss": 0.82463199, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3922, + "time_per_iteration": 2.6611218452453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_mlp": 1.04057336, + "diversity_loss_mlp": 0.0, + "epoch": 0.7547133512889573, + "flos": 616066684416.0, + "grad_norm": 0.07014269793522399, + "language_loss": 0.82503462, + "learning_rate": 0.00014965525383888795, + "loss": 0.83553147, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3923, + "time_per_iteration": 2.7689740657806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_mlp": 1.04243279, + "diversity_loss_mlp": 0.0, + "epoch": 0.7549057329742208, + "flos": 750845085696.0, + "grad_norm": 0.07037901848858046, + "language_loss": 0.72344971, + "learning_rate": 0.00014943304690585851, + "loss": 0.73396569, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3924, + "time_per_iteration": 2.926786184310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050623, + "balance_loss_mlp": 1.04116416, + "diversity_loss_mlp": 0.0, + "epoch": 0.7550981146594844, + "flos": 514444276224.0, + "grad_norm": 0.07074790487011906, + "language_loss": 0.79134214, + "learning_rate": 0.0001492109760749908, + "loss": 0.80184835, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3925, + "time_per_iteration": 2.6663551330566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048636, + "balance_loss_mlp": 1.03920674, + "diversity_loss_mlp": 0.0, + "epoch": 0.755290496344748, + "flos": 522009930240.0, + "grad_norm": 0.06259359506310941, + "language_loss": 0.79865938, + "learning_rate": 0.00014898904143250002, + "loss": 0.80914569, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3926, + "time_per_iteration": 2.7111570835113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007032, + "balance_loss_mlp": 1.00193024, + "diversity_loss_mlp": 0.0, + "epoch": 0.7554828780300116, + "flos": 1414615021056.0, + "grad_norm": 0.018464770707338953, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76762235, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.05102539, + "routerloss_mlp": 0.0, + "step": 3927, + "time_per_iteration": 4.9247355461120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049551, + "balance_loss_mlp": 1.04027081, + "diversity_loss_mlp": 0.0, + "epoch": 0.7556752597152752, + "flos": 556937482752.0, + "grad_norm": 0.0681788266526358, + "language_loss": 0.80484271, + "learning_rate": 0.0001485455810572474, + "loss": 0.81533813, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3928, + "time_per_iteration": 2.644436836242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050291, + "balance_loss_mlp": 1.04075408, + "diversity_loss_mlp": 0.0, + "epoch": 0.7558676414005386, + "flos": 563638279680.0, + "grad_norm": 0.05891834719109388, + "language_loss": 0.83858299, + "learning_rate": 0.00014832405549665236, + "loss": 0.84908581, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3929, + "time_per_iteration": 2.7012484073638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_mlp": 1.03651154, + "diversity_loss_mlp": 0.0, + "epoch": 0.7560600230858022, + "flos": 561377392128.0, + "grad_norm": 0.06702269562440989, + "language_loss": 0.78850049, + "learning_rate": 0.00014810266646876746, + "loss": 0.79895926, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3930, + "time_per_iteration": 2.768267869949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_mlp": 1.0400542, + "diversity_loss_mlp": 0.0, + "epoch": 0.7562524047710658, + "flos": 719576649216.0, + "grad_norm": 0.07203252309013448, + "language_loss": 0.77448905, + "learning_rate": 0.00014788141405954364, + "loss": 0.78498399, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3931, + "time_per_iteration": 2.9904940128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047773, + "balance_loss_mlp": 1.03817058, + "diversity_loss_mlp": 0.0, + "epoch": 0.7564447864563294, + "flos": 543347937792.0, + "grad_norm": 0.07800689348595595, + "language_loss": 0.8509475, + "learning_rate": 0.00014766029835487865, + "loss": 0.86142522, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3932, + "time_per_iteration": 2.712207078933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050974, + "balance_loss_mlp": 1.04148519, + "diversity_loss_mlp": 0.0, + "epoch": 0.7566371681415929, + "flos": 725805743616.0, + "grad_norm": 0.09178447768332373, + "language_loss": 0.79506183, + "learning_rate": 0.0001474393194406173, + "loss": 0.80557162, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3933, + "time_per_iteration": 2.933224678039551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048897, + "balance_loss_mlp": 1.03937268, + "diversity_loss_mlp": 0.0, + "epoch": 0.7568295498268565, + "flos": 576580280832.0, + "grad_norm": 0.05892607400759823, + "language_loss": 0.79702771, + "learning_rate": 0.00014721847740255112, + "loss": 0.80751669, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3934, + "time_per_iteration": 2.826552391052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003728, + "balance_loss_mlp": 0.99864936, + "diversity_loss_mlp": 0.0, + "epoch": 0.75702193151212, + "flos": 1520059903488.0, + "grad_norm": 0.02131829704568505, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74915653, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 3935, + "time_per_iteration": 4.626272439956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050547, + "balance_loss_mlp": 1.0411061, + "diversity_loss_mlp": 0.0, + "epoch": 0.7572143131973836, + "flos": 525471556608.0, + "grad_norm": 0.08283198519893772, + "language_loss": 0.78541541, + "learning_rate": 0.00014677720429790526, + "loss": 0.79592091, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3936, + "time_per_iteration": 2.634308338165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046754, + "balance_loss_mlp": 1.03724098, + "diversity_loss_mlp": 0.0, + "epoch": 0.7574066948826472, + "flos": 550738123776.0, + "grad_norm": 0.060589870954327815, + "language_loss": 0.84442061, + "learning_rate": 0.0001465567734026429, + "loss": 0.8548882, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3937, + "time_per_iteration": 2.716531276702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051246, + "balance_loss_mlp": 1.04150677, + "diversity_loss_mlp": 0.0, + "epoch": 0.7575990765679107, + "flos": 395899176960.0, + "grad_norm": 0.08803792614427135, + "language_loss": 0.82826757, + "learning_rate": 0.00014633647972621034, + "loss": 0.83878005, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.09729004, + "routerloss_mlp": 0.0, + "step": 3938, + "time_per_iteration": 2.4589834213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053327, + "balance_loss_mlp": 1.04381418, + "diversity_loss_mlp": 0.0, + "epoch": 0.7577914582531743, + "flos": 585030615552.0, + "grad_norm": 0.07008474871833649, + "language_loss": 0.86420083, + "learning_rate": 0.00014611632335413354, + "loss": 0.87473404, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3939, + "time_per_iteration": 2.7953155040740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_mlp": 1.04597211, + "diversity_loss_mlp": 0.0, + "epoch": 0.7579838399384379, + "flos": 820979172864.0, + "grad_norm": 0.06005420836927303, + "language_loss": 0.82715803, + "learning_rate": 0.00014589630437188456, + "loss": 0.83771348, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3940, + "time_per_iteration": 3.1720919609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056474, + "balance_loss_mlp": 1.04727697, + "diversity_loss_mlp": 0.0, + "epoch": 0.7581762216237015, + "flos": 443892441600.0, + "grad_norm": 0.07556117037580423, + "language_loss": 0.78885162, + "learning_rate": 0.00014567642286488253, + "loss": 0.7994163, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3941, + "time_per_iteration": 2.5224215984344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105497, + "balance_loss_mlp": 1.0453198, + "diversity_loss_mlp": 0.0, + "epoch": 0.7583686033089649, + "flos": 540886989312.0, + "grad_norm": 0.10380533878684198, + "language_loss": 0.79189527, + "learning_rate": 0.00014545667891849258, + "loss": 0.80244499, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3942, + "time_per_iteration": 2.6196579933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_mlp": 1.04717493, + "diversity_loss_mlp": 0.0, + "epoch": 0.7585609849942285, + "flos": 522588091392.0, + "grad_norm": 0.06980232416240703, + "language_loss": 0.82745945, + "learning_rate": 0.00014523707261802733, + "loss": 0.83802581, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3943, + "time_per_iteration": 2.652625799179077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794094, + "balance_loss_mlp": 1.34365344, + "diversity_loss_mlp": 0.22232203, + "epoch": 0.7587533666794921, + "flos": 541860503040.0, + "grad_norm": 0.034795977662747106, + "language_loss": 0.81799769, + "learning_rate": 0.00014501760404874527, + "loss": 0.82593858, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01110633, + "step": 3944, + "time_per_iteration": 2.7529001235961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059832, + "balance_loss_mlp": 1.05071235, + "diversity_loss_mlp": 0.0, + "epoch": 0.7589457483647557, + "flos": 606408270336.0, + "grad_norm": 0.07566953086997541, + "language_loss": 0.85807776, + "learning_rate": 0.00014479827329585176, + "loss": 0.86867607, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3945, + "time_per_iteration": 2.701622486114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_mlp": 1.04233766, + "diversity_loss_mlp": 0.0, + "epoch": 0.7591381300500193, + "flos": 555106452480.0, + "grad_norm": 0.05933089648069645, + "language_loss": 0.84881538, + "learning_rate": 0.00014457908044449846, + "loss": 0.85933375, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3946, + "time_per_iteration": 2.728095769882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787034, + "balance_loss_mlp": 1.32538223, + "diversity_loss_mlp": 0.22601989, + "epoch": 0.7593305117352828, + "flos": 529681669632.0, + "grad_norm": 0.02987157443530754, + "language_loss": 0.83105904, + "learning_rate": 0.00014436002557978371, + "loss": 0.83892936, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.011333, + "step": 3947, + "time_per_iteration": 2.8229527473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009615, + "balance_loss_mlp": 1.00491834, + "diversity_loss_mlp": 0.0, + "epoch": 0.7595228934205464, + "flos": 1502798759424.0, + "grad_norm": 0.009520189474687826, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77652764, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.046875, + "routerloss_mlp": 0.0, + "step": 3948, + "time_per_iteration": 6.289541482925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060096, + "balance_loss_mlp": 1.05072582, + "diversity_loss_mlp": 0.0, + "epoch": 0.7597152751058099, + "flos": 455525047296.0, + "grad_norm": 0.06379991139513626, + "language_loss": 0.79987645, + "learning_rate": 0.0001439223301503945, + "loss": 0.8104775, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3949, + "time_per_iteration": 2.4896605014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063286, + "balance_loss_mlp": 1.05441725, + "diversity_loss_mlp": 0.0, + "epoch": 0.7599076567910735, + "flos": 685466966016.0, + "grad_norm": 0.07443357695534152, + "language_loss": 0.75937033, + "learning_rate": 0.00014370368975564834, + "loss": 0.7700032, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 3950, + "time_per_iteration": 2.939652442932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062595, + "balance_loss_mlp": 1.05339789, + "diversity_loss_mlp": 0.0, + "epoch": 0.760100038476337, + "flos": 532372414464.0, + "grad_norm": 0.07225326310483449, + "language_loss": 0.83501256, + "learning_rate": 0.00014348518768739766, + "loss": 0.84563851, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3951, + "time_per_iteration": 2.760315179824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013895, + "balance_loss_mlp": 1.00924563, + "diversity_loss_mlp": 0.0, + "epoch": 0.7602924201616006, + "flos": 1471742866944.0, + "grad_norm": 0.01015881799745275, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77741933, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.04638672, + "routerloss_mlp": 0.0, + "step": 3952, + "time_per_iteration": 4.8084025382995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106276, + "balance_loss_mlp": 1.05365205, + "diversity_loss_mlp": 0.0, + "epoch": 0.7604848018468642, + "flos": 774631558656.0, + "grad_norm": 0.06460876756714844, + "language_loss": 0.86549526, + "learning_rate": 0.00014304859886964867, + "loss": 0.87612283, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3953, + "time_per_iteration": 2.9919626712799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_mlp": 1.05655789, + "diversity_loss_mlp": 0.0, + "epoch": 0.7606771835321278, + "flos": 558185209344.0, + "grad_norm": 0.06531272999026969, + "language_loss": 0.83625901, + "learning_rate": 0.00014283051228964878, + "loss": 0.84691703, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.09234619, + "routerloss_mlp": 0.0, + "step": 3954, + "time_per_iteration": 2.7195558547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060232, + "balance_loss_mlp": 1.05114245, + "diversity_loss_mlp": 0.0, + "epoch": 0.7608695652173914, + "flos": 525397404672.0, + "grad_norm": 0.06973579873696066, + "language_loss": 0.82862848, + "learning_rate": 0.00014261256437514197, + "loss": 0.83923078, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3955, + "time_per_iteration": 2.6542091369628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794195, + "balance_loss_mlp": 1.3411088, + "diversity_loss_mlp": 0.22477356, + "epoch": 0.7610619469026548, + "flos": 615038842368.0, + "grad_norm": 0.03401627820018092, + "language_loss": 0.82645166, + "learning_rate": 0.0001423947552107428, + "loss": 0.83439362, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0112533, + "step": 3956, + "time_per_iteration": 2.7648067474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062618, + "balance_loss_mlp": 1.05335546, + "diversity_loss_mlp": 0.0, + "epoch": 0.7612543285879184, + "flos": 863356382208.0, + "grad_norm": 0.06632119476384091, + "language_loss": 0.77184016, + "learning_rate": 0.00014217708488101243, + "loss": 0.78246629, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3957, + "time_per_iteration": 3.1002120971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064244, + "balance_loss_mlp": 1.05514848, + "diversity_loss_mlp": 0.0, + "epoch": 0.761446710273182, + "flos": 553658664960.0, + "grad_norm": 0.08639703813163502, + "language_loss": 0.77281177, + "learning_rate": 0.0001419595534704579, + "loss": 0.78345418, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3958, + "time_per_iteration": 2.7124218940734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062721, + "balance_loss_mlp": 1.05369043, + "diversity_loss_mlp": 0.0, + "epoch": 0.7616390919584456, + "flos": 467350373376.0, + "grad_norm": 0.06838082339011158, + "language_loss": 0.81229275, + "learning_rate": 0.00014174216106353237, + "loss": 0.82291996, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3959, + "time_per_iteration": 2.628516912460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060156, + "balance_loss_mlp": 1.05085802, + "diversity_loss_mlp": 0.0, + "epoch": 0.7618314736437091, + "flos": 498430858752.0, + "grad_norm": 0.07205328766008003, + "language_loss": 0.76858711, + "learning_rate": 0.00014152490774463512, + "loss": 0.77918863, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3960, + "time_per_iteration": 2.630159854888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106295, + "balance_loss_mlp": 1.05382431, + "diversity_loss_mlp": 0.0, + "epoch": 0.7620238553289727, + "flos": 434545316352.0, + "grad_norm": 0.0819861529910791, + "language_loss": 0.87198371, + "learning_rate": 0.00014130779359811135, + "loss": 0.88261318, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3961, + "time_per_iteration": 2.464413642883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058979, + "balance_loss_mlp": 1.04990077, + "diversity_loss_mlp": 0.0, + "epoch": 0.7622162370142362, + "flos": 664277262336.0, + "grad_norm": 0.07245892571162069, + "language_loss": 0.85946453, + "learning_rate": 0.0001410908187082521, + "loss": 0.87005424, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 3962, + "time_per_iteration": 2.921780586242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058262, + "balance_loss_mlp": 1.04887986, + "diversity_loss_mlp": 0.0, + "epoch": 0.7624086186994998, + "flos": 557965324800.0, + "grad_norm": 0.06688462156779182, + "language_loss": 0.83390021, + "learning_rate": 0.0001408739831592949, + "loss": 0.84448284, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3963, + "time_per_iteration": 2.6833889484405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060293, + "balance_loss_mlp": 1.05104804, + "diversity_loss_mlp": 0.0, + "epoch": 0.7626010003847634, + "flos": 629132396544.0, + "grad_norm": 0.0755930480675871, + "language_loss": 0.77544367, + "learning_rate": 0.0001406572870354224, + "loss": 0.7860465, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3964, + "time_per_iteration": 2.7871947288513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060093, + "balance_loss_mlp": 1.05084801, + "diversity_loss_mlp": 0.0, + "epoch": 0.7627933820700269, + "flos": 437942702592.0, + "grad_norm": 0.06988595261199848, + "language_loss": 0.86813599, + "learning_rate": 0.00014044073042076337, + "loss": 0.87873685, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3965, + "time_per_iteration": 2.4948155879974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064783, + "balance_loss_mlp": 1.0558666, + "diversity_loss_mlp": 0.0, + "epoch": 0.7629857637552905, + "flos": 532723350528.0, + "grad_norm": 0.053016831320737375, + "language_loss": 0.88845956, + "learning_rate": 0.00014022431339939302, + "loss": 0.8991074, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 3966, + "time_per_iteration": 2.673383951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057572, + "balance_loss_mlp": 1.04824972, + "diversity_loss_mlp": 0.0, + "epoch": 0.7631781454405541, + "flos": 680036290560.0, + "grad_norm": 0.09057872820095057, + "language_loss": 0.7816959, + "learning_rate": 0.00014000803605533163, + "loss": 0.79227161, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3967, + "time_per_iteration": 2.8631951808929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057314, + "balance_loss_mlp": 1.04857016, + "diversity_loss_mlp": 0.0, + "epoch": 0.7633705271258177, + "flos": 507493859328.0, + "grad_norm": 0.08630668575925342, + "language_loss": 0.84042531, + "learning_rate": 0.00013979189847254553, + "loss": 0.85099846, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 3968, + "time_per_iteration": 2.5586295127868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057911, + "balance_loss_mlp": 1.04832053, + "diversity_loss_mlp": 0.0, + "epoch": 0.7635629088110811, + "flos": 618866085888.0, + "grad_norm": 0.07119073500769035, + "language_loss": 0.80335605, + "learning_rate": 0.00013957590073494674, + "loss": 0.81393516, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3969, + "time_per_iteration": 2.785759449005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055351, + "balance_loss_mlp": 1.0460887, + "diversity_loss_mlp": 0.0, + "epoch": 0.7637552904963447, + "flos": 638425193472.0, + "grad_norm": 0.0691753234001315, + "language_loss": 0.78865349, + "learning_rate": 0.0001393600429263931, + "loss": 0.79920697, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3970, + "time_per_iteration": 2.7582993507385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013524, + "balance_loss_mlp": 1.00873148, + "diversity_loss_mlp": 0.0, + "epoch": 0.7639476721816083, + "flos": 1563222302208.0, + "grad_norm": 0.011908325756944461, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.7575841, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.04785156, + "routerloss_mlp": 0.0, + "step": 3971, + "time_per_iteration": 4.944155693054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051128, + "balance_loss_mlp": 1.04182386, + "diversity_loss_mlp": 0.0, + "epoch": 0.7641400538668719, + "flos": 495987162624.0, + "grad_norm": 0.07417078530438988, + "language_loss": 0.81570405, + "learning_rate": 0.0001389287474315804, + "loss": 0.82621539, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3972, + "time_per_iteration": 2.6553244590759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052519, + "balance_loss_mlp": 1.04347086, + "diversity_loss_mlp": 0.0, + "epoch": 0.7643324355521355, + "flos": 578441046528.0, + "grad_norm": 0.05487535888911553, + "language_loss": 0.79840803, + "learning_rate": 0.00013871330991276505, + "loss": 0.8089332, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 3973, + "time_per_iteration": 2.681697368621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052207, + "balance_loss_mlp": 1.0428077, + "diversity_loss_mlp": 0.0, + "epoch": 0.764524817237399, + "flos": 784823717376.0, + "grad_norm": 0.08960984364762024, + "language_loss": 0.80946076, + "learning_rate": 0.00013849801265788247, + "loss": 0.81998283, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3974, + "time_per_iteration": 3.0523104667663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796632, + "balance_loss_mlp": 1.34598541, + "diversity_loss_mlp": 0.22497699, + "epoch": 0.7647171989226625, + "flos": 526279514112.0, + "grad_norm": 0.033347453631336434, + "language_loss": 0.83125114, + "learning_rate": 0.00013828285575051818, + "loss": 0.83921754, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01115073, + "step": 3975, + "time_per_iteration": 2.631014108657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052026, + "balance_loss_mlp": 1.04301977, + "diversity_loss_mlp": 0.0, + "epoch": 0.7649095806079261, + "flos": 554876656128.0, + "grad_norm": 0.06872239671854397, + "language_loss": 0.84060633, + "learning_rate": 0.0001380678392742035, + "loss": 0.85112655, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 3976, + "time_per_iteration": 2.710768938064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050013, + "balance_loss_mlp": 1.04042244, + "diversity_loss_mlp": 0.0, + "epoch": 0.7651019622931897, + "flos": 649145954304.0, + "grad_norm": 0.05722299510673748, + "language_loss": 0.84721446, + "learning_rate": 0.00013785296331241526, + "loss": 0.85771459, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3977, + "time_per_iteration": 2.863175868988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049924, + "balance_loss_mlp": 1.04060829, + "diversity_loss_mlp": 0.0, + "epoch": 0.7652943439784533, + "flos": 1046449248768.0, + "grad_norm": 0.0690026214963165, + "language_loss": 0.87410915, + "learning_rate": 0.00013763822794857583, + "loss": 0.88460839, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3978, + "time_per_iteration": 3.3100810050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049847, + "balance_loss_mlp": 1.04050136, + "diversity_loss_mlp": 0.0, + "epoch": 0.7654867256637168, + "flos": 504350862336.0, + "grad_norm": 0.06632607852839086, + "language_loss": 0.90003061, + "learning_rate": 0.00013742363326605278, + "loss": 0.91052908, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3979, + "time_per_iteration": 2.754115581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_mlp": 1.04258752, + "diversity_loss_mlp": 0.0, + "epoch": 0.7656791073489804, + "flos": 574709976576.0, + "grad_norm": 0.059791344398012564, + "language_loss": 0.78432417, + "learning_rate": 0.00013720917934815935, + "loss": 0.79484463, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3980, + "time_per_iteration": 2.801797866821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053778, + "balance_loss_mlp": 1.04425907, + "diversity_loss_mlp": 0.0, + "epoch": 0.765871489034244, + "flos": 492812232192.0, + "grad_norm": 0.08312893208703641, + "language_loss": 0.82967758, + "learning_rate": 0.00013699486627815344, + "loss": 0.84021544, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3981, + "time_per_iteration": 2.6589224338531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052365, + "balance_loss_mlp": 1.04295897, + "diversity_loss_mlp": 0.0, + "epoch": 0.7660638707195075, + "flos": 486024800256.0, + "grad_norm": 0.07260212580199023, + "language_loss": 0.82633436, + "learning_rate": 0.00013678069413923928, + "loss": 0.83685803, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3982, + "time_per_iteration": 2.6876726150512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_mlp": 1.0454247, + "diversity_loss_mlp": 0.0, + "epoch": 0.766256252404771, + "flos": 444295134720.0, + "grad_norm": 0.060912508562222696, + "language_loss": 0.81971568, + "learning_rate": 0.00013656666301456555, + "loss": 0.83026105, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3983, + "time_per_iteration": 2.547969341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051422, + "balance_loss_mlp": 1.04195666, + "diversity_loss_mlp": 0.0, + "epoch": 0.7664486340900346, + "flos": 485179766784.0, + "grad_norm": 0.07203556219041155, + "language_loss": 0.84272242, + "learning_rate": 0.0001363527729872267, + "loss": 0.85323668, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3984, + "time_per_iteration": 2.638418197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052921, + "balance_loss_mlp": 1.04378974, + "diversity_loss_mlp": 0.0, + "epoch": 0.7666410157752982, + "flos": 646200820224.0, + "grad_norm": 0.06683426358110046, + "language_loss": 0.76389247, + "learning_rate": 0.00013613902414026207, + "loss": 0.77442169, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3985, + "time_per_iteration": 2.7989237308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055274, + "balance_loss_mlp": 1.04588056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7668333974605618, + "flos": 774303017472.0, + "grad_norm": 0.07515257411295292, + "language_loss": 0.82508516, + "learning_rate": 0.00013592541655665642, + "loss": 0.83563781, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3986, + "time_per_iteration": 3.015293836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105591, + "balance_loss_mlp": 1.04635525, + "diversity_loss_mlp": 0.0, + "epoch": 0.7670257791458254, + "flos": 613462574592.0, + "grad_norm": 0.07774054250244124, + "language_loss": 0.85269868, + "learning_rate": 0.00013571195031933947, + "loss": 0.86325783, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 3987, + "time_per_iteration": 2.6980810165405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010581, + "balance_loss_mlp": 1.0057168, + "diversity_loss_mlp": 0.0, + "epoch": 0.7672181608310888, + "flos": 1485357378048.0, + "grad_norm": 0.012742252799641985, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81491923, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 3988, + "time_per_iteration": 4.809666156768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049905, + "balance_loss_mlp": 1.04043365, + "diversity_loss_mlp": 0.0, + "epoch": 0.7674105425163524, + "flos": 610732182528.0, + "grad_norm": 0.07424799958173026, + "language_loss": 0.85590923, + "learning_rate": 0.00013528544221501655, + "loss": 0.86640829, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3989, + "time_per_iteration": 2.7649118900299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010579, + "balance_loss_mlp": 1.04848218, + "diversity_loss_mlp": 0.0, + "epoch": 0.767602924201616, + "flos": 845205788160.0, + "grad_norm": 0.07001972276723446, + "language_loss": 0.81763613, + "learning_rate": 0.00013507240051359586, + "loss": 0.82821512, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3990, + "time_per_iteration": 3.0377867221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057165, + "balance_loss_mlp": 1.04797447, + "diversity_loss_mlp": 0.0, + "epoch": 0.7677953058868796, + "flos": 527114635776.0, + "grad_norm": 0.07160878890290734, + "language_loss": 0.86059034, + "learning_rate": 0.00013485950048963425, + "loss": 0.87116206, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3991, + "time_per_iteration": 2.5790224075317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105318, + "balance_loss_mlp": 1.04409003, + "diversity_loss_mlp": 0.0, + "epoch": 0.7679876875721431, + "flos": 923550501888.0, + "grad_norm": 0.0667031946156718, + "language_loss": 0.82767689, + "learning_rate": 0.00013464674222578643, + "loss": 0.83820868, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3992, + "time_per_iteration": 3.201578140258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057061, + "balance_loss_mlp": 1.04791176, + "diversity_loss_mlp": 0.0, + "epoch": 0.7681800692574067, + "flos": 458087311872.0, + "grad_norm": 0.08569609854575283, + "language_loss": 0.83404213, + "learning_rate": 0.00013443412580465292, + "loss": 0.84461272, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3993, + "time_per_iteration": 2.5704004764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_mlp": 1.04118383, + "diversity_loss_mlp": 0.0, + "epoch": 0.7683724509426703, + "flos": 658436179968.0, + "grad_norm": 0.0673936052155154, + "language_loss": 0.83964813, + "learning_rate": 0.00013422165130877857, + "loss": 0.85015404, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3994, + "time_per_iteration": 2.9138286113739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057424, + "balance_loss_mlp": 1.0483048, + "diversity_loss_mlp": 0.0, + "epoch": 0.7685648326279338, + "flos": 555284491776.0, + "grad_norm": 0.07281784593119212, + "language_loss": 0.8049981, + "learning_rate": 0.00013400931882065327, + "loss": 0.81557238, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3995, + "time_per_iteration": 2.6342077255249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055278, + "balance_loss_mlp": 1.04585409, + "diversity_loss_mlp": 0.0, + "epoch": 0.7687572143131974, + "flos": 687404081664.0, + "grad_norm": 0.062093519620885704, + "language_loss": 0.80842459, + "learning_rate": 0.0001337971284227118, + "loss": 0.81897736, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3996, + "time_per_iteration": 3.0022008419036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004861, + "balance_loss_mlp": 1.00011611, + "diversity_loss_mlp": 0.0, + "epoch": 0.7689495959984609, + "flos": 1489453691904.0, + "grad_norm": 0.007312606829584695, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77123284, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.04736328, + "routerloss_mlp": 0.0, + "step": 3997, + "time_per_iteration": 4.911606311798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055259, + "balance_loss_mlp": 1.04605579, + "diversity_loss_mlp": 0.0, + "epoch": 0.7691419776837245, + "flos": 570405888000.0, + "grad_norm": 0.06973120075241693, + "language_loss": 0.8046248, + "learning_rate": 0.0001333731742268438, + "loss": 0.81517738, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3998, + "time_per_iteration": 2.683593273162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053524, + "balance_loss_mlp": 1.0442791, + "diversity_loss_mlp": 0.0, + "epoch": 0.7693343593689881, + "flos": 520087495680.0, + "grad_norm": 0.0765354269800423, + "language_loss": 0.85693717, + "learning_rate": 0.0001331614105935109, + "loss": 0.86747241, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3999, + "time_per_iteration": 2.675220489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054062, + "balance_loss_mlp": 1.04481769, + "diversity_loss_mlp": 0.0, + "epoch": 0.7695267410542517, + "flos": 660378438144.0, + "grad_norm": 0.06349178277774252, + "language_loss": 0.84176111, + "learning_rate": 0.00013294978937954883, + "loss": 0.85230172, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4000, + "time_per_iteration": 2.8622941970825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054979, + "balance_loss_mlp": 1.04558492, + "diversity_loss_mlp": 0.0, + "epoch": 0.7697191227395151, + "flos": 546809564160.0, + "grad_norm": 0.09234703224205486, + "language_loss": 0.85414779, + "learning_rate": 0.00013273831066711655, + "loss": 0.86469758, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4001, + "time_per_iteration": 2.6298534870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052476, + "balance_loss_mlp": 1.04325461, + "diversity_loss_mlp": 0.0, + "epoch": 0.7699115044247787, + "flos": 540610205184.0, + "grad_norm": 0.06055695533202859, + "language_loss": 0.79907209, + "learning_rate": 0.00013252697453831747, + "loss": 0.8095969, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 4002, + "time_per_iteration": 2.692922830581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.03798985, + "diversity_loss_mlp": 0.0, + "epoch": 0.7701038861100423, + "flos": 562936407552.0, + "grad_norm": 0.06495740089460322, + "language_loss": 0.82613641, + "learning_rate": 0.00013231578107519916, + "loss": 0.83660942, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4003, + "time_per_iteration": 2.9229555130004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049706, + "balance_loss_mlp": 1.04049134, + "diversity_loss_mlp": 0.0, + "epoch": 0.7702962677953059, + "flos": 481737964032.0, + "grad_norm": 0.07621650724161941, + "language_loss": 0.82803172, + "learning_rate": 0.00013210473035975422, + "loss": 0.83852881, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 4004, + "time_per_iteration": 2.569532632827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_mlp": 1.04116035, + "diversity_loss_mlp": 0.0, + "epoch": 0.7704886494805695, + "flos": 770389138944.0, + "grad_norm": 0.07296352629436301, + "language_loss": 0.85812414, + "learning_rate": 0.0001318938224739201, + "loss": 0.86862826, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4005, + "time_per_iteration": 3.0234341621398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049929, + "balance_loss_mlp": 1.04063106, + "diversity_loss_mlp": 0.0, + "epoch": 0.770681031165833, + "flos": 601192336896.0, + "grad_norm": 0.06528825004105314, + "language_loss": 0.83766401, + "learning_rate": 0.00013168305749957843, + "loss": 0.84816337, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 4006, + "time_per_iteration": 2.733548641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790765, + "balance_loss_mlp": 1.33768153, + "diversity_loss_mlp": 0.22157404, + "epoch": 0.7708734128510966, + "flos": 496108302336.0, + "grad_norm": 0.030772470198916744, + "language_loss": 0.82874978, + "learning_rate": 0.00013147243551855532, + "loss": 0.8366574, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01113757, + "step": 4007, + "time_per_iteration": 2.6124446392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_mlp": 1.0404737, + "diversity_loss_mlp": 0.0, + "epoch": 0.7710657945363601, + "flos": 567299966976.0, + "grad_norm": 0.05859111752284866, + "language_loss": 0.80677342, + "learning_rate": 0.00013126195661262148, + "loss": 0.81727076, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4008, + "time_per_iteration": 2.7372946739196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052041, + "balance_loss_mlp": 1.04286766, + "diversity_loss_mlp": 0.0, + "epoch": 0.7712581762216237, + "flos": 604550075904.0, + "grad_norm": 0.06950402202343967, + "language_loss": 0.86921602, + "learning_rate": 0.00013105162086349216, + "loss": 0.87973642, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4009, + "time_per_iteration": 2.825164556503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050421, + "balance_loss_mlp": 1.04102159, + "diversity_loss_mlp": 0.0, + "epoch": 0.7714505579068872, + "flos": 530894891520.0, + "grad_norm": 0.05664497988696294, + "language_loss": 0.85945249, + "learning_rate": 0.00013084142835282687, + "loss": 0.86995667, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 4010, + "time_per_iteration": 2.6627306938171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00590218, + "balance_loss_mlp": 1.02735484, + "diversity_loss_mlp": 0.13424492, + "epoch": 0.7716429395921508, + "flos": 1422205267968.0, + "grad_norm": 0.0012430140076356488, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80474579, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00941846, + "step": 4011, + "time_per_iteration": 4.808507919311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050884, + "balance_loss_mlp": 1.04154992, + "diversity_loss_mlp": 0.0, + "epoch": 0.7718353212774144, + "flos": 578428563456.0, + "grad_norm": 0.062052307609784016, + "language_loss": 0.89290094, + "learning_rate": 0.0001304214733732485, + "loss": 0.90340984, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 4012, + "time_per_iteration": 2.7328708171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105349, + "balance_loss_mlp": 1.04380453, + "diversity_loss_mlp": 0.0, + "epoch": 0.772027702962678, + "flos": 510742941696.0, + "grad_norm": 0.07734543299334512, + "language_loss": 0.82669097, + "learning_rate": 0.00013021171106737672, + "loss": 0.83722585, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 4013, + "time_per_iteration": 2.6573734283447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049877, + "balance_loss_mlp": 1.04070377, + "diversity_loss_mlp": 0.0, + "epoch": 0.7722200846479416, + "flos": 525661705728.0, + "grad_norm": 0.06603423132938777, + "language_loss": 0.80092031, + "learning_rate": 0.00013000209232605071, + "loss": 0.81141913, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4014, + "time_per_iteration": 2.717602014541626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053571, + "balance_loss_mlp": 1.04388535, + "diversity_loss_mlp": 0.0, + "epoch": 0.772412466333205, + "flos": 479598216192.0, + "grad_norm": 0.10571386830465022, + "language_loss": 0.80179751, + "learning_rate": 0.0001297926172306519, + "loss": 0.81233323, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 4015, + "time_per_iteration": 2.65010142326355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051804, + "balance_loss_mlp": 1.04230273, + "diversity_loss_mlp": 0.0, + "epoch": 0.7726048480184686, + "flos": 905688801792.0, + "grad_norm": 0.06492582612573077, + "language_loss": 0.7883606, + "learning_rate": 0.0001295832858625055, + "loss": 0.79887861, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 4016, + "time_per_iteration": 3.2565736770629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050538, + "balance_loss_mlp": 1.04109037, + "diversity_loss_mlp": 0.0, + "epoch": 0.7727972297037322, + "flos": 631380801024.0, + "grad_norm": 0.06662088321139942, + "language_loss": 0.70083648, + "learning_rate": 0.00012937409830288154, + "loss": 0.71134186, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 4017, + "time_per_iteration": 2.818197250366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046111, + "balance_loss_mlp": 1.03688383, + "diversity_loss_mlp": 0.0, + "epoch": 0.7729896113889958, + "flos": 414786147840.0, + "grad_norm": 0.08953669234150197, + "language_loss": 0.84953344, + "learning_rate": 0.00012916505463299362, + "loss": 0.85999447, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4018, + "time_per_iteration": 2.5104525089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104651, + "balance_loss_mlp": 1.03696132, + "diversity_loss_mlp": 0.0, + "epoch": 0.7731819930742593, + "flos": 668907694080.0, + "grad_norm": 0.08710028809718832, + "language_loss": 0.78235918, + "learning_rate": 0.00012895615493399972, + "loss": 0.79282427, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 4019, + "time_per_iteration": 2.7878103256225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104679, + "balance_loss_mlp": 1.03747368, + "diversity_loss_mlp": 0.0, + "epoch": 0.7733743747595229, + "flos": 489854615040.0, + "grad_norm": 0.07808729146965544, + "language_loss": 0.82637143, + "learning_rate": 0.00012874739928700192, + "loss": 0.83683932, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 4020, + "time_per_iteration": 2.5788097381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044505, + "balance_loss_mlp": 1.03501582, + "diversity_loss_mlp": 0.0, + "epoch": 0.7735667564447865, + "flos": 659612325888.0, + "grad_norm": 0.07324265685000747, + "language_loss": 0.79874408, + "learning_rate": 0.00012853878777304624, + "loss": 0.80918914, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 4021, + "time_per_iteration": 2.870278835296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794381, + "balance_loss_mlp": 1.34430456, + "diversity_loss_mlp": 0.22252312, + "epoch": 0.77375913813005, + "flos": 533383004160.0, + "grad_norm": 0.029931863934209574, + "language_loss": 0.84459031, + "learning_rate": 0.000128330320473123, + "loss": 0.85253412, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01096685, + "step": 4022, + "time_per_iteration": 2.7129287719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008173, + "balance_loss_mlp": 1.00330901, + "diversity_loss_mlp": 0.0, + "epoch": 0.7739515198153136, + "flos": 1520081925120.0, + "grad_norm": 0.013994594591819043, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.7934007, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 4023, + "time_per_iteration": 4.895900726318359 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051725, + "balance_loss_mlp": 1.04231346, + "diversity_loss_mlp": 0.0, + "epoch": 0.7741439015005771, + "flos": 640105348608.0, + "grad_norm": 0.07018696985022486, + "language_loss": 0.81708258, + "learning_rate": 0.0001279138188390543, + "loss": 0.82759976, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 4024, + "time_per_iteration": 2.745079517364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050638, + "balance_loss_mlp": 1.04130435, + "diversity_loss_mlp": 0.0, + "epoch": 0.7743362831858407, + "flos": 665841420288.0, + "grad_norm": 0.06486800405407347, + "language_loss": 0.86009115, + "learning_rate": 0.00012770578466660915, + "loss": 0.87059748, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4025, + "time_per_iteration": 2.848886013031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054208, + "balance_loss_mlp": 1.04474843, + "diversity_loss_mlp": 0.0, + "epoch": 0.7745286648711043, + "flos": 562760939520.0, + "grad_norm": 0.06391594939980325, + "language_loss": 0.81626999, + "learning_rate": 0.0001274978950315968, + "loss": 0.82681203, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 4026, + "time_per_iteration": 2.791773796081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104997, + "balance_loss_mlp": 1.04037929, + "diversity_loss_mlp": 0.0, + "epoch": 0.7747210465563679, + "flos": 516912565248.0, + "grad_norm": 0.11270799389052534, + "language_loss": 0.83240479, + "learning_rate": 0.00012729015001472716, + "loss": 0.84290445, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 4027, + "time_per_iteration": 2.6333580017089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_mlp": 1.04164386, + "diversity_loss_mlp": 0.0, + "epoch": 0.7749134282416313, + "flos": 634209937920.0, + "grad_norm": 0.06039716871949276, + "language_loss": 0.81597829, + "learning_rate": 0.00012708254969665418, + "loss": 0.82648969, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 4028, + "time_per_iteration": 2.753960132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057336, + "balance_loss_mlp": 1.0482347, + "diversity_loss_mlp": 0.0, + "epoch": 0.7751058099268949, + "flos": 495364584960.0, + "grad_norm": 0.08015627547619836, + "language_loss": 0.83207834, + "learning_rate": 0.00012687509415797526, + "loss": 0.84265172, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4029, + "time_per_iteration": 2.549224376678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055891, + "balance_loss_mlp": 1.04669952, + "diversity_loss_mlp": 0.0, + "epoch": 0.7752981916121585, + "flos": 510310513152.0, + "grad_norm": 0.0754412874698092, + "language_loss": 0.81577122, + "learning_rate": 0.00012666778347923208, + "loss": 0.82633013, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 4030, + "time_per_iteration": 2.6578049659729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058934, + "balance_loss_mlp": 1.04996991, + "diversity_loss_mlp": 0.0, + "epoch": 0.7754905732974221, + "flos": 497548749312.0, + "grad_norm": 0.05434911795401194, + "language_loss": 0.83884913, + "learning_rate": 0.0001264606177409092, + "loss": 0.84943849, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4031, + "time_per_iteration": 2.7437548637390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054431, + "balance_loss_mlp": 1.04539514, + "diversity_loss_mlp": 0.0, + "epoch": 0.7756829549826857, + "flos": 480744626688.0, + "grad_norm": 0.06981681066227559, + "language_loss": 0.85926938, + "learning_rate": 0.00012625359702343609, + "loss": 0.86981368, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4032, + "time_per_iteration": 2.7145252227783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062978, + "balance_loss_mlp": 1.05414999, + "diversity_loss_mlp": 0.0, + "epoch": 0.7758753366679492, + "flos": 552630822912.0, + "grad_norm": 0.06703655691775996, + "language_loss": 0.84627414, + "learning_rate": 0.00012604672140718504, + "loss": 0.85690391, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4033, + "time_per_iteration": 2.6776609420776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061314, + "balance_loss_mlp": 1.05224824, + "diversity_loss_mlp": 0.0, + "epoch": 0.7760677183532128, + "flos": 703835246592.0, + "grad_norm": 0.0713724123127894, + "language_loss": 0.77912575, + "learning_rate": 0.00012583999097247233, + "loss": 0.78973895, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4034, + "time_per_iteration": 2.8429367542266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058219, + "balance_loss_mlp": 1.04938531, + "diversity_loss_mlp": 0.0, + "epoch": 0.7762601000384763, + "flos": 523470200832.0, + "grad_norm": 0.07138701732892383, + "language_loss": 0.80042505, + "learning_rate": 0.0001256334057995578, + "loss": 0.81100732, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4035, + "time_per_iteration": 2.805361032485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060081, + "balance_loss_mlp": 1.05109227, + "diversity_loss_mlp": 0.0, + "epoch": 0.7764524817237399, + "flos": 557532896256.0, + "grad_norm": 0.06152435345467902, + "language_loss": 0.85125613, + "learning_rate": 0.000125426965968645, + "loss": 0.86185694, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4036, + "time_per_iteration": 2.7150938510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064057, + "balance_loss_mlp": 1.05523515, + "diversity_loss_mlp": 0.0, + "epoch": 0.7766448634090035, + "flos": 579725849088.0, + "grad_norm": 0.07000613008602406, + "language_loss": 0.819399, + "learning_rate": 0.00012522067155988092, + "loss": 0.83003962, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4037, + "time_per_iteration": 2.6996352672576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060303, + "balance_loss_mlp": 1.05135584, + "diversity_loss_mlp": 0.0, + "epoch": 0.776837245094267, + "flos": 635603397120.0, + "grad_norm": 0.0718823999319763, + "language_loss": 0.75306779, + "learning_rate": 0.00012501452265335617, + "loss": 0.7636708, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4038, + "time_per_iteration": 2.8315415382385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066111, + "balance_loss_mlp": 1.05724156, + "diversity_loss_mlp": 0.0, + "epoch": 0.7770296267795306, + "flos": 614680565760.0, + "grad_norm": 0.06411925705378174, + "language_loss": 0.83063197, + "learning_rate": 0.0001248085193291047, + "loss": 0.84129304, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4039, + "time_per_iteration": 2.729095935821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069535, + "balance_loss_mlp": 1.0605464, + "diversity_loss_mlp": 0.0, + "epoch": 0.7772220084647942, + "flos": 878808890880.0, + "grad_norm": 0.05882048458025786, + "language_loss": 0.82089669, + "learning_rate": 0.00012460266166710443, + "loss": 0.83159202, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4040, + "time_per_iteration": 3.1514501571655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068929, + "balance_loss_mlp": 1.06013775, + "diversity_loss_mlp": 0.0, + "epoch": 0.7774143901500578, + "flos": 839641489920.0, + "grad_norm": 0.07867166554480139, + "language_loss": 0.77746958, + "learning_rate": 0.00012439694974727633, + "loss": 0.78815889, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4041, + "time_per_iteration": 3.0117955207824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_mlp": 1.05708027, + "diversity_loss_mlp": 0.0, + "epoch": 0.7776067718353212, + "flos": 568147571712.0, + "grad_norm": 0.06430167773545564, + "language_loss": 0.79798543, + "learning_rate": 0.00012419138364948458, + "loss": 0.80864501, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4042, + "time_per_iteration": 2.7055745124816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064858, + "balance_loss_mlp": 1.05601263, + "diversity_loss_mlp": 0.0, + "epoch": 0.7777991535205848, + "flos": 745943012352.0, + "grad_norm": 0.06788477072783218, + "language_loss": 0.82296908, + "learning_rate": 0.00012398596345353702, + "loss": 0.83361769, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4043, + "time_per_iteration": 2.8943872451782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064992, + "balance_loss_mlp": 1.05608058, + "diversity_loss_mlp": 0.0, + "epoch": 0.7779915352058484, + "flos": 538075104768.0, + "grad_norm": 0.06253380969554054, + "language_loss": 0.83342338, + "learning_rate": 0.0001237806892391851, + "loss": 0.8440733, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4044, + "time_per_iteration": 2.697079658508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061837, + "balance_loss_mlp": 1.05312264, + "diversity_loss_mlp": 0.0, + "epoch": 0.778183916891112, + "flos": 634788099072.0, + "grad_norm": 0.07069263559946819, + "language_loss": 0.81128013, + "learning_rate": 0.0001235755610861233, + "loss": 0.82189852, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4045, + "time_per_iteration": 2.7329134941101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066232, + "balance_loss_mlp": 1.05731463, + "diversity_loss_mlp": 0.0, + "epoch": 0.7783762985763756, + "flos": 588677621760.0, + "grad_norm": 0.07032278053298287, + "language_loss": 0.85504925, + "learning_rate": 0.0001233705790739893, + "loss": 0.86571157, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4046, + "time_per_iteration": 2.708867073059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061968, + "balance_loss_mlp": 1.05317056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7785686802616391, + "flos": 930656563200.0, + "grad_norm": 0.08570945023626393, + "language_loss": 0.7512747, + "learning_rate": 0.0001231657432823643, + "loss": 0.76189435, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4047, + "time_per_iteration": 3.209035634994507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064295, + "balance_loss_mlp": 1.05536008, + "diversity_loss_mlp": 0.0, + "epoch": 0.7787610619469026, + "flos": 497934190080.0, + "grad_norm": 0.07478772193794427, + "language_loss": 0.78683329, + "learning_rate": 0.0001229610537907725, + "loss": 0.79747623, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4048, + "time_per_iteration": 2.570645332336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063203, + "balance_loss_mlp": 1.05442929, + "diversity_loss_mlp": 0.0, + "epoch": 0.7789534436321662, + "flos": 515637674496.0, + "grad_norm": 0.07810921414498996, + "language_loss": 0.90262878, + "learning_rate": 0.00012275651067868143, + "loss": 0.91326082, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4049, + "time_per_iteration": 2.5862553119659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058533, + "balance_loss_mlp": 1.04978311, + "diversity_loss_mlp": 0.0, + "epoch": 0.7791458253174298, + "flos": 988476369408.0, + "grad_norm": 0.05845393765756997, + "language_loss": 0.80259252, + "learning_rate": 0.00012255211402550182, + "loss": 0.81317782, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4050, + "time_per_iteration": 3.2020328044891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055369, + "balance_loss_mlp": 1.04645181, + "diversity_loss_mlp": 0.0, + "epoch": 0.7793382070026933, + "flos": 629040992256.0, + "grad_norm": 0.07830185849799275, + "language_loss": 0.76506507, + "learning_rate": 0.00012234786391058727, + "loss": 0.77561879, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4051, + "time_per_iteration": 2.823751449584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059116, + "balance_loss_mlp": 1.05021727, + "diversity_loss_mlp": 0.0, + "epoch": 0.7795305886879569, + "flos": 531752408064.0, + "grad_norm": 0.07934971719083544, + "language_loss": 0.85162616, + "learning_rate": 0.0001221437604132352, + "loss": 0.86221731, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4052, + "time_per_iteration": 2.6284594535827637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054893, + "balance_loss_mlp": 1.04598236, + "diversity_loss_mlp": 0.0, + "epoch": 0.7797229703732205, + "flos": 611979909120.0, + "grad_norm": 0.07077897315409304, + "language_loss": 0.8102321, + "learning_rate": 0.0001219398036126852, + "loss": 0.82078099, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4053, + "time_per_iteration": 2.7439231872558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059971, + "balance_loss_mlp": 1.05101228, + "diversity_loss_mlp": 0.0, + "epoch": 0.7799153520584841, + "flos": 872164620288.0, + "grad_norm": 0.06870313821829518, + "language_loss": 0.78245676, + "learning_rate": 0.00012173599358812027, + "loss": 0.79305649, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4054, + "time_per_iteration": 3.256080150604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058619, + "balance_loss_mlp": 1.04986334, + "diversity_loss_mlp": 0.0, + "epoch": 0.7801077337437476, + "flos": 583627244544.0, + "grad_norm": 0.07402592003625927, + "language_loss": 0.82719493, + "learning_rate": 0.0001215323304186668, + "loss": 0.83778107, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4055, + "time_per_iteration": 2.7612040042877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105856, + "balance_loss_mlp": 1.05008435, + "diversity_loss_mlp": 0.0, + "epoch": 0.7803001154290111, + "flos": 601165172736.0, + "grad_norm": 0.06917846158934658, + "language_loss": 0.87829256, + "learning_rate": 0.00012132881418339364, + "loss": 0.88887817, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.08483887, + "routerloss_mlp": 0.0, + "step": 4056, + "time_per_iteration": 2.7365031242370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006422, + "balance_loss_mlp": 1.00186825, + "diversity_loss_mlp": 0.0, + "epoch": 0.7804924971142747, + "flos": 1479577591296.0, + "grad_norm": 0.016656968003394067, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78523988, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.0456543, + "routerloss_mlp": 0.0, + "step": 4057, + "time_per_iteration": 4.83305811882019 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_mlp": 1.04785705, + "diversity_loss_mlp": 0.0, + "epoch": 0.7806848787995383, + "flos": 630362870784.0, + "grad_norm": 0.06805160455788861, + "language_loss": 0.77303064, + "learning_rate": 0.00012092222283137944, + "loss": 0.78359842, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4058, + "time_per_iteration": 2.749647617340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100669, + "balance_loss_mlp": 1.00213623, + "diversity_loss_mlp": 0.0, + "epoch": 0.7808772604848019, + "flos": 1417587319296.0, + "grad_norm": 0.014137874321597207, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79913002, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.0456543, + "routerloss_mlp": 0.0, + "step": 4059, + "time_per_iteration": 4.786531209945679 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060827, + "balance_loss_mlp": 1.0521071, + "diversity_loss_mlp": 0.0, + "epoch": 0.7810696421700654, + "flos": 731696011776.0, + "grad_norm": 0.0627573295973092, + "language_loss": 0.83679825, + "learning_rate": 0.00012051622016348856, + "loss": 0.84740651, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4060, + "time_per_iteration": 2.999849557876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060136, + "balance_loss_mlp": 1.05145788, + "diversity_loss_mlp": 0.0, + "epoch": 0.781262023855329, + "flos": 424941230592.0, + "grad_norm": 0.09064537340570315, + "language_loss": 0.84317231, + "learning_rate": 0.00012031343978315539, + "loss": 0.85377359, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 4061, + "time_per_iteration": 2.468447208404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056381, + "balance_loss_mlp": 1.04746997, + "diversity_loss_mlp": 0.0, + "epoch": 0.7814544055405925, + "flos": 501027628032.0, + "grad_norm": 0.06926307807295869, + "language_loss": 0.8253361, + "learning_rate": 0.00012011080681021774, + "loss": 0.83589995, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4062, + "time_per_iteration": 2.6554322242736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058674, + "balance_loss_mlp": 1.04981685, + "diversity_loss_mlp": 0.0, + "epoch": 0.7816467872258561, + "flos": 462448300032.0, + "grad_norm": 0.07294593948757502, + "language_loss": 0.86419785, + "learning_rate": 0.00011990832132334512, + "loss": 0.87478459, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4063, + "time_per_iteration": 2.514464855194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054396, + "balance_loss_mlp": 1.04535961, + "diversity_loss_mlp": 0.0, + "epoch": 0.7818391689111197, + "flos": 740818483200.0, + "grad_norm": 0.07578138035513655, + "language_loss": 0.82624197, + "learning_rate": 0.00011970598340114897, + "loss": 0.83678591, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4064, + "time_per_iteration": 2.931457042694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051575, + "balance_loss_mlp": 1.04267633, + "diversity_loss_mlp": 0.0, + "epoch": 0.7820315505963832, + "flos": 547669278720.0, + "grad_norm": 0.07400316047770077, + "language_loss": 0.84204572, + "learning_rate": 0.00011950379312218396, + "loss": 0.85256147, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4065, + "time_per_iteration": 2.7011330127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053821, + "balance_loss_mlp": 1.04467154, + "diversity_loss_mlp": 0.0, + "epoch": 0.7822239322816468, + "flos": 728983245312.0, + "grad_norm": 0.057956585414562535, + "language_loss": 0.86203766, + "learning_rate": 0.00011930175056494719, + "loss": 0.87257588, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4066, + "time_per_iteration": 2.877427816390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054242, + "balance_loss_mlp": 1.04519939, + "diversity_loss_mlp": 0.0, + "epoch": 0.7824163139669104, + "flos": 452016433152.0, + "grad_norm": 0.057083401886059204, + "language_loss": 0.75923216, + "learning_rate": 0.00011909985580787885, + "loss": 0.76977456, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4067, + "time_per_iteration": 2.624633312225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047724, + "balance_loss_mlp": 1.03850365, + "diversity_loss_mlp": 0.0, + "epoch": 0.782608695652174, + "flos": 540489065472.0, + "grad_norm": 0.05949124262263275, + "language_loss": 0.81228232, + "learning_rate": 0.00011889810892936137, + "loss": 0.82275951, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4068, + "time_per_iteration": 2.736132860183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060005, + "balance_loss_mlp": 1.05080259, + "diversity_loss_mlp": 0.0, + "epoch": 0.7828010773374374, + "flos": 500308503552.0, + "grad_norm": 0.067986892151795, + "language_loss": 0.77103662, + "learning_rate": 0.00011869651000771959, + "loss": 0.78163677, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4069, + "time_per_iteration": 2.8403103351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054344, + "balance_loss_mlp": 1.04549229, + "diversity_loss_mlp": 0.0, + "epoch": 0.782993459022701, + "flos": 600816807936.0, + "grad_norm": 0.06684521190560817, + "language_loss": 0.83076346, + "learning_rate": 0.00011849505912122117, + "loss": 0.84130692, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4070, + "time_per_iteration": 2.7008423805236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054175, + "balance_loss_mlp": 1.04501987, + "diversity_loss_mlp": 0.0, + "epoch": 0.7831858407079646, + "flos": 810055779840.0, + "grad_norm": 0.07690857771038405, + "language_loss": 0.78090364, + "learning_rate": 0.00011829375634807654, + "loss": 0.79144537, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4071, + "time_per_iteration": 3.033573627471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054971, + "balance_loss_mlp": 1.04576814, + "diversity_loss_mlp": 0.0, + "epoch": 0.7833782223932282, + "flos": 806594153472.0, + "grad_norm": 0.056420463967120596, + "language_loss": 0.81179786, + "learning_rate": 0.00011809260176643821, + "loss": 0.82234752, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4072, + "time_per_iteration": 3.047667980194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057919, + "balance_loss_mlp": 1.0486629, + "diversity_loss_mlp": 0.0, + "epoch": 0.7835706040784918, + "flos": 520870860288.0, + "grad_norm": 0.08201668927537556, + "language_loss": 0.83855987, + "learning_rate": 0.00011789159545440131, + "loss": 0.84913909, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4073, + "time_per_iteration": 2.5870485305786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061783, + "balance_loss_mlp": 1.05281854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7837629857637552, + "flos": 505605929472.0, + "grad_norm": 0.05483100075639626, + "language_loss": 0.82342023, + "learning_rate": 0.00011769073749000348, + "loss": 0.83403808, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4074, + "time_per_iteration": 2.7744524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_mlp": 1.05058742, + "diversity_loss_mlp": 0.0, + "epoch": 0.7839553674490188, + "flos": 516124431360.0, + "grad_norm": 0.07650558225741275, + "language_loss": 0.76181698, + "learning_rate": 0.0001174900279512246, + "loss": 0.77241433, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4075, + "time_per_iteration": 2.5718233585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055959, + "balance_loss_mlp": 1.04716742, + "diversity_loss_mlp": 0.0, + "epoch": 0.7841477491342824, + "flos": 506648825856.0, + "grad_norm": 0.06638794146044662, + "language_loss": 0.81755495, + "learning_rate": 0.00011728946691598707, + "loss": 0.82811451, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4076, + "time_per_iteration": 2.597710371017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057965, + "balance_loss_mlp": 1.0489229, + "diversity_loss_mlp": 0.0, + "epoch": 0.784340130819546, + "flos": 719636120064.0, + "grad_norm": 0.07312696414479496, + "language_loss": 0.76038092, + "learning_rate": 0.00011708905446215561, + "loss": 0.77096057, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4077, + "time_per_iteration": 2.8587801456451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052857, + "balance_loss_mlp": 1.04389191, + "diversity_loss_mlp": 0.0, + "epoch": 0.7845325125048095, + "flos": 514441704960.0, + "grad_norm": 0.05480426452035972, + "language_loss": 0.79978698, + "learning_rate": 0.00011688879066753711, + "loss": 0.81031561, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4078, + "time_per_iteration": 2.6878645420074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794674, + "balance_loss_mlp": 1.3435601, + "diversity_loss_mlp": 0.22424069, + "epoch": 0.7847248941900731, + "flos": 466102646784.0, + "grad_norm": 0.037025249970490705, + "language_loss": 0.87360638, + "learning_rate": 0.00011668867560988122, + "loss": 0.88155311, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01077335, + "step": 4079, + "time_per_iteration": 2.605992317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055214, + "balance_loss_mlp": 1.04603505, + "diversity_loss_mlp": 0.0, + "epoch": 0.7849172758753367, + "flos": 503028983808.0, + "grad_norm": 0.07540056238596937, + "language_loss": 0.84502101, + "learning_rate": 0.00011648870936687916, + "loss": 0.85557318, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4080, + "time_per_iteration": 2.803166627883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054284, + "balance_loss_mlp": 1.04527164, + "diversity_loss_mlp": 0.0, + "epoch": 0.7851096575606002, + "flos": 531999456768.0, + "grad_norm": 0.07109491685615342, + "language_loss": 0.7888999, + "learning_rate": 0.00011628889201616461, + "loss": 0.79944277, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4081, + "time_per_iteration": 2.6307146549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053935, + "balance_loss_mlp": 1.04494071, + "diversity_loss_mlp": 0.0, + "epoch": 0.7853020392458638, + "flos": 569956207104.0, + "grad_norm": 0.06995649688675094, + "language_loss": 0.8206296, + "learning_rate": 0.00011608922363531393, + "loss": 0.83116901, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4082, + "time_per_iteration": 2.6929171085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_mlp": 1.04621124, + "diversity_loss_mlp": 0.0, + "epoch": 0.7854944209311273, + "flos": 832579845120.0, + "grad_norm": 0.06467745732761603, + "language_loss": 0.83401716, + "learning_rate": 0.00011588970430184504, + "loss": 0.84456635, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 4083, + "time_per_iteration": 3.0374722480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055907, + "balance_loss_mlp": 1.04704356, + "diversity_loss_mlp": 0.0, + "epoch": 0.7856868026163909, + "flos": 559929604608.0, + "grad_norm": 0.053416444226472466, + "language_loss": 0.81812388, + "learning_rate": 0.00011569033409321822, + "loss": 0.82868296, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4084, + "time_per_iteration": 2.7151241302490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056433, + "balance_loss_mlp": 1.04721808, + "diversity_loss_mlp": 0.0, + "epoch": 0.7858791843016545, + "flos": 545230725120.0, + "grad_norm": 0.08362128305368578, + "language_loss": 0.72967046, + "learning_rate": 0.00011549111308683591, + "loss": 0.74023485, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4085, + "time_per_iteration": 2.703397750854492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053784, + "balance_loss_mlp": 1.044855, + "diversity_loss_mlp": 0.0, + "epoch": 0.7860715659869181, + "flos": 380997665280.0, + "grad_norm": 0.07026628399198086, + "language_loss": 0.80478334, + "learning_rate": 0.00011529204136004251, + "loss": 0.81532121, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4086, + "time_per_iteration": 2.4818243980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055251, + "balance_loss_mlp": 1.04632854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7862639476721817, + "flos": 567440930304.0, + "grad_norm": 0.06468878784636958, + "language_loss": 0.84670031, + "learning_rate": 0.00011509311899012459, + "loss": 0.85725284, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4087, + "time_per_iteration": 2.6685831546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052321, + "balance_loss_mlp": 1.04333234, + "diversity_loss_mlp": 0.0, + "epoch": 0.7864563293574451, + "flos": 545238065664.0, + "grad_norm": 0.07857696263976417, + "language_loss": 0.781057, + "learning_rate": 0.00011489434605431053, + "loss": 0.7915802, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4088, + "time_per_iteration": 2.634192705154419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050771, + "balance_loss_mlp": 1.0415858, + "diversity_loss_mlp": 0.0, + "epoch": 0.7866487110427087, + "flos": 563536963584.0, + "grad_norm": 0.06849593864396217, + "language_loss": 0.81194121, + "learning_rate": 0.0001146957226297708, + "loss": 0.82244897, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4089, + "time_per_iteration": 2.6896586418151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054397, + "balance_loss_mlp": 1.04508066, + "diversity_loss_mlp": 0.0, + "epoch": 0.7868410927279723, + "flos": 728189968896.0, + "grad_norm": 0.06226549816004976, + "language_loss": 0.76514363, + "learning_rate": 0.00011449724879361827, + "loss": 0.77568758, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4090, + "time_per_iteration": 3.0211868286132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105233, + "balance_loss_mlp": 1.04349613, + "diversity_loss_mlp": 0.0, + "epoch": 0.7870334744132359, + "flos": 521355045888.0, + "grad_norm": 0.10606387135755017, + "language_loss": 0.73947829, + "learning_rate": 0.00011429892462290687, + "loss": 0.75000155, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4091, + "time_per_iteration": 2.663403034210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051781, + "balance_loss_mlp": 1.04245293, + "diversity_loss_mlp": 0.0, + "epoch": 0.7872258560984994, + "flos": 451411107840.0, + "grad_norm": 0.07444773057019392, + "language_loss": 0.83167046, + "learning_rate": 0.00011410075019463295, + "loss": 0.84218824, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4092, + "time_per_iteration": 2.6732146739959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048957, + "balance_loss_mlp": 1.04006362, + "diversity_loss_mlp": 0.0, + "epoch": 0.787418237783763, + "flos": 515195334144.0, + "grad_norm": 0.060787527331610934, + "language_loss": 0.80152667, + "learning_rate": 0.00011390272558573461, + "loss": 0.81201625, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4093, + "time_per_iteration": 2.7180373668670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046439, + "balance_loss_mlp": 1.03762388, + "diversity_loss_mlp": 0.0, + "epoch": 0.7876106194690266, + "flos": 485081021952.0, + "grad_norm": 0.06490792600835427, + "language_loss": 0.7982657, + "learning_rate": 0.00011370485087309202, + "loss": 0.80873013, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4094, + "time_per_iteration": 2.6366312503814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049978, + "balance_loss_mlp": 1.04087603, + "diversity_loss_mlp": 0.0, + "epoch": 0.7878030011542901, + "flos": 542841357312.0, + "grad_norm": 0.07475345031561743, + "language_loss": 0.79215139, + "learning_rate": 0.00011350712613352688, + "loss": 0.80265117, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4095, + "time_per_iteration": 2.652498960494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_mlp": 1.0379113, + "diversity_loss_mlp": 0.0, + "epoch": 0.7879953828395537, + "flos": 516739668480.0, + "grad_norm": 0.08748048466921367, + "language_loss": 0.79438257, + "learning_rate": 0.00011330955144380283, + "loss": 0.8048501, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4096, + "time_per_iteration": 2.641091823577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051654, + "balance_loss_mlp": 1.04231441, + "diversity_loss_mlp": 0.0, + "epoch": 0.7881877645248172, + "flos": 582278201856.0, + "grad_norm": 0.09762790842246886, + "language_loss": 0.8590734, + "learning_rate": 0.00011311212688062483, + "loss": 0.86958992, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 4097, + "time_per_iteration": 2.7734925746917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104816, + "balance_loss_mlp": 1.03907609, + "diversity_loss_mlp": 0.0, + "epoch": 0.7883801462100808, + "flos": 589171719168.0, + "grad_norm": 0.07905994769378807, + "language_loss": 0.77729434, + "learning_rate": 0.0001129148525206402, + "loss": 0.78777593, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4098, + "time_per_iteration": 2.7954680919647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043495, + "balance_loss_mlp": 1.03457785, + "diversity_loss_mlp": 0.0, + "epoch": 0.7885725278953444, + "flos": 481728052224.0, + "grad_norm": 0.07239705861159748, + "language_loss": 0.86597443, + "learning_rate": 0.00011271772844043759, + "loss": 0.87640929, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4099, + "time_per_iteration": 2.6607439517974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_mlp": 1.03621721, + "diversity_loss_mlp": 0.0, + "epoch": 0.788764909580608, + "flos": 756794824704.0, + "grad_norm": 0.0879845315874332, + "language_loss": 0.76285118, + "learning_rate": 0.00011252075471654727, + "loss": 0.7733022, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4100, + "time_per_iteration": 2.971648693084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105122, + "balance_loss_mlp": 1.04207063, + "diversity_loss_mlp": 0.0, + "epoch": 0.7889572912658714, + "flos": 702555213312.0, + "grad_norm": 0.0764302871750087, + "language_loss": 0.77711362, + "learning_rate": 0.00011232393142544133, + "loss": 0.78762579, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4101, + "time_per_iteration": 2.91229510307312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047622, + "balance_loss_mlp": 1.03860378, + "diversity_loss_mlp": 0.0, + "epoch": 0.789149672951135, + "flos": 736405364736.0, + "grad_norm": 0.07185195333789275, + "language_loss": 0.82940054, + "learning_rate": 0.00011212725864353323, + "loss": 0.83987677, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4102, + "time_per_iteration": 3.1023645401000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025318, + "balance_loss_mlp": 1.02088332, + "diversity_loss_mlp": 0.0, + "epoch": 0.7893420546363986, + "flos": 1481396511744.0, + "grad_norm": 0.024083596003167965, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77361244, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4103, + "time_per_iteration": 4.869060754776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.03684092, + "diversity_loss_mlp": 0.0, + "epoch": 0.7895344363216622, + "flos": 509072698368.0, + "grad_norm": 0.08808407727788632, + "language_loss": 0.75807375, + "learning_rate": 0.00011173436491267291, + "loss": 0.76853269, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4104, + "time_per_iteration": 2.632619619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051432, + "balance_loss_mlp": 1.04226446, + "diversity_loss_mlp": 0.0, + "epoch": 0.7897268180069258, + "flos": 541988983296.0, + "grad_norm": 0.06591293045265766, + "language_loss": 0.81841874, + "learning_rate": 0.0001115381441162554, + "loss": 0.82893306, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4105, + "time_per_iteration": 2.6688740253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015618, + "balance_loss_mlp": 1.0112071, + "diversity_loss_mlp": 0.0, + "epoch": 0.7899191996921893, + "flos": 1412687817216.0, + "grad_norm": 0.01578072375455914, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74599338, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4106, + "time_per_iteration": 4.878762245178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_mlp": 1.041677, + "diversity_loss_mlp": 0.0, + "epoch": 0.7901115813774529, + "flos": 622841633280.0, + "grad_norm": 0.06419159755656932, + "language_loss": 0.85182965, + "learning_rate": 0.00011114615504234465, + "loss": 0.86233652, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4107, + "time_per_iteration": 2.7453701496124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_mlp": 1.03746724, + "diversity_loss_mlp": 0.0, + "epoch": 0.7903039630627164, + "flos": 645545935872.0, + "grad_norm": 0.07341048206377168, + "language_loss": 0.80923963, + "learning_rate": 0.00011095038691703468, + "loss": 0.81970477, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4108, + "time_per_iteration": 2.857043504714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047055, + "balance_loss_mlp": 1.03800678, + "diversity_loss_mlp": 0.0, + "epoch": 0.79049634474798, + "flos": 594365257728.0, + "grad_norm": 0.06655370110946672, + "language_loss": 0.82816958, + "learning_rate": 0.00011075476983417998, + "loss": 0.83864009, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4109, + "time_per_iteration": 2.8551764488220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049932, + "balance_loss_mlp": 1.04054475, + "diversity_loss_mlp": 0.0, + "epoch": 0.7906887264332435, + "flos": 716093001216.0, + "grad_norm": 0.08565145998771567, + "language_loss": 0.7770009, + "learning_rate": 0.00011055930386972579, + "loss": 0.78750026, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 4110, + "time_per_iteration": 2.9051218032836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_mlp": 1.03950906, + "diversity_loss_mlp": 0.0, + "epoch": 0.7908811081185071, + "flos": 789893918208.0, + "grad_norm": 0.07889594156212229, + "language_loss": 0.78524226, + "learning_rate": 0.00011036398909955863, + "loss": 0.79572868, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 4111, + "time_per_iteration": 2.9591848850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00801967, + "balance_loss_mlp": 1.35861206, + "diversity_loss_mlp": 0.22341654, + "epoch": 0.7910734898037707, + "flos": 641904072192.0, + "grad_norm": 0.031814716701276446, + "language_loss": 0.81445456, + "learning_rate": 0.00011016882559950648, + "loss": 0.82247424, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0109526, + "step": 4112, + "time_per_iteration": 2.8517532348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_mlp": 1.04066622, + "diversity_loss_mlp": 0.0, + "epoch": 0.7912658714890343, + "flos": 669357374976.0, + "grad_norm": 0.06825914372029093, + "language_loss": 0.80628312, + "learning_rate": 0.00010997381344533853, + "loss": 0.81678075, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4113, + "time_per_iteration": 2.76458477973938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054223, + "balance_loss_mlp": 1.04482937, + "diversity_loss_mlp": 0.0, + "epoch": 0.7914582531742979, + "flos": 557779944960.0, + "grad_norm": 0.06296725861693256, + "language_loss": 0.80975449, + "learning_rate": 0.00010977895271276517, + "loss": 0.82029676, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4114, + "time_per_iteration": 2.677236795425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105271, + "balance_loss_mlp": 1.04387641, + "diversity_loss_mlp": 0.0, + "epoch": 0.7916506348595613, + "flos": 570064863744.0, + "grad_norm": 0.07698010071595295, + "language_loss": 0.79882276, + "learning_rate": 0.00010958424347743807, + "loss": 0.80934995, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4115, + "time_per_iteration": 2.7255280017852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056753, + "balance_loss_mlp": 1.04793203, + "diversity_loss_mlp": 0.0, + "epoch": 0.7918430165448249, + "flos": 718301758464.0, + "grad_norm": 0.06323084510093162, + "language_loss": 0.80379033, + "learning_rate": 0.00010938968581494991, + "loss": 0.81435782, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4116, + "time_per_iteration": 2.956744909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056354, + "balance_loss_mlp": 1.0473659, + "diversity_loss_mlp": 0.0, + "epoch": 0.7920353982300885, + "flos": 553648753152.0, + "grad_norm": 0.07593804019744407, + "language_loss": 0.78918922, + "learning_rate": 0.000109195279800835, + "loss": 0.79975271, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4117, + "time_per_iteration": 2.7232017517089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052824, + "balance_loss_mlp": 1.04372239, + "diversity_loss_mlp": 0.0, + "epoch": 0.7922277799153521, + "flos": 810120019968.0, + "grad_norm": 0.07668598230710005, + "language_loss": 0.76558191, + "learning_rate": 0.00010900102551056834, + "loss": 0.77611017, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4118, + "time_per_iteration": 3.0348682403564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_mlp": 1.04203153, + "diversity_loss_mlp": 0.0, + "epoch": 0.7924201616006156, + "flos": 421351123968.0, + "grad_norm": 0.06933579681898581, + "language_loss": 0.8458457, + "learning_rate": 0.00010880692301956601, + "loss": 0.85635561, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4119, + "time_per_iteration": 2.465395212173462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059146, + "balance_loss_mlp": 1.05027056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7926125432858792, + "flos": 617852924928.0, + "grad_norm": 0.06493837690301978, + "language_loss": 0.86651456, + "learning_rate": 0.00010861297240318518, + "loss": 0.87710601, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4120, + "time_per_iteration": 2.8506181240081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_mlp": 1.04826188, + "diversity_loss_mlp": 0.0, + "epoch": 0.7928049249711427, + "flos": 602487051264.0, + "grad_norm": 0.07524766323731863, + "language_loss": 0.87229133, + "learning_rate": 0.00010841917373672444, + "loss": 0.88286078, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 4121, + "time_per_iteration": 2.745227336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055842, + "balance_loss_mlp": 1.04712808, + "diversity_loss_mlp": 0.0, + "epoch": 0.7929973066564063, + "flos": 656024790528.0, + "grad_norm": 0.08118940133699648, + "language_loss": 0.78629029, + "learning_rate": 0.00010822552709542293, + "loss": 0.79684877, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4122, + "time_per_iteration": 2.813340425491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055553, + "balance_loss_mlp": 1.04677343, + "diversity_loss_mlp": 0.0, + "epoch": 0.7931896883416699, + "flos": 536397520896.0, + "grad_norm": 0.058728515527731805, + "language_loss": 0.86142117, + "learning_rate": 0.0001080320325544612, + "loss": 0.87197673, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4123, + "time_per_iteration": 2.6903398036956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053347, + "balance_loss_mlp": 1.04438257, + "diversity_loss_mlp": 0.0, + "epoch": 0.7933820700269334, + "flos": 498082493952.0, + "grad_norm": 0.06377375336372411, + "language_loss": 0.83519953, + "learning_rate": 0.00010783869018895997, + "loss": 0.84573305, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4124, + "time_per_iteration": 2.6091437339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_mlp": 1.04709673, + "diversity_loss_mlp": 0.0, + "epoch": 0.793574451712197, + "flos": 537472350720.0, + "grad_norm": 0.06290112703691109, + "language_loss": 0.84019685, + "learning_rate": 0.00010764550007398189, + "loss": 0.85075527, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4125, + "time_per_iteration": 2.639021396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105337, + "balance_loss_mlp": 1.04447079, + "diversity_loss_mlp": 0.0, + "epoch": 0.7937668333974606, + "flos": 488285687808.0, + "grad_norm": 0.059983052052207615, + "language_loss": 0.81026101, + "learning_rate": 0.00010745246228452982, + "loss": 0.8207947, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4126, + "time_per_iteration": 2.567128896713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_mlp": 1.04658413, + "diversity_loss_mlp": 0.0, + "epoch": 0.7939592150827242, + "flos": 527425924608.0, + "grad_norm": 0.06538981258691282, + "language_loss": 0.81837595, + "learning_rate": 0.00010725957689554771, + "loss": 0.82892644, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.08477783, + "routerloss_mlp": 0.0, + "step": 4127, + "time_per_iteration": 2.7668473720550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105364, + "balance_loss_mlp": 1.04483056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7941515967679876, + "flos": 541702287360.0, + "grad_norm": 0.06455760363891609, + "language_loss": 0.84442085, + "learning_rate": 0.00010706684398192013, + "loss": 0.85495722, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4128, + "time_per_iteration": 2.703094482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.04694915, + "diversity_loss_mlp": 0.0, + "epoch": 0.7943439784532512, + "flos": 518387516928.0, + "grad_norm": 0.10398066376678644, + "language_loss": 0.81773114, + "learning_rate": 0.00010687426361847313, + "loss": 0.82829189, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4129, + "time_per_iteration": 2.730570077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_mlp": 1.04571033, + "diversity_loss_mlp": 0.0, + "epoch": 0.7945363601385148, + "flos": 509025710592.0, + "grad_norm": 0.06937610081260179, + "language_loss": 0.8574326, + "learning_rate": 0.00010668183587997254, + "loss": 0.86797965, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4130, + "time_per_iteration": 2.644259452819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_mlp": 1.04217792, + "diversity_loss_mlp": 0.0, + "epoch": 0.7947287418237784, + "flos": 651214121472.0, + "grad_norm": 0.05953600763070223, + "language_loss": 0.77579701, + "learning_rate": 0.0001064895608411256, + "loss": 0.78630781, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4131, + "time_per_iteration": 2.841925859451294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105178, + "balance_loss_mlp": 1.04286337, + "diversity_loss_mlp": 0.0, + "epoch": 0.794921123509042, + "flos": 696054477312.0, + "grad_norm": 0.06486183241314894, + "language_loss": 0.80494809, + "learning_rate": 0.00010629743857657998, + "loss": 0.81546587, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4132, + "time_per_iteration": 2.9550116062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007878, + "balance_loss_mlp": 1.00334787, + "diversity_loss_mlp": 0.0, + "epoch": 0.7951135051943055, + "flos": 1402942768128.0, + "grad_norm": 0.014279472424614392, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71606547, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.04541016, + "routerloss_mlp": 0.0, + "step": 4133, + "time_per_iteration": 4.61087965965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_mlp": 1.05091596, + "diversity_loss_mlp": 0.0, + "epoch": 0.795305886879569, + "flos": 810085515264.0, + "grad_norm": 0.08419096338195846, + "language_loss": 0.82037973, + "learning_rate": 0.00010591365266868802, + "loss": 0.83097553, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 4134, + "time_per_iteration": 2.980473518371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006707, + "balance_loss_mlp": 1.00217748, + "diversity_loss_mlp": 0.0, + "epoch": 0.7954982685648326, + "flos": 1426005347328.0, + "grad_norm": 0.013377465040040408, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76518488, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.04541016, + "routerloss_mlp": 0.0, + "step": 4135, + "time_per_iteration": 5.031512975692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051378, + "balance_loss_mlp": 1.04224622, + "diversity_loss_mlp": 0.0, + "epoch": 0.7956906502500962, + "flos": 389885197824.0, + "grad_norm": 0.08143958467983652, + "language_loss": 0.7928952, + "learning_rate": 0.00010553047875229166, + "loss": 0.80340898, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4136, + "time_per_iteration": 2.536219596862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053745, + "balance_loss_mlp": 1.04491794, + "diversity_loss_mlp": 0.0, + "epoch": 0.7958830319353598, + "flos": 515573434368.0, + "grad_norm": 0.05917621440441134, + "language_loss": 0.8352496, + "learning_rate": 0.00010533912147689328, + "loss": 0.84578705, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4137, + "time_per_iteration": 2.62947416305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052392, + "balance_loss_mlp": 1.04364753, + "diversity_loss_mlp": 0.0, + "epoch": 0.7960754136206233, + "flos": 493941390336.0, + "grad_norm": 0.07247645097842569, + "language_loss": 0.82383895, + "learning_rate": 0.00010514791742243656, + "loss": 0.83436286, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4138, + "time_per_iteration": 2.6058223247528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053846, + "balance_loss_mlp": 1.04486322, + "diversity_loss_mlp": 0.0, + "epoch": 0.7962677953058869, + "flos": 655728182784.0, + "grad_norm": 0.07856202151848143, + "language_loss": 0.82678479, + "learning_rate": 0.00010495686666315341, + "loss": 0.83732331, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4139, + "time_per_iteration": 2.8820180892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_mlp": 1.04509258, + "diversity_loss_mlp": 0.0, + "epoch": 0.7964601769911505, + "flos": 542384335872.0, + "grad_norm": 0.09207393340076041, + "language_loss": 0.77504325, + "learning_rate": 0.00010476596927321635, + "loss": 0.78558183, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4140, + "time_per_iteration": 2.5876264572143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054719, + "balance_loss_mlp": 1.04586816, + "diversity_loss_mlp": 0.0, + "epoch": 0.796652558676414, + "flos": 537650016768.0, + "grad_norm": 0.06332389355869186, + "language_loss": 0.80286723, + "learning_rate": 0.00010457522532673835, + "loss": 0.81341445, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4141, + "time_per_iteration": 2.7853429317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053842, + "balance_loss_mlp": 1.04521155, + "diversity_loss_mlp": 0.0, + "epoch": 0.7968449403616775, + "flos": 475091495424.0, + "grad_norm": 0.07594916891501999, + "language_loss": 0.83322799, + "learning_rate": 0.00010438463489777272, + "loss": 0.84376645, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4142, + "time_per_iteration": 2.574995756149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053543, + "balance_loss_mlp": 1.0441432, + "diversity_loss_mlp": 0.0, + "epoch": 0.7970373220469411, + "flos": 567613827072.0, + "grad_norm": 0.06219380630034642, + "language_loss": 0.77388006, + "learning_rate": 0.00010419419806031316, + "loss": 0.78441548, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 4143, + "time_per_iteration": 2.681364059448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057205, + "balance_loss_mlp": 1.04838395, + "diversity_loss_mlp": 0.0, + "epoch": 0.7972297037322047, + "flos": 556208446464.0, + "grad_norm": 0.06244291716660837, + "language_loss": 0.83778638, + "learning_rate": 0.00010400391488829403, + "loss": 0.84835839, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4144, + "time_per_iteration": 2.7661397457122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056681, + "balance_loss_mlp": 1.04754949, + "diversity_loss_mlp": 0.0, + "epoch": 0.7974220854174683, + "flos": 576180158976.0, + "grad_norm": 0.056029857219710606, + "language_loss": 0.86605, + "learning_rate": 0.00010381378545558984, + "loss": 0.87661684, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4145, + "time_per_iteration": 2.706909656524658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_mlp": 1.04191816, + "diversity_loss_mlp": 0.0, + "epoch": 0.7976144671027319, + "flos": 483069754368.0, + "grad_norm": 0.06718577287314217, + "language_loss": 0.84665811, + "learning_rate": 0.00010362380983601505, + "loss": 0.85716891, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4146, + "time_per_iteration": 2.529480218887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055069, + "balance_loss_mlp": 1.04609227, + "diversity_loss_mlp": 0.0, + "epoch": 0.7978068487879953, + "flos": 1077865615872.0, + "grad_norm": 0.0571367932207486, + "language_loss": 0.7866556, + "learning_rate": 0.00010343398810332477, + "loss": 0.79720628, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4147, + "time_per_iteration": 3.4586639404296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105507, + "balance_loss_mlp": 1.04595661, + "diversity_loss_mlp": 0.0, + "epoch": 0.7979992304732589, + "flos": 733739586048.0, + "grad_norm": 0.07566676342485233, + "language_loss": 0.84437156, + "learning_rate": 0.00010324432033121467, + "loss": 0.85492229, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4148, + "time_per_iteration": 2.8839025497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053366, + "balance_loss_mlp": 1.04418659, + "diversity_loss_mlp": 0.0, + "epoch": 0.7981916121585225, + "flos": 415774342656.0, + "grad_norm": 0.06830192551222886, + "language_loss": 0.83435208, + "learning_rate": 0.00010305480659332005, + "loss": 0.84488571, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4149, + "time_per_iteration": 2.5951197147369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059283, + "balance_loss_mlp": 1.05012214, + "diversity_loss_mlp": 0.0, + "epoch": 0.7983839938437861, + "flos": 465257613312.0, + "grad_norm": 0.07563453451103978, + "language_loss": 0.83492422, + "learning_rate": 0.00010286544696321682, + "loss": 0.84551704, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4150, + "time_per_iteration": 2.5118510723114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055471, + "balance_loss_mlp": 1.04628563, + "diversity_loss_mlp": 0.0, + "epoch": 0.7985763755290496, + "flos": 510567473664.0, + "grad_norm": 0.07562833621575128, + "language_loss": 0.7924732, + "learning_rate": 0.00010267624151442073, + "loss": 0.80302793, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4151, + "time_per_iteration": 2.612138509750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052309, + "balance_loss_mlp": 1.04312396, + "diversity_loss_mlp": 0.0, + "epoch": 0.7987687572143132, + "flos": 1010649498624.0, + "grad_norm": 0.07020647270289845, + "language_loss": 0.80794007, + "learning_rate": 0.000102487190320388, + "loss": 0.81846315, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4152, + "time_per_iteration": 3.3858306407928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052492, + "balance_loss_mlp": 1.0432297, + "diversity_loss_mlp": 0.0, + "epoch": 0.7989611388995768, + "flos": 1021078794240.0, + "grad_norm": 0.08528953367031804, + "language_loss": 0.79654646, + "learning_rate": 0.00010229829345451475, + "loss": 0.80707145, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4153, + "time_per_iteration": 3.326597213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_mlp": 1.04706669, + "diversity_loss_mlp": 0.0, + "epoch": 0.7991535205848403, + "flos": 1101338601984.0, + "grad_norm": 0.06462141101761633, + "language_loss": 0.79619837, + "learning_rate": 0.00010210955099013724, + "loss": 0.80676001, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4154, + "time_per_iteration": 3.3817038536071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054824, + "balance_loss_mlp": 1.04566312, + "diversity_loss_mlp": 0.0, + "epoch": 0.7993459022701039, + "flos": 834818337792.0, + "grad_norm": 0.07616557599778462, + "language_loss": 0.76846623, + "learning_rate": 0.00010192096300053167, + "loss": 0.77901447, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4155, + "time_per_iteration": 3.081740379333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105106, + "balance_loss_mlp": 1.04188037, + "diversity_loss_mlp": 0.0, + "epoch": 0.7995382839553674, + "flos": 522686836224.0, + "grad_norm": 0.0612954553036602, + "language_loss": 0.85157597, + "learning_rate": 0.00010173252955891477, + "loss": 0.86208659, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4156, + "time_per_iteration": 2.7239129543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055997, + "balance_loss_mlp": 1.04709256, + "diversity_loss_mlp": 0.0, + "epoch": 0.799730665640631, + "flos": 537820715520.0, + "grad_norm": 0.07720224754254114, + "language_loss": 0.73362273, + "learning_rate": 0.00010154425073844253, + "loss": 0.74418271, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4157, + "time_per_iteration": 2.696467638015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052019, + "balance_loss_mlp": 1.04316235, + "diversity_loss_mlp": 0.0, + "epoch": 0.7999230473258946, + "flos": 505060075008.0, + "grad_norm": 0.060505733748086536, + "language_loss": 0.82517296, + "learning_rate": 0.00010135612661221138, + "loss": 0.83569312, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4158, + "time_per_iteration": 2.582913398742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_mlp": 1.03880203, + "diversity_loss_mlp": 0.0, + "epoch": 0.8001154290111582, + "flos": 1027342393344.0, + "grad_norm": 0.08198302238912947, + "language_loss": 0.81945235, + "learning_rate": 0.00010116815725325751, + "loss": 0.82993186, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 4159, + "time_per_iteration": 3.28433895111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798548, + "balance_loss_mlp": 1.34939909, + "diversity_loss_mlp": 0.22584054, + "epoch": 0.8003078106964217, + "flos": 750906754560.0, + "grad_norm": 0.032371691049230863, + "language_loss": 0.80472159, + "learning_rate": 0.00010098034273455725, + "loss": 0.81270707, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01092844, + "step": 4160, + "time_per_iteration": 3.020301342010498 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 344944048, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9388404519337984.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/training_args.bin b/sft_pretrain/Full_competesmoev30/checkpoint-4160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b6a9277adbc97dc93da839d7637a55f6cb09192 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe769c1cc19035ec98b831c3889d46da4eb91c0444d770f41a815de3d19398a +size 7992 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-4160/zero_to_fp32.py b/sft_pretrain/Full_competesmoev30/checkpoint-4160/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-4160/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/added_tokens.json b/sft_pretrain/Full_competesmoev30/checkpoint-5198/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/config.json b/sft_pretrain/Full_competesmoev30/checkpoint-5198/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28a5bb1c149304f33214eee3c6e2764711ffb065 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.005, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.005, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": true, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 9, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev30", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/generation_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-5198/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dbad92662bc8b3b5cbb461aa7345780ed2de5aa --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1a8ba0e167338e30d6c9ed4fef48a37dbfc352649855d7a76565ce03d801afe +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df8c33216a2d7dd939b6e1ad1215534cf815f9af --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42ca3815819bbec79bbac6abbaac0503292879c138aec413395d13731783cfe2 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..529dc7d6e7c6a51b963a45fc81dc6d282fd0f646 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c92033fa819028ca4d8019edd41a4dbfe1168a3f53c59e0aad9aed2ed6e3ba +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0671eaab5fc72c068ce37bf41874853b9c07357f --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db5a0e7361f4a49777ed8bb5a11d804080b2564f0df73eb51837bedfc009bb04 +size 396582032 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c6834eaf2bc28d0fb221caac1808680a54bd72b --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e73fd0fbedc1225e92e8ea14ab4bb504e7b4233639c26137c62c227b2ecb979 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2d993e46c557eb14fd584340dd8c83af6ad37b4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eac8cfcf47a58edf25bc44c7706e5239b8336c220acc7347fdf3a6cf070c350 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab1cca65aa636e42c547624881992aef3593809d --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16fe7cb3ed20001810a40aea8281a1102f2273fac8021f4cc6bbd29aae734ddb +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..105fbe35c6ad1b96ab05b2325a662b048b610fde --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ec487ae6df783a46bca7d0ed5d1b91a1928034cd4e20ac58d44d5dc0370a46 +size 2117326886 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/latest b/sft_pretrain/Full_competesmoev30/checkpoint-5198/latest new file mode 100644 index 0000000000000000000000000000000000000000..c0e63763d1d13a0ca7a3b62ff8f5cd1d69cc4978 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/latest @@ -0,0 +1 @@ +global_step5198 \ No newline at end of file diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/model-00001-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-5198/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/model-00002-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/checkpoint-5198/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..221517641f8c3e836c30a881dbeae36e687c8737 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24b87b0e369f9a71b0854220a5351ec7cad9e6d1184d114409009a80f2629f49 +size 3759030203 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/model.safetensors.index.json b/sft_pretrain/Full_competesmoev30/checkpoint-5198/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..1c36aea017a82c896c2bf8d32802184967811e4c --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/model.safetensors.index.json @@ -0,0 +1,673 @@ +{ + "metadata": { + "total_size": 8731429675 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_0.pth b/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..74aaffdc337c5a168a279aed341c53617abfb292 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7428511a0f39116505eb0e78fefd1d50fe2ddacee4482cdd5d925938d450347 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_1.pth b/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_2.pth b/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_3.pth b/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/special_tokens_map.json b/sft_pretrain/Full_competesmoev30/checkpoint-5198/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/tokenizer.model b/sft_pretrain/Full_competesmoev30/checkpoint-5198/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/tokenizer_config.json b/sft_pretrain/Full_competesmoev30/checkpoint-5198/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/trainer_state.json b/sft_pretrain/Full_competesmoev30/checkpoint-5198/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..90258f60dff5808ec5ba71b4ca9572c9c2ebe986 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/trainer_state.json @@ -0,0 +1,87799 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03936368, + "balance_loss_mlp": 2.84994221, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 15.847607787273237, + "language_loss": 2.91765308, + "learning_rate": 0.0, + "loss": 1.97528625, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.278199672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02015882, + "balance_loss_mlp": 1.26743817, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 26.39987998366427, + "language_loss": 2.42349291, + "learning_rate": 0.00013726078121135892, + "loss": 2.44365168, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 7.4765625, + "step": 2, + "time_per_iteration": 2.74550199508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02034476, + "balance_loss_mlp": 1.28603244, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 23.46624299076427, + "language_loss": 2.13354897, + "learning_rate": 0.00021755319103969496, + "loss": 2.15389395, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 7.4765625, + "step": 3, + "time_per_iteration": 2.820986270904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02058399, + "balance_loss_mlp": 1.29927421, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 3.493910581799846, + "language_loss": 1.37129521, + "learning_rate": 0.00027452156242271784, + "loss": 1.3918792, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 7.5859375, + "step": 4, + "time_per_iteration": 2.677243947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02066247, + "balance_loss_mlp": 1.30979228, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 0.8674817587168525, + "language_loss": 1.33187473, + "learning_rate": 0.0003187096642208417, + "loss": 1.35253716, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 7.55859375, + "step": 5, + "time_per_iteration": 2.6032657623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02071583, + "balance_loss_mlp": 1.31322157, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 2.033424387355904, + "language_loss": 1.30649018, + "learning_rate": 0.0003548139722510539, + "loss": 1.32720602, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 7.578125, + "step": 6, + "time_per_iteration": 2.6967170238494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101369, + "balance_loss_mlp": 1.33652186, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7061194413900653, + "language_loss": 1.22160292, + "learning_rate": 0.00038533972973918044, + "loss": 1.24261677, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 7.64453125, + "step": 7, + "time_per_iteration": 2.7199785709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02146806, + "balance_loss_mlp": 1.36975181, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.35850971046258795, + "language_loss": 1.17196155, + "learning_rate": 0.0004117823436340768, + "loss": 1.19342971, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 7.76171875, + "step": 8, + "time_per_iteration": 2.6428823471069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02153063, + "balance_loss_mlp": 1.36837983, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.22105321402960548, + "language_loss": 1.2430563, + "learning_rate": 0.00043510638207938993, + "loss": 1.26458693, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 7.8359375, + "step": 9, + "time_per_iteration": 2.7773404121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194678, + "balance_loss_mlp": 1.4077065, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.2650641779955913, + "language_loss": 1.13927829, + "learning_rate": 0.00045597044543220066, + "loss": 1.16122508, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 7.87109375, + "step": 10, + "time_per_iteration": 2.6966803073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215625, + "balance_loss_mlp": 1.42216802, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.17099192662038445, + "language_loss": 1.11761594, + "learning_rate": 0.00047484428652143135, + "loss": 1.13977218, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 7.921875, + "step": 11, + "time_per_iteration": 2.846426010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218955, + "balance_loss_mlp": 1.42854977, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.11899482154082718, + "language_loss": 1.17641664, + "learning_rate": 0.0004920747534624128, + "loss": 1.19860613, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 7.890625, + "step": 12, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02207543, + "balance_loss_mlp": 1.41751897, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.14172497717456267, + "language_loss": 1.20158505, + "learning_rate": 0.0005079252465375872, + "loss": 1.22366059, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 7.8984375, + "step": 13, + "time_per_iteration": 2.7560088634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02203989, + "balance_loss_mlp": 1.41625452, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.1448362910448976, + "language_loss": 1.09927368, + "learning_rate": 0.0005226005109505393, + "loss": 1.12131357, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 7.859375, + "step": 14, + "time_per_iteration": 2.623379707336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02125464, + "balance_loss_mlp": 1.36481309, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.13392565488521943, + "language_loss": 1.15514731, + "learning_rate": 0.0005362628552605367, + "loss": 1.17640197, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 7.59765625, + "step": 15, + "time_per_iteration": 2.596914768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02122013, + "balance_loss_mlp": 1.3682282, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.12347082932885804, + "language_loss": 1.19854355, + "learning_rate": 0.0005490431248454357, + "loss": 1.21976352, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 7.53125, + "step": 16, + "time_per_iteration": 2.685072898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02419001, + "balance_loss_mlp": 1.67742407, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2736231848322761, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78124118, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.40625, + "step": 17, + "time_per_iteration": 5.928683757781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02002798, + "balance_loss_mlp": 1.29097593, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.09154168539226555, + "language_loss": 1.06151795, + "learning_rate": 0.0005723671632907488, + "loss": 1.08154595, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.12109375, + "step": 18, + "time_per_iteration": 2.6618175506591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945774, + "balance_loss_mlp": 1.26141703, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11342789334024792, + "language_loss": 1.1168499, + "learning_rate": 0.0005830738490244919, + "loss": 1.13630772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.8515625, + "step": 19, + "time_per_iteration": 2.5248160362243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01908107, + "balance_loss_mlp": 1.24625731, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10096694408553891, + "language_loss": 1.13845825, + "learning_rate": 0.0005932312266435596, + "loss": 1.15753937, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.62109375, + "step": 20, + "time_per_iteration": 2.800579309463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01843731, + "balance_loss_mlp": 1.21316147, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.1378013237236713, + "language_loss": 1.09039617, + "learning_rate": 0.0006028929207788754, + "loss": 1.10883355, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 6.30078125, + "step": 21, + "time_per_iteration": 2.693075656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01796963, + "balance_loss_mlp": 1.19309616, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.10529209836160877, + "language_loss": 1.11936951, + "learning_rate": 0.0006121050677327902, + "loss": 1.13733912, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 6.03125, + "step": 22, + "time_per_iteration": 2.8881568908691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746784, + "balance_loss_mlp": 1.17724967, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.085047282331545, + "language_loss": 1.02962387, + "learning_rate": 0.0006209076479463684, + "loss": 1.04709172, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 5.70703125, + "step": 23, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01714578, + "balance_loss_mlp": 1.16831291, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.1446104563316411, + "language_loss": 1.12823486, + "learning_rate": 0.0006293355346737718, + "loss": 1.1453805, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 5.46875, + "step": 24, + "time_per_iteration": 2.662325382232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664908, + "balance_loss_mlp": 1.14725351, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.08929005506461926, + "language_loss": 1.08926165, + "learning_rate": 0.0006374193284416834, + "loss": 1.10591078, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 5.17578125, + "step": 25, + "time_per_iteration": 2.7794790267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01647718, + "balance_loss_mlp": 1.15752983, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.382953647696995, + "language_loss": 1.07588863, + "learning_rate": 0.0006451860277489461, + "loss": 1.09236586, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.89453125, + "step": 26, + "time_per_iteration": 2.6574552059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01623745, + "balance_loss_mlp": 1.1686517, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.13377036730821817, + "language_loss": 1.14740276, + "learning_rate": 0.0006526595731190848, + "loss": 1.16364002, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 4.55078125, + "step": 27, + "time_per_iteration": 2.5226099491119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558493, + "balance_loss_mlp": 1.14078379, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.07887885702942038, + "language_loss": 1.08901012, + "learning_rate": 0.0006598612921618983, + "loss": 1.10459495, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 4.18359375, + "step": 28, + "time_per_iteration": 2.839459180831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01503024, + "balance_loss_mlp": 1.11487842, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.08107526710192482, + "language_loss": 1.0255661, + "learning_rate": 0.0006668102665011454, + "loss": 1.04059625, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 3.87695312, + "step": 29, + "time_per_iteration": 3.257913589477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474291, + "balance_loss_mlp": 1.11227608, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.13697687064909753, + "language_loss": 1.11483085, + "learning_rate": 0.0006735236364718957, + "loss": 1.1295737, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 3.6171875, + "step": 30, + "time_per_iteration": 2.7084178924560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0142553, + "balance_loss_mlp": 1.09460521, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.11726589989245696, + "language_loss": 1.10265064, + "learning_rate": 0.0006800168558381346, + "loss": 1.11690593, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 3.31054688, + "step": 31, + "time_per_iteration": 2.588890552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01390474, + "balance_loss_mlp": 1.08758759, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.10666498872881085, + "language_loss": 1.13109517, + "learning_rate": 0.0006863039060567947, + "loss": 1.14499998, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 3.0234375, + "step": 32, + "time_per_iteration": 2.671940326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372012, + "balance_loss_mlp": 1.09372997, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.09439068448398888, + "language_loss": 1.06106949, + "learning_rate": 0.0006923974775611263, + "loss": 1.07478976, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 2.78710938, + "step": 33, + "time_per_iteration": 2.854475498199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370442, + "balance_loss_mlp": 1.11390388, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.06215931521992215, + "language_loss": 1.03014469, + "learning_rate": 0.0006983091239737814, + "loss": 1.04384923, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 2.56445312, + "step": 34, + "time_per_iteration": 3.0690298080444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361344, + "balance_loss_mlp": 1.12464166, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.09515467516314563, + "language_loss": 1.01683736, + "learning_rate": 0.0007040493939600222, + "loss": 1.03045082, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 2.36523438, + "step": 35, + "time_per_iteration": 2.8111989498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344012, + "balance_loss_mlp": 1.12600231, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.06987238068095514, + "language_loss": 1.02534437, + "learning_rate": 0.0007096279445021078, + "loss": 1.0387845, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 2.18554688, + "step": 36, + "time_per_iteration": 2.704871654510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340389, + "balance_loss_mlp": 1.14107156, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.1404335763188921, + "language_loss": 1.09097314, + "learning_rate": 0.0007150536386503726, + "loss": 1.10437703, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 1.9921875, + "step": 37, + "time_per_iteration": 2.872793436050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315876, + "balance_loss_mlp": 1.13486814, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.16061978088166937, + "language_loss": 1.01896858, + "learning_rate": 0.0007203346302358509, + "loss": 1.0321275, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 1.81054688, + "step": 38, + "time_per_iteration": 2.9352476596832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304512, + "balance_loss_mlp": 1.13332772, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.19798610454398824, + "language_loss": 1.06942129, + "learning_rate": 0.000725478437577282, + "loss": 1.08246636, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 1.71386719, + "step": 39, + "time_per_iteration": 2.766380786895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266397, + "balance_loss_mlp": 1.10894561, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.0682924496804484, + "language_loss": 1.01676083, + "learning_rate": 0.0007304920078549186, + "loss": 1.02942467, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 1.57324219, + "step": 40, + "time_per_iteration": 2.7017316818237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260084, + "balance_loss_mlp": 1.10988009, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.18661861035366387, + "language_loss": 1.03648829, + "learning_rate": 0.0007353817735343603, + "loss": 1.04908907, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 1.50097656, + "step": 41, + "time_per_iteration": 2.7103593349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243555, + "balance_loss_mlp": 1.10651195, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.09436856387031409, + "language_loss": 0.996611, + "learning_rate": 0.0007401537019902344, + "loss": 1.00904644, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 1.37109375, + "step": 42, + "time_per_iteration": 2.6113343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223311, + "balance_loss_mlp": 1.09961998, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.12261468754490484, + "language_loss": 1.02989793, + "learning_rate": 0.0007448133392900729, + "loss": 1.04213095, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.23535156, + "step": 43, + "time_per_iteration": 2.6736834049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123183, + "balance_loss_mlp": 1.11490965, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.06742287935331995, + "language_loss": 0.98469728, + "learning_rate": 0.0007493658489441491, + "loss": 0.9970156, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.16699219, + "step": 44, + "time_per_iteration": 2.8660154342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221739, + "balance_loss_mlp": 1.11549973, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.13165016268944502, + "language_loss": 1.02125764, + "learning_rate": 0.0007538160463002316, + "loss": 1.03347504, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.06445312, + "step": 45, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219104, + "balance_loss_mlp": 1.12082767, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.09154051415002856, + "language_loss": 1.05303812, + "learning_rate": 0.0007581684291577274, + "loss": 1.06522906, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.98193359, + "step": 46, + "time_per_iteration": 2.5779762268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211973, + "balance_loss_mlp": 1.12180293, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.10098348979088022, + "language_loss": 1.08761919, + "learning_rate": 0.0007624272050891776, + "loss": 1.09973884, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.90185547, + "step": 47, + "time_per_iteration": 2.8511393070220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.09893048, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.06288361982709323, + "language_loss": 0.98731792, + "learning_rate": 0.0007665963158851307, + "loss": 0.9991011, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.79345703, + "step": 48, + "time_per_iteration": 2.7975704669952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117803, + "balance_loss_mlp": 1.10588408, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.07935638516568921, + "language_loss": 1.07018328, + "learning_rate": 0.0007706794594783609, + "loss": 1.08196378, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.72167969, + "step": 49, + "time_per_iteration": 2.762869358062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170672, + "balance_loss_mlp": 1.10281849, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.06589219417940043, + "language_loss": 1.06122911, + "learning_rate": 0.0007746801096530423, + "loss": 1.07293582, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.67919922, + "step": 50, + "time_per_iteration": 2.755232334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116692, + "balance_loss_mlp": 1.10545588, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09337036144210262, + "language_loss": 1.10751569, + "learning_rate": 0.0007786015338021173, + "loss": 1.11918497, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.61376953, + "step": 51, + "time_per_iteration": 2.6145899295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.10279799, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.0700474736529942, + "language_loss": 1.03127432, + "learning_rate": 0.0007824468089603051, + "loss": 1.04286635, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.56396484, + "step": 52, + "time_per_iteration": 2.653333902359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.1128397, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.0678828268350522, + "language_loss": 1.02721131, + "learning_rate": 0.0007862188363098669, + "loss": 1.0388329, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.4934082, + "step": 53, + "time_per_iteration": 3.16854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150565, + "balance_loss_mlp": 1.10464573, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.07226768628462193, + "language_loss": 1.03151178, + "learning_rate": 0.0007899203543304438, + "loss": 1.04301751, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.45947266, + "step": 54, + "time_per_iteration": 2.684342384338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153237, + "balance_loss_mlp": 1.10901022, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.2877805661885644, + "language_loss": 1.16480064, + "learning_rate": 0.0007935539507422731, + "loss": 1.17633295, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.44213867, + "step": 55, + "time_per_iteration": 2.550560235977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135293, + "balance_loss_mlp": 1.09545326, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.09011321470942846, + "language_loss": 1.08752644, + "learning_rate": 0.0007971220733732573, + "loss": 1.09887934, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.39819336, + "step": 56, + "time_per_iteration": 2.6777026653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138051, + "balance_loss_mlp": 1.10307515, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.08011479339587849, + "language_loss": 1.04026377, + "learning_rate": 0.0008006270400641869, + "loss": 1.05164433, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.34985352, + "step": 57, + "time_per_iteration": 2.6899423599243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140153, + "balance_loss_mlp": 1.10787153, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.11169369867739573, + "language_loss": 1.05261517, + "learning_rate": 0.0008040710477125043, + "loss": 1.06401682, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.32275391, + "step": 58, + "time_per_iteration": 2.723038911819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144338, + "balance_loss_mlp": 1.11403465, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.15034464280850074, + "language_loss": 1.06417704, + "learning_rate": 0.0008074561805429771, + "loss": 1.07562041, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.30297852, + "step": 59, + "time_per_iteration": 2.6378283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136058, + "balance_loss_mlp": 1.10842514, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.12260992246729245, + "language_loss": 1.03937411, + "learning_rate": 0.0008107844176832545, + "loss": 1.05073476, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.27612305, + "step": 60, + "time_per_iteration": 2.700141668319702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143848, + "balance_loss_mlp": 1.11745548, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.07189127634205647, + "language_loss": 1.05365705, + "learning_rate": 0.0008140576401132568, + "loss": 1.06509542, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.2644043, + "step": 61, + "time_per_iteration": 2.6508264541625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141309, + "balance_loss_mlp": 1.11781311, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.05216073972873087, + "language_loss": 1.06422329, + "learning_rate": 0.0008172776370494935, + "loss": 1.07563639, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.23461914, + "step": 62, + "time_per_iteration": 2.725492238998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136117, + "balance_loss_mlp": 1.11272764, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.101779425959611, + "language_loss": 1.13612652, + "learning_rate": 0.0008204461118185703, + "loss": 1.14748764, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.23376465, + "step": 63, + "time_per_iteration": 2.5753746032714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148279, + "balance_loss_mlp": 1.12627339, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.07447427381713748, + "language_loss": 1.0324012, + "learning_rate": 0.0008235646872681536, + "loss": 1.04388404, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.22009277, + "step": 64, + "time_per_iteration": 2.5766890048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134709, + "balance_loss_mlp": 1.11331069, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.38827595406324295, + "language_loss": 1.02755439, + "learning_rate": 0.0008266349107584288, + "loss": 1.03890157, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.2142334, + "step": 65, + "time_per_iteration": 2.6795432567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150765, + "balance_loss_mlp": 1.12982011, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.12495940986475743, + "language_loss": 1.06208372, + "learning_rate": 0.0008296582587724851, + "loss": 1.07359147, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.20947266, + "step": 66, + "time_per_iteration": 2.7176458835601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140545, + "balance_loss_mlp": 1.11969519, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.1040817091496257, + "language_loss": 1.04495656, + "learning_rate": 0.0008326361411800136, + "loss": 1.05636215, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.20861816, + "step": 67, + "time_per_iteration": 2.944484233856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11664486, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.1236975736999165, + "language_loss": 1.04613113, + "learning_rate": 0.0008355699051851403, + "loss": 1.05749726, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1998291, + "step": 68, + "time_per_iteration": 2.7155401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163949, + "balance_loss_mlp": 1.14371967, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.08669769947970225, + "language_loss": 1.11325383, + "learning_rate": 0.0008384608389860635, + "loss": 1.12489343, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.20214844, + "step": 69, + "time_per_iteration": 2.6746206283569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.15127182, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.13494585106435908, + "language_loss": 1.01927853, + "learning_rate": 0.000841310175171381, + "loss": 1.03098571, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.19433594, + "step": 70, + "time_per_iteration": 2.6096978187561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116458, + "balance_loss_mlp": 1.14537501, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.08071853308807045, + "language_loss": 0.99831259, + "learning_rate": 0.000844119093875517, + "loss": 1.00995839, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.19189453, + "step": 71, + "time_per_iteration": 2.7110228538513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172694, + "balance_loss_mlp": 1.1531322, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.1298896621631551, + "language_loss": 1.05077183, + "learning_rate": 0.0008468887257134666, + "loss": 1.06249881, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.19543457, + "step": 72, + "time_per_iteration": 2.6877832412719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117331, + "balance_loss_mlp": 1.15338969, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.15655470084299106, + "language_loss": 1.07319438, + "learning_rate": 0.0008496201545131264, + "loss": 1.08492744, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.19909668, + "step": 73, + "time_per_iteration": 2.712404251098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155518, + "balance_loss_mlp": 1.13590837, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.16190508579873739, + "language_loss": 1.04767108, + "learning_rate": 0.0008523144198617317, + "loss": 1.05922627, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.19604492, + "step": 74, + "time_per_iteration": 3.1923534870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136399, + "balance_loss_mlp": 1.11624122, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.09478832041488004, + "language_loss": 1.04861999, + "learning_rate": 0.0008549725194813783, + "loss": 1.05998397, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.20153809, + "step": 75, + "time_per_iteration": 2.6708076000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116508, + "balance_loss_mlp": 1.09800684, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.08770819878028477, + "language_loss": 1.03907192, + "learning_rate": 0.0008575954114472099, + "loss": 1.05023694, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.18481445, + "step": 76, + "time_per_iteration": 3.13152813911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115762, + "balance_loss_mlp": 1.09717751, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.13848190952411177, + "language_loss": 1.01474786, + "learning_rate": 0.0008601840162606118, + "loss": 1.02590549, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.18591309, + "step": 77, + "time_per_iteration": 3.0026464462280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126663, + "balance_loss_mlp": 1.10745883, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04300320251384177, + "language_loss": 1.07548404, + "learning_rate": 0.000862739218788641, + "loss": 1.08675063, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.19189453, + "step": 78, + "time_per_iteration": 2.780151128768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136666, + "balance_loss_mlp": 1.11736631, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.05300805683051922, + "language_loss": 1.05217659, + "learning_rate": 0.0008652618700799138, + "loss": 1.0635432, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.19287109, + "step": 79, + "time_per_iteration": 2.644989252090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115046, + "balance_loss_mlp": 1.13105261, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.13679514692214284, + "language_loss": 1.04483461, + "learning_rate": 0.0008677527890662774, + "loss": 1.05633926, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.19384766, + "step": 80, + "time_per_iteration": 2.4652533531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151484, + "balance_loss_mlp": 1.13120639, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.06949005945359786, + "language_loss": 1.05593443, + "learning_rate": 0.0008702127641587799, + "loss": 1.06744933, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.20263672, + "step": 81, + "time_per_iteration": 2.6423192024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155894, + "balance_loss_mlp": 1.13492513, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.09507058081046676, + "language_loss": 1.01514888, + "learning_rate": 0.0008726425547457192, + "loss": 1.02670789, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.20959473, + "step": 82, + "time_per_iteration": 2.7670798301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.11376882, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.0793725108169458, + "language_loss": 1.00304663, + "learning_rate": 0.0008750428925998964, + "loss": 1.01438546, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.20117188, + "step": 83, + "time_per_iteration": 2.7451062202453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145272, + "balance_loss_mlp": 1.12516141, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.14534943996774727, + "language_loss": 1.06251049, + "learning_rate": 0.0008774144832015932, + "loss": 1.07396317, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.2010498, + "step": 84, + "time_per_iteration": 2.7039954662323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01784137, + "balance_loss_mlp": 1.77116704, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.33978769388161495, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76558447, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.12988281, + "step": 85, + "time_per_iteration": 4.672428846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133263, + "balance_loss_mlp": 1.11339045, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.0814354491433929, + "language_loss": 1.01647198, + "learning_rate": 0.0008820741205014318, + "loss": 1.02780461, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.19873047, + "step": 86, + "time_per_iteration": 2.9217472076416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135249, + "balance_loss_mlp": 1.11522174, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.09136661427056217, + "language_loss": 1.02933669, + "learning_rate": 0.0008843634575408404, + "loss": 1.04068923, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20031738, + "step": 87, + "time_per_iteration": 2.7795376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126576, + "balance_loss_mlp": 1.10805094, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.08653972064742017, + "language_loss": 1.04609084, + "learning_rate": 0.0008866266301555082, + "loss": 1.0573566, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.18518066, + "step": 88, + "time_per_iteration": 2.7490010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144591, + "balance_loss_mlp": 1.12630451, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.0643644920813647, + "language_loss": 1.05052233, + "learning_rate": 0.0008888642296509615, + "loss": 1.06196821, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.18273926, + "step": 89, + "time_per_iteration": 2.594862222671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167485, + "balance_loss_mlp": 1.14840007, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.0960094219381758, + "language_loss": 1.09507632, + "learning_rate": 0.0008910768275115906, + "loss": 1.10675108, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.1907959, + "step": 90, + "time_per_iteration": 2.732243299484253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168948, + "balance_loss_mlp": 1.14970791, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.08670111946866453, + "language_loss": 1.05579484, + "learning_rate": 0.0008932649762767675, + "loss": 1.06748414, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.19238281, + "step": 91, + "time_per_iteration": 2.58011531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156938, + "balance_loss_mlp": 1.13799536, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.1377326340865385, + "language_loss": 1.07988524, + "learning_rate": 0.0008954292103690864, + "loss": 1.09145451, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.18933105, + "step": 92, + "time_per_iteration": 2.88777494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144865, + "balance_loss_mlp": 1.12581539, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.08013614344713903, + "language_loss": 1.10040021, + "learning_rate": 0.0008975700468778296, + "loss": 1.11184883, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.19042969, + "step": 93, + "time_per_iteration": 2.5774590969085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153192, + "balance_loss_mlp": 1.13429725, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.08120240816831911, + "language_loss": 1.03244281, + "learning_rate": 0.0008996879863005366, + "loss": 1.04397476, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.18896484, + "step": 94, + "time_per_iteration": 2.6684646606445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166139, + "balance_loss_mlp": 1.14685082, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.10696755240582503, + "language_loss": 1.0365541, + "learning_rate": 0.0009017835132453337, + "loss": 1.04821539, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.19262695, + "step": 95, + "time_per_iteration": 2.5731871128082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160643, + "balance_loss_mlp": 1.14130712, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.09689172385373614, + "language_loss": 1.03809953, + "learning_rate": 0.0009038570970964896, + "loss": 1.04970598, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.1932373, + "step": 96, + "time_per_iteration": 2.7642133235931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142174, + "balance_loss_mlp": 1.1226114, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0731237284630876, + "language_loss": 1.01012015, + "learning_rate": 0.0009059091926454854, + "loss": 1.02154183, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.19543457, + "step": 97, + "time_per_iteration": 2.5798768997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134737, + "balance_loss_mlp": 1.11522222, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.09616120207899966, + "language_loss": 1.00179553, + "learning_rate": 0.0009079402406897198, + "loss": 1.01314282, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.19494629, + "step": 98, + "time_per_iteration": 3.2566075325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.12357211, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.06455780129345397, + "language_loss": 1.01265812, + "learning_rate": 0.0009099506686008212, + "loss": 1.02409148, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.19763184, + "step": 99, + "time_per_iteration": 2.799565553665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129571, + "balance_loss_mlp": 1.11054564, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.10657448879387016, + "language_loss": 1.0467732, + "learning_rate": 0.0009119408908644013, + "loss": 1.05806899, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.19030762, + "step": 100, + "time_per_iteration": 2.684875249862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122425, + "balance_loss_mlp": 1.10363734, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.06970738765852934, + "language_loss": 1.09725833, + "learning_rate": 0.0009139113095929519, + "loss": 1.1084826, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.18762207, + "step": 101, + "time_per_iteration": 2.8530783653259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130466, + "balance_loss_mlp": 1.11095107, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.04951217111237057, + "language_loss": 1.03750157, + "learning_rate": 0.0009158623150134762, + "loss": 1.04880619, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.19506836, + "step": 102, + "time_per_iteration": 2.5738718509674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124372, + "balance_loss_mlp": 1.10552466, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.07829016079597523, + "language_loss": 1.03829539, + "learning_rate": 0.000917794285931332, + "loss": 1.04953909, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.18859863, + "step": 103, + "time_per_iteration": 2.6672050952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116034, + "balance_loss_mlp": 1.09756863, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.06055754000551873, + "language_loss": 0.96430528, + "learning_rate": 0.0009197075901716639, + "loss": 0.97546566, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.18444824, + "step": 104, + "time_per_iteration": 2.7030909061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143399, + "balance_loss_mlp": 1.12458754, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.08851166873462187, + "language_loss": 1.06492853, + "learning_rate": 0.0009216025849997171, + "loss": 1.07636249, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.18798828, + "step": 105, + "time_per_iteration": 2.770717144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.11799645, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.1087806769155691, + "language_loss": 1.01426148, + "learning_rate": 0.0009234796175212258, + "loss": 1.02562797, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.18640137, + "step": 106, + "time_per_iteration": 2.9345030784606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145469, + "balance_loss_mlp": 1.12691963, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.08314221817588373, + "language_loss": 1.04264343, + "learning_rate": 0.000925339025064007, + "loss": 1.05409813, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.18530273, + "step": 107, + "time_per_iteration": 2.9724230766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136133, + "balance_loss_mlp": 1.11766744, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.06103111074840472, + "language_loss": 0.9746207, + "learning_rate": 0.0009271811355418027, + "loss": 0.98598194, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.18457031, + "step": 108, + "time_per_iteration": 2.8312766551971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114122, + "balance_loss_mlp": 1.12251627, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09366723049874563, + "language_loss": 1.0430491, + "learning_rate": 0.0009290062678013548, + "loss": 1.05446124, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.18713379, + "step": 109, + "time_per_iteration": 2.8890299797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119997, + "balance_loss_mlp": 1.10091138, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.07845117671788823, + "language_loss": 1.02498507, + "learning_rate": 0.0009308147319536321, + "loss": 1.03618503, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.19067383, + "step": 110, + "time_per_iteration": 2.6301145553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124178, + "balance_loss_mlp": 1.10517561, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.06169483511964636, + "language_loss": 1.08628201, + "learning_rate": 0.0009326068296900676, + "loss": 1.09752393, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.18981934, + "step": 111, + "time_per_iteration": 2.8480148315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124091, + "balance_loss_mlp": 1.1046958, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.07277353768082521, + "language_loss": 1.00328588, + "learning_rate": 0.0009343828545846161, + "loss": 1.01452684, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.19384766, + "step": 112, + "time_per_iteration": 2.785245656967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_mlp": 1.12596965, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.0989159829516975, + "language_loss": 1.03963184, + "learning_rate": 0.0009361430923823841, + "loss": 1.05108869, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.19702148, + "step": 113, + "time_per_iteration": 2.6218817234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139838, + "balance_loss_mlp": 1.11994159, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.08134488401387123, + "language_loss": 1.07289195, + "learning_rate": 0.0009378878212755459, + "loss": 1.08429039, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.19885254, + "step": 114, + "time_per_iteration": 2.489394426345825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135841, + "balance_loss_mlp": 1.11546779, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.08931795851274972, + "language_loss": 0.98084462, + "learning_rate": 0.0009396173121672103, + "loss": 0.992203, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.20373535, + "step": 115, + "time_per_iteration": 2.6338186264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.11229324, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.07784948028132394, + "language_loss": 1.03230667, + "learning_rate": 0.0009413318289238633, + "loss": 1.04362714, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.1973877, + "step": 116, + "time_per_iteration": 2.7797064781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119746, + "balance_loss_mlp": 1.10049319, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.10235619274826367, + "language_loss": 0.95674431, + "learning_rate": 0.0009430316286169771, + "loss": 0.96794176, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.19226074, + "step": 117, + "time_per_iteration": 3.0148251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123727, + "balance_loss_mlp": 1.10400951, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.08556933686221588, + "language_loss": 1.00759292, + "learning_rate": 0.0009447169617543361, + "loss": 1.0188303, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.19714355, + "step": 118, + "time_per_iteration": 2.570577383041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147362, + "balance_loss_mlp": 1.12738276, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.14195532580527156, + "language_loss": 1.07468402, + "learning_rate": 0.0009463880725016029, + "loss": 1.08615768, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.19970703, + "step": 119, + "time_per_iteration": 2.687791585922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119491, + "balance_loss_mlp": 1.1002152, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.12580227983012474, + "language_loss": 1.02723956, + "learning_rate": 0.0009480451988946134, + "loss": 1.03843451, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19274902, + "step": 120, + "time_per_iteration": 2.86080002784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.09974504, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.09779732210141849, + "language_loss": 1.04102588, + "learning_rate": 0.0009496885730428627, + "loss": 1.05221319, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1895752, + "step": 121, + "time_per_iteration": 3.058720350265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129608, + "balance_loss_mlp": 1.11076128, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.21300696817673925, + "language_loss": 1.02294064, + "learning_rate": 0.0009513184213246156, + "loss": 1.03423667, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.18859863, + "step": 122, + "time_per_iteration": 2.634585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112851, + "balance_loss_mlp": 1.10879278, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.08876505507315528, + "language_loss": 1.05331969, + "learning_rate": 0.0009529349645740552, + "loss": 1.06460488, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.19702148, + "step": 123, + "time_per_iteration": 2.68062686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139736, + "balance_loss_mlp": 1.11948287, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.07165211399576038, + "language_loss": 1.04294729, + "learning_rate": 0.0009545384182608524, + "loss": 1.05434453, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.20239258, + "step": 124, + "time_per_iteration": 2.541867971420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147945, + "balance_loss_mlp": 1.12758446, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.1170262954091428, + "language_loss": 1.01733518, + "learning_rate": 0.0009561289926625252, + "loss": 1.02881455, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.20361328, + "step": 125, + "time_per_iteration": 2.6904866695404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144311, + "balance_loss_mlp": 1.12337756, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.0767802787123007, + "language_loss": 1.06512678, + "learning_rate": 0.0009577068930299292, + "loss": 1.07656991, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.20935059, + "step": 126, + "time_per_iteration": 2.5956666469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112026, + "balance_loss_mlp": 1.10011339, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.05578094289714296, + "language_loss": 1.01563096, + "learning_rate": 0.0009592723197462087, + "loss": 1.02683353, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.20141602, + "step": 127, + "time_per_iteration": 2.652282953262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135813, + "balance_loss_mlp": 1.11633444, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.08941911012616197, + "language_loss": 0.98464531, + "learning_rate": 0.0009608254684795125, + "loss": 0.99600339, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.19470215, + "step": 128, + "time_per_iteration": 2.9219348430633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113214, + "balance_loss_mlp": 1.11204123, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.07851670709976168, + "language_loss": 1.01339173, + "learning_rate": 0.0009623665303297678, + "loss": 1.02471328, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.20092773, + "step": 129, + "time_per_iteration": 2.72129225730896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138949, + "balance_loss_mlp": 1.11936343, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.10234054898828188, + "language_loss": 1.05215728, + "learning_rate": 0.0009638956919697878, + "loss": 1.0635469, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19580078, + "step": 130, + "time_per_iteration": 2.8943347930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120076, + "balance_loss_mlp": 1.10040641, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.07955649128739337, + "language_loss": 0.97532988, + "learning_rate": 0.0009654131357809714, + "loss": 0.98653066, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.19665527, + "step": 131, + "time_per_iteration": 2.5710790157318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131377, + "balance_loss_mlp": 1.11108756, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.09603534709419483, + "language_loss": 1.06830871, + "learning_rate": 0.0009669190399838441, + "loss": 1.07962251, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.20275879, + "step": 132, + "time_per_iteration": 3.12355899810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104613, + "balance_loss_mlp": 1.08422863, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.07678679730921736, + "language_loss": 0.99635059, + "learning_rate": 0.0009684135787636724, + "loss": 1.0073967, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.20373535, + "step": 133, + "time_per_iteration": 2.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011225, + "balance_loss_mlp": 1.10198379, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.06194161941979751, + "language_loss": 1.03999257, + "learning_rate": 0.0009698969223913726, + "loss": 1.05121756, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.2052002, + "step": 134, + "time_per_iteration": 3.0173001289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111055, + "balance_loss_mlp": 1.09066617, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.06876216863310104, + "language_loss": 1.06792855, + "learning_rate": 0.0009713692373399265, + "loss": 1.07903397, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.19873047, + "step": 135, + "time_per_iteration": 2.670929431915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134721, + "balance_loss_mlp": 1.33280921, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.15411027982306336, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.80803436, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.14355469, + "step": 136, + "time_per_iteration": 5.4502341747283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142923, + "balance_loss_mlp": 1.13023889, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.0420308652143082, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.78953964, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.911421298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.1204778, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.15008184892874737, + "language_loss": 0.99414909, + "learning_rate": 0.0009757216201974225, + "loss": 1.00555539, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.20141602, + "step": 138, + "time_per_iteration": 2.805294990539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163113, + "balance_loss_mlp": 1.1417979, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.10042691837700132, + "language_loss": 1.04683781, + "learning_rate": 0.0009771514130396581, + "loss": 1.05846894, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.21325684, + "step": 139, + "time_per_iteration": 2.6785237789154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.15150893, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.13712828131438198, + "language_loss": 1.04777944, + "learning_rate": 0.00097857095638274, + "loss": 1.05949712, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.20275879, + "step": 140, + "time_per_iteration": 2.5689632892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161751, + "balance_loss_mlp": 1.140818, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.04776427930188189, + "language_loss": 0.96152979, + "learning_rate": 0.0009799803961288726, + "loss": 0.97314727, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.20922852, + "step": 141, + "time_per_iteration": 3.005524158477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114311, + "balance_loss_mlp": 1.12280869, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.08242063446041879, + "language_loss": 1.02058709, + "learning_rate": 0.000981379875086876, + "loss": 1.03201818, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.20300293, + "step": 142, + "time_per_iteration": 3.0404272079467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149559, + "balance_loss_mlp": 1.12884021, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.08811908081945614, + "language_loss": 0.97007114, + "learning_rate": 0.0009827695330590185, + "loss": 0.98156673, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.20727539, + "step": 143, + "time_per_iteration": 2.677872896194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139202, + "balance_loss_mlp": 1.11838782, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.09095558281985278, + "language_loss": 0.9660008, + "learning_rate": 0.0009841495069248256, + "loss": 0.97739279, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.20788574, + "step": 144, + "time_per_iteration": 3.0181970596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124304, + "balance_loss_mlp": 1.10402668, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.06968867614461936, + "language_loss": 0.96011639, + "learning_rate": 0.0009855199307219871, + "loss": 0.97135949, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.20275879, + "step": 145, + "time_per_iteration": 2.6638803482055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129462, + "balance_loss_mlp": 1.10819507, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.10380696742567494, + "language_loss": 0.97768301, + "learning_rate": 0.0009868809357244854, + "loss": 0.98897767, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.21264648, + "step": 146, + "time_per_iteration": 2.6609416007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_mlp": 1.08754969, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.04767435219925792, + "language_loss": 1.01976728, + "learning_rate": 0.0009882326505180556, + "loss": 1.03085351, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.21081543, + "step": 147, + "time_per_iteration": 2.7018306255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116059, + "balance_loss_mlp": 1.09487534, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.081387986355653, + "language_loss": 1.0020777, + "learning_rate": 0.0009895752010730906, + "loss": 1.01323831, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.21191406, + "step": 148, + "time_per_iteration": 2.9776458740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114545, + "balance_loss_mlp": 1.09280121, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07164111136345892, + "language_loss": 1.06547272, + "learning_rate": 0.0009909087108150867, + "loss": 1.07661819, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.21740723, + "step": 149, + "time_per_iteration": 2.7685787677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120998, + "balance_loss_mlp": 1.09932601, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.09002123643314056, + "language_loss": 1.07463562, + "learning_rate": 0.0009922333006927371, + "loss": 1.08584571, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.2166748, + "step": 150, + "time_per_iteration": 2.5377442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134752, + "balance_loss_mlp": 1.11268604, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.07882603128859848, + "language_loss": 1.00827551, + "learning_rate": 0.0009935490892437632, + "loss": 1.01962304, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.22070312, + "step": 151, + "time_per_iteration": 2.5629055500030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126049, + "balance_loss_mlp": 1.10497248, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.07540534084758796, + "language_loss": 0.99210167, + "learning_rate": 0.0009948561926585687, + "loss": 1.00336218, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.21069336, + "step": 152, + "time_per_iteration": 2.755824565887451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133957, + "balance_loss_mlp": 1.1110214, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09890448438657973, + "language_loss": 1.02627087, + "learning_rate": 0.0009961547248418122, + "loss": 1.03761053, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.22937012, + "step": 153, + "time_per_iteration": 2.6255645751953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115876, + "balance_loss_mlp": 1.09208155, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.0750271830701194, + "language_loss": 0.99508584, + "learning_rate": 0.0009974447974719707, + "loss": 1.00624466, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.23791504, + "step": 154, + "time_per_iteration": 2.685029983520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126502, + "balance_loss_mlp": 1.10213518, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.12681443605953674, + "language_loss": 1.01620197, + "learning_rate": 0.0009987265200589763, + "loss": 1.02746701, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.24365234, + "step": 155, + "time_per_iteration": 2.7264955043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119337, + "balance_loss_mlp": 1.09590077, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.07965097154096117, + "language_loss": 1.01522899, + "learning_rate": 0.001, + "loss": 1.02642226, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.23400879, + "step": 156, + "time_per_iteration": 2.864698886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111456, + "balance_loss_mlp": 1.09257805, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.061020534493473076, + "language_loss": 0.9859184, + "learning_rate": 0.0009999999029413921, + "loss": 0.99706399, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.2199707, + "step": 157, + "time_per_iteration": 2.8241283893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125679, + "balance_loss_mlp": 1.1049242, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.05862251807890935, + "language_loss": 1.00346851, + "learning_rate": 0.0009999996117656068, + "loss": 1.01472545, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.2076416, + "step": 158, + "time_per_iteration": 2.7097458839416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113624, + "balance_loss_mlp": 1.09279847, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.09545570145123992, + "language_loss": 0.93653512, + "learning_rate": 0.0009999991264727564, + "loss": 0.94767129, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.20837402, + "step": 159, + "time_per_iteration": 2.756363868713379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110635, + "balance_loss_mlp": 1.08577418, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.09475469160316574, + "language_loss": 1.04571712, + "learning_rate": 0.0009999984470630296, + "loss": 1.05678058, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.20581055, + "step": 160, + "time_per_iteration": 2.5990707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112086, + "balance_loss_mlp": 1.09061611, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.07420241291943742, + "language_loss": 0.9342289, + "learning_rate": 0.0009999975735366902, + "loss": 0.94534969, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.21472168, + "step": 161, + "time_per_iteration": 3.06878662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114184, + "balance_loss_mlp": 1.09270215, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0799449593456649, + "language_loss": 0.95189524, + "learning_rate": 0.0009999965058940775, + "loss": 0.96303707, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.21484375, + "step": 162, + "time_per_iteration": 3.4937808513641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112457, + "balance_loss_mlp": 1.10226631, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08293329451395655, + "language_loss": 1.01278222, + "learning_rate": 0.0009999952441356057, + "loss": 1.02402782, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.22314453, + "step": 163, + "time_per_iteration": 2.535121202468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109663, + "balance_loss_mlp": 1.08820534, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.06727245316799851, + "language_loss": 1.0154388, + "learning_rate": 0.000999993788261765, + "loss": 1.02653539, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.21472168, + "step": 164, + "time_per_iteration": 3.5832889080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110987, + "balance_loss_mlp": 1.08942175, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.07205404441274409, + "language_loss": 1.03110182, + "learning_rate": 0.00099999213827312, + "loss": 1.04221165, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.21569824, + "step": 165, + "time_per_iteration": 2.8096628189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118839, + "balance_loss_mlp": 1.09684491, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.050309165813849886, + "language_loss": 0.98088074, + "learning_rate": 0.000999990294170312, + "loss": 0.99206913, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.22009277, + "step": 166, + "time_per_iteration": 2.663135051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116813, + "balance_loss_mlp": 1.09486628, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.06058681172545402, + "language_loss": 1.02190185, + "learning_rate": 0.0009999882559540566, + "loss": 1.03306985, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.21948242, + "step": 167, + "time_per_iteration": 2.649784564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118672, + "balance_loss_mlp": 1.09543872, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.10019647540930027, + "language_loss": 0.98887956, + "learning_rate": 0.000999986023625145, + "loss": 1.00006628, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.23217773, + "step": 168, + "time_per_iteration": 2.6998720169067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01817799, + "balance_loss_mlp": 1.79767668, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.21411409700219255, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80742216, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.20117188, + "step": 169, + "time_per_iteration": 5.029488563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112576, + "balance_loss_mlp": 1.10157228, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.09130724925200479, + "language_loss": 0.99515283, + "learning_rate": 0.0009999809766328958, + "loss": 1.00641036, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.24206543, + "step": 170, + "time_per_iteration": 2.6508679389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153612, + "balance_loss_mlp": 1.12968671, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.0981725040523357, + "language_loss": 1.01766157, + "learning_rate": 0.0009999781619715177, + "loss": 1.02919769, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.23925781, + "step": 171, + "time_per_iteration": 2.5449466705322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151071, + "balance_loss_mlp": 1.12767053, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.10018141203760955, + "language_loss": 1.0104121, + "learning_rate": 0.000999975153201402, + "loss": 1.02192283, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.23388672, + "step": 172, + "time_per_iteration": 2.8463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114097, + "balance_loss_mlp": 1.11745048, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.05920698759335099, + "language_loss": 0.98661143, + "learning_rate": 0.0009999719503237174, + "loss": 0.99802113, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.23498535, + "step": 173, + "time_per_iteration": 2.733147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157549, + "balance_loss_mlp": 1.1333611, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.12686135486457134, + "language_loss": 1.07479167, + "learning_rate": 0.0009999685533397073, + "loss": 1.08636713, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.24194336, + "step": 174, + "time_per_iteration": 2.5705809593200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110896, + "balance_loss_mlp": 1.08707762, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.07652801902249555, + "language_loss": 0.99758261, + "learning_rate": 0.00099996496225069, + "loss": 1.00869155, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.23815918, + "step": 175, + "time_per_iteration": 2.6572659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118018, + "balance_loss_mlp": 1.09399772, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.05463854096335067, + "language_loss": 1.01895058, + "learning_rate": 0.0009999611770580604, + "loss": 1.03013086, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.24023438, + "step": 176, + "time_per_iteration": 2.8216159343719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.09596181, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.08810438351502946, + "language_loss": 1.01167393, + "learning_rate": 0.0009999571977632876, + "loss": 1.02288568, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.25231934, + "step": 177, + "time_per_iteration": 2.581037998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115073, + "balance_loss_mlp": 1.09040904, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.08419866181616258, + "language_loss": 1.03353202, + "learning_rate": 0.0009999530243679166, + "loss": 1.04468274, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.24682617, + "step": 178, + "time_per_iteration": 2.5844500064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137225, + "balance_loss_mlp": 1.11332321, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.13671082465577608, + "language_loss": 0.99045932, + "learning_rate": 0.0009999486568735675, + "loss": 1.00183165, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.23913574, + "step": 179, + "time_per_iteration": 3.044409990310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125047, + "balance_loss_mlp": 1.1010983, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.0738854697341979, + "language_loss": 0.99422705, + "learning_rate": 0.0009999440952819362, + "loss": 1.00547755, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.23950195, + "step": 180, + "time_per_iteration": 3.644280433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112251, + "balance_loss_mlp": 1.08836114, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.04789131390967285, + "language_loss": 0.98983485, + "learning_rate": 0.0009999393395947935, + "loss": 1.00095737, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2388916, + "step": 181, + "time_per_iteration": 2.8229053020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114504, + "balance_loss_mlp": 1.08992302, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.08040661288612141, + "language_loss": 1.02358437, + "learning_rate": 0.0009999343898139858, + "loss": 1.03472936, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.24584961, + "step": 182, + "time_per_iteration": 2.6112709045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123737, + "balance_loss_mlp": 1.09824967, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.0879280890069936, + "language_loss": 1.01010704, + "learning_rate": 0.0009999292459414348, + "loss": 1.02134442, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.25476074, + "step": 183, + "time_per_iteration": 2.574800491333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111495, + "balance_loss_mlp": 1.08559036, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.08068750200828848, + "language_loss": 1.05455053, + "learning_rate": 0.0009999239079791374, + "loss": 1.06566548, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.25915527, + "step": 184, + "time_per_iteration": 2.5650548934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110884, + "balance_loss_mlp": 1.08343673, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.07300059562366337, + "language_loss": 0.98493111, + "learning_rate": 0.0009999183759291659, + "loss": 0.99601954, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.25427246, + "step": 185, + "time_per_iteration": 2.7383785247802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110571, + "balance_loss_mlp": 1.08168936, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.09426698036311254, + "language_loss": 1.00536895, + "learning_rate": 0.0009999126497936682, + "loss": 1.01642609, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.24023438, + "step": 186, + "time_per_iteration": 2.5103538036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110913, + "balance_loss_mlp": 1.08740544, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.07507023604654985, + "language_loss": 1.03590488, + "learning_rate": 0.0009999067295748676, + "loss": 1.047014, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.23510742, + "step": 187, + "time_per_iteration": 2.806403160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112247, + "balance_loss_mlp": 1.09995186, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.10679989437153373, + "language_loss": 1.00781608, + "learning_rate": 0.000999900615275062, + "loss": 1.01904082, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.22509766, + "step": 188, + "time_per_iteration": 2.6750597953796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105556, + "balance_loss_mlp": 1.0823226, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.06425431277780277, + "language_loss": 1.06987619, + "learning_rate": 0.0009998943068966256, + "loss": 1.0809319, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.23242188, + "step": 189, + "time_per_iteration": 2.4297006130218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.0826813, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.07322572175010231, + "language_loss": 1.01591444, + "learning_rate": 0.0009998878044420072, + "loss": 1.02697778, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23669434, + "step": 190, + "time_per_iteration": 2.6686899662017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108272, + "balance_loss_mlp": 1.08489525, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.07088525550270033, + "language_loss": 0.97819, + "learning_rate": 0.0009998811079137318, + "loss": 0.98927271, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.23400879, + "step": 191, + "time_per_iteration": 2.5795974731445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118931, + "balance_loss_mlp": 1.09439743, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.07437245365565072, + "language_loss": 0.9895249, + "learning_rate": 0.0009998742173143987, + "loss": 1.0007143, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.24536133, + "step": 192, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133748, + "balance_loss_mlp": 1.10824919, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.06698686336952825, + "language_loss": 0.98415262, + "learning_rate": 0.0009998671326466833, + "loss": 0.99549013, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.25524902, + "step": 193, + "time_per_iteration": 2.955780506134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136952, + "balance_loss_mlp": 1.10922432, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.07154145387165563, + "language_loss": 0.99267447, + "learning_rate": 0.0009998598539133362, + "loss": 1.00404394, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.27734375, + "step": 194, + "time_per_iteration": 3.0137686729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163557, + "balance_loss_mlp": 1.13373041, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.09795763902625766, + "language_loss": 1.00780571, + "learning_rate": 0.0009998523811171828, + "loss": 1.01944125, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2980957, + "step": 195, + "time_per_iteration": 2.5090267658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164155, + "balance_loss_mlp": 1.13323212, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0756543485462421, + "language_loss": 1.0036695, + "learning_rate": 0.0009998447142611248, + "loss": 1.015311, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.30883789, + "step": 196, + "time_per_iteration": 2.653759241104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12615836, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.10738469994654526, + "language_loss": 0.9438082, + "learning_rate": 0.0009998368533481387, + "loss": 0.95537138, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.30126953, + "step": 197, + "time_per_iteration": 3.03090763092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123277, + "balance_loss_mlp": 1.09433353, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08947148055588174, + "language_loss": 0.97516447, + "learning_rate": 0.0009998287983812762, + "loss": 0.98639727, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.28930664, + "step": 198, + "time_per_iteration": 2.842519760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.10672641, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.08719552456544254, + "language_loss": 1.03183711, + "learning_rate": 0.0009998205493636646, + "loss": 1.04316807, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.26416016, + "step": 199, + "time_per_iteration": 2.657094955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099837, + "balance_loss_mlp": 1.07485092, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.11937452390124363, + "language_loss": 0.95869702, + "learning_rate": 0.0009998121062985063, + "loss": 0.96969533, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.24987793, + "step": 200, + "time_per_iteration": 2.6954355239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108694, + "balance_loss_mlp": 1.08444691, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.09459530753006626, + "language_loss": 0.98493665, + "learning_rate": 0.0009998034691890794, + "loss": 0.9960236, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.24243164, + "step": 201, + "time_per_iteration": 2.7717928886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.08075976, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.07675440437740683, + "language_loss": 1.0290482, + "learning_rate": 0.0009997946380387369, + "loss": 1.04009235, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.23632812, + "step": 202, + "time_per_iteration": 2.63975191116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111336, + "balance_loss_mlp": 1.08706474, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09220046036918417, + "language_loss": 1.04956245, + "learning_rate": 0.0009997856128509076, + "loss": 1.06067586, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.24279785, + "step": 203, + "time_per_iteration": 2.856816053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124883, + "balance_loss_mlp": 1.10112453, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.08622839045605694, + "language_loss": 0.99688643, + "learning_rate": 0.0009997763936290952, + "loss": 1.00813532, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.23754883, + "step": 204, + "time_per_iteration": 2.5392112731933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113243, + "balance_loss_mlp": 1.10773039, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.09842935942049862, + "language_loss": 1.0453217, + "learning_rate": 0.0009997669803768789, + "loss": 1.05664587, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24694824, + "step": 205, + "time_per_iteration": 2.7708992958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108465, + "balance_loss_mlp": 1.08426595, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.10843184908981528, + "language_loss": 0.9984858, + "learning_rate": 0.0009997573730979134, + "loss": 1.00957048, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.24194336, + "step": 206, + "time_per_iteration": 2.7474939823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01685643, + "balance_loss_mlp": 1.6616106, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.13014896830523812, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80878842, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 0.24023438, + "step": 207, + "time_per_iteration": 4.682751655578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109669, + "balance_loss_mlp": 1.08474243, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.07677308889428856, + "language_loss": 0.98866731, + "learning_rate": 0.0009997375764747294, + "loss": 0.99976397, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.24926758, + "step": 208, + "time_per_iteration": 2.9866418838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110763, + "balance_loss_mlp": 1.08659935, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.07362493409063897, + "language_loss": 0.96845645, + "learning_rate": 0.0009997273871381967, + "loss": 0.97956407, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.24169922, + "step": 209, + "time_per_iteration": 2.7354848384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125333, + "balance_loss_mlp": 1.09998906, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.07873798613461079, + "language_loss": 1.01664305, + "learning_rate": 0.0009997170037902862, + "loss": 1.0278964, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.25366211, + "step": 210, + "time_per_iteration": 2.704061269760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120611, + "balance_loss_mlp": 1.09462297, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.06515356853390573, + "language_loss": 1.04550838, + "learning_rate": 0.0009997064264350292, + "loss": 1.05671442, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.26013184, + "step": 211, + "time_per_iteration": 2.8975577354431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113542, + "balance_loss_mlp": 1.08662462, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.07652094351016743, + "language_loss": 0.98263478, + "learning_rate": 0.0009996956550765317, + "loss": 0.99377024, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.26928711, + "step": 212, + "time_per_iteration": 2.6716954708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125588, + "balance_loss_mlp": 1.09752572, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07289633346919515, + "language_loss": 0.93075061, + "learning_rate": 0.0009996846897189762, + "loss": 0.94200653, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.28051758, + "step": 213, + "time_per_iteration": 2.621661901473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110085, + "balance_loss_mlp": 1.08412087, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.055838089119108855, + "language_loss": 0.99370623, + "learning_rate": 0.0009996735303666193, + "loss": 1.004807, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.2598877, + "step": 214, + "time_per_iteration": 2.6928601264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095032, + "balance_loss_mlp": 1.06966448, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.04962656356162825, + "language_loss": 1.01034558, + "learning_rate": 0.0009996621770237937, + "loss": 1.02129602, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.25390625, + "step": 215, + "time_per_iteration": 2.760256290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098352, + "balance_loss_mlp": 1.07167339, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.06820201547086252, + "language_loss": 0.97216904, + "learning_rate": 0.0009996506296949073, + "loss": 0.98315251, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.26708984, + "step": 216, + "time_per_iteration": 2.921712636947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106582, + "balance_loss_mlp": 1.0792954, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.05678696526689756, + "language_loss": 0.96681535, + "learning_rate": 0.0009996388883844428, + "loss": 0.97788119, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27282715, + "step": 217, + "time_per_iteration": 2.6392288208007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092837, + "balance_loss_mlp": 1.06704009, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06325985488704432, + "language_loss": 1.01514912, + "learning_rate": 0.0009996269530969588, + "loss": 1.02607751, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25830078, + "step": 218, + "time_per_iteration": 2.6588566303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105308, + "balance_loss_mlp": 1.08038127, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.07879458740668356, + "language_loss": 0.99769139, + "learning_rate": 0.0009996148238370888, + "loss": 1.00874448, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24938965, + "step": 219, + "time_per_iteration": 2.7322278022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103161, + "balance_loss_mlp": 1.07711363, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.0629407592127239, + "language_loss": 0.95434463, + "learning_rate": 0.0009996025006095421, + "loss": 0.96537632, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.26049805, + "step": 220, + "time_per_iteration": 3.336355209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02460831, + "balance_loss_mlp": 2.43965983, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.4526401201513886, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.80243975, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 0.21191406, + "step": 221, + "time_per_iteration": 5.584397315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138695, + "balance_loss_mlp": 1.11146736, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.08000509590360377, + "language_loss": 0.96767551, + "learning_rate": 0.0009995772722706307, + "loss": 0.9790625, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.27246094, + "step": 222, + "time_per_iteration": 2.932035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.14898777, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.06295735346771135, + "language_loss": 1.10290885, + "learning_rate": 0.0009995643671690604, + "loss": 1.1146853, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.28686523, + "step": 223, + "time_per_iteration": 2.489574909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118606, + "balance_loss_mlp": 1.15768862, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.06397701682602697, + "language_loss": 0.97599596, + "learning_rate": 0.0009995512681194023, + "loss": 0.98785651, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.28369141, + "step": 224, + "time_per_iteration": 2.8617055416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204697, + "balance_loss_mlp": 1.17644429, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.0569906191636753, + "language_loss": 0.95713508, + "learning_rate": 0.0009995379751267417, + "loss": 0.96918201, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28295898, + "step": 225, + "time_per_iteration": 3.272956371307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211045, + "balance_loss_mlp": 1.17959809, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.06210348551978246, + "language_loss": 0.970909, + "learning_rate": 0.0009995244881962398, + "loss": 0.98301941, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.31420898, + "step": 226, + "time_per_iteration": 2.629014253616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207143, + "balance_loss_mlp": 1.17750776, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.06412842399528458, + "language_loss": 0.97423029, + "learning_rate": 0.0009995108073331323, + "loss": 0.98630178, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.29614258, + "step": 227, + "time_per_iteration": 2.598266124725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.1790204, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.05900157234221112, + "language_loss": 1.00919747, + "learning_rate": 0.0009994969325427309, + "loss": 1.02128983, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.30200195, + "step": 228, + "time_per_iteration": 2.681445598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208149, + "balance_loss_mlp": 1.17727375, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.08372721248844238, + "language_loss": 0.96768719, + "learning_rate": 0.0009994828638304218, + "loss": 0.97976863, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.30883789, + "step": 229, + "time_per_iteration": 2.6330137252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213023, + "balance_loss_mlp": 1.18202829, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.09332052147555223, + "language_loss": 1.02555704, + "learning_rate": 0.0009994686012016675, + "loss": 1.0376873, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.30981445, + "step": 230, + "time_per_iteration": 2.519575595855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205079, + "balance_loss_mlp": 1.17470419, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.07303811655625075, + "language_loss": 1.02279592, + "learning_rate": 0.000999454144662005, + "loss": 1.03484678, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.3034668, + "step": 231, + "time_per_iteration": 2.8772194385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200788, + "balance_loss_mlp": 1.16729009, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.05982585511102693, + "language_loss": 0.9550131, + "learning_rate": 0.0009994394942170468, + "loss": 0.96702093, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.33520508, + "step": 232, + "time_per_iteration": 2.705536127090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200355, + "balance_loss_mlp": 1.16673827, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06482734437318205, + "language_loss": 0.93872058, + "learning_rate": 0.0009994246498724808, + "loss": 0.95072412, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.33642578, + "step": 233, + "time_per_iteration": 2.729526996612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204357, + "balance_loss_mlp": 1.17043054, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.06840473363398163, + "language_loss": 0.96267349, + "learning_rate": 0.00099940961163407, + "loss": 0.97471702, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.33935547, + "step": 234, + "time_per_iteration": 2.8506321907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210646, + "balance_loss_mlp": 1.1758604, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.061734633326469966, + "language_loss": 0.99016106, + "learning_rate": 0.0009993943795076528, + "loss": 1.0022676, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.34814453, + "step": 235, + "time_per_iteration": 2.6817193031311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012082, + "balance_loss_mlp": 1.17379582, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.07722659013027651, + "language_loss": 1.01211047, + "learning_rate": 0.0009993789534991427, + "loss": 1.02419257, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34423828, + "step": 236, + "time_per_iteration": 2.4797797203063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216471, + "balance_loss_mlp": 1.18354487, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.057771959372629855, + "language_loss": 0.96296465, + "learning_rate": 0.0009993633336145287, + "loss": 0.97512937, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.3293457, + "step": 237, + "time_per_iteration": 2.629390001296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225643, + "balance_loss_mlp": 1.19369495, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.07668042159358972, + "language_loss": 1.00654197, + "learning_rate": 0.0009993475198598752, + "loss": 1.01879823, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.31958008, + "step": 238, + "time_per_iteration": 3.01481032371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220207, + "balance_loss_mlp": 1.1866858, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08994725037560618, + "language_loss": 0.96828419, + "learning_rate": 0.0009993315122413212, + "loss": 0.98048627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.33544922, + "step": 239, + "time_per_iteration": 2.6483867168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215592, + "balance_loss_mlp": 1.18042517, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.08238446857980607, + "language_loss": 0.9678297, + "learning_rate": 0.0009993153107650818, + "loss": 0.97998565, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.35180664, + "step": 240, + "time_per_iteration": 2.594534158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199347, + "balance_loss_mlp": 1.16303563, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.09316981102360596, + "language_loss": 0.96465278, + "learning_rate": 0.0009992989154374468, + "loss": 0.9766463, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.36328125, + "step": 241, + "time_per_iteration": 2.5503900051116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190623, + "balance_loss_mlp": 1.15631413, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06540072726643342, + "language_loss": 1.03219867, + "learning_rate": 0.0009992823262647817, + "loss": 1.04410505, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.34301758, + "step": 242, + "time_per_iteration": 2.7218894958496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156597, + "balance_loss_mlp": 1.1235044, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.09177405734811558, + "language_loss": 0.97326249, + "learning_rate": 0.0009992655432535264, + "loss": 0.98482847, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.33105469, + "step": 243, + "time_per_iteration": 2.800133466720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136682, + "balance_loss_mlp": 1.10614085, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.0753000751829641, + "language_loss": 0.98140877, + "learning_rate": 0.0009992485664101973, + "loss": 0.99277562, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.30517578, + "step": 244, + "time_per_iteration": 2.6863763332366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.08648348, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.06369495608278983, + "language_loss": 1.00049853, + "learning_rate": 0.000999231395741385, + "loss": 1.01165819, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.29467773, + "step": 245, + "time_per_iteration": 3.145612955093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104415, + "balance_loss_mlp": 1.0764488, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.058358007346171054, + "language_loss": 0.97651666, + "learning_rate": 0.0009992140312537557, + "loss": 0.98756075, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.2800293, + "step": 246, + "time_per_iteration": 2.612847328186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092763, + "balance_loss_mlp": 1.06641817, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.0813165094086701, + "language_loss": 0.93562448, + "learning_rate": 0.000999196472954051, + "loss": 0.94655204, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.26379395, + "step": 247, + "time_per_iteration": 2.9633545875549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02706023, + "balance_loss_mlp": 2.55038333, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.26644214904670055, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.82130873, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.5546875, + "step": 248, + "time_per_iteration": 5.665804624557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.12381256, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.07780849766073628, + "language_loss": 1.00670481, + "learning_rate": 0.0009991607749457578, + "loss": 1.01821971, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.27709961, + "step": 249, + "time_per_iteration": 2.511357069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173992, + "balance_loss_mlp": 1.14483345, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.08242230719461915, + "language_loss": 0.98555326, + "learning_rate": 0.0009991426352510286, + "loss": 0.99729323, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.29174805, + "step": 250, + "time_per_iteration": 2.9747626781463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213643, + "balance_loss_mlp": 1.18186164, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.08110439009499554, + "language_loss": 0.99640858, + "learning_rate": 0.0009991243017719422, + "loss": 1.00854492, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.31787109, + "step": 251, + "time_per_iteration": 2.6450002193450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247147, + "balance_loss_mlp": 1.21276748, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.09531666026222298, + "language_loss": 0.94547766, + "learning_rate": 0.0009991057745156165, + "loss": 0.95794916, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.34375, + "step": 252, + "time_per_iteration": 2.608226776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0212821, + "balance_loss_mlp": 2.05687547, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.23568337742673945, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84039193, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.71484375, + "step": 253, + "time_per_iteration": 5.009166955947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253718, + "balance_loss_mlp": 1.22112656, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.11732554794190522, + "language_loss": 1.02719152, + "learning_rate": 0.0009990681387000943, + "loss": 1.03972876, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.32568359, + "step": 254, + "time_per_iteration": 2.733544111251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259536, + "balance_loss_mlp": 1.22959042, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.10757948615664437, + "language_loss": 0.99075437, + "learning_rate": 0.0009990490301555093, + "loss": 1.00334978, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.29907227, + "step": 255, + "time_per_iteration": 2.952223777770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01833791, + "balance_loss_mlp": 1.79201972, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.13001926806611183, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81048942, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.41796875, + "step": 256, + "time_per_iteration": 4.834028244018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01839647, + "balance_loss_mlp": 1.7994014, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.11989001468728706, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81082386, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.40234375, + "step": 257, + "time_per_iteration": 4.963416814804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764173, + "balance_loss_mlp": 1.72659838, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.09913369297847359, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71740055, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.375, + "step": 258, + "time_per_iteration": 4.860485076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242536, + "balance_loss_mlp": 1.21342516, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.09740558448014502, + "language_loss": 0.93272007, + "learning_rate": 0.0009989706585723202, + "loss": 0.94514549, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29101562, + "step": 259, + "time_per_iteration": 2.763617753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252163, + "balance_loss_mlp": 1.22202659, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.1249592106702951, + "language_loss": 0.99313855, + "learning_rate": 0.0009989505813633442, + "loss": 1.0056603, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.30102539, + "step": 260, + "time_per_iteration": 2.687018394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240716, + "balance_loss_mlp": 1.2099601, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12109163963871895, + "language_loss": 0.99271172, + "learning_rate": 0.000998930310444573, + "loss": 1.00511885, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.30712891, + "step": 261, + "time_per_iteration": 2.7355992794036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194626, + "balance_loss_mlp": 1.16220057, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.10196827835843725, + "language_loss": 0.96712077, + "learning_rate": 0.0009989098458238765, + "loss": 0.97906703, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.32421875, + "step": 262, + "time_per_iteration": 2.8160154819488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120265, + "balance_loss_mlp": 1.16850853, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.08050125519090791, + "language_loss": 0.96376812, + "learning_rate": 0.0009988891875091998, + "loss": 0.97579467, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.34179688, + "step": 263, + "time_per_iteration": 2.7738425731658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221172, + "balance_loss_mlp": 1.18657792, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09840792148235085, + "language_loss": 0.91716301, + "learning_rate": 0.0009988683355085636, + "loss": 0.92937469, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.34619141, + "step": 264, + "time_per_iteration": 2.7763147354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240941, + "balance_loss_mlp": 1.20393836, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.10851467261948886, + "language_loss": 0.99809039, + "learning_rate": 0.000998847289830063, + "loss": 1.01049972, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37011719, + "step": 265, + "time_per_iteration": 2.824655532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228337, + "balance_loss_mlp": 1.1930747, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.10300549526892724, + "language_loss": 0.92410266, + "learning_rate": 0.0009988260504818682, + "loss": 0.93638599, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.35253906, + "step": 266, + "time_per_iteration": 2.5484864711761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187227, + "balance_loss_mlp": 1.15127397, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.08304900792028935, + "language_loss": 0.99349552, + "learning_rate": 0.000998804617472226, + "loss": 1.00536776, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.35986328, + "step": 267, + "time_per_iteration": 2.67124342918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115754, + "balance_loss_mlp": 1.1241138, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.09977621520267708, + "language_loss": 0.94207335, + "learning_rate": 0.0009987829908094568, + "loss": 0.95364869, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.33447266, + "step": 268, + "time_per_iteration": 2.813934087753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134088, + "balance_loss_mlp": 1.09908843, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.11738978381138881, + "language_loss": 1.00792646, + "learning_rate": 0.0009987611705019569, + "loss": 1.01926744, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.3503418, + "step": 269, + "time_per_iteration": 4.138862133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117111, + "balance_loss_mlp": 1.08282614, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.05348082980263852, + "language_loss": 0.99369657, + "learning_rate": 0.0009987391565581978, + "loss": 1.00486767, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34277344, + "step": 270, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126117, + "balance_loss_mlp": 1.09176075, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.07524916084480812, + "language_loss": 0.92056942, + "learning_rate": 0.000998716948986726, + "loss": 0.93183053, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34350586, + "step": 271, + "time_per_iteration": 2.7993569374084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142479, + "balance_loss_mlp": 1.10948217, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.0817059207133684, + "language_loss": 0.94050443, + "learning_rate": 0.0009986945477961633, + "loss": 0.95192927, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.33032227, + "step": 272, + "time_per_iteration": 2.692488193511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162369, + "balance_loss_mlp": 1.13108802, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07154102990319093, + "language_loss": 0.9958387, + "learning_rate": 0.0009986719529952066, + "loss": 1.00746238, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.3125, + "step": 273, + "time_per_iteration": 2.834634780883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151893, + "balance_loss_mlp": 1.12099373, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.11641144040169231, + "language_loss": 0.98596179, + "learning_rate": 0.000998649164592628, + "loss": 0.99748075, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.30859375, + "step": 274, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128229, + "balance_loss_mlp": 1.0986656, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.08444223005841496, + "language_loss": 0.96863008, + "learning_rate": 0.0009986261825972748, + "loss": 0.97991234, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29541016, + "step": 275, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116765, + "balance_loss_mlp": 1.08734369, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.09541227165854013, + "language_loss": 0.9859423, + "learning_rate": 0.000998603007018069, + "loss": 0.99711001, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29394531, + "step": 276, + "time_per_iteration": 2.7675342559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108591, + "balance_loss_mlp": 1.07731009, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.06559506468622318, + "language_loss": 0.95903766, + "learning_rate": 0.0009985796378640089, + "loss": 0.97012359, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.3125, + "step": 277, + "time_per_iteration": 2.7019519805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111687, + "balance_loss_mlp": 1.08012068, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.07318038514420845, + "language_loss": 0.95983016, + "learning_rate": 0.0009985560751441665, + "loss": 0.97094703, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.31542969, + "step": 278, + "time_per_iteration": 2.8234922885894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111914, + "balance_loss_mlp": 1.0874306, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.07220087085065136, + "language_loss": 0.98319995, + "learning_rate": 0.00099853231886769, + "loss": 0.99439132, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.31713867, + "step": 279, + "time_per_iteration": 2.7748613357543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133292, + "balance_loss_mlp": 1.10162961, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06439402113592181, + "language_loss": 0.98657203, + "learning_rate": 0.0009985083690438024, + "loss": 0.99790496, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.31640625, + "step": 280, + "time_per_iteration": 2.700810670852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132869, + "balance_loss_mlp": 1.10204113, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.04843472954862069, + "language_loss": 0.89283121, + "learning_rate": 0.0009984842256818016, + "loss": 0.9041599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.30786133, + "step": 281, + "time_per_iteration": 3.115292549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113546, + "balance_loss_mlp": 1.10580087, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.06657413960403659, + "language_loss": 0.99515754, + "learning_rate": 0.0009984598887910613, + "loss": 1.00651217, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.29614258, + "step": 282, + "time_per_iteration": 2.735640048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140553, + "balance_loss_mlp": 1.10893846, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.07881571737542031, + "language_loss": 0.95306879, + "learning_rate": 0.0009984353583810297, + "loss": 0.96447432, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.31616211, + "step": 283, + "time_per_iteration": 2.8240931034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128602, + "balance_loss_mlp": 1.09834647, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0943213260733239, + "language_loss": 0.97471213, + "learning_rate": 0.0009984106344612302, + "loss": 0.98599815, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.30224609, + "step": 284, + "time_per_iteration": 2.802689790725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119254, + "balance_loss_mlp": 1.08964229, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.0726777825280204, + "language_loss": 0.92919928, + "learning_rate": 0.0009983857170412615, + "loss": 0.94039178, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.29589844, + "step": 285, + "time_per_iteration": 3.0111782550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.10165143, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.06957121076923053, + "language_loss": 0.92976809, + "learning_rate": 0.000998360606130798, + "loss": 0.94110835, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.32324219, + "step": 286, + "time_per_iteration": 2.8221306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01949249, + "balance_loss_mlp": 1.90461755, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.20138197735421756, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71022367, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.44726562, + "step": 287, + "time_per_iteration": 4.872509956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160615, + "balance_loss_mlp": 1.12447047, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.09083797153449202, + "language_loss": 0.98382282, + "learning_rate": 0.0009983098038774552, + "loss": 0.99542892, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.36132812, + "step": 288, + "time_per_iteration": 2.7861900329589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156365, + "balance_loss_mlp": 1.54524422, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.05039988105800305, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79733872, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.18359375, + "step": 289, + "time_per_iteration": 4.809176683425903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183294, + "balance_loss_mlp": 1.14958155, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.11767359006900376, + "language_loss": 0.95852768, + "learning_rate": 0.0009982582277800948, + "loss": 0.9703607, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.33666992, + "step": 290, + "time_per_iteration": 2.5785539150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11738336, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.09005932528563108, + "language_loss": 1.03039932, + "learning_rate": 0.0009982321495648908, + "loss": 1.04188573, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.3125, + "step": 291, + "time_per_iteration": 2.798412561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133747, + "balance_loss_mlp": 1.10218096, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.07041326246084649, + "language_loss": 0.9488259, + "learning_rate": 0.0009982058779188115, + "loss": 0.96016335, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.31542969, + "step": 292, + "time_per_iteration": 2.7117443084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113109, + "balance_loss_mlp": 1.08354521, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.0659469171672323, + "language_loss": 1.02221513, + "learning_rate": 0.0009981794128520567, + "loss": 1.0333463, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.29589844, + "step": 293, + "time_per_iteration": 2.83561372756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113063, + "balance_loss_mlp": 1.10104227, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.07618014203826041, + "language_loss": 0.98908657, + "learning_rate": 0.000998152754374901, + "loss": 1.00039291, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.2956543, + "step": 294, + "time_per_iteration": 2.879502773284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133819, + "balance_loss_mlp": 1.1052562, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.09109925372268521, + "language_loss": 0.94850433, + "learning_rate": 0.0009981259024976943, + "loss": 0.95984244, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.28564453, + "step": 295, + "time_per_iteration": 2.708038568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129466, + "balance_loss_mlp": 1.10023606, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.08548016831625774, + "language_loss": 0.92669952, + "learning_rate": 0.0009980988572308612, + "loss": 0.93799424, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.29248047, + "step": 296, + "time_per_iteration": 2.99466609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126952, + "balance_loss_mlp": 1.09779358, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.05751010220277151, + "language_loss": 0.96034563, + "learning_rate": 0.0009980716185849015, + "loss": 0.9716152, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.29174805, + "step": 297, + "time_per_iteration": 3.0216734409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135652, + "balance_loss_mlp": 1.10651755, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06310788330802251, + "language_loss": 0.92855394, + "learning_rate": 0.0009980441865703904, + "loss": 0.93991041, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29150391, + "step": 298, + "time_per_iteration": 2.6354267597198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124221, + "balance_loss_mlp": 1.09456158, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.07879622532675779, + "language_loss": 1.0091691, + "learning_rate": 0.000998016561197978, + "loss": 1.02041125, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29638672, + "step": 299, + "time_per_iteration": 2.726853370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104202, + "balance_loss_mlp": 1.0768075, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.07606317837722033, + "language_loss": 0.9243238, + "learning_rate": 0.0009979887424783895, + "loss": 0.9353658, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.27441406, + "step": 300, + "time_per_iteration": 2.866880416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03286275, + "balance_loss_mlp": 5.97428513, + "diversity_loss_mlp": 0.40086228, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.08630620995418306, + "language_loss": 1.00780904, + "learning_rate": 0.0009979607304224248, + "loss": 1.04067183, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.09870158, + "step": 301, + "time_per_iteration": 2.8737847805023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101287, + "balance_loss_mlp": 1.07100797, + "diversity_loss_mlp": 0.0, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.07465341521099292, + "language_loss": 0.98771101, + "learning_rate": 0.000997932525040959, + "loss": 0.99872386, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.30273438, + "routerloss_mlp": 0.0, + "step": 302, + "time_per_iteration": 2.646038055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097684, + "balance_loss_mlp": 1.06912112, + "diversity_loss_mlp": 0.0, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.0784548088046029, + "language_loss": 1.01345074, + "learning_rate": 0.000997904126344943, + "loss": 1.02442753, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.28527832, + "routerloss_mlp": 0.0, + "step": 303, + "time_per_iteration": 2.607773542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117476, + "balance_loss_mlp": 1.08612442, + "diversity_loss_mlp": 0.0, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.08413175271133923, + "language_loss": 0.96722186, + "learning_rate": 0.0009978755343454018, + "loss": 0.97839665, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.31323242, + "routerloss_mlp": 0.0, + "step": 304, + "time_per_iteration": 2.7423698902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.11099684, + "diversity_loss_mlp": 0.0, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.08591892096672729, + "language_loss": 0.97475642, + "learning_rate": 0.0009978467490534355, + "loss": 0.98621881, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.35229492, + "routerloss_mlp": 0.0, + "step": 305, + "time_per_iteration": 2.5751075744628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144387, + "balance_loss_mlp": 1.10974526, + "diversity_loss_mlp": 0.0, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.06674928608125212, + "language_loss": 0.95161211, + "learning_rate": 0.00099781777048022, + "loss": 0.96305597, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.34667969, + "routerloss_mlp": 0.0, + "step": 306, + "time_per_iteration": 2.697453260421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142445, + "balance_loss_mlp": 1.10766006, + "diversity_loss_mlp": 0.0, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.08714127978238019, + "language_loss": 0.96547389, + "learning_rate": 0.0009977885986370057, + "loss": 0.97689843, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.34790039, + "routerloss_mlp": 0.0, + "step": 307, + "time_per_iteration": 2.555311679840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.11098385, + "diversity_loss_mlp": 0.0, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.07630797692789458, + "language_loss": 0.93133295, + "learning_rate": 0.000997759233535118, + "loss": 0.94276774, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.32495117, + "routerloss_mlp": 0.0, + "step": 308, + "time_per_iteration": 2.7760326862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.10530353, + "diversity_loss_mlp": 0.0, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.1535726459245726, + "language_loss": 0.98530197, + "learning_rate": 0.0009977296751859576, + "loss": 0.99668187, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.32666016, + "routerloss_mlp": 0.0, + "step": 309, + "time_per_iteration": 2.7718236446380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119868, + "balance_loss_mlp": 1.09030402, + "diversity_loss_mlp": 0.0, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.09363029892750833, + "language_loss": 1.00139546, + "learning_rate": 0.0009976999236009998, + "loss": 1.01259422, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 310, + "time_per_iteration": 2.7480924129486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128418, + "balance_loss_mlp": 1.1004039, + "diversity_loss_mlp": 0.0, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.11799476734746514, + "language_loss": 1.01830125, + "learning_rate": 0.0009976699787917955, + "loss": 1.02958548, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.28051758, + "routerloss_mlp": 0.0, + "step": 311, + "time_per_iteration": 2.6702628135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02237821, + "balance_loss_mlp": 2.22513723, + "diversity_loss_mlp": 0.0, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.1521885653041848, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75680816, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 312, + "time_per_iteration": 4.968472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01934551, + "balance_loss_mlp": 3.38140035, + "diversity_loss_mlp": 0.39575127, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.05936914788699087, + "language_loss": 0.983639, + "learning_rate": 0.0009976095095472243, + "loss": 1.00298452, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.04597524, + "step": 313, + "time_per_iteration": 2.6077775955200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140935, + "balance_loss_mlp": 1.11120427, + "diversity_loss_mlp": 0.0, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.09323488343042824, + "language_loss": 0.95392269, + "learning_rate": 0.0009975789851353334, + "loss": 0.96533203, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29736328, + "routerloss_mlp": 0.0, + "step": 314, + "time_per_iteration": 2.810530424118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152798, + "balance_loss_mlp": 1.12359178, + "diversity_loss_mlp": 0.0, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.09115128879339694, + "language_loss": 0.97407585, + "learning_rate": 0.0009975482675461487, + "loss": 0.98560387, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.29223633, + "routerloss_mlp": 0.0, + "step": 315, + "time_per_iteration": 2.658961772918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165231, + "balance_loss_mlp": 1.13464189, + "diversity_loss_mlp": 0.0, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08232329918432242, + "language_loss": 0.95008749, + "learning_rate": 0.0009975173567915952, + "loss": 0.96173978, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.3059082, + "routerloss_mlp": 0.0, + "step": 316, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208938, + "balance_loss_mlp": 1.17508304, + "diversity_loss_mlp": 0.0, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.11734128354988786, + "language_loss": 0.89037865, + "learning_rate": 0.000997486252883674, + "loss": 0.90246803, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.33886719, + "routerloss_mlp": 0.0, + "step": 317, + "time_per_iteration": 2.82440447807312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246386, + "balance_loss_mlp": 1.21069503, + "diversity_loss_mlp": 0.0, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.09191065951965113, + "language_loss": 0.94435382, + "learning_rate": 0.0009974549558344602, + "loss": 0.95681769, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.35693359, + "routerloss_mlp": 0.0, + "step": 318, + "time_per_iteration": 3.6594014167785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256455, + "balance_loss_mlp": 1.22028661, + "diversity_loss_mlp": 0.0, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.10186826507715854, + "language_loss": 1.03254342, + "learning_rate": 0.000997423465656105, + "loss": 1.04510808, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.36181641, + "routerloss_mlp": 0.0, + "step": 319, + "time_per_iteration": 2.7277376651763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228783, + "balance_loss_mlp": 1.19342566, + "diversity_loss_mlp": 0.0, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.07892523617459922, + "language_loss": 1.00628281, + "learning_rate": 0.0009973917823608335, + "loss": 1.01857066, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.35375977, + "routerloss_mlp": 0.0, + "step": 320, + "time_per_iteration": 2.608973503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216411, + "balance_loss_mlp": 1.18279386, + "diversity_loss_mlp": 0.0, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.08046246772740448, + "language_loss": 0.96186835, + "learning_rate": 0.0009973599059609462, + "loss": 0.9740324, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 321, + "time_per_iteration": 2.736543655395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188588, + "balance_loss_mlp": 1.15735531, + "diversity_loss_mlp": 0.0, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.06958940991484033, + "language_loss": 0.93877137, + "learning_rate": 0.000997327836468819, + "loss": 0.95065725, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.31225586, + "routerloss_mlp": 0.0, + "step": 322, + "time_per_iteration": 2.6034624576568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_mlp": 1.14392066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.10097410409674823, + "language_loss": 0.96476239, + "learning_rate": 0.000997295573896902, + "loss": 0.97648811, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28649902, + "routerloss_mlp": 0.0, + "step": 323, + "time_per_iteration": 2.8207039833068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02388506, + "balance_loss_mlp": 2.37343788, + "diversity_loss_mlp": 0.0, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.2858946964689234, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83584547, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 324, + "time_per_iteration": 4.691263437271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01793915, + "balance_loss_mlp": 1.78142214, + "diversity_loss_mlp": 0.0, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.11944332826526777, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80365855, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 325, + "time_per_iteration": 4.837715148925781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214832, + "balance_loss_mlp": 1.18657923, + "diversity_loss_mlp": 0.0, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.0814388529334085, + "language_loss": 0.91516924, + "learning_rate": 0.000997197627828043, + "loss": 0.92731762, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 326, + "time_per_iteration": 2.5261096954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228602, + "balance_loss_mlp": 1.20018268, + "diversity_loss_mlp": 0.0, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.08774897428196327, + "language_loss": 0.86495018, + "learning_rate": 0.0009971645930629716, + "loss": 0.87723619, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.28442383, + "routerloss_mlp": 0.0, + "step": 327, + "time_per_iteration": 2.73193621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236303, + "balance_loss_mlp": 1.20914674, + "diversity_loss_mlp": 0.0, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.0823367638378532, + "language_loss": 0.99889791, + "learning_rate": 0.0009971313652814872, + "loss": 1.01126099, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.2722168, + "routerloss_mlp": 0.0, + "step": 328, + "time_per_iteration": 2.79278826713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224995, + "balance_loss_mlp": 1.1973865, + "diversity_loss_mlp": 0.0, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.1407341288256049, + "language_loss": 0.97435188, + "learning_rate": 0.0009970979444964903, + "loss": 0.98660183, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27636719, + "routerloss_mlp": 0.0, + "step": 329, + "time_per_iteration": 2.9955334663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213807, + "balance_loss_mlp": 1.18553066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.10291010686297611, + "language_loss": 0.9869082, + "learning_rate": 0.0009970643307209556, + "loss": 0.99904621, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 330, + "time_per_iteration": 2.79775071144104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202809, + "balance_loss_mlp": 1.17248201, + "diversity_loss_mlp": 0.0, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.08231148280507655, + "language_loss": 0.94842714, + "learning_rate": 0.0009970305239679334, + "loss": 0.96045524, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.30322266, + "routerloss_mlp": 0.0, + "step": 331, + "time_per_iteration": 2.802400827407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203401, + "balance_loss_mlp": 1.17300248, + "diversity_loss_mlp": 0.0, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.08804880344809486, + "language_loss": 0.99692816, + "learning_rate": 0.0009969965242505483, + "loss": 1.00896215, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.30371094, + "routerloss_mlp": 0.0, + "step": 332, + "time_per_iteration": 2.634702682495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224958, + "balance_loss_mlp": 1.19243741, + "diversity_loss_mlp": 0.0, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06414677867033303, + "language_loss": 0.95931363, + "learning_rate": 0.0009969623315820007, + "loss": 0.97156322, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 333, + "time_per_iteration": 2.6661436557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245141, + "balance_loss_mlp": 1.21149969, + "diversity_loss_mlp": 0.0, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06624608002660057, + "language_loss": 0.9590115, + "learning_rate": 0.000996927945975565, + "loss": 0.97146285, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 334, + "time_per_iteration": 2.576922655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252992, + "balance_loss_mlp": 1.21672821, + "diversity_loss_mlp": 0.0, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.07108304231036514, + "language_loss": 0.93002915, + "learning_rate": 0.0009968933674445906, + "loss": 0.94255906, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.36230469, + "routerloss_mlp": 0.0, + "step": 335, + "time_per_iteration": 2.706836462020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267675, + "balance_loss_mlp": 1.23026776, + "diversity_loss_mlp": 0.0, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.0701420022906001, + "language_loss": 0.95153642, + "learning_rate": 0.0009968585960025028, + "loss": 0.96421325, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.37402344, + "routerloss_mlp": 0.0, + "step": 336, + "time_per_iteration": 2.9356396198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01838771, + "balance_loss_mlp": 1.81416643, + "diversity_loss_mlp": 0.0, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.09587986506557475, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79491967, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.24511719, + "routerloss_mlp": 0.0, + "step": 337, + "time_per_iteration": 4.784119606018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242978, + "balance_loss_mlp": 1.20874155, + "diversity_loss_mlp": 0.0, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.1007121907193806, + "language_loss": 0.9314844, + "learning_rate": 0.0009967884744390583, + "loss": 0.94391423, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.3425293, + "routerloss_mlp": 0.0, + "step": 338, + "time_per_iteration": 3.5315823554992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209945, + "balance_loss_mlp": 1.1758039, + "diversity_loss_mlp": 0.0, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.10820011352875603, + "language_loss": 0.93812096, + "learning_rate": 0.0009967531243449256, + "loss": 0.95022047, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.34130859, + "routerloss_mlp": 0.0, + "step": 339, + "time_per_iteration": 2.6663827896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172072, + "balance_loss_mlp": 1.13959908, + "diversity_loss_mlp": 0.0, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.07246387309668721, + "language_loss": 1.014539, + "learning_rate": 0.000996717581394126, + "loss": 1.02625966, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 340, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142038, + "balance_loss_mlp": 1.11142516, + "diversity_loss_mlp": 0.0, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.07622939946709405, + "language_loss": 1.01788783, + "learning_rate": 0.000996681845600459, + "loss": 1.0293082, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.30615234, + "routerloss_mlp": 0.0, + "step": 341, + "time_per_iteration": 2.6651370525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138836, + "balance_loss_mlp": 1.10901034, + "diversity_loss_mlp": 0.0, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.06359259902727714, + "language_loss": 0.94080132, + "learning_rate": 0.0009966459169777982, + "loss": 0.95218974, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.29785156, + "routerloss_mlp": 0.0, + "step": 342, + "time_per_iteration": 2.524775981903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136152, + "balance_loss_mlp": 1.10670757, + "diversity_loss_mlp": 0.0, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.07912610309003802, + "language_loss": 1.03090763, + "learning_rate": 0.0009966097955400924, + "loss": 1.04226899, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.29418945, + "routerloss_mlp": 0.0, + "step": 343, + "time_per_iteration": 2.662269115447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074802, + "balance_loss_mlp": 1.74366593, + "diversity_loss_mlp": 0.35364389, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.10968898462568231, + "language_loss": 0.99445379, + "learning_rate": 0.0009965734813013652, + "loss": 1.00520182, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02614743, + "step": 344, + "time_per_iteration": 2.82026743888855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138748, + "balance_loss_mlp": 1.10989952, + "diversity_loss_mlp": 0.0, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.13046244738635646, + "language_loss": 0.99630761, + "learning_rate": 0.0009965369742757151, + "loss": 1.00769508, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.28833008, + "routerloss_mlp": 0.0, + "step": 345, + "time_per_iteration": 2.565809965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112942, + "balance_loss_mlp": 1.10131097, + "diversity_loss_mlp": 0.0, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.1120170016707216, + "language_loss": 0.96858162, + "learning_rate": 0.0009965002744773152, + "loss": 0.9798758, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 346, + "time_per_iteration": 3.52542781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144914, + "balance_loss_mlp": 1.1170671, + "diversity_loss_mlp": 0.0, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.08447825810050776, + "language_loss": 0.93369007, + "learning_rate": 0.0009964633819204139, + "loss": 0.94513917, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.27832031, + "routerloss_mlp": 0.0, + "step": 347, + "time_per_iteration": 2.6504640579223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02729187, + "balance_loss_mlp": 2.68856025, + "diversity_loss_mlp": 0.0, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.36365581545094156, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.84530306, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 348, + "time_per_iteration": 4.9217259883880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01886969, + "balance_loss_mlp": 1.8606472, + "diversity_loss_mlp": 0.0, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.11180228987157655, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.77040851, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.26367188, + "routerloss_mlp": 0.0, + "step": 349, + "time_per_iteration": 4.915479898452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148873, + "balance_loss_mlp": 1.11942816, + "diversity_loss_mlp": 0.0, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.08620115988858058, + "language_loss": 0.93105251, + "learning_rate": 0.000996351547842304, + "loss": 0.94254124, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29443359, + "routerloss_mlp": 0.0, + "step": 350, + "time_per_iteration": 3.2273383140563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183797, + "balance_loss_mlp": 1.152946, + "diversity_loss_mlp": 0.0, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.10656846418921655, + "language_loss": 0.91589314, + "learning_rate": 0.0009963138843953744, + "loss": 0.92773116, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.30810547, + "routerloss_mlp": 0.0, + "step": 351, + "time_per_iteration": 2.6443302631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122224, + "balance_loss_mlp": 1.19079256, + "diversity_loss_mlp": 0.0, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.12218392571909323, + "language_loss": 0.95582229, + "learning_rate": 0.000996276028262306, + "loss": 0.9680447, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.31420898, + "routerloss_mlp": 0.0, + "step": 352, + "time_per_iteration": 2.819287061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121763, + "balance_loss_mlp": 1.18711233, + "diversity_loss_mlp": 0.0, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.14903684788896404, + "language_loss": 1.01496267, + "learning_rate": 0.0009962379794577964, + "loss": 1.02713895, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.30493164, + "routerloss_mlp": 0.0, + "step": 353, + "time_per_iteration": 2.591759204864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123139, + "balance_loss_mlp": 1.2003479, + "diversity_loss_mlp": 0.0, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.0632056956592815, + "language_loss": 0.9195236, + "learning_rate": 0.000996199737996617, + "loss": 0.9318375, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "routerloss_mlp": 0.0, + "step": 354, + "time_per_iteration": 2.889040231704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209696, + "balance_loss_mlp": 1.17963195, + "diversity_loss_mlp": 0.0, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.07119928644727336, + "language_loss": 1.00405252, + "learning_rate": 0.0009961613038936149, + "loss": 1.0161494, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.30029297, + "routerloss_mlp": 0.0, + "step": 355, + "time_per_iteration": 2.5856525897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187257, + "balance_loss_mlp": 1.15755057, + "diversity_loss_mlp": 0.0, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.07116362106359332, + "language_loss": 0.93361115, + "learning_rate": 0.000996122677163711, + "loss": 0.9454838, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.296875, + "routerloss_mlp": 0.0, + "step": 356, + "time_per_iteration": 2.8134818077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213028, + "balance_loss_mlp": 1.18367887, + "diversity_loss_mlp": 0.0, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.08014414191517881, + "language_loss": 0.98940754, + "learning_rate": 0.000996083857821902, + "loss": 1.0015378, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.29345703, + "routerloss_mlp": 0.0, + "step": 357, + "time_per_iteration": 3.0531890392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237281, + "balance_loss_mlp": 1.20714498, + "diversity_loss_mlp": 0.0, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.06260381392843543, + "language_loss": 0.96791607, + "learning_rate": 0.0009960448458832588, + "loss": 0.98028892, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30126953, + "routerloss_mlp": 0.0, + "step": 358, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.20750594, + "diversity_loss_mlp": 0.0, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.07177130169486132, + "language_loss": 0.96227086, + "learning_rate": 0.000996005641362927, + "loss": 0.97463197, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28637695, + "routerloss_mlp": 0.0, + "step": 359, + "time_per_iteration": 2.58060884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229528, + "balance_loss_mlp": 1.19984436, + "diversity_loss_mlp": 0.0, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.09877521418753983, + "language_loss": 0.99257219, + "learning_rate": 0.0009959662442761274, + "loss": 1.00486755, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.29663086, + "routerloss_mlp": 0.0, + "step": 360, + "time_per_iteration": 2.8970725536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241998, + "balance_loss_mlp": 1.21033561, + "diversity_loss_mlp": 0.0, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.07509157549903762, + "language_loss": 0.93086261, + "learning_rate": 0.000995926654638155, + "loss": 0.9432826, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.31640625, + "routerloss_mlp": 0.0, + "step": 361, + "time_per_iteration": 2.787796974182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225169, + "balance_loss_mlp": 1.19405532, + "diversity_loss_mlp": 0.0, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.08313329413520473, + "language_loss": 0.94580126, + "learning_rate": 0.00099588687246438, + "loss": 0.95805293, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.31103516, + "routerloss_mlp": 0.0, + "step": 362, + "time_per_iteration": 2.826186418533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188587, + "balance_loss_mlp": 1.15785527, + "diversity_loss_mlp": 0.0, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.12654684897021498, + "language_loss": 1.02203465, + "learning_rate": 0.0009958468977702471, + "loss": 1.03392053, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 363, + "time_per_iteration": 2.5915637016296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02117372, + "balance_loss_mlp": 1.97470212, + "diversity_loss_mlp": 0.0, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.12517092959889778, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81852078, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.4296875, + "routerloss_mlp": 0.0, + "step": 364, + "time_per_iteration": 4.79950737953186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195198, + "balance_loss_mlp": 1.16406059, + "diversity_loss_mlp": 0.0, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.08484436116426784, + "language_loss": 0.90580225, + "learning_rate": 0.0009957663708830612, + "loss": 0.91775423, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 365, + "time_per_iteration": 3.2616662979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119947, + "balance_loss_mlp": 1.16575801, + "diversity_loss_mlp": 0.0, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.10575932689534903, + "language_loss": 0.93159938, + "learning_rate": 0.0009957258187212714, + "loss": 0.9435941, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.33740234, + "routerloss_mlp": 0.0, + "step": 366, + "time_per_iteration": 3.0113134384155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02012454, + "balance_loss_mlp": 1.90030205, + "diversity_loss_mlp": 0.0, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.0781885975604906, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.81207317, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.125, + "routerloss_mlp": 0.0, + "step": 367, + "time_per_iteration": 4.857182502746582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238272, + "balance_loss_mlp": 1.20377314, + "diversity_loss_mlp": 0.0, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.10459556468103207, + "language_loss": 0.9040041, + "learning_rate": 0.0009956441370400167, + "loss": 0.91638684, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.34472656, + "routerloss_mlp": 0.0, + "step": 368, + "time_per_iteration": 2.6384623050689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212552, + "balance_loss_mlp": 1.17986465, + "diversity_loss_mlp": 0.0, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.11871319311308551, + "language_loss": 0.96155751, + "learning_rate": 0.0009956030075522636, + "loss": 0.973683, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.3269043, + "routerloss_mlp": 0.0, + "step": 369, + "time_per_iteration": 2.7690951824188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.85686088, + "diversity_loss_mlp": 0.26596725, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0445321938876095, + "language_loss": 0.99161661, + "learning_rate": 0.0009955616856543587, + "loss": 1.00259984, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.03691306, + "step": 370, + "time_per_iteration": 2.6551451683044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136923, + "balance_loss_mlp": 1.10690594, + "diversity_loss_mlp": 0.0, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.06345816714032589, + "language_loss": 0.89315635, + "learning_rate": 0.0009955201713623448, + "loss": 0.90452558, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.29980469, + "routerloss_mlp": 0.0, + "step": 371, + "time_per_iteration": 2.7738049030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01981215, + "balance_loss_mlp": 1.93124223, + "diversity_loss_mlp": 0.0, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.16358882606758401, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78653932, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.5, + "routerloss_mlp": 0.0, + "step": 372, + "time_per_iteration": 4.94252347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117773, + "balance_loss_mlp": 1.08999681, + "diversity_loss_mlp": 0.0, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.14652608757044766, + "language_loss": 1.03006279, + "learning_rate": 0.0009954365656605333, + "loss": 1.04124057, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.27783203, + "routerloss_mlp": 0.0, + "step": 373, + "time_per_iteration": 2.551156759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138367, + "balance_loss_mlp": 1.10901785, + "diversity_loss_mlp": 0.0, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.09116429227244367, + "language_loss": 0.95790577, + "learning_rate": 0.0009953944742831947, + "loss": 0.96928942, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.29296875, + "routerloss_mlp": 0.0, + "step": 374, + "time_per_iteration": 2.995286226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159694, + "balance_loss_mlp": 1.13084567, + "diversity_loss_mlp": 0.0, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.10582188185488459, + "language_loss": 0.99257255, + "learning_rate": 0.0009953521905766642, + "loss": 1.00416946, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.28808594, + "routerloss_mlp": 0.0, + "step": 375, + "time_per_iteration": 2.946237325668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186879, + "balance_loss_mlp": 1.15664721, + "diversity_loss_mlp": 0.0, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.09648654328935216, + "language_loss": 0.97696835, + "learning_rate": 0.0009953097145573577, + "loss": 0.98883718, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.30200195, + "routerloss_mlp": 0.0, + "step": 376, + "time_per_iteration": 2.64080548286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119333, + "balance_loss_mlp": 1.16164398, + "diversity_loss_mlp": 0.0, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.11805021949506506, + "language_loss": 0.95023847, + "learning_rate": 0.000995267046241766, + "loss": 0.96217185, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 377, + "time_per_iteration": 3.2120020389556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188603, + "balance_loss_mlp": 1.15617776, + "diversity_loss_mlp": 0.0, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.10215127385841216, + "language_loss": 0.94931126, + "learning_rate": 0.0009952241856464547, + "loss": 0.96119732, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.32421875, + "routerloss_mlp": 0.0, + "step": 378, + "time_per_iteration": 2.595047950744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183617, + "balance_loss_mlp": 1.14971423, + "diversity_loss_mlp": 0.0, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.08294465031859817, + "language_loss": 1.01604176, + "learning_rate": 0.0009951811327880632, + "loss": 1.02787805, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.33911133, + "routerloss_mlp": 0.0, + "step": 379, + "time_per_iteration": 2.7318813800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173744, + "balance_loss_mlp": 1.13891101, + "diversity_loss_mlp": 0.0, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.06744176383892367, + "language_loss": 0.94898254, + "learning_rate": 0.0009951378876833063, + "loss": 0.96071994, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.34838867, + "routerloss_mlp": 0.0, + "step": 380, + "time_per_iteration": 2.565268039703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198329, + "balance_loss_mlp": 1.16392517, + "diversity_loss_mlp": 0.0, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.08808941505023588, + "language_loss": 1.01867247, + "learning_rate": 0.0009950944503489736, + "loss": 1.03065586, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.34399414, + "routerloss_mlp": 0.0, + "step": 381, + "time_per_iteration": 2.7605583667755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220014, + "balance_loss_mlp": 1.18479919, + "diversity_loss_mlp": 0.0, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.09503573620830386, + "language_loss": 0.95487726, + "learning_rate": 0.0009950508208019285, + "loss": 0.96707737, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.35253906, + "routerloss_mlp": 0.0, + "step": 382, + "time_per_iteration": 3.023996591567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224507, + "balance_loss_mlp": 1.19086623, + "diversity_loss_mlp": 0.0, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.09021711867793632, + "language_loss": 1.0023253, + "learning_rate": 0.0009950069990591096, + "loss": 1.01457047, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.33666992, + "routerloss_mlp": 0.0, + "step": 383, + "time_per_iteration": 2.62634015083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02435347, + "balance_loss_mlp": 2.36668229, + "diversity_loss_mlp": 0.0, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.252441104666548, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78836709, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.6875, + "routerloss_mlp": 0.0, + "step": 384, + "time_per_iteration": 4.887000322341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205074, + "balance_loss_mlp": 1.17217231, + "diversity_loss_mlp": 0.0, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.13776686153508858, + "language_loss": 0.92669415, + "learning_rate": 0.0009949187790542777, + "loss": 0.93874478, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.32910156, + "routerloss_mlp": 0.0, + "step": 385, + "time_per_iteration": 2.7325563430786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158547, + "balance_loss_mlp": 1.12683773, + "diversity_loss_mlp": 0.0, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.09404920935129117, + "language_loss": 0.89306223, + "learning_rate": 0.0009948743808265148, + "loss": 0.90464771, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 386, + "time_per_iteration": 2.723581314086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152345, + "balance_loss_mlp": 1.12321043, + "diversity_loss_mlp": 0.0, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.11553674714385681, + "language_loss": 0.98625511, + "learning_rate": 0.0009948297904714782, + "loss": 0.99777853, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29125977, + "routerloss_mlp": 0.0, + "step": 387, + "time_per_iteration": 2.6925902366638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152671, + "balance_loss_mlp": 1.12460923, + "diversity_loss_mlp": 0.0, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.10281917509950625, + "language_loss": 0.91430104, + "learning_rate": 0.0009947850080064796, + "loss": 0.92582774, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.28076172, + "routerloss_mlp": 0.0, + "step": 388, + "time_per_iteration": 2.7813222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_mlp": 1.80238378, + "diversity_loss_mlp": 0.24433145, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.03140321958098528, + "language_loss": 0.96549261, + "learning_rate": 0.0009947400334489047, + "loss": 0.97600979, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0283502, + "step": 389, + "time_per_iteration": 3.055640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_mlp": 1.11867988, + "diversity_loss_mlp": 0.0, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.10120121915973303, + "language_loss": 0.87344396, + "learning_rate": 0.0009946948668162145, + "loss": 0.88490444, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.27392578, + "routerloss_mlp": 0.0, + "step": 390, + "time_per_iteration": 2.7240688800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159261, + "balance_loss_mlp": 1.13079381, + "diversity_loss_mlp": 0.0, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.0733706931740777, + "language_loss": 0.92598295, + "learning_rate": 0.0009946495081259441, + "loss": 0.93757558, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 391, + "time_per_iteration": 2.8451168537139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145886, + "balance_loss_mlp": 1.11753774, + "diversity_loss_mlp": 0.0, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.0986246500370879, + "language_loss": 0.95604634, + "learning_rate": 0.0009946039573957035, + "loss": 0.96750522, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.28344727, + "routerloss_mlp": 0.0, + "step": 392, + "time_per_iteration": 2.943962574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142945, + "balance_loss_mlp": 1.11550307, + "diversity_loss_mlp": 0.0, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.0698233472363084, + "language_loss": 0.92221498, + "learning_rate": 0.000994558214643177, + "loss": 0.93364441, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.27441406, + "routerloss_mlp": 0.0, + "step": 393, + "time_per_iteration": 2.7336390018463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137637, + "balance_loss_mlp": 1.10933709, + "diversity_loss_mlp": 0.0, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.0667709001177297, + "language_loss": 0.93581867, + "learning_rate": 0.000994512279886123, + "loss": 0.94719505, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 394, + "time_per_iteration": 3.0792524814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148773, + "balance_loss_mlp": 1.12104487, + "diversity_loss_mlp": 0.0, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.057306164352953166, + "language_loss": 0.94243777, + "learning_rate": 0.0009944661531423758, + "loss": 0.95392549, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.27758789, + "routerloss_mlp": 0.0, + "step": 395, + "time_per_iteration": 2.7003707885742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169709, + "balance_loss_mlp": 1.14162326, + "diversity_loss_mlp": 0.0, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.09187664036534561, + "language_loss": 0.92709243, + "learning_rate": 0.000994419834429843, + "loss": 0.93878949, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 396, + "time_per_iteration": 2.654961109161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184579, + "balance_loss_mlp": 1.15613592, + "diversity_loss_mlp": 0.0, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.10401840603132484, + "language_loss": 0.96742636, + "learning_rate": 0.0009943733237665069, + "loss": 0.97927213, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 397, + "time_per_iteration": 2.8282015323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204203, + "balance_loss_mlp": 1.17542565, + "diversity_loss_mlp": 0.0, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.06433229599495933, + "language_loss": 0.96130294, + "learning_rate": 0.0009943266211704248, + "loss": 0.97334492, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.28759766, + "routerloss_mlp": 0.0, + "step": 398, + "time_per_iteration": 2.970426321029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183998, + "balance_loss_mlp": 1.15534043, + "diversity_loss_mlp": 0.0, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.08157022591406732, + "language_loss": 0.98195136, + "learning_rate": 0.000994279726659728, + "loss": 0.99379134, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.28662109, + "routerloss_mlp": 0.0, + "step": 399, + "time_per_iteration": 2.5123794078826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177562, + "balance_loss_mlp": 1.14926195, + "diversity_loss_mlp": 0.0, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.07895179134063258, + "language_loss": 0.95376462, + "learning_rate": 0.0009942326402526231, + "loss": 0.96554029, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.28320312, + "routerloss_mlp": 0.0, + "step": 400, + "time_per_iteration": 2.52349591255188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146856, + "balance_loss_mlp": 1.11905658, + "diversity_loss_mlp": 0.0, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.0705701607591385, + "language_loss": 0.94442534, + "learning_rate": 0.0009941853619673902, + "loss": 0.95589387, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.27807617, + "routerloss_mlp": 0.0, + "step": 401, + "time_per_iteration": 2.643442153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.10811007, + "diversity_loss_mlp": 0.0, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.11619926948996102, + "language_loss": 0.97199881, + "learning_rate": 0.0009941378918223844, + "loss": 0.9833436, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.26416016, + "routerloss_mlp": 0.0, + "step": 402, + "time_per_iteration": 3.05241322517395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124539, + "balance_loss_mlp": 1.09765708, + "diversity_loss_mlp": 0.0, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.0628584922031364, + "language_loss": 0.90586787, + "learning_rate": 0.0009940902298360354, + "loss": 0.91711324, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.26916504, + "routerloss_mlp": 0.0, + "step": 403, + "time_per_iteration": 2.739593744277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123125, + "balance_loss_mlp": 1.09564674, + "diversity_loss_mlp": 0.0, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.07463467829204698, + "language_loss": 0.99357891, + "learning_rate": 0.0009940423760268473, + "loss": 1.00481009, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.27478027, + "routerloss_mlp": 0.0, + "step": 404, + "time_per_iteration": 2.863248825073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123907, + "balance_loss_mlp": 1.09644127, + "diversity_loss_mlp": 0.0, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.08544352707712408, + "language_loss": 0.93046296, + "learning_rate": 0.0009939943304133982, + "loss": 0.94170201, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.27514648, + "routerloss_mlp": 0.0, + "step": 405, + "time_per_iteration": 2.631242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929276, + "balance_loss_mlp": 1.55583501, + "diversity_loss_mlp": 0.25816602, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.039808149400508724, + "language_loss": 1.0085814, + "learning_rate": 0.0009939460930143416, + "loss": 1.017874, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02227605, + "step": 406, + "time_per_iteration": 2.655000925064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908113, + "balance_loss_mlp": 1.5136435, + "diversity_loss_mlp": 0.25845903, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.031543409668047605, + "language_loss": 0.94866949, + "learning_rate": 0.0009938976638484043, + "loss": 0.95775062, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02206134, + "step": 407, + "time_per_iteration": 2.932522773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125815, + "balance_loss_mlp": 1.09954083, + "diversity_loss_mlp": 0.0, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.0874520562524596, + "language_loss": 0.93291676, + "learning_rate": 0.0009938490429343887, + "loss": 0.94417489, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 408, + "time_per_iteration": 2.5488343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128514, + "balance_loss_mlp": 1.10140562, + "diversity_loss_mlp": 0.0, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.1051667442879041, + "language_loss": 0.94155729, + "learning_rate": 0.0009938002302911709, + "loss": 0.95284247, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 409, + "time_per_iteration": 2.7672979831695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.10946035, + "diversity_loss_mlp": 0.0, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.09613329153911296, + "language_loss": 0.9601537, + "learning_rate": 0.0009937512259377015, + "loss": 0.97151482, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 410, + "time_per_iteration": 2.674072504043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159019, + "balance_loss_mlp": 1.13217306, + "diversity_loss_mlp": 0.0, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.05951235305386178, + "language_loss": 0.95475662, + "learning_rate": 0.000993702029893006, + "loss": 0.96634674, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.26879883, + "routerloss_mlp": 0.0, + "step": 411, + "time_per_iteration": 2.7913753986358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185856, + "balance_loss_mlp": 1.15731764, + "diversity_loss_mlp": 0.0, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.10961223184545879, + "language_loss": 0.95336723, + "learning_rate": 0.0009936526421761838, + "loss": 0.96522582, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.28540039, + "routerloss_mlp": 0.0, + "step": 412, + "time_per_iteration": 3.036557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181446, + "balance_loss_mlp": 1.15414703, + "diversity_loss_mlp": 0.0, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.09075853005030154, + "language_loss": 0.97731507, + "learning_rate": 0.000993603062806409, + "loss": 0.98912954, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.27319336, + "routerloss_mlp": 0.0, + "step": 413, + "time_per_iteration": 2.690500259399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166438, + "balance_loss_mlp": 1.1394248, + "diversity_loss_mlp": 0.0, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.0841151797190701, + "language_loss": 1.00301099, + "learning_rate": 0.0009935532918029298, + "loss": 1.01467538, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.27050781, + "routerloss_mlp": 0.0, + "step": 414, + "time_per_iteration": 2.6386477947235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171646, + "balance_loss_mlp": 1.14432323, + "diversity_loss_mlp": 0.0, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.07267589634089947, + "language_loss": 0.94145483, + "learning_rate": 0.0009935033291850694, + "loss": 0.95317131, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.27307129, + "routerloss_mlp": 0.0, + "step": 415, + "time_per_iteration": 2.6771326065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138565, + "balance_loss_mlp": 1.11312544, + "diversity_loss_mlp": 0.0, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.09244391725109519, + "language_loss": 0.96404541, + "learning_rate": 0.0009934531749722247, + "loss": 0.97543103, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 416, + "time_per_iteration": 2.586975574493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132517, + "balance_loss_mlp": 1.10733998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.0915153559751851, + "language_loss": 0.94398224, + "learning_rate": 0.0009934028291838672, + "loss": 0.95530736, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.25183105, + "routerloss_mlp": 0.0, + "step": 417, + "time_per_iteration": 2.7062928676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150706, + "balance_loss_mlp": 1.1251713, + "diversity_loss_mlp": 0.0, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.10053131301435142, + "language_loss": 0.89968443, + "learning_rate": 0.0009933522918395433, + "loss": 0.91119152, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.25549316, + "routerloss_mlp": 0.0, + "step": 418, + "time_per_iteration": 2.65326189994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00760745, + "balance_loss_mlp": 1.16580379, + "diversity_loss_mlp": 0.256477, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.006992447528439397, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79011846, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.049605, + "step": 419, + "time_per_iteration": 4.8772523403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176473, + "balance_loss_mlp": 1.15143883, + "diversity_loss_mlp": 0.0, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08608768077535772, + "language_loss": 1.07860529, + "learning_rate": 0.000993250642561551, + "loss": 1.09036994, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.25061035, + "routerloss_mlp": 0.0, + "step": 420, + "time_per_iteration": 2.588672399520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.15165043, + "diversity_loss_mlp": 0.0, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.09804047271530963, + "language_loss": 0.93524832, + "learning_rate": 0.0009931995306673466, + "loss": 0.94701445, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 421, + "time_per_iteration": 2.734513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200943, + "balance_loss_mlp": 1.17474103, + "diversity_loss_mlp": 0.0, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.0768650968130289, + "language_loss": 0.98959565, + "learning_rate": 0.000993148227296103, + "loss": 1.00160503, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 422, + "time_per_iteration": 2.6389012336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185361, + "balance_loss_mlp": 1.1604228, + "diversity_loss_mlp": 0.0, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.08220754838372611, + "language_loss": 0.87845761, + "learning_rate": 0.000993096732467738, + "loss": 0.89031118, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.24938965, + "routerloss_mlp": 0.0, + "step": 423, + "time_per_iteration": 2.976412057876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00884908, + "balance_loss_mlp": 1.45653749, + "diversity_loss_mlp": 0.26738948, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.04326164577840749, + "language_loss": 0.94753903, + "learning_rate": 0.0009930450462022435, + "loss": 0.95638812, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02294483, + "step": 424, + "time_per_iteration": 2.9038002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02462639, + "balance_loss_mlp": 2.35582733, + "diversity_loss_mlp": 0.0, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.15208391867633483, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.81652445, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.0703125, + "routerloss_mlp": 0.0, + "step": 425, + "time_per_iteration": 4.893689155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182525, + "balance_loss_mlp": 1.15690684, + "diversity_loss_mlp": 0.0, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.10181541083425144, + "language_loss": 0.92197704, + "learning_rate": 0.0009929410994402065, + "loss": 0.93380231, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.25646973, + "routerloss_mlp": 0.0, + "step": 426, + "time_per_iteration": 3.793488025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863772, + "balance_loss_mlp": 1.42266524, + "diversity_loss_mlp": 0.26325443, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.038163151149059646, + "language_loss": 0.97185421, + "learning_rate": 0.0009928888389840196, + "loss": 0.98049194, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02081174, + "step": 427, + "time_per_iteration": 2.7310097217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196199, + "balance_loss_mlp": 1.1708436, + "diversity_loss_mlp": 0.0, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.1014811860289813, + "language_loss": 0.98936689, + "learning_rate": 0.0009928363871714147, + "loss": 1.00132895, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.25378418, + "routerloss_mlp": 0.0, + "step": 428, + "time_per_iteration": 2.650698184967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198239, + "balance_loss_mlp": 1.17194164, + "diversity_loss_mlp": 0.0, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.0884548399202502, + "language_loss": 0.93840969, + "learning_rate": 0.0009927837440227556, + "loss": 0.95039201, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 429, + "time_per_iteration": 2.8162689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199498, + "balance_loss_mlp": 1.17399931, + "diversity_loss_mlp": 0.0, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.0660726649824177, + "language_loss": 0.88846099, + "learning_rate": 0.0009927309095584798, + "loss": 0.90045595, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.25524902, + "routerloss_mlp": 0.0, + "step": 430, + "time_per_iteration": 2.975594997406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190829, + "balance_loss_mlp": 1.1661284, + "diversity_loss_mlp": 0.0, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.08430379744466543, + "language_loss": 0.98639262, + "learning_rate": 0.0009926778837991, + "loss": 0.99830091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.24682617, + "routerloss_mlp": 0.0, + "step": 431, + "time_per_iteration": 2.595855236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187757, + "balance_loss_mlp": 1.16231799, + "diversity_loss_mlp": 0.0, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.08045199303169787, + "language_loss": 0.97297168, + "learning_rate": 0.000992624666765202, + "loss": 0.98484921, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.2545166, + "routerloss_mlp": 0.0, + "step": 432, + "time_per_iteration": 2.828488826751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195331, + "balance_loss_mlp": 1.17080951, + "diversity_loss_mlp": 0.0, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.08518069864439091, + "language_loss": 0.9513936, + "learning_rate": 0.000992571258477447, + "loss": 0.96334684, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 433, + "time_per_iteration": 2.7914628982543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181479, + "balance_loss_mlp": 1.15727913, + "diversity_loss_mlp": 0.0, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.08514456826718247, + "language_loss": 0.89393032, + "learning_rate": 0.0009925176589565695, + "loss": 0.90574509, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.24182129, + "routerloss_mlp": 0.0, + "step": 434, + "time_per_iteration": 2.847381830215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154073, + "balance_loss_mlp": 1.13002813, + "diversity_loss_mlp": 0.0, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.09497783603336436, + "language_loss": 0.99263078, + "learning_rate": 0.0009924638682233791, + "loss": 1.00417161, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.24047852, + "routerloss_mlp": 0.0, + "step": 435, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505725, + "balance_loss_mlp": 2.43934894, + "diversity_loss_mlp": 0.0, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06827578128022488, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82070321, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.6640625, + "routerloss_mlp": 0.0, + "step": 436, + "time_per_iteration": 4.539026737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138708, + "balance_loss_mlp": 1.11440182, + "diversity_loss_mlp": 0.0, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.10357837156718612, + "language_loss": 0.8856501, + "learning_rate": 0.0009923557132036668, + "loss": 0.89703721, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 437, + "time_per_iteration": 3.0414698123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124752, + "balance_loss_mlp": 1.09998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.06660243724344939, + "language_loss": 0.94103611, + "learning_rate": 0.0009923013489591345, + "loss": 0.95228368, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.24768066, + "routerloss_mlp": 0.0, + "step": 438, + "time_per_iteration": 2.7426626682281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857144, + "balance_loss_mlp": 1.4199276, + "diversity_loss_mlp": 0.26049304, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04620678173721227, + "language_loss": 0.92873847, + "learning_rate": 0.0009922467935862681, + "loss": 0.93730992, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01693399, + "step": 439, + "time_per_iteration": 3.107149124145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113851, + "balance_loss_mlp": 1.11386943, + "diversity_loss_mlp": 0.0, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.07763968648184205, + "language_loss": 0.95120305, + "learning_rate": 0.0009921920471062478, + "loss": 0.96258819, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 440, + "time_per_iteration": 2.572195529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139984, + "balance_loss_mlp": 1.11489022, + "diversity_loss_mlp": 0.0, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.0880262953369173, + "language_loss": 0.92829931, + "learning_rate": 0.0009921371095403281, + "loss": 0.93969917, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.25109863, + "routerloss_mlp": 0.0, + "step": 441, + "time_per_iteration": 2.6386919021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156684, + "balance_loss_mlp": 1.13206697, + "diversity_loss_mlp": 0.0, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.09427081021892933, + "language_loss": 0.95792937, + "learning_rate": 0.0009920819809098379, + "loss": 0.96949625, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 442, + "time_per_iteration": 2.588674783706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169571, + "balance_loss_mlp": 1.1441319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.0873536117240321, + "language_loss": 0.91373646, + "learning_rate": 0.0009920266612361798, + "loss": 0.92543221, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 443, + "time_per_iteration": 2.755526065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167225, + "balance_loss_mlp": 1.14349055, + "diversity_loss_mlp": 0.0, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.07116177044877865, + "language_loss": 0.90907955, + "learning_rate": 0.0009919711505408308, + "loss": 0.92075175, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.23718262, + "routerloss_mlp": 0.0, + "step": 444, + "time_per_iteration": 2.7939865589141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116645, + "balance_loss_mlp": 1.14170241, + "diversity_loss_mlp": 0.0, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.09221719775958219, + "language_loss": 0.89192301, + "learning_rate": 0.000991915448845342, + "loss": 0.90358752, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.24731445, + "routerloss_mlp": 0.0, + "step": 445, + "time_per_iteration": 2.5457842350006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154656, + "balance_loss_mlp": 1.13168466, + "diversity_loss_mlp": 0.0, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.08780021998431992, + "language_loss": 0.98329008, + "learning_rate": 0.000991859556171339, + "loss": 0.99483669, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.22973633, + "routerloss_mlp": 0.0, + "step": 446, + "time_per_iteration": 2.6356756687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0083848, + "balance_loss_mlp": 1.38336182, + "diversity_loss_mlp": 0.25472927, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.049564893991705376, + "language_loss": 1.00050902, + "learning_rate": 0.000991803472540521, + "loss": 1.00889397, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01943407, + "step": 447, + "time_per_iteration": 2.631704807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130193, + "balance_loss_mlp": 1.1087712, + "diversity_loss_mlp": 0.0, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.11682082282160788, + "language_loss": 0.94917679, + "learning_rate": 0.0009917471979746615, + "loss": 0.96047872, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 448, + "time_per_iteration": 2.9820516109466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122722, + "balance_loss_mlp": 1.10119319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.07207820272739716, + "language_loss": 0.94521272, + "learning_rate": 0.0009916907324956086, + "loss": 0.95643997, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 449, + "time_per_iteration": 2.701571464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127326, + "balance_loss_mlp": 1.10453379, + "diversity_loss_mlp": 0.0, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.081693490118891, + "language_loss": 0.90889072, + "learning_rate": 0.0009916340761252837, + "loss": 0.92016399, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 450, + "time_per_iteration": 2.598238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124635, + "balance_loss_mlp": 1.10287929, + "diversity_loss_mlp": 0.0, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.08322873762038852, + "language_loss": 0.88526833, + "learning_rate": 0.0009915772288856832, + "loss": 0.89651471, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.21765137, + "routerloss_mlp": 0.0, + "step": 451, + "time_per_iteration": 3.0680441856384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121178, + "balance_loss_mlp": 1.09876692, + "diversity_loss_mlp": 0.0, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.07764148626601892, + "language_loss": 0.8994481, + "learning_rate": 0.000991520190798877, + "loss": 0.91065991, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22412109, + "routerloss_mlp": 0.0, + "step": 452, + "time_per_iteration": 2.7982983589172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136254, + "balance_loss_mlp": 1.11281788, + "diversity_loss_mlp": 0.0, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.11496723003988224, + "language_loss": 0.98584056, + "learning_rate": 0.0009914629618870089, + "loss": 0.99720311, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 453, + "time_per_iteration": 2.8737423419952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0218934, + "balance_loss_mlp": 2.1624465, + "diversity_loss_mlp": 0.0, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.09249743450545506, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.8086521, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.26953125, + "routerloss_mlp": 0.0, + "step": 454, + "time_per_iteration": 4.756322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065274, + "balance_loss_mlp": 2.03780842, + "diversity_loss_mlp": 0.0, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.0744981683452351, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83493233, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.27539062, + "routerloss_mlp": 0.0, + "step": 455, + "time_per_iteration": 2.173584461212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848454, + "balance_loss_mlp": 1.40727437, + "diversity_loss_mlp": 0.24745712, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.04702924064086775, + "language_loss": 0.92085564, + "learning_rate": 0.0009912901304235883, + "loss": 0.92934018, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0210887, + "step": 456, + "time_per_iteration": 2.868276596069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273346, + "balance_loss_mlp": 1.24886012, + "diversity_loss_mlp": 0.0, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.1518400720273604, + "language_loss": 0.87943619, + "learning_rate": 0.000991232138434397, + "loss": 0.89216965, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.24499512, + "routerloss_mlp": 0.0, + "step": 457, + "time_per_iteration": 2.8729381561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262571, + "balance_loss_mlp": 1.23763299, + "diversity_loss_mlp": 0.0, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.14470377187588201, + "language_loss": 0.94336045, + "learning_rate": 0.000991173955731976, + "loss": 0.9559862, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 458, + "time_per_iteration": 2.7100729942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218734, + "balance_loss_mlp": 1.19520259, + "diversity_loss_mlp": 0.0, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.09239254139658798, + "language_loss": 0.99845707, + "learning_rate": 0.0009911155823389137, + "loss": 1.01064444, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.23547363, + "routerloss_mlp": 0.0, + "step": 459, + "time_per_iteration": 2.9462080001831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178782, + "balance_loss_mlp": 1.1555717, + "diversity_loss_mlp": 0.0, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.0878830171329016, + "language_loss": 0.95269191, + "learning_rate": 0.000991057018277873, + "loss": 0.9644798, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.23205566, + "routerloss_mlp": 0.0, + "step": 460, + "time_per_iteration": 2.7473583221435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151252, + "balance_loss_mlp": 1.12904322, + "diversity_loss_mlp": 0.0, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.1205367347306004, + "language_loss": 0.9509443, + "learning_rate": 0.0009909982635715898, + "loss": 0.96245682, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22216797, + "routerloss_mlp": 0.0, + "step": 461, + "time_per_iteration": 2.6226725578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145607, + "balance_loss_mlp": 1.12300491, + "diversity_loss_mlp": 0.0, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.0884001914091671, + "language_loss": 0.94182885, + "learning_rate": 0.0009909393182428751, + "loss": 0.95328492, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22619629, + "routerloss_mlp": 0.0, + "step": 462, + "time_per_iteration": 2.632216453552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157329, + "balance_loss_mlp": 1.13402367, + "diversity_loss_mlp": 0.0, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.09814328047414513, + "language_loss": 0.89072084, + "learning_rate": 0.000990880182314614, + "loss": 0.90229416, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 463, + "time_per_iteration": 2.6763410568237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.008652, + "balance_loss_mlp": 1.44467092, + "diversity_loss_mlp": 0.24997658, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.034550824680377484, + "language_loss": 0.89998591, + "learning_rate": 0.0009908208558097643, + "loss": 0.90863788, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01787652, + "step": 464, + "time_per_iteration": 2.9323060512542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224446, + "balance_loss_mlp": 1.20036614, + "diversity_loss_mlp": 0.0, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.11121459240038054, + "language_loss": 0.9153899, + "learning_rate": 0.000990761338751359, + "loss": 0.92763436, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 465, + "time_per_iteration": 2.7976956367492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01887012, + "balance_loss_mlp": 1.84867477, + "diversity_loss_mlp": 0.0, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.10155840838291885, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75546634, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.3828125, + "routerloss_mlp": 0.0, + "step": 466, + "time_per_iteration": 4.965139150619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319273, + "balance_loss_mlp": 1.29344034, + "diversity_loss_mlp": 0.0, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.10901527230577203, + "language_loss": 0.93872285, + "learning_rate": 0.0009906417330663815, + "loss": 0.95191562, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 467, + "time_per_iteration": 2.628042459487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01352641, + "balance_loss_mlp": 1.3264153, + "diversity_loss_mlp": 0.0, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.10051526680757361, + "language_loss": 0.90321958, + "learning_rate": 0.0009905816444862442, + "loss": 0.91674596, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 468, + "time_per_iteration": 2.613952398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396274, + "balance_loss_mlp": 1.36905813, + "diversity_loss_mlp": 0.0, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.10220310656667285, + "language_loss": 0.88433367, + "learning_rate": 0.0009905213654454216, + "loss": 0.89829642, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.27209473, + "routerloss_mlp": 0.0, + "step": 469, + "time_per_iteration": 2.897365093231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01363851, + "balance_loss_mlp": 1.3367548, + "diversity_loss_mlp": 0.0, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.11223211494597432, + "language_loss": 0.94907629, + "learning_rate": 0.0009904608959673158, + "loss": 0.96271479, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.2713623, + "routerloss_mlp": 0.0, + "step": 470, + "time_per_iteration": 2.7828967571258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328731, + "balance_loss_mlp": 1.30289829, + "diversity_loss_mlp": 0.0, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.10534875872888719, + "language_loss": 0.94143116, + "learning_rate": 0.000990400236075403, + "loss": 0.95471847, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 471, + "time_per_iteration": 2.5291385650634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126, + "balance_loss_mlp": 1.23546696, + "diversity_loss_mlp": 0.0, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.08150240013734093, + "language_loss": 0.92401147, + "learning_rate": 0.0009903393857932338, + "loss": 0.93661153, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 472, + "time_per_iteration": 2.6317975521087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234666, + "balance_loss_mlp": 1.21105075, + "diversity_loss_mlp": 0.0, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.1079858906687858, + "language_loss": 0.89742762, + "learning_rate": 0.0009902783451444317, + "loss": 0.90977424, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.23583984, + "routerloss_mlp": 0.0, + "step": 473, + "time_per_iteration": 2.708159923553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204783, + "balance_loss_mlp": 1.18326581, + "diversity_loss_mlp": 0.0, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.08561107807714156, + "language_loss": 0.94620812, + "learning_rate": 0.0009902171141526956, + "loss": 0.95825595, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 474, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196875, + "balance_loss_mlp": 1.17460644, + "diversity_loss_mlp": 0.0, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.10745755704500252, + "language_loss": 0.82875264, + "learning_rate": 0.000990155692841797, + "loss": 0.84072143, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.22277832, + "routerloss_mlp": 0.0, + "step": 475, + "time_per_iteration": 2.985820770263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191147, + "balance_loss_mlp": 1.16911697, + "diversity_loss_mlp": 0.0, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.10692573165988825, + "language_loss": 0.93685389, + "learning_rate": 0.0009900940812355818, + "loss": 0.9487654, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.22033691, + "routerloss_mlp": 0.0, + "step": 476, + "time_per_iteration": 2.882946014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182015, + "balance_loss_mlp": 1.15972316, + "diversity_loss_mlp": 0.0, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.15748592495925862, + "language_loss": 0.89566875, + "learning_rate": 0.00099003227935797, + "loss": 0.90748894, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.22290039, + "routerloss_mlp": 0.0, + "step": 477, + "time_per_iteration": 2.729729413986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176422, + "balance_loss_mlp": 1.15324748, + "diversity_loss_mlp": 0.0, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.11223041806675033, + "language_loss": 0.92644513, + "learning_rate": 0.000989970287232955, + "loss": 0.93820935, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.23156738, + "routerloss_mlp": 0.0, + "step": 478, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168241, + "balance_loss_mlp": 1.14524555, + "diversity_loss_mlp": 0.0, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.08330283562574453, + "language_loss": 0.90444613, + "learning_rate": 0.0009899081048846043, + "loss": 0.91612852, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 479, + "time_per_iteration": 2.548454523086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230508, + "balance_loss_mlp": 1.20630884, + "diversity_loss_mlp": 0.0, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.17103007353978975, + "language_loss": 0.94793594, + "learning_rate": 0.0009898457323370593, + "loss": 0.96024096, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.24206543, + "routerloss_mlp": 0.0, + "step": 480, + "time_per_iteration": 2.582655668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249007, + "balance_loss_mlp": 1.22349596, + "diversity_loss_mlp": 0.0, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.11976742763400251, + "language_loss": 0.9370476, + "learning_rate": 0.000989783169614535, + "loss": 0.94953763, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25537109, + "routerloss_mlp": 0.0, + "step": 481, + "time_per_iteration": 2.6305787563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01772239, + "balance_loss_mlp": 1.74649, + "diversity_loss_mlp": 0.0, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.0876770513617693, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80524993, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 482, + "time_per_iteration": 4.8690409660339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276229, + "balance_loss_mlp": 1.25084925, + "diversity_loss_mlp": 0.0, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.10686208189243855, + "language_loss": 0.91100538, + "learning_rate": 0.000989657473741779, + "loss": 0.92376775, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25402832, + "routerloss_mlp": 0.0, + "step": 483, + "time_per_iteration": 2.8294553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275465, + "balance_loss_mlp": 1.25022864, + "diversity_loss_mlp": 0.0, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.09087050091564236, + "language_loss": 0.92375994, + "learning_rate": 0.0009895943406403465, + "loss": 0.93651462, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.25244141, + "routerloss_mlp": 0.0, + "step": 484, + "time_per_iteration": 2.728445053100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231643, + "balance_loss_mlp": 1.20584655, + "diversity_loss_mlp": 0.0, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.11173906110031175, + "language_loss": 0.85102737, + "learning_rate": 0.0009895310174615338, + "loss": 0.86334383, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.25805664, + "routerloss_mlp": 0.0, + "step": 485, + "time_per_iteration": 2.809858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01674879, + "balance_loss_mlp": 1.65122819, + "diversity_loss_mlp": 0.0, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.0891862493938321, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.77393395, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.23632812, + "routerloss_mlp": 0.0, + "step": 486, + "time_per_iteration": 4.675356388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149968, + "balance_loss_mlp": 1.1268059, + "diversity_loss_mlp": 0.0, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.12873710921953274, + "language_loss": 0.89867461, + "learning_rate": 0.0009894038009701782, + "loss": 0.91017425, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.23168945, + "routerloss_mlp": 0.0, + "step": 487, + "time_per_iteration": 2.646655797958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141338, + "balance_loss_mlp": 1.11786556, + "diversity_loss_mlp": 0.0, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.11717214663903742, + "language_loss": 0.89069557, + "learning_rate": 0.0009893399077070253, + "loss": 0.90210891, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.23474121, + "routerloss_mlp": 0.0, + "step": 488, + "time_per_iteration": 2.578733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936332, + "balance_loss_mlp": 1.59238243, + "diversity_loss_mlp": 0.24211329, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.03786592480343135, + "language_loss": 0.88446009, + "learning_rate": 0.0009892758244652718, + "loss": 0.89382339, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0190843, + "step": 489, + "time_per_iteration": 2.72853946685791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131365, + "balance_loss_mlp": 1.10876274, + "diversity_loss_mlp": 0.0, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.09957245788293691, + "language_loss": 0.92780352, + "learning_rate": 0.0009892115512697968, + "loss": 0.93911719, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 490, + "time_per_iteration": 2.6975181102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127367, + "balance_loss_mlp": 1.10648203, + "diversity_loss_mlp": 0.0, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.09077239739165983, + "language_loss": 0.95311546, + "learning_rate": 0.0009891470881455537, + "loss": 0.96438909, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 491, + "time_per_iteration": 2.674140214920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141034, + "balance_loss_mlp": 1.12092364, + "diversity_loss_mlp": 0.0, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.08843271909801863, + "language_loss": 0.91967297, + "learning_rate": 0.0009890824351175692, + "loss": 0.93108326, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.20092773, + "routerloss_mlp": 0.0, + "step": 492, + "time_per_iteration": 2.689789295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148763, + "balance_loss_mlp": 1.12847304, + "diversity_loss_mlp": 0.0, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.0818574716555875, + "language_loss": 0.96715915, + "learning_rate": 0.0009890175922109435, + "loss": 0.97864676, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.20288086, + "routerloss_mlp": 0.0, + "step": 493, + "time_per_iteration": 2.653787136077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161837, + "balance_loss_mlp": 1.14108253, + "diversity_loss_mlp": 0.0, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.10785532679009643, + "language_loss": 0.94627249, + "learning_rate": 0.0009889525594508513, + "loss": 0.95789087, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.20751953, + "routerloss_mlp": 0.0, + "step": 494, + "time_per_iteration": 3.013289213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168804, + "balance_loss_mlp": 1.14887238, + "diversity_loss_mlp": 0.0, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.09313196509024183, + "language_loss": 0.89226812, + "learning_rate": 0.0009888873368625404, + "loss": 0.90395617, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 495, + "time_per_iteration": 2.4990835189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215397, + "balance_loss_mlp": 1.19448745, + "diversity_loss_mlp": 0.0, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.11525575263217126, + "language_loss": 0.92808712, + "learning_rate": 0.0009888219244713326, + "loss": 0.94024116, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 496, + "time_per_iteration": 2.828477382659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235818, + "balance_loss_mlp": 1.2138716, + "diversity_loss_mlp": 0.0, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.13708349411569606, + "language_loss": 0.92383498, + "learning_rate": 0.0009887563223026229, + "loss": 0.93619317, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 497, + "time_per_iteration": 2.6688501834869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03358766, + "balance_loss_mlp": 3.33902526, + "diversity_loss_mlp": 0.0, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.4973253845941573, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.82426929, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19726562, + "routerloss_mlp": 0.0, + "step": 498, + "time_per_iteration": 4.9225428104400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125204, + "balance_loss_mlp": 1.22810328, + "diversity_loss_mlp": 0.0, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.09338533863845942, + "language_loss": 0.9145627, + "learning_rate": 0.0009886245487346482, + "loss": 0.92708313, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.23925781, + "routerloss_mlp": 0.0, + "step": 499, + "time_per_iteration": 3.0396392345428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273949, + "balance_loss_mlp": 1.24874783, + "diversity_loss_mlp": 0.0, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.12406156723875504, + "language_loss": 0.94657683, + "learning_rate": 0.0009885583773865422, + "loss": 0.95931631, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 500, + "time_per_iteration": 2.434283971786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_mlp": 1.29096031, + "diversity_loss_mlp": 0.0, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.11518840252548597, + "language_loss": 0.91528684, + "learning_rate": 0.0009884920163632524, + "loss": 0.92847896, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 501, + "time_per_iteration": 2.6888957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131255, + "balance_loss_mlp": 1.28246212, + "diversity_loss_mlp": 0.0, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.12991803618191863, + "language_loss": 0.93797207, + "learning_rate": 0.000988425465690543, + "loss": 0.95109755, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.30102539, + "routerloss_mlp": 0.0, + "step": 502, + "time_per_iteration": 2.5672004222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283439, + "balance_loss_mlp": 1.25225365, + "diversity_loss_mlp": 0.0, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.11000587000012971, + "language_loss": 0.91223967, + "learning_rate": 0.0009883587253942505, + "loss": 0.92507404, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 503, + "time_per_iteration": 2.7560157775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_mlp": 1.24281311, + "diversity_loss_mlp": 0.0, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.10509235815923167, + "language_loss": 0.97371984, + "learning_rate": 0.0009882917955002862, + "loss": 0.9864552, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 504, + "time_per_iteration": 2.5183091163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227481, + "balance_loss_mlp": 1.1978929, + "diversity_loss_mlp": 0.0, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.11004475447178139, + "language_loss": 0.90284961, + "learning_rate": 0.0009882246760346343, + "loss": 0.91512442, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 505, + "time_per_iteration": 2.6169376373291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215441, + "balance_loss_mlp": 1.18637753, + "diversity_loss_mlp": 0.0, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.13294554223904492, + "language_loss": 0.94025862, + "learning_rate": 0.0009881573670233533, + "loss": 0.95241302, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.29077148, + "routerloss_mlp": 0.0, + "step": 506, + "time_per_iteration": 2.5373079776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012064, + "balance_loss_mlp": 1.17976809, + "diversity_loss_mlp": 0.0, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.07932421313758002, + "language_loss": 0.89223576, + "learning_rate": 0.0009880898684925747, + "loss": 0.90429974, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 507, + "time_per_iteration": 2.661796808242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206827, + "balance_loss_mlp": 1.18070853, + "diversity_loss_mlp": 0.0, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.09132088261693337, + "language_loss": 0.87935519, + "learning_rate": 0.0009880221804685037, + "loss": 0.89142346, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.26159668, + "routerloss_mlp": 0.0, + "step": 508, + "time_per_iteration": 2.542513608932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02552291, + "balance_loss_mlp": 2.42869496, + "diversity_loss_mlp": 0.0, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.1282373293100265, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.8189671, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 509, + "time_per_iteration": 4.707206964492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280503, + "balance_loss_mlp": 1.25399113, + "diversity_loss_mlp": 0.0, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.09929466646798928, + "language_loss": 0.93586993, + "learning_rate": 0.0009878862360456733, + "loss": 0.94867498, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.265625, + "routerloss_mlp": 0.0, + "step": 510, + "time_per_iteration": 2.6981284618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284628, + "balance_loss_mlp": 1.25883126, + "diversity_loss_mlp": 0.0, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.10250849932844218, + "language_loss": 0.87516463, + "learning_rate": 0.0009878179796996922, + "loss": 0.88801086, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.25817871, + "routerloss_mlp": 0.0, + "step": 511, + "time_per_iteration": 2.7541561126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281708, + "balance_loss_mlp": 1.25468373, + "diversity_loss_mlp": 0.0, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.10234956077068923, + "language_loss": 0.90780497, + "learning_rate": 0.0009877495339659754, + "loss": 0.92062211, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.27038574, + "routerloss_mlp": 0.0, + "step": 512, + "time_per_iteration": 2.7744665145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278173, + "balance_loss_mlp": 1.25241184, + "diversity_loss_mlp": 0.0, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.11291475079800635, + "language_loss": 0.85683644, + "learning_rate": 0.000987680898871096, + "loss": 0.86961818, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 513, + "time_per_iteration": 2.8321592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289018, + "balance_loss_mlp": 1.26217198, + "diversity_loss_mlp": 0.0, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.10190264212433507, + "language_loss": 0.85800934, + "learning_rate": 0.0009876120744417, + "loss": 0.87089956, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.26867676, + "routerloss_mlp": 0.0, + "step": 514, + "time_per_iteration": 2.945312023162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245022, + "balance_loss_mlp": 1.2198211, + "diversity_loss_mlp": 0.0, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.09616865198011539, + "language_loss": 0.94088352, + "learning_rate": 0.0009875430607045078, + "loss": 0.9533338, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 515, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214395, + "balance_loss_mlp": 1.19058895, + "diversity_loss_mlp": 0.0, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.0895550710797692, + "language_loss": 0.91242373, + "learning_rate": 0.000987473857686313, + "loss": 0.9245677, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.23791504, + "routerloss_mlp": 0.0, + "step": 516, + "time_per_iteration": 2.7530250549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218622, + "balance_loss_mlp": 1.19458985, + "diversity_loss_mlp": 0.0, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.11626991588591096, + "language_loss": 0.92559797, + "learning_rate": 0.0009874044654139824, + "loss": 0.93778414, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.24023438, + "routerloss_mlp": 0.0, + "step": 517, + "time_per_iteration": 2.7673146724700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188049, + "balance_loss_mlp": 1.16410005, + "diversity_loss_mlp": 0.0, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.09260385447056875, + "language_loss": 0.91065013, + "learning_rate": 0.0009873348839144563, + "loss": 0.92253065, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.23950195, + "routerloss_mlp": 0.0, + "step": 518, + "time_per_iteration": 2.5385515689849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.13979197, + "diversity_loss_mlp": 0.0, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.07604390633760301, + "language_loss": 0.95252264, + "learning_rate": 0.000987265113214749, + "loss": 0.96414435, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.22375488, + "routerloss_mlp": 0.0, + "step": 519, + "time_per_iteration": 2.556882619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171262, + "balance_loss_mlp": 1.14849353, + "diversity_loss_mlp": 0.0, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.093032650642813, + "language_loss": 0.94720447, + "learning_rate": 0.0009871951533419476, + "loss": 0.95891708, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.22753906, + "routerloss_mlp": 0.0, + "step": 520, + "time_per_iteration": 2.724825143814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163285, + "balance_loss_mlp": 1.14063525, + "diversity_loss_mlp": 0.0, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.07732484115861517, + "language_loss": 0.87440532, + "learning_rate": 0.0009871250043232132, + "loss": 0.88603818, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.22631836, + "routerloss_mlp": 0.0, + "step": 521, + "time_per_iteration": 2.756647825241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171709, + "balance_loss_mlp": 1.14840364, + "diversity_loss_mlp": 0.0, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.08586449419627491, + "language_loss": 0.8592059, + "learning_rate": 0.0009870546661857797, + "loss": 0.87092298, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 522, + "time_per_iteration": 2.611241340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188262, + "balance_loss_mlp": 1.16447985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.11121774977632432, + "language_loss": 0.93899059, + "learning_rate": 0.0009869841389569553, + "loss": 0.9508732, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.2376709, + "routerloss_mlp": 0.0, + "step": 523, + "time_per_iteration": 2.986001491546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897074, + "balance_loss_mlp": 1.51972795, + "diversity_loss_mlp": 0.23477924, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.04055297882665198, + "language_loss": 0.88430732, + "learning_rate": 0.0009869134226641206, + "loss": 0.89327806, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01982057, + "step": 524, + "time_per_iteration": 2.5944766998291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213869, + "balance_loss_mlp": 1.19106424, + "diversity_loss_mlp": 0.0, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.1040439940574723, + "language_loss": 0.87633705, + "learning_rate": 0.0009868425173347303, + "loss": 0.88847572, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 525, + "time_per_iteration": 2.679245710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202393, + "balance_loss_mlp": 1.17973125, + "diversity_loss_mlp": 0.0, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.10306076043273057, + "language_loss": 0.95430547, + "learning_rate": 0.0009867714229963125, + "loss": 0.96632946, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 526, + "time_per_iteration": 2.6960504055023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194179, + "balance_loss_mlp": 1.17121899, + "diversity_loss_mlp": 0.0, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.13221329860014494, + "language_loss": 0.92439747, + "learning_rate": 0.000986700139676468, + "loss": 0.93633932, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 527, + "time_per_iteration": 2.5740442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226752, + "balance_loss_mlp": 1.20331526, + "diversity_loss_mlp": 0.0, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.07480383753700154, + "language_loss": 0.90178651, + "learning_rate": 0.0009866286674028717, + "loss": 0.91405398, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 528, + "time_per_iteration": 2.6214394569396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00901033, + "balance_loss_mlp": 1.53179681, + "diversity_loss_mlp": 0.23385583, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.042015219172821444, + "language_loss": 0.87127066, + "learning_rate": 0.0009865570062032717, + "loss": 0.88028097, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820667, + "step": 529, + "time_per_iteration": 2.947612762451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243163, + "balance_loss_mlp": 1.21885657, + "diversity_loss_mlp": 0.0, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.11620953964099495, + "language_loss": 0.91896212, + "learning_rate": 0.0009864851561054893, + "loss": 0.93139374, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 530, + "time_per_iteration": 2.8097901344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192516, + "balance_loss_mlp": 1.16937733, + "diversity_loss_mlp": 0.0, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.0991735208834069, + "language_loss": 0.90383148, + "learning_rate": 0.0009864131171374191, + "loss": 0.9157567, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.23132324, + "routerloss_mlp": 0.0, + "step": 531, + "time_per_iteration": 2.6775832176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169363, + "balance_loss_mlp": 1.14682031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.08125371515716559, + "language_loss": 0.90489674, + "learning_rate": 0.0009863408893270292, + "loss": 0.91659039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.2253418, + "routerloss_mlp": 0.0, + "step": 532, + "time_per_iteration": 2.7877254486083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.1120224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.11770570969906818, + "language_loss": 0.85183895, + "learning_rate": 0.0009862684727023605, + "loss": 0.8631804, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 533, + "time_per_iteration": 2.717573642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128208, + "balance_loss_mlp": 1.10571277, + "diversity_loss_mlp": 0.0, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.10673213842736717, + "language_loss": 0.88664484, + "learning_rate": 0.0009861958672915283, + "loss": 0.89792687, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.22497559, + "routerloss_mlp": 0.0, + "step": 534, + "time_per_iteration": 2.7880847454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111155, + "balance_loss_mlp": 1.08948302, + "diversity_loss_mlp": 0.0, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.11915216532291298, + "language_loss": 0.88834876, + "learning_rate": 0.0009861230731227201, + "loss": 0.89946032, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.21679688, + "routerloss_mlp": 0.0, + "step": 535, + "time_per_iteration": 2.844203233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121725, + "balance_loss_mlp": 1.10002935, + "diversity_loss_mlp": 0.0, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.11019657032079996, + "language_loss": 0.90318179, + "learning_rate": 0.0009860500902241973, + "loss": 0.91439903, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.21716309, + "routerloss_mlp": 0.0, + "step": 536, + "time_per_iteration": 2.5753133296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126411, + "balance_loss_mlp": 1.10444033, + "diversity_loss_mlp": 0.0, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.13353850851854182, + "language_loss": 0.95278764, + "learning_rate": 0.0009859769186242942, + "loss": 0.96405172, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.21984863, + "routerloss_mlp": 0.0, + "step": 537, + "time_per_iteration": 2.544611930847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894726, + "balance_loss_mlp": 1.52693653, + "diversity_loss_mlp": 0.22699235, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04205207536563703, + "language_loss": 0.88558614, + "learning_rate": 0.0009859035583514187, + "loss": 0.8945334, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01776124, + "step": 538, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257859, + "balance_loss_mlp": 1.23475599, + "diversity_loss_mlp": 0.0, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.11200334451020948, + "language_loss": 0.89448857, + "learning_rate": 0.0009858300094340517, + "loss": 0.90706718, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.23071289, + "routerloss_mlp": 0.0, + "step": 539, + "time_per_iteration": 2.7679364681243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291272, + "balance_loss_mlp": 1.26785898, + "diversity_loss_mlp": 0.0, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.17493624211104222, + "language_loss": 0.84562349, + "learning_rate": 0.0009857562719007473, + "loss": 0.85853624, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.23388672, + "routerloss_mlp": 0.0, + "step": 540, + "time_per_iteration": 2.6256375312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267144, + "balance_loss_mlp": 1.24492311, + "diversity_loss_mlp": 0.0, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.14114133743563548, + "language_loss": 0.86615884, + "learning_rate": 0.0009856823457801331, + "loss": 0.87883031, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 541, + "time_per_iteration": 2.8773691654205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254714, + "balance_loss_mlp": 1.23256469, + "diversity_loss_mlp": 0.0, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.08733197639022866, + "language_loss": 0.93604994, + "learning_rate": 0.00098560823110091, + "loss": 0.94859707, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.22167969, + "routerloss_mlp": 0.0, + "step": 542, + "time_per_iteration": 2.6173057556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206738, + "balance_loss_mlp": 1.18436217, + "diversity_loss_mlp": 0.0, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.14252191795618116, + "language_loss": 0.94814467, + "learning_rate": 0.000985533927891851, + "loss": 0.96021199, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.22387695, + "routerloss_mlp": 0.0, + "step": 543, + "time_per_iteration": 2.682035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00924177, + "balance_loss_mlp": 1.58877563, + "diversity_loss_mlp": 0.22542018, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.04171093567104517, + "language_loss": 0.92462713, + "learning_rate": 0.0009854594361818044, + "loss": 0.93386889, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01707876, + "step": 544, + "time_per_iteration": 2.771606922149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134006, + "balance_loss_mlp": 1.11126077, + "diversity_loss_mlp": 0.0, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.16622789723447462, + "language_loss": 0.91736549, + "learning_rate": 0.0009853847559996897, + "loss": 0.92870551, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.22729492, + "routerloss_mlp": 0.0, + "step": 545, + "time_per_iteration": 2.714980363845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131307, + "balance_loss_mlp": 1.10896707, + "diversity_loss_mlp": 0.0, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.13863422454282084, + "language_loss": 0.90834534, + "learning_rate": 0.0009853098873745, + "loss": 0.91965836, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.22351074, + "routerloss_mlp": 0.0, + "step": 546, + "time_per_iteration": 2.98349928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127677, + "balance_loss_mlp": 1.10500383, + "diversity_loss_mlp": 0.0, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.15888834478547278, + "language_loss": 0.90073705, + "learning_rate": 0.0009852348303353027, + "loss": 0.91201389, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.22668457, + "routerloss_mlp": 0.0, + "step": 547, + "time_per_iteration": 2.782012701034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148987, + "balance_loss_mlp": 1.12613487, + "diversity_loss_mlp": 0.0, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.10179846154330349, + "language_loss": 0.82990968, + "learning_rate": 0.000985159584911237, + "loss": 0.84139955, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 548, + "time_per_iteration": 3.102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216658, + "balance_loss_mlp": 1.19307828, + "diversity_loss_mlp": 0.0, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.12466178148261096, + "language_loss": 0.89916652, + "learning_rate": 0.0009850841511315162, + "loss": 0.91133308, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.2355957, + "routerloss_mlp": 0.0, + "step": 549, + "time_per_iteration": 2.61226749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241093, + "balance_loss_mlp": 1.21708441, + "diversity_loss_mlp": 0.0, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.11901003741868514, + "language_loss": 0.90615034, + "learning_rate": 0.0009850085290254256, + "loss": 0.91856128, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.23986816, + "routerloss_mlp": 0.0, + "step": 550, + "time_per_iteration": 2.7958199977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914838, + "balance_loss_mlp": 1.5724771, + "diversity_loss_mlp": 0.22113116, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.03122458898086593, + "language_loss": 0.87977409, + "learning_rate": 0.0009849327186223246, + "loss": 0.88892245, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0180343, + "step": 551, + "time_per_iteration": 2.799394130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242815, + "balance_loss_mlp": 1.21818638, + "diversity_loss_mlp": 0.0, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.10957849833176474, + "language_loss": 0.95181417, + "learning_rate": 0.000984856719951646, + "loss": 0.96424234, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.24609375, + "routerloss_mlp": 0.0, + "step": 552, + "time_per_iteration": 2.559286117553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121032, + "balance_loss_mlp": 1.18546462, + "diversity_loss_mlp": 0.0, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.09349197696587547, + "language_loss": 0.91760498, + "learning_rate": 0.0009847805330428943, + "loss": 0.92970818, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.24865723, + "routerloss_mlp": 0.0, + "step": 553, + "time_per_iteration": 2.906571388244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875983, + "balance_loss_mlp": 1.49139261, + "diversity_loss_mlp": 0.22127438, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05457604420902532, + "language_loss": 0.93558431, + "learning_rate": 0.0009847041579256481, + "loss": 0.94434416, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01964992, + "step": 554, + "time_per_iteration": 2.6159372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202664, + "balance_loss_mlp": 1.17859542, + "diversity_loss_mlp": 0.0, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.08395889420783041, + "language_loss": 0.94042808, + "learning_rate": 0.0009846275946295592, + "loss": 0.95245475, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 555, + "time_per_iteration": 2.592341184616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182493, + "balance_loss_mlp": 1.15904498, + "diversity_loss_mlp": 0.0, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.08262845202589308, + "language_loss": 0.8740595, + "learning_rate": 0.0009845508431843518, + "loss": 0.8858844, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 556, + "time_per_iteration": 3.0123813152313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177615, + "balance_loss_mlp": 1.15481031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.07593810566908125, + "language_loss": 0.88148719, + "learning_rate": 0.0009844739036198233, + "loss": 0.8932634, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 557, + "time_per_iteration": 2.6356143951416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184514, + "balance_loss_mlp": 1.16157842, + "diversity_loss_mlp": 0.0, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.09177793780956148, + "language_loss": 0.94916999, + "learning_rate": 0.0009843967759658448, + "loss": 0.96101511, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 558, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02293865, + "balance_loss_mlp": 2.17026901, + "diversity_loss_mlp": 0.0, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.09925677209713644, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.75061619, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 559, + "time_per_iteration": 4.829499244689941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207667, + "balance_loss_mlp": 1.18555331, + "diversity_loss_mlp": 0.0, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.1031420062274817, + "language_loss": 0.9552027, + "learning_rate": 0.000984241956509384, + "loss": 0.96727937, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 560, + "time_per_iteration": 2.65759539604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204016, + "balance_loss_mlp": 1.18220043, + "diversity_loss_mlp": 0.0, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.08944048757536185, + "language_loss": 0.90505213, + "learning_rate": 0.0009841642647670078, + "loss": 0.91709226, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 561, + "time_per_iteration": 2.591806173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194467, + "balance_loss_mlp": 1.17308092, + "diversity_loss_mlp": 0.0, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.08297191380839272, + "language_loss": 0.85483265, + "learning_rate": 0.0009840863850553944, + "loss": 0.8667773, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.21398926, + "routerloss_mlp": 0.0, + "step": 562, + "time_per_iteration": 2.963149309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179499, + "balance_loss_mlp": 1.15856552, + "diversity_loss_mlp": 0.0, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.18759249419324772, + "language_loss": 0.9088884, + "learning_rate": 0.0009840083174047782, + "loss": 0.92068338, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.20947266, + "routerloss_mlp": 0.0, + "step": 563, + "time_per_iteration": 2.71415114402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169496, + "balance_loss_mlp": 1.14940953, + "diversity_loss_mlp": 0.0, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.08351477183844232, + "language_loss": 0.86295354, + "learning_rate": 0.0009839300618454685, + "loss": 0.87464857, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.20080566, + "routerloss_mlp": 0.0, + "step": 564, + "time_per_iteration": 2.8288042545318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163807, + "balance_loss_mlp": 1.14280224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0761185875884483, + "language_loss": 0.9141686, + "learning_rate": 0.0009838516184078466, + "loss": 0.92580664, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.21020508, + "routerloss_mlp": 0.0, + "step": 565, + "time_per_iteration": 2.8194022178649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177734, + "balance_loss_mlp": 1.15682447, + "diversity_loss_mlp": 0.0, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.14122321260962364, + "language_loss": 0.88377023, + "learning_rate": 0.0009837729871223669, + "loss": 0.89554763, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 566, + "time_per_iteration": 2.6096079349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194985, + "balance_loss_mlp": 1.17372978, + "diversity_loss_mlp": 0.0, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.1066586812750682, + "language_loss": 0.88896918, + "learning_rate": 0.0009836941680195568, + "loss": 0.90091902, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.21264648, + "routerloss_mlp": 0.0, + "step": 567, + "time_per_iteration": 2.779846429824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210465, + "balance_loss_mlp": 1.18900692, + "diversity_loss_mlp": 0.0, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.09744135285550241, + "language_loss": 0.84777021, + "learning_rate": 0.0009836151611300166, + "loss": 0.85987484, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 568, + "time_per_iteration": 3.2130274772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210546, + "balance_loss_mlp": 1.18979168, + "diversity_loss_mlp": 0.0, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.0877787159655237, + "language_loss": 0.95202124, + "learning_rate": 0.0009835359664844194, + "loss": 0.96412671, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.2076416, + "routerloss_mlp": 0.0, + "step": 569, + "time_per_iteration": 2.614626407623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02163392, + "balance_loss_mlp": 2.12848806, + "diversity_loss_mlp": 0.0, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.098326155744124, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.83200204, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.34960938, + "routerloss_mlp": 0.0, + "step": 570, + "time_per_iteration": 4.910563230514526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188679, + "balance_loss_mlp": 1.16738796, + "diversity_loss_mlp": 0.0, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.10673198509513786, + "language_loss": 0.92503107, + "learning_rate": 0.0009833770140481118, + "loss": 0.93691778, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.21313477, + "routerloss_mlp": 0.0, + "step": 571, + "time_per_iteration": 2.6361794471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167522, + "balance_loss_mlp": 1.14587367, + "diversity_loss_mlp": 0.0, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.06757736028097705, + "language_loss": 0.82720339, + "learning_rate": 0.000983297256319112, + "loss": 0.83887863, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.21655273, + "routerloss_mlp": 0.0, + "step": 572, + "time_per_iteration": 3.2420709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148716, + "balance_loss_mlp": 1.12606621, + "diversity_loss_mlp": 0.0, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.09218112459591986, + "language_loss": 0.87054348, + "learning_rate": 0.000983217310957477, + "loss": 0.88203067, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 573, + "time_per_iteration": 2.7485547065734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139139, + "balance_loss_mlp": 1.11725259, + "diversity_loss_mlp": 0.0, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.08282639029669561, + "language_loss": 0.90421212, + "learning_rate": 0.000983137177994244, + "loss": 0.91560352, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.21899414, + "routerloss_mlp": 0.0, + "step": 574, + "time_per_iteration": 2.8651185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142425, + "balance_loss_mlp": 1.11990607, + "diversity_loss_mlp": 0.0, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.08655490231030577, + "language_loss": 0.8561765, + "learning_rate": 0.0009830568574605235, + "loss": 0.8676008, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.22521973, + "routerloss_mlp": 0.0, + "step": 575, + "time_per_iteration": 2.942331075668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162411, + "balance_loss_mlp": 1.13946342, + "diversity_loss_mlp": 0.0, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.08792859421485215, + "language_loss": 0.88113999, + "learning_rate": 0.0009829763493874992, + "loss": 0.89276409, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 576, + "time_per_iteration": 3.0282514095306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173849, + "balance_loss_mlp": 1.15098429, + "diversity_loss_mlp": 0.0, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.10676499351314739, + "language_loss": 0.9303807, + "learning_rate": 0.0009828956538064264, + "loss": 0.94211912, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.2286377, + "routerloss_mlp": 0.0, + "step": 577, + "time_per_iteration": 2.7946369647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173248, + "balance_loss_mlp": 1.1503005, + "diversity_loss_mlp": 0.0, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.11074471638842859, + "language_loss": 0.91223717, + "learning_rate": 0.0009828147707486344, + "loss": 0.92396963, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 578, + "time_per_iteration": 2.731588125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.13424993, + "diversity_loss_mlp": 0.0, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.09317476454713723, + "language_loss": 0.86116958, + "learning_rate": 0.0009827337002455245, + "loss": 0.87273794, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 579, + "time_per_iteration": 2.639047145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134437, + "balance_loss_mlp": 1.11184728, + "diversity_loss_mlp": 0.0, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.07918824025832125, + "language_loss": 0.88299757, + "learning_rate": 0.0009826524423285712, + "loss": 0.89434195, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.22595215, + "routerloss_mlp": 0.0, + "step": 580, + "time_per_iteration": 2.911012649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114105, + "balance_loss_mlp": 1.11881745, + "diversity_loss_mlp": 0.0, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.10469703454021252, + "language_loss": 0.89618349, + "learning_rate": 0.0009825709970293218, + "loss": 0.90759397, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 581, + "time_per_iteration": 2.8837828636169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135626, + "balance_loss_mlp": 1.11433506, + "diversity_loss_mlp": 0.0, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.1022616119694228, + "language_loss": 0.95317924, + "learning_rate": 0.0009824893643793956, + "loss": 0.96453559, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.21289062, + "routerloss_mlp": 0.0, + "step": 582, + "time_per_iteration": 3.0962114334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948798, + "balance_loss_mlp": 1.63779283, + "diversity_loss_mlp": 0.22248407, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.04350556393742171, + "language_loss": 0.88843536, + "learning_rate": 0.0009824075444104857, + "loss": 0.89792335, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01865991, + "step": 583, + "time_per_iteration": 2.719085454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157517, + "balance_loss_mlp": 1.13638163, + "diversity_loss_mlp": 0.0, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.10740950198198211, + "language_loss": 0.93831933, + "learning_rate": 0.000982325537154357, + "loss": 0.94989443, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.21154785, + "routerloss_mlp": 0.0, + "step": 584, + "time_per_iteration": 2.597120523452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117352, + "balance_loss_mlp": 1.15234792, + "diversity_loss_mlp": 0.0, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.12322952105084124, + "language_loss": 0.94442445, + "learning_rate": 0.0009822433426428484, + "loss": 0.95615965, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.21179199, + "routerloss_mlp": 0.0, + "step": 585, + "time_per_iteration": 2.571805238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238103, + "balance_loss_mlp": 1.2166214, + "diversity_loss_mlp": 0.0, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.08678287386034968, + "language_loss": 0.87089044, + "learning_rate": 0.0009821609609078697, + "loss": 0.88327146, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.21484375, + "routerloss_mlp": 0.0, + "step": 586, + "time_per_iteration": 2.586289405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320429, + "balance_loss_mlp": 1.29861343, + "diversity_loss_mlp": 0.0, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.09324667942342675, + "language_loss": 0.89581811, + "learning_rate": 0.0009820783919814045, + "loss": 0.90902239, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 587, + "time_per_iteration": 2.804417848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01397697, + "balance_loss_mlp": 1.37499988, + "diversity_loss_mlp": 0.0, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.11766834316785481, + "language_loss": 0.82825267, + "learning_rate": 0.0009819956358955095, + "loss": 0.8422296, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.22705078, + "routerloss_mlp": 0.0, + "step": 588, + "time_per_iteration": 2.5654590129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433511, + "balance_loss_mlp": 1.41009879, + "diversity_loss_mlp": 0.0, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.13254981657968556, + "language_loss": 0.84316242, + "learning_rate": 0.0009819126926823127, + "loss": 0.85749757, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.23413086, + "routerloss_mlp": 0.0, + "step": 589, + "time_per_iteration": 2.5090954303741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369151, + "balance_loss_mlp": 1.34720445, + "diversity_loss_mlp": 0.0, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.12923638752993147, + "language_loss": 0.87131608, + "learning_rate": 0.000981829562374016, + "loss": 0.88500756, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 590, + "time_per_iteration": 2.7904558181762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263432, + "balance_loss_mlp": 1.24309444, + "diversity_loss_mlp": 0.0, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.0979331207375339, + "language_loss": 0.97635686, + "learning_rate": 0.0009817462450028933, + "loss": 0.98899126, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 591, + "time_per_iteration": 2.6596498489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186211, + "balance_loss_mlp": 1.16698265, + "diversity_loss_mlp": 0.0, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.0791908179615389, + "language_loss": 0.85476398, + "learning_rate": 0.0009816627406012916, + "loss": 0.86662614, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 592, + "time_per_iteration": 2.795384168624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143018, + "balance_loss_mlp": 1.12423062, + "diversity_loss_mlp": 0.0, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.14133504737490046, + "language_loss": 0.85158926, + "learning_rate": 0.0009815790492016295, + "loss": 0.86301947, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.18774414, + "routerloss_mlp": 0.0, + "step": 593, + "time_per_iteration": 2.968202829360962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113857, + "balance_loss_mlp": 1.11954474, + "diversity_loss_mlp": 0.0, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.10990083394980393, + "language_loss": 0.87156999, + "learning_rate": 0.0009814951708363993, + "loss": 0.88295579, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.19006348, + "routerloss_mlp": 0.0, + "step": 594, + "time_per_iteration": 2.8341050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01993613, + "balance_loss_mlp": 1.96176016, + "diversity_loss_mlp": 0.0, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.10325359814292956, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79984605, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.31835938, + "routerloss_mlp": 0.0, + "step": 595, + "time_per_iteration": 4.746119976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11163688, + "diversity_loss_mlp": 0.0, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.1448933947746474, + "language_loss": 0.89056683, + "learning_rate": 0.0009813268533395648, + "loss": 0.90187395, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.19067383, + "routerloss_mlp": 0.0, + "step": 596, + "time_per_iteration": 2.592421054840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151969, + "balance_loss_mlp": 1.13301492, + "diversity_loss_mlp": 0.0, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.12455054099529249, + "language_loss": 0.8755219, + "learning_rate": 0.0009812424142733073, + "loss": 0.88704157, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.18933105, + "routerloss_mlp": 0.0, + "step": 597, + "time_per_iteration": 2.549654483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158973, + "balance_loss_mlp": 1.13961387, + "diversity_loss_mlp": 0.0, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.1533400924271749, + "language_loss": 0.86129421, + "learning_rate": 0.000981157788372175, + "loss": 0.87288398, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 598, + "time_per_iteration": 3.029372453689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181573, + "balance_loss_mlp": 1.16308403, + "diversity_loss_mlp": 0.0, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.08122879346901381, + "language_loss": 0.89185023, + "learning_rate": 0.0009810729756690223, + "loss": 0.90366596, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.18481445, + "routerloss_mlp": 0.0, + "step": 599, + "time_per_iteration": 2.72200608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225343, + "balance_loss_mlp": 1.20584035, + "diversity_loss_mlp": 0.0, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.09322481346022114, + "language_loss": 0.91937912, + "learning_rate": 0.0009809879761967766, + "loss": 0.93163252, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.19482422, + "routerloss_mlp": 0.0, + "step": 600, + "time_per_iteration": 2.9454104900360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240049, + "balance_loss_mlp": 1.22046316, + "diversity_loss_mlp": 0.0, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.11235514763344263, + "language_loss": 0.86727029, + "learning_rate": 0.0009809027899884378, + "loss": 0.87967086, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.19580078, + "routerloss_mlp": 0.0, + "step": 601, + "time_per_iteration": 2.888047218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288764, + "balance_loss_mlp": 1.26829576, + "diversity_loss_mlp": 0.0, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.07021797329248278, + "language_loss": 0.88593882, + "learning_rate": 0.0009808174170770779, + "loss": 0.89882648, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.20458984, + "routerloss_mlp": 0.0, + "step": 602, + "time_per_iteration": 2.8045670986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02144093, + "balance_loss_mlp": 2.11128712, + "diversity_loss_mlp": 0.0, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.1124732092134732, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.87042338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.328125, + "routerloss_mlp": 0.0, + "step": 603, + "time_per_iteration": 4.899731397628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341078, + "balance_loss_mlp": 1.32069361, + "diversity_loss_mlp": 0.0, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.10202627615666406, + "language_loss": 0.93765342, + "learning_rate": 0.0009806461112779462, + "loss": 0.95106417, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 604, + "time_per_iteration": 2.6618311405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291209, + "balance_loss_mlp": 1.27080083, + "diversity_loss_mlp": 0.0, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.13219567018011513, + "language_loss": 0.87928259, + "learning_rate": 0.0009805601784566814, + "loss": 0.89219463, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.20397949, + "routerloss_mlp": 0.0, + "step": 605, + "time_per_iteration": 2.4783012866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229751, + "balance_loss_mlp": 1.20996237, + "diversity_loss_mlp": 0.0, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.07794567116482086, + "language_loss": 0.95705628, + "learning_rate": 0.0009804740590654089, + "loss": 0.9693538, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.19787598, + "routerloss_mlp": 0.0, + "step": 606, + "time_per_iteration": 2.6886532306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155761, + "balance_loss_mlp": 1.13543582, + "diversity_loss_mlp": 0.0, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.09113538166915294, + "language_loss": 0.90117687, + "learning_rate": 0.0009803877531375635, + "loss": 0.91273439, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 607, + "time_per_iteration": 2.877068281173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127783, + "balance_loss_mlp": 1.1072073, + "diversity_loss_mlp": 0.0, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.0886917383310614, + "language_loss": 0.90959686, + "learning_rate": 0.0009803012607066523, + "loss": 0.92087471, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.20581055, + "routerloss_mlp": 0.0, + "step": 608, + "time_per_iteration": 2.7187952995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110833, + "balance_loss_mlp": 1.08786178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.061304878637031934, + "language_loss": 0.89645171, + "learning_rate": 0.0009802145818062543, + "loss": 0.90753502, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 609, + "time_per_iteration": 2.692622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920288, + "balance_loss_mlp": 1.57755673, + "diversity_loss_mlp": 0.22646153, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.03934500472587961, + "language_loss": 0.91726142, + "learning_rate": 0.0009801277164700212, + "loss": 0.92646432, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01827916, + "step": 610, + "time_per_iteration": 2.5983645915985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100014, + "balance_loss_mlp": 1.07810283, + "diversity_loss_mlp": 0.0, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.11493980483313035, + "language_loss": 0.90203917, + "learning_rate": 0.0009800406647316776, + "loss": 0.91303933, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.21911621, + "routerloss_mlp": 0.0, + "step": 611, + "time_per_iteration": 2.83890438079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02350268, + "balance_loss_mlp": 2.30563617, + "diversity_loss_mlp": 0.0, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.20114955038596882, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7926473, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.44726562, + "routerloss_mlp": 0.0, + "step": 612, + "time_per_iteration": 4.795763254165649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111114, + "balance_loss_mlp": 1.09067178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.10624240262278996, + "language_loss": 0.88978302, + "learning_rate": 0.000979866002183916, + "loss": 0.9008944, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 613, + "time_per_iteration": 2.660820484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.10140252, + "diversity_loss_mlp": 0.0, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.11793468153173196, + "language_loss": 0.90023279, + "learning_rate": 0.0009797783914423082, + "loss": 0.91144633, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.19946289, + "routerloss_mlp": 0.0, + "step": 614, + "time_per_iteration": 2.8052501678466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.13508475, + "diversity_loss_mlp": 0.0, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.09232041353489327, + "language_loss": 0.84365702, + "learning_rate": 0.0009796905944342094, + "loss": 0.8552016, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.19360352, + "routerloss_mlp": 0.0, + "step": 615, + "time_per_iteration": 2.829193115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164283, + "balance_loss_mlp": 1.14475632, + "diversity_loss_mlp": 0.0, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.08204462941928636, + "language_loss": 0.88193601, + "learning_rate": 0.0009796026111937057, + "loss": 0.89357883, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.19519043, + "routerloss_mlp": 0.0, + "step": 616, + "time_per_iteration": 2.5868873596191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165697, + "balance_loss_mlp": 1.14656377, + "diversity_loss_mlp": 0.0, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.08667467412120618, + "language_loss": 0.88612103, + "learning_rate": 0.0009795144417549552, + "loss": 0.89777797, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.19128418, + "routerloss_mlp": 0.0, + "step": 617, + "time_per_iteration": 2.689771890640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163262, + "balance_loss_mlp": 1.14452195, + "diversity_loss_mlp": 0.0, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.07824422885129345, + "language_loss": 0.8978498, + "learning_rate": 0.0009794260861521883, + "loss": 0.90948236, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.18737793, + "routerloss_mlp": 0.0, + "step": 618, + "time_per_iteration": 2.78352689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154292, + "balance_loss_mlp": 1.13528955, + "diversity_loss_mlp": 0.0, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.09960243519509318, + "language_loss": 0.86907887, + "learning_rate": 0.0009793375444197075, + "loss": 0.88062179, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 619, + "time_per_iteration": 2.618597984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159739, + "balance_loss_mlp": 1.14053416, + "diversity_loss_mlp": 0.0, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.09155899478389973, + "language_loss": 0.85016847, + "learning_rate": 0.000979248816591888, + "loss": 0.86176586, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 620, + "time_per_iteration": 2.7570278644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145713, + "balance_loss_mlp": 1.12721133, + "diversity_loss_mlp": 0.0, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.1108991519321712, + "language_loss": 0.86349535, + "learning_rate": 0.0009791599027031766, + "loss": 0.87495244, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 621, + "time_per_iteration": 3.2095139026641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137126, + "balance_loss_mlp": 1.11841059, + "diversity_loss_mlp": 0.0, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.09815511109151757, + "language_loss": 0.86187375, + "learning_rate": 0.0009790708027880932, + "loss": 0.873245, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 622, + "time_per_iteration": 2.878537654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01885107, + "balance_loss_mlp": 1.84448004, + "diversity_loss_mlp": 0.0, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.060338107853692736, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.79312396, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 623, + "time_per_iteration": 4.854407787322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.12785053, + "diversity_loss_mlp": 0.0, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.08227936779447462, + "language_loss": 0.9313252, + "learning_rate": 0.0009788920450172487, + "loss": 0.94280195, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.19812012, + "routerloss_mlp": 0.0, + "step": 624, + "time_per_iteration": 2.633763551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173257, + "balance_loss_mlp": 1.15283692, + "diversity_loss_mlp": 0.0, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.08898942147955141, + "language_loss": 0.90448737, + "learning_rate": 0.0009788023872308875, + "loss": 0.91621995, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.20410156, + "routerloss_mlp": 0.0, + "step": 625, + "time_per_iteration": 2.5277719497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01862648, + "balance_loss_mlp": 1.82163978, + "diversity_loss_mlp": 0.0, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.06145643913195344, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.77291644, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.41015625, + "routerloss_mlp": 0.0, + "step": 626, + "time_per_iteration": 4.746332883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165065, + "balance_loss_mlp": 1.1446321, + "diversity_loss_mlp": 0.0, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.07179626691480034, + "language_loss": 0.93775636, + "learning_rate": 0.0009786225140303285, + "loss": 0.94940698, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.2043457, + "routerloss_mlp": 0.0, + "step": 627, + "time_per_iteration": 2.650980234146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.13354802, + "diversity_loss_mlp": 0.0, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.1000912175423248, + "language_loss": 0.91955918, + "learning_rate": 0.0009785322986859634, + "loss": 0.93110657, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.21191406, + "routerloss_mlp": 0.0, + "step": 628, + "time_per_iteration": 2.699179172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0098085, + "balance_loss_mlp": 1.69793713, + "diversity_loss_mlp": 0.22907162, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.03434932946066091, + "language_loss": 0.92752671, + "learning_rate": 0.0009784418975588838, + "loss": 0.93733525, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01734566, + "step": 629, + "time_per_iteration": 2.7467246055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131113, + "balance_loss_mlp": 1.10905957, + "diversity_loss_mlp": 0.0, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.08662072407619689, + "language_loss": 0.93157279, + "learning_rate": 0.0009783513106841862, + "loss": 0.94288397, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.22070312, + "routerloss_mlp": 0.0, + "step": 630, + "time_per_iteration": 2.699862003326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01893774, + "balance_loss_mlp": 1.85181284, + "diversity_loss_mlp": 0.0, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.08318726834589595, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78626478, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.41992188, + "routerloss_mlp": 0.0, + "step": 631, + "time_per_iteration": 4.952157258987427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129662, + "balance_loss_mlp": 1.10740614, + "diversity_loss_mlp": 0.0, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.08011431594745816, + "language_loss": 0.87836802, + "learning_rate": 0.0009781695798326854, + "loss": 0.88966465, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.22265625, + "routerloss_mlp": 0.0, + "step": 632, + "time_per_iteration": 2.5692520141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.10132909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.08866631591317527, + "language_loss": 0.87804729, + "learning_rate": 0.0009780784359264365, + "loss": 0.88928837, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 633, + "time_per_iteration": 2.6267781257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00719882, + "balance_loss_mlp": 1.16367078, + "diversity_loss_mlp": 0.22089316, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.0030158712959469035, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.74908578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02760048, + "step": 634, + "time_per_iteration": 4.819004535675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00956665, + "balance_loss_mlp": 1.64561963, + "diversity_loss_mlp": 0.23289478, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.029780004210258365, + "language_loss": 0.87410563, + "learning_rate": 0.000977895591329867, + "loss": 0.88367236, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017408, + "step": 635, + "time_per_iteration": 2.8417630195617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111829, + "balance_loss_mlp": 1.09035909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.07301537581986137, + "language_loss": 0.86799347, + "learning_rate": 0.000977803890710533, + "loss": 0.87911177, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 636, + "time_per_iteration": 2.721245765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_mlp": 1.08507979, + "diversity_loss_mlp": 0.0, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.0646034576227674, + "language_loss": 0.93395561, + "learning_rate": 0.0009777120045912774, + "loss": 0.94501537, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.20898438, + "routerloss_mlp": 0.0, + "step": 637, + "time_per_iteration": 2.5976381301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114015, + "balance_loss_mlp": 1.09267688, + "diversity_loss_mlp": 0.0, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.07520229878174765, + "language_loss": 0.89586985, + "learning_rate": 0.0009776199330077736, + "loss": 0.90700996, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.21362305, + "routerloss_mlp": 0.0, + "step": 638, + "time_per_iteration": 2.7055575847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127487, + "balance_loss_mlp": 1.10741186, + "diversity_loss_mlp": 0.0, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.08952902399696973, + "language_loss": 0.91934389, + "learning_rate": 0.0009775276759957667, + "loss": 0.93061876, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.20068359, + "routerloss_mlp": 0.0, + "step": 639, + "time_per_iteration": 2.703442096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113385, + "balance_loss_mlp": 1.11285698, + "diversity_loss_mlp": 0.0, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.08734236555353025, + "language_loss": 0.8993817, + "learning_rate": 0.0009774352335910745, + "loss": 0.91072023, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.21008301, + "routerloss_mlp": 0.0, + "step": 640, + "time_per_iteration": 2.798133373260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133691, + "balance_loss_mlp": 1.11327052, + "diversity_loss_mlp": 0.0, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.08010684820371014, + "language_loss": 0.94195282, + "learning_rate": 0.000977342605829586, + "loss": 0.95328975, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.20422363, + "routerloss_mlp": 0.0, + "step": 641, + "time_per_iteration": 2.72929310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167832, + "balance_loss_mlp": 1.14699411, + "diversity_loss_mlp": 0.0, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.08202605728626432, + "language_loss": 0.85741401, + "learning_rate": 0.0009772497927472623, + "loss": 0.86909235, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.20837402, + "routerloss_mlp": 0.0, + "step": 642, + "time_per_iteration": 3.071017265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166824, + "balance_loss_mlp": 1.14637995, + "diversity_loss_mlp": 0.0, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.0829252807022359, + "language_loss": 0.84863311, + "learning_rate": 0.0009771567943801368, + "loss": 0.86030138, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.20446777, + "routerloss_mlp": 0.0, + "step": 643, + "time_per_iteration": 2.667830228805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.16058123, + "diversity_loss_mlp": 0.0, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.07304892670416417, + "language_loss": 0.89067769, + "learning_rate": 0.0009770636107643152, + "loss": 0.90248668, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.203125, + "routerloss_mlp": 0.0, + "step": 644, + "time_per_iteration": 2.715703010559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187033, + "balance_loss_mlp": 1.16633821, + "diversity_loss_mlp": 0.0, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.07624328698635177, + "language_loss": 0.87043303, + "learning_rate": 0.0009769702419359738, + "loss": 0.88230342, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.20703125, + "routerloss_mlp": 0.0, + "step": 645, + "time_per_iteration": 2.645270586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199535, + "balance_loss_mlp": 1.17913866, + "diversity_loss_mlp": 0.0, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.10325279424343262, + "language_loss": 0.88927197, + "learning_rate": 0.000976876687931362, + "loss": 0.90126729, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 646, + "time_per_iteration": 2.9558987617492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154414, + "balance_loss_mlp": 1.13427997, + "diversity_loss_mlp": 0.0, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.10259074887379964, + "language_loss": 0.84658372, + "learning_rate": 0.0009767829487868005, + "loss": 0.85812783, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.20129395, + "routerloss_mlp": 0.0, + "step": 647, + "time_per_iteration": 2.593254566192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165839, + "balance_loss_mlp": 1.14557362, + "diversity_loss_mlp": 0.0, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.08660672395493044, + "language_loss": 0.88729513, + "learning_rate": 0.000976689024538682, + "loss": 0.8989535, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.20263672, + "routerloss_mlp": 0.0, + "step": 648, + "time_per_iteration": 2.6087043285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.12564492, + "diversity_loss_mlp": 0.0, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.09471610460140056, + "language_loss": 0.86980593, + "learning_rate": 0.0009765949152234716, + "loss": 0.88127637, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.21411133, + "routerloss_mlp": 0.0, + "step": 649, + "time_per_iteration": 2.8878984451293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130912, + "balance_loss_mlp": 2.08723378, + "diversity_loss_mlp": 0.0, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.17488169385486374, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.80816996, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.4375, + "routerloss_mlp": 0.0, + "step": 650, + "time_per_iteration": 4.7227959632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125186, + "balance_loss_mlp": 1.10393071, + "diversity_loss_mlp": 0.0, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.09783498118048492, + "language_loss": 0.81436628, + "learning_rate": 0.0009764061415379919, + "loss": 0.82561815, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.21276855, + "routerloss_mlp": 0.0, + "step": 651, + "time_per_iteration": 3.2849485874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135606, + "balance_loss_mlp": 1.11419618, + "diversity_loss_mlp": 0.0, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08568090703098526, + "language_loss": 0.88376707, + "learning_rate": 0.0009763114772410109, + "loss": 0.89512312, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 652, + "time_per_iteration": 2.640482187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147244, + "balance_loss_mlp": 1.12633479, + "diversity_loss_mlp": 0.0, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.0799999486499222, + "language_loss": 0.86490756, + "learning_rate": 0.0009762166280235146, + "loss": 0.87638003, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.20910645, + "routerloss_mlp": 0.0, + "step": 653, + "time_per_iteration": 2.9535903930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188128, + "balance_loss_mlp": 1.16659844, + "diversity_loss_mlp": 0.0, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.09522027236447655, + "language_loss": 0.86765033, + "learning_rate": 0.0009761215939223267, + "loss": 0.87953162, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 654, + "time_per_iteration": 2.7124929428100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186032, + "balance_loss_mlp": 1.16533732, + "diversity_loss_mlp": 0.0, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.11212167432887624, + "language_loss": 0.85993934, + "learning_rate": 0.0009760263749743428, + "loss": 0.87179965, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.20690918, + "routerloss_mlp": 0.0, + "step": 655, + "time_per_iteration": 2.5919461250305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171572, + "balance_loss_mlp": 1.1518662, + "diversity_loss_mlp": 0.0, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.09226162692886594, + "language_loss": 0.89700639, + "learning_rate": 0.0009759309712165299, + "loss": 0.9087221, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.19702148, + "routerloss_mlp": 0.0, + "step": 656, + "time_per_iteration": 2.746537685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161192, + "balance_loss_mlp": 1.14149833, + "diversity_loss_mlp": 0.0, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.08627335840647962, + "language_loss": 0.92326117, + "learning_rate": 0.0009758353826859272, + "loss": 0.9348731, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 657, + "time_per_iteration": 2.5861480236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128682, + "balance_loss_mlp": 1.10790431, + "diversity_loss_mlp": 0.0, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.1059978443595565, + "language_loss": 0.88603538, + "learning_rate": 0.0009757396094196456, + "loss": 0.89732224, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.20788574, + "routerloss_mlp": 0.0, + "step": 658, + "time_per_iteration": 2.8773136138916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130444, + "balance_loss_mlp": 1.11040533, + "diversity_loss_mlp": 0.0, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.12293029558515219, + "language_loss": 0.83426332, + "learning_rate": 0.0009756436514548673, + "loss": 0.8455677, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.20031738, + "routerloss_mlp": 0.0, + "step": 659, + "time_per_iteration": 2.810722589492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.11438441, + "diversity_loss_mlp": 0.0, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.06793027871708798, + "language_loss": 0.87658846, + "learning_rate": 0.0009755475088288466, + "loss": 0.88793576, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.20349121, + "routerloss_mlp": 0.0, + "step": 660, + "time_per_iteration": 2.7121376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.12785089, + "diversity_loss_mlp": 0.0, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08710392398912287, + "language_loss": 0.89421189, + "learning_rate": 0.0009754511815789095, + "loss": 0.90569162, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.20117188, + "routerloss_mlp": 0.0, + "step": 661, + "time_per_iteration": 2.777318239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162586, + "balance_loss_mlp": 1.14171267, + "diversity_loss_mlp": 0.0, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08537034247511402, + "language_loss": 0.84716892, + "learning_rate": 0.0009753546697424533, + "loss": 0.85879481, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 662, + "time_per_iteration": 2.6664726734161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169368, + "balance_loss_mlp": 1.14935231, + "diversity_loss_mlp": 0.0, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.08593929583832248, + "language_loss": 0.89815515, + "learning_rate": 0.0009752579733569475, + "loss": 0.90984881, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.20019531, + "routerloss_mlp": 0.0, + "step": 663, + "time_per_iteration": 2.695844888687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02192512, + "balance_loss_mlp": 2.16352034, + "diversity_loss_mlp": 0.0, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.2093028146020386, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.77073896, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.2890625, + "routerloss_mlp": 0.0, + "step": 664, + "time_per_iteration": 4.96467137336731 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00927072, + "balance_loss_mlp": 1.59828615, + "diversity_loss_mlp": 0.21952696, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.040572636524321984, + "language_loss": 0.8949101, + "learning_rate": 0.0009750640270890217, + "loss": 0.90418077, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816532, + "step": 665, + "time_per_iteration": 2.7632246017456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241186, + "balance_loss_mlp": 1.22053885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.08846289988129392, + "language_loss": 0.95572138, + "learning_rate": 0.0009749667772818983, + "loss": 0.96813321, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.20654297, + "routerloss_mlp": 0.0, + "step": 666, + "time_per_iteration": 3.037458896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183198, + "balance_loss_mlp": 1.80241597, + "diversity_loss_mlp": 0.0, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.11554481164154014, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.7876792, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.29492188, + "routerloss_mlp": 0.0, + "step": 667, + "time_per_iteration": 4.810182332992554 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244511, + "balance_loss_mlp": 1.22299325, + "diversity_loss_mlp": 0.0, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.09137997717488894, + "language_loss": 0.94816601, + "learning_rate": 0.0009747717245101093, + "loss": 0.9606111, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.21520996, + "routerloss_mlp": 0.0, + "step": 668, + "time_per_iteration": 2.552507162094116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917856, + "balance_loss_mlp": 1.58052325, + "diversity_loss_mlp": 0.21830653, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.03508480239171642, + "language_loss": 0.8457346, + "learning_rate": 0.00097467392162117, + "loss": 0.85491318, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01844162, + "step": 669, + "time_per_iteration": 2.6064391136169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242109, + "balance_loss_mlp": 1.21882796, + "diversity_loss_mlp": 0.0, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.1666980552990896, + "language_loss": 0.90609741, + "learning_rate": 0.0009745759344474708, + "loss": 0.91851848, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.23266602, + "routerloss_mlp": 0.0, + "step": 670, + "time_per_iteration": 2.826202392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229033, + "balance_loss_mlp": 1.2077179, + "diversity_loss_mlp": 0.0, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.09671049007121679, + "language_loss": 0.88974905, + "learning_rate": 0.0009744777630270536, + "loss": 0.90203935, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.21337891, + "routerloss_mlp": 0.0, + "step": 671, + "time_per_iteration": 2.578334331512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233527, + "balance_loss_mlp": 1.21067417, + "diversity_loss_mlp": 0.0, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.08999527722625096, + "language_loss": 0.92790663, + "learning_rate": 0.000974379407398032, + "loss": 0.94024187, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 672, + "time_per_iteration": 2.8661158084869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237675, + "balance_loss_mlp": 1.21589506, + "diversity_loss_mlp": 0.0, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.09653126460783178, + "language_loss": 0.81875724, + "learning_rate": 0.0009742808675985913, + "loss": 0.83113402, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.21801758, + "routerloss_mlp": 0.0, + "step": 673, + "time_per_iteration": 3.0861356258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260533, + "balance_loss_mlp": 1.23754919, + "diversity_loss_mlp": 0.0, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.08653130412501808, + "language_loss": 0.90219223, + "learning_rate": 0.0009741821436669876, + "loss": 0.91479754, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 674, + "time_per_iteration": 2.5609960556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267597, + "balance_loss_mlp": 1.24489975, + "diversity_loss_mlp": 0.0, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.09623752325881015, + "language_loss": 0.91791725, + "learning_rate": 0.0009740832356415492, + "loss": 0.93059325, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.22680664, + "routerloss_mlp": 0.0, + "step": 675, + "time_per_iteration": 2.544027805328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295128, + "balance_loss_mlp": 1.27278781, + "diversity_loss_mlp": 0.0, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.08903369590662558, + "language_loss": 0.87403589, + "learning_rate": 0.0009739841435606756, + "loss": 0.88698715, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.22338867, + "routerloss_mlp": 0.0, + "step": 676, + "time_per_iteration": 2.9931325912475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261461, + "balance_loss_mlp": 1.23933589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.0602287995404217, + "language_loss": 0.89557111, + "learning_rate": 0.0009738848674628377, + "loss": 0.90818572, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 677, + "time_per_iteration": 2.7290966510772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264602, + "balance_loss_mlp": 1.24307275, + "diversity_loss_mlp": 0.0, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.10468610894957399, + "language_loss": 0.88751101, + "learning_rate": 0.000973785407386578, + "loss": 0.90015703, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 678, + "time_per_iteration": 2.7950329780578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00969584, + "balance_loss_mlp": 1.6979661, + "diversity_loss_mlp": 0.20886885, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.03344489204860934, + "language_loss": 0.86933386, + "learning_rate": 0.0009736857633705103, + "loss": 0.87902969, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01616703, + "step": 679, + "time_per_iteration": 2.8691866397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193718, + "balance_loss_mlp": 1.17283261, + "diversity_loss_mlp": 0.0, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.08130386374469858, + "language_loss": 0.92363989, + "learning_rate": 0.0009735859354533196, + "loss": 0.93557703, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 680, + "time_per_iteration": 2.6832337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155917, + "balance_loss_mlp": 1.13447094, + "diversity_loss_mlp": 0.0, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.0924188238597787, + "language_loss": 0.91083395, + "learning_rate": 0.0009734859236737628, + "loss": 0.92239314, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.21459961, + "routerloss_mlp": 0.0, + "step": 681, + "time_per_iteration": 2.6023473739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125397, + "balance_loss_mlp": 1.10410571, + "diversity_loss_mlp": 0.0, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.08442474228180671, + "language_loss": 0.93186569, + "learning_rate": 0.0009733857280706678, + "loss": 0.9431197, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.2130127, + "routerloss_mlp": 0.0, + "step": 682, + "time_per_iteration": 2.5775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968386, + "balance_loss_mlp": 1.69064701, + "diversity_loss_mlp": 0.21057674, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.03992508312329801, + "language_loss": 0.84369749, + "learning_rate": 0.000973285348682934, + "loss": 0.85338134, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777408, + "step": 683, + "time_per_iteration": 2.768641233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01618305, + "balance_loss_mlp": 1.58530831, + "diversity_loss_mlp": 0.0, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.09794042911652269, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79516685, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.33007812, + "routerloss_mlp": 0.0, + "step": 684, + "time_per_iteration": 4.802167177200317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094162, + "balance_loss_mlp": 1.07383704, + "diversity_loss_mlp": 0.0, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.12652995306024198, + "language_loss": 0.84832728, + "learning_rate": 0.0009730840387095046, + "loss": 0.8592689, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.20324707, + "routerloss_mlp": 0.0, + "step": 685, + "time_per_iteration": 3.2910287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112414, + "balance_loss_mlp": 1.09188628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.13012317463795417, + "language_loss": 0.90537834, + "learning_rate": 0.0009729831082019642, + "loss": 0.91650254, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.20532227, + "routerloss_mlp": 0.0, + "step": 686, + "time_per_iteration": 2.7909138202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121716, + "balance_loss_mlp": 1.101331, + "diversity_loss_mlp": 0.0, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08096428549902779, + "language_loss": 0.88353586, + "learning_rate": 0.0009728819940660958, + "loss": 0.89475298, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 687, + "time_per_iteration": 2.7699429988861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131653, + "balance_loss_mlp": 1.11135173, + "diversity_loss_mlp": 0.0, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.07933225152322496, + "language_loss": 0.85085285, + "learning_rate": 0.0009727806963411557, + "loss": 0.86216938, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 688, + "time_per_iteration": 2.581984519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144043, + "balance_loss_mlp": 1.12350333, + "diversity_loss_mlp": 0.0, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.09807362554425139, + "language_loss": 0.87180853, + "learning_rate": 0.000972679215066471, + "loss": 0.88324893, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.20544434, + "routerloss_mlp": 0.0, + "step": 689, + "time_per_iteration": 2.6538989543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148083, + "balance_loss_mlp": 1.12809181, + "diversity_loss_mlp": 0.0, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.09247782934143206, + "language_loss": 0.98983967, + "learning_rate": 0.0009725775502814401, + "loss": 1.00132048, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.19995117, + "routerloss_mlp": 0.0, + "step": 690, + "time_per_iteration": 2.610485315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167941, + "balance_loss_mlp": 1.14827132, + "diversity_loss_mlp": 0.0, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.08082631328369684, + "language_loss": 0.84880829, + "learning_rate": 0.0009724757020255327, + "loss": 0.8604877, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.1965332, + "routerloss_mlp": 0.0, + "step": 691, + "time_per_iteration": 2.8424370288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152001, + "balance_loss_mlp": 1.13209307, + "diversity_loss_mlp": 0.0, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09067820147092803, + "language_loss": 0.87807095, + "learning_rate": 0.0009723736703382902, + "loss": 0.88959098, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.19897461, + "routerloss_mlp": 0.0, + "step": 692, + "time_per_iteration": 2.5578606128692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149266, + "balance_loss_mlp": 1.13037133, + "diversity_loss_mlp": 0.0, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07979062216362842, + "language_loss": 0.82877922, + "learning_rate": 0.0009722714552593244, + "loss": 0.84027195, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 693, + "time_per_iteration": 2.6148533821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153464, + "balance_loss_mlp": 1.13444984, + "diversity_loss_mlp": 0.0, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08708336283232748, + "language_loss": 0.94164526, + "learning_rate": 0.000972169056828319, + "loss": 0.9531799, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 694, + "time_per_iteration": 2.517944097518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154915, + "balance_loss_mlp": 1.1360321, + "diversity_loss_mlp": 0.0, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.0753733884935208, + "language_loss": 0.86921358, + "learning_rate": 0.0009720664750850283, + "loss": 0.8807627, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 695, + "time_per_iteration": 2.8149421215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148667, + "balance_loss_mlp": 1.1299628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.09445278911045346, + "language_loss": 0.92951906, + "learning_rate": 0.0009719637100692784, + "loss": 0.94100577, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 696, + "time_per_iteration": 2.719451904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149322, + "balance_loss_mlp": 1.13098741, + "diversity_loss_mlp": 0.0, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.10008701466446891, + "language_loss": 0.82604736, + "learning_rate": 0.0009718607618209661, + "loss": 0.83754057, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 697, + "time_per_iteration": 2.8692104816436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148038, + "balance_loss_mlp": 1.12914348, + "diversity_loss_mlp": 0.0, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.07908911060166324, + "language_loss": 0.87701273, + "learning_rate": 0.0009717576303800595, + "loss": 0.88849318, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 698, + "time_per_iteration": 3.0484437942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139219, + "balance_loss_mlp": 1.11988366, + "diversity_loss_mlp": 0.0, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.12480577454910273, + "language_loss": 0.85819161, + "learning_rate": 0.0009716543157865975, + "loss": 0.86958385, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.1932373, + "routerloss_mlp": 0.0, + "step": 699, + "time_per_iteration": 2.706787347793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144768, + "balance_loss_mlp": 1.12586117, + "diversity_loss_mlp": 0.0, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.16362357873421526, + "language_loss": 0.83352965, + "learning_rate": 0.0009715508180806907, + "loss": 0.84497738, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.18896484, + "routerloss_mlp": 0.0, + "step": 700, + "time_per_iteration": 3.1985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162079, + "balance_loss_mlp": 1.14230227, + "diversity_loss_mlp": 0.0, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.08746408781150025, + "language_loss": 0.90170425, + "learning_rate": 0.0009714471373025202, + "loss": 0.91332507, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.19763184, + "routerloss_mlp": 0.0, + "step": 701, + "time_per_iteration": 3.487022638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.13656974, + "diversity_loss_mlp": 0.0, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.10787745491017559, + "language_loss": 0.88186693, + "learning_rate": 0.0009713432734923386, + "loss": 0.89343208, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 702, + "time_per_iteration": 2.6239736080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.14830136, + "diversity_loss_mlp": 0.0, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.09670789671988574, + "language_loss": 0.86879516, + "learning_rate": 0.0009712392266904696, + "loss": 0.88047349, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.19506836, + "routerloss_mlp": 0.0, + "step": 703, + "time_per_iteration": 2.7542335987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181198, + "balance_loss_mlp": 1.16149247, + "diversity_loss_mlp": 0.0, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.10598212751912446, + "language_loss": 0.85246772, + "learning_rate": 0.0009711349969373076, + "loss": 0.86427975, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 704, + "time_per_iteration": 3.162461042404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175522, + "balance_loss_mlp": 1.15518451, + "diversity_loss_mlp": 0.0, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.0954290464489283, + "language_loss": 0.80285007, + "learning_rate": 0.0009710305842733178, + "loss": 0.81460524, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 705, + "time_per_iteration": 2.7630715370178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155052, + "balance_loss_mlp": 1.13601446, + "diversity_loss_mlp": 0.0, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.09437017973872532, + "language_loss": 0.89630616, + "learning_rate": 0.0009709259887390373, + "loss": 0.9078567, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.19030762, + "routerloss_mlp": 0.0, + "step": 706, + "time_per_iteration": 2.6160268783569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895019, + "balance_loss_mlp": 1.55161047, + "diversity_loss_mlp": 0.20666173, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.04273378361131697, + "language_loss": 0.90874577, + "learning_rate": 0.0009708212103750737, + "loss": 0.91769588, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588319, + "step": 707, + "time_per_iteration": 2.594606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180444, + "balance_loss_mlp": 1.16110778, + "diversity_loss_mlp": 0.0, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.08814378894040824, + "language_loss": 0.87522972, + "learning_rate": 0.0009707162492221051, + "loss": 0.88703418, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.19335938, + "routerloss_mlp": 0.0, + "step": 708, + "time_per_iteration": 2.8884427547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197388, + "balance_loss_mlp": 1.17801642, + "diversity_loss_mlp": 0.0, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.07892254834086627, + "language_loss": 0.87611169, + "learning_rate": 0.0009706111053208815, + "loss": 0.8880856, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 709, + "time_per_iteration": 2.7824413776397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213311, + "balance_loss_mlp": 1.19383228, + "diversity_loss_mlp": 0.0, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.10389736734512126, + "language_loss": 0.85504246, + "learning_rate": 0.0009705057787122232, + "loss": 0.86717558, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.19458008, + "routerloss_mlp": 0.0, + "step": 710, + "time_per_iteration": 2.529498815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178108, + "balance_loss_mlp": 1.15870059, + "diversity_loss_mlp": 0.0, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.07975606670492637, + "language_loss": 0.91293353, + "learning_rate": 0.0009704002694370216, + "loss": 0.92471457, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.19384766, + "routerloss_mlp": 0.0, + "step": 711, + "time_per_iteration": 2.5365610122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152018, + "balance_loss_mlp": 1.13282573, + "diversity_loss_mlp": 0.0, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.08453852441771745, + "language_loss": 0.86583841, + "learning_rate": 0.0009702945775362388, + "loss": 0.87735862, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.19177246, + "routerloss_mlp": 0.0, + "step": 712, + "time_per_iteration": 2.595674514770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111883, + "balance_loss_mlp": 1.10022175, + "diversity_loss_mlp": 0.0, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.08096963371537849, + "language_loss": 0.87088716, + "learning_rate": 0.0009701887030509086, + "loss": 0.88207549, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.18615723, + "routerloss_mlp": 0.0, + "step": 713, + "time_per_iteration": 2.6124320030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112657, + "balance_loss_mlp": 1.09444165, + "diversity_loss_mlp": 0.0, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.12434454369652892, + "language_loss": 0.91262931, + "learning_rate": 0.0009700826460221346, + "loss": 0.92375588, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.18225098, + "routerloss_mlp": 0.0, + "step": 714, + "time_per_iteration": 2.674612283706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115832, + "balance_loss_mlp": 1.09812903, + "diversity_loss_mlp": 0.0, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.11407804289300516, + "language_loss": 0.92571628, + "learning_rate": 0.0009699764064910921, + "loss": 0.93687463, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.17712402, + "routerloss_mlp": 0.0, + "step": 715, + "time_per_iteration": 2.8810853958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121116, + "balance_loss_mlp": 1.10322237, + "diversity_loss_mlp": 0.0, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08940816195623212, + "language_loss": 0.86826718, + "learning_rate": 0.0009698699844990268, + "loss": 0.87947834, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.17907715, + "routerloss_mlp": 0.0, + "step": 716, + "time_per_iteration": 2.697970151901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153213, + "balance_loss_mlp": 1.13561809, + "diversity_loss_mlp": 0.0, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.07906779204708066, + "language_loss": 0.88138282, + "learning_rate": 0.0009697633800872555, + "loss": 0.89291501, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 717, + "time_per_iteration": 2.8897392749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197417, + "balance_loss_mlp": 1.1801312, + "diversity_loss_mlp": 0.0, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.10867682790127652, + "language_loss": 0.9066782, + "learning_rate": 0.0009696565932971655, + "loss": 0.91865242, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 718, + "time_per_iteration": 2.8944718837738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209582, + "balance_loss_mlp": 1.19165277, + "diversity_loss_mlp": 0.0, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.0949883595308799, + "language_loss": 0.89814746, + "learning_rate": 0.0009695496241702153, + "loss": 0.91024327, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 719, + "time_per_iteration": 2.7888894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188478, + "balance_loss_mlp": 1.17082274, + "diversity_loss_mlp": 0.0, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.11627833553714081, + "language_loss": 0.86245799, + "learning_rate": 0.0009694424727479339, + "loss": 0.87434286, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.17687988, + "routerloss_mlp": 0.0, + "step": 720, + "time_per_iteration": 2.901224374771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157865, + "balance_loss_mlp": 1.14056826, + "diversity_loss_mlp": 0.0, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.09369792564045784, + "language_loss": 0.88928097, + "learning_rate": 0.0009693351390719213, + "loss": 0.90085959, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 721, + "time_per_iteration": 2.6945152282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126431, + "balance_loss_mlp": 1.10868096, + "diversity_loss_mlp": 0.0, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.07998653864580182, + "language_loss": 0.90800881, + "learning_rate": 0.000969227623183848, + "loss": 0.91927308, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 722, + "time_per_iteration": 2.789515733718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110503, + "balance_loss_mlp": 1.0873754, + "diversity_loss_mlp": 0.0, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.07914116119322331, + "language_loss": 0.90912664, + "learning_rate": 0.0009691199251254554, + "loss": 0.92017698, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.17663574, + "routerloss_mlp": 0.0, + "step": 723, + "time_per_iteration": 2.8231685161590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0093359, + "balance_loss_mlp": 1.62175167, + "diversity_loss_mlp": 0.20987722, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.03669424434563534, + "language_loss": 0.86868215, + "learning_rate": 0.0009690120449385555, + "loss": 0.87801802, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777578, + "step": 724, + "time_per_iteration": 2.8498518466949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093753, + "balance_loss_mlp": 1.07543111, + "diversity_loss_mlp": 0.0, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.10366482624390064, + "language_loss": 0.92449063, + "learning_rate": 0.0009689039826650312, + "loss": 0.93542814, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.18322754, + "routerloss_mlp": 0.0, + "step": 725, + "time_per_iteration": 2.7611966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0154366, + "balance_loss_mlp": 1.50932813, + "diversity_loss_mlp": 0.0, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.08078369374569346, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.78066719, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.34375, + "routerloss_mlp": 0.0, + "step": 726, + "time_per_iteration": 4.927435398101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933775, + "balance_loss_mlp": 1.62253523, + "diversity_loss_mlp": 0.20735951, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.04309218151041253, + "language_loss": 0.87429261, + "learning_rate": 0.0009686873120259941, + "loss": 0.88363039, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01882811, + "step": 727, + "time_per_iteration": 2.602264165878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113926, + "balance_loss_mlp": 1.12035322, + "diversity_loss_mlp": 0.0, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.14876828859354083, + "language_loss": 0.8713131, + "learning_rate": 0.0009685787037446004, + "loss": 0.88270569, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.18884277, + "routerloss_mlp": 0.0, + "step": 728, + "time_per_iteration": 2.806549072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118218, + "balance_loss_mlp": 1.09903765, + "diversity_loss_mlp": 0.0, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.1987640778264907, + "language_loss": 0.87505388, + "learning_rate": 0.0009684699135448201, + "loss": 0.88623607, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 729, + "time_per_iteration": 2.7200138568878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112792, + "balance_loss_mlp": 1.09435034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.0640895655048784, + "language_loss": 0.92135447, + "learning_rate": 0.0009683609414688895, + "loss": 0.93248242, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.18432617, + "routerloss_mlp": 0.0, + "step": 730, + "time_per_iteration": 2.7423696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911127, + "balance_loss_mlp": 1.58117688, + "diversity_loss_mlp": 0.20959289, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.03249579551243702, + "language_loss": 0.86587501, + "learning_rate": 0.0009682517875591154, + "loss": 0.87498629, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01574249, + "step": 731, + "time_per_iteration": 2.809400796890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199938, + "balance_loss_mlp": 1.18138909, + "diversity_loss_mlp": 0.0, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.07609394509363156, + "language_loss": 0.86229968, + "learning_rate": 0.0009681424518578749, + "loss": 0.87429905, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.18530273, + "routerloss_mlp": 0.0, + "step": 732, + "time_per_iteration": 2.725839614868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283686, + "balance_loss_mlp": 1.26505399, + "diversity_loss_mlp": 0.0, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.1414658743658329, + "language_loss": 0.87506676, + "learning_rate": 0.000968032934407616, + "loss": 0.88790363, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.1862793, + "routerloss_mlp": 0.0, + "step": 733, + "time_per_iteration": 2.583768844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01310281, + "balance_loss_mlp": 1.29136264, + "diversity_loss_mlp": 0.0, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.10963887531318486, + "language_loss": 0.81871867, + "learning_rate": 0.0009679232352508571, + "loss": 0.8318215, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.18908691, + "routerloss_mlp": 0.0, + "step": 734, + "time_per_iteration": 2.785585880279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286635, + "balance_loss_mlp": 1.26744211, + "diversity_loss_mlp": 0.0, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.10469043869015734, + "language_loss": 0.80695581, + "learning_rate": 0.0009678133544301871, + "loss": 0.81982213, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 735, + "time_per_iteration": 2.6638481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224375, + "balance_loss_mlp": 1.20588589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.06500438819618859, + "language_loss": 0.91870093, + "learning_rate": 0.0009677032919882658, + "loss": 0.93094468, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 736, + "time_per_iteration": 2.6578378677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197974, + "balance_loss_mlp": 1.18056929, + "diversity_loss_mlp": 0.0, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.09940630997209131, + "language_loss": 0.91374373, + "learning_rate": 0.000967593047967823, + "loss": 0.92572349, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.17419434, + "routerloss_mlp": 0.0, + "step": 737, + "time_per_iteration": 2.5236403942108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117212, + "balance_loss_mlp": 1.15476346, + "diversity_loss_mlp": 0.0, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.10840920786543624, + "language_loss": 0.86479127, + "learning_rate": 0.0009674826224116593, + "loss": 0.87651253, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 738, + "time_per_iteration": 2.803260326385498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134605, + "balance_loss_mlp": 1.11759412, + "diversity_loss_mlp": 0.0, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.09051392518082112, + "language_loss": 0.86862409, + "learning_rate": 0.0009673720153626455, + "loss": 0.87997013, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 739, + "time_per_iteration": 2.6086573600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.10798764, + "diversity_loss_mlp": 0.0, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.11444093339414264, + "language_loss": 0.8689152, + "learning_rate": 0.0009672612268637235, + "loss": 0.88016504, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.17016602, + "routerloss_mlp": 0.0, + "step": 740, + "time_per_iteration": 2.582648277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116151, + "balance_loss_mlp": 1.09880614, + "diversity_loss_mlp": 0.0, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.10874190594389947, + "language_loss": 0.84213787, + "learning_rate": 0.0009671502569579048, + "loss": 0.85329938, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 741, + "time_per_iteration": 2.7945284843444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132432, + "balance_loss_mlp": 1.11539662, + "diversity_loss_mlp": 0.0, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.07140691777849974, + "language_loss": 0.89503837, + "learning_rate": 0.0009670391056882719, + "loss": 0.90636265, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.17053223, + "routerloss_mlp": 0.0, + "step": 742, + "time_per_iteration": 2.71687912940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149228, + "balance_loss_mlp": 1.13240731, + "diversity_loss_mlp": 0.0, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.08672376963732596, + "language_loss": 0.88698781, + "learning_rate": 0.0009669277730979776, + "loss": 0.89848006, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 743, + "time_per_iteration": 3.2029030323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147207, + "balance_loss_mlp": 1.13025546, + "diversity_loss_mlp": 0.0, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.09113342882689801, + "language_loss": 0.85227454, + "learning_rate": 0.0009668162592302449, + "loss": 0.86374664, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 744, + "time_per_iteration": 2.899656057357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165032, + "balance_loss_mlp": 1.14748406, + "diversity_loss_mlp": 0.0, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.07780467137911447, + "language_loss": 0.86560214, + "learning_rate": 0.0009667045641283676, + "loss": 0.87725246, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.17553711, + "routerloss_mlp": 0.0, + "step": 745, + "time_per_iteration": 2.6474997997283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.14148676, + "diversity_loss_mlp": 0.0, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.09864944110558675, + "language_loss": 0.95312673, + "learning_rate": 0.0009665926878357092, + "loss": 0.96471858, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.17700195, + "routerloss_mlp": 0.0, + "step": 746, + "time_per_iteration": 2.946307420730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00851982, + "balance_loss_mlp": 1.46230698, + "diversity_loss_mlp": 0.20995456, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.034792990408202794, + "language_loss": 0.91192698, + "learning_rate": 0.0009664806303957043, + "loss": 0.92044681, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01585159, + "step": 747, + "time_per_iteration": 2.706286668777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160661, + "balance_loss_mlp": 1.14221931, + "diversity_loss_mlp": 0.0, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.08367194984434445, + "language_loss": 0.87066692, + "learning_rate": 0.0009663683918518571, + "loss": 0.88227355, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 748, + "time_per_iteration": 2.892982244491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136526, + "balance_loss_mlp": 1.11831081, + "diversity_loss_mlp": 0.0, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.07455761265115375, + "language_loss": 0.85490787, + "learning_rate": 0.0009662559722477428, + "loss": 0.86627316, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.18237305, + "routerloss_mlp": 0.0, + "step": 749, + "time_per_iteration": 2.6979615688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01292346, + "balance_loss_mlp": 1.2582047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.08640394257539531, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77455318, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.34179688, + "routerloss_mlp": 0.0, + "step": 750, + "time_per_iteration": 4.991304397583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.11068118, + "diversity_loss_mlp": 0.0, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.07866539193327844, + "language_loss": 0.89197791, + "learning_rate": 0.0009660305900333632, + "loss": 0.90326303, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.17834473, + "routerloss_mlp": 0.0, + "step": 751, + "time_per_iteration": 2.6706793308258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121937, + "balance_loss_mlp": 1.1038413, + "diversity_loss_mlp": 0.0, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.10038132697844201, + "language_loss": 0.82478833, + "learning_rate": 0.0009659176275105992, + "loss": 0.83600777, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.1809082, + "routerloss_mlp": 0.0, + "step": 752, + "time_per_iteration": 2.697909355163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126212, + "balance_loss_mlp": 1.10777032, + "diversity_loss_mlp": 0.0, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.10638604925915984, + "language_loss": 0.85756153, + "learning_rate": 0.0009658044841025701, + "loss": 0.86882365, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 753, + "time_per_iteration": 2.7749171257019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128331, + "balance_loss_mlp": 1.1107595, + "diversity_loss_mlp": 0.0, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.09130861127340602, + "language_loss": 0.81584072, + "learning_rate": 0.0009656911598532021, + "loss": 0.827124, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.17590332, + "routerloss_mlp": 0.0, + "step": 754, + "time_per_iteration": 2.635702610015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136592, + "balance_loss_mlp": 1.11914003, + "diversity_loss_mlp": 0.0, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.06835454276473461, + "language_loss": 0.90494555, + "learning_rate": 0.0009655776548064917, + "loss": 0.9163115, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.17456055, + "routerloss_mlp": 0.0, + "step": 755, + "time_per_iteration": 2.6545748710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135969, + "balance_loss_mlp": 1.11902952, + "diversity_loss_mlp": 0.0, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.07886906074703284, + "language_loss": 0.88367254, + "learning_rate": 0.0009654639690065054, + "loss": 0.89503217, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 756, + "time_per_iteration": 2.8773815631866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150961, + "balance_loss_mlp": 1.13343716, + "diversity_loss_mlp": 0.0, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.07604063018618923, + "language_loss": 0.8823185, + "learning_rate": 0.00096535010249738, + "loss": 0.89382815, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.17529297, + "routerloss_mlp": 0.0, + "step": 757, + "time_per_iteration": 2.7175021171569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846707, + "balance_loss_mlp": 1.45519352, + "diversity_loss_mlp": 0.20419648, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.03954501513556402, + "language_loss": 0.82782531, + "learning_rate": 0.0009652360553233224, + "loss": 0.83629239, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017012, + "step": 758, + "time_per_iteration": 2.7434637546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115333, + "balance_loss_mlp": 1.12624609, + "diversity_loss_mlp": 0.0, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.03342191973393777, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.7492708, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 759, + "time_per_iteration": 4.910880088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188786, + "balance_loss_mlp": 1.17063034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.0638252555407819, + "language_loss": 0.81659228, + "learning_rate": 0.0009650074191575883, + "loss": 0.82848012, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.18151855, + "routerloss_mlp": 0.0, + "step": 760, + "time_per_iteration": 3.2028603553771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213565, + "balance_loss_mlp": 1.19484925, + "diversity_loss_mlp": 0.0, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.07046318146001718, + "language_loss": 0.86031073, + "learning_rate": 0.0009648928302546766, + "loss": 0.87244636, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 761, + "time_per_iteration": 2.6812515258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243947, + "balance_loss_mlp": 1.22551703, + "diversity_loss_mlp": 0.0, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.0884537515073792, + "language_loss": 0.85470825, + "learning_rate": 0.0009647780608643613, + "loss": 0.86714768, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.1842041, + "routerloss_mlp": 0.0, + "step": 762, + "time_per_iteration": 3.3486785888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012302, + "balance_loss_mlp": 1.21243811, + "diversity_loss_mlp": 0.0, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.12042495658723557, + "language_loss": 0.874053, + "learning_rate": 0.0009646631110312001, + "loss": 0.88635492, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 763, + "time_per_iteration": 2.6648313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172377, + "balance_loss_mlp": 1.1544956, + "diversity_loss_mlp": 0.0, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.05916332097574664, + "language_loss": 0.8841719, + "learning_rate": 0.0009645479807998203, + "loss": 0.89589572, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 764, + "time_per_iteration": 2.7347912788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147505, + "balance_loss_mlp": 1.12983775, + "diversity_loss_mlp": 0.0, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06985321722585584, + "language_loss": 0.92467874, + "learning_rate": 0.0009644326702149196, + "loss": 0.93615377, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.17675781, + "routerloss_mlp": 0.0, + "step": 765, + "time_per_iteration": 2.7316319942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135962, + "balance_loss_mlp": 1.11803293, + "diversity_loss_mlp": 0.0, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.09157028460957184, + "language_loss": 0.84919345, + "learning_rate": 0.0009643171793212653, + "loss": 0.86055309, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 766, + "time_per_iteration": 3.116917610168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105537, + "balance_loss_mlp": 1.08738184, + "diversity_loss_mlp": 0.0, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.08034801396880724, + "language_loss": 0.89233959, + "learning_rate": 0.0009642015081636952, + "loss": 0.90339494, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.18164062, + "routerloss_mlp": 0.0, + "step": 767, + "time_per_iteration": 2.705993175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103513, + "balance_loss_mlp": 1.08563185, + "diversity_loss_mlp": 0.0, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.09221888586765616, + "language_loss": 0.88360566, + "learning_rate": 0.0009640856567871166, + "loss": 0.8946408, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.17895508, + "routerloss_mlp": 0.0, + "step": 768, + "time_per_iteration": 2.5172243118286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108598, + "balance_loss_mlp": 1.08981061, + "diversity_loss_mlp": 0.0, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.0844592716079577, + "language_loss": 0.89047211, + "learning_rate": 0.0009639696252365072, + "loss": 0.9015581, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.18786621, + "routerloss_mlp": 0.0, + "step": 769, + "time_per_iteration": 3.034848690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105095, + "balance_loss_mlp": 1.08673656, + "diversity_loss_mlp": 0.0, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.07095543604969227, + "language_loss": 0.81996548, + "learning_rate": 0.0009638534135569144, + "loss": 0.83101642, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.18371582, + "routerloss_mlp": 0.0, + "step": 770, + "time_per_iteration": 2.947564125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106726, + "balance_loss_mlp": 1.08859468, + "diversity_loss_mlp": 0.0, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.08627707323979403, + "language_loss": 0.9012745, + "learning_rate": 0.0009637370217934554, + "loss": 0.91234171, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.18139648, + "routerloss_mlp": 0.0, + "step": 771, + "time_per_iteration": 2.6592423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.09355128, + "diversity_loss_mlp": 0.0, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.06345294765682771, + "language_loss": 0.82981932, + "learning_rate": 0.0009636204499913175, + "loss": 0.84093815, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 772, + "time_per_iteration": 2.8836610317230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115362, + "balance_loss_mlp": 1.09749293, + "diversity_loss_mlp": 0.0, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06338786563117527, + "language_loss": 0.87914705, + "learning_rate": 0.0009635036981957581, + "loss": 0.89030063, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 773, + "time_per_iteration": 2.885239601135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132405, + "balance_loss_mlp": 1.11417794, + "diversity_loss_mlp": 0.0, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.08623405645423676, + "language_loss": 0.90735364, + "learning_rate": 0.0009633867664521043, + "loss": 0.91867769, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.18212891, + "routerloss_mlp": 0.0, + "step": 774, + "time_per_iteration": 2.802264451980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159356, + "balance_loss_mlp": 1.14176083, + "diversity_loss_mlp": 0.0, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.09977443827883303, + "language_loss": 0.86760318, + "learning_rate": 0.0009632696548057527, + "loss": 0.8791967, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 775, + "time_per_iteration": 2.5641794204711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187156, + "balance_loss_mlp": 1.16960835, + "diversity_loss_mlp": 0.0, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.08744626586779954, + "language_loss": 0.85013115, + "learning_rate": 0.0009631523633021704, + "loss": 0.86200273, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 776, + "time_per_iteration": 2.7851786613464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881631, + "balance_loss_mlp": 1.52411294, + "diversity_loss_mlp": 0.20632464, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.038364140445948956, + "language_loss": 0.88378215, + "learning_rate": 0.0009630348919868936, + "loss": 0.89259851, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164127, + "step": 777, + "time_per_iteration": 2.7285845279693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191902, + "balance_loss_mlp": 1.17415154, + "diversity_loss_mlp": 0.0, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.14061909589017782, + "language_loss": 0.81450796, + "learning_rate": 0.0009629172409055293, + "loss": 0.82642698, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 778, + "time_per_iteration": 2.5018203258514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154162, + "balance_loss_mlp": 1.13728166, + "diversity_loss_mlp": 0.0, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.06968828956123203, + "language_loss": 0.87518388, + "learning_rate": 0.0009627994101037531, + "loss": 0.88672549, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 779, + "time_per_iteration": 2.763136863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139257, + "balance_loss_mlp": 1.12231779, + "diversity_loss_mlp": 0.0, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.07833298109740298, + "language_loss": 0.88761836, + "learning_rate": 0.0009626813996273114, + "loss": 0.8990109, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 780, + "time_per_iteration": 2.8791675567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117275, + "balance_loss_mlp": 1.09990597, + "diversity_loss_mlp": 0.0, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.09603506751758703, + "language_loss": 0.89051467, + "learning_rate": 0.0009625632095220198, + "loss": 0.90168738, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 781, + "time_per_iteration": 2.8194801807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119786, + "balance_loss_mlp": 1.10251248, + "diversity_loss_mlp": 0.0, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.1003760880169841, + "language_loss": 0.86904705, + "learning_rate": 0.0009624448398337637, + "loss": 0.88024497, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 782, + "time_per_iteration": 2.511925458908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117445, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.08409428795596587, + "language_loss": 0.8913728, + "learning_rate": 0.0009623262906084984, + "loss": 0.90254724, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.17236328, + "routerloss_mlp": 0.0, + "step": 783, + "time_per_iteration": 2.9890754222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125496, + "balance_loss_mlp": 1.10804367, + "diversity_loss_mlp": 0.0, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.07818041002140835, + "language_loss": 0.90351313, + "learning_rate": 0.0009622075618922486, + "loss": 0.9147681, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 784, + "time_per_iteration": 2.6550891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119712, + "balance_loss_mlp": 1.10261774, + "diversity_loss_mlp": 0.0, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.07239943737193227, + "language_loss": 0.87125635, + "learning_rate": 0.0009620886537311091, + "loss": 0.88245344, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.17114258, + "routerloss_mlp": 0.0, + "step": 785, + "time_per_iteration": 2.646864652633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125571, + "balance_loss_mlp": 1.10794032, + "diversity_loss_mlp": 0.0, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.08980079735835493, + "language_loss": 0.85309643, + "learning_rate": 0.000961969566171244, + "loss": 0.86435217, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.1763916, + "routerloss_mlp": 0.0, + "step": 786, + "time_per_iteration": 2.5803041458129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136873, + "balance_loss_mlp": 1.11938524, + "diversity_loss_mlp": 0.0, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.08282756535064502, + "language_loss": 0.8993417, + "learning_rate": 0.0009618502992588873, + "loss": 0.91071045, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.17504883, + "routerloss_mlp": 0.0, + "step": 787, + "time_per_iteration": 2.6479151248931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124837, + "balance_loss_mlp": 1.10727715, + "diversity_loss_mlp": 0.0, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07571751270322945, + "language_loss": 0.8792628, + "learning_rate": 0.0009617308530403424, + "loss": 0.89051116, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 788, + "time_per_iteration": 3.002804756164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125535, + "balance_loss_mlp": 1.10758173, + "diversity_loss_mlp": 0.0, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0842913885359751, + "language_loss": 0.88032806, + "learning_rate": 0.0009616112275619825, + "loss": 0.89158338, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 789, + "time_per_iteration": 2.6842775344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110837, + "balance_loss_mlp": 1.09398067, + "diversity_loss_mlp": 0.0, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.07451962795351484, + "language_loss": 0.83893597, + "learning_rate": 0.0009614914228702503, + "loss": 0.85004437, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 790, + "time_per_iteration": 2.714026689529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095726, + "balance_loss_mlp": 1.07848811, + "diversity_loss_mlp": 0.0, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.07099161447381937, + "language_loss": 0.89133644, + "learning_rate": 0.0009613714390116581, + "loss": 0.90229368, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.17260742, + "routerloss_mlp": 0.0, + "step": 791, + "time_per_iteration": 2.947917938232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089669, + "balance_loss_mlp": 1.0730865, + "diversity_loss_mlp": 0.0, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.07518738092336623, + "language_loss": 0.86102855, + "learning_rate": 0.0009612512760327879, + "loss": 0.87192523, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 792, + "time_per_iteration": 2.887404203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092437, + "balance_loss_mlp": 1.07553315, + "diversity_loss_mlp": 0.0, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.09992337759040973, + "language_loss": 0.85428631, + "learning_rate": 0.0009611309339802909, + "loss": 0.86521071, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 793, + "time_per_iteration": 2.463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101517, + "balance_loss_mlp": 1.08537626, + "diversity_loss_mlp": 0.0, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.07717151134226699, + "language_loss": 0.84535038, + "learning_rate": 0.0009610104129008881, + "loss": 0.85636556, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 794, + "time_per_iteration": 3.1276698112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108014, + "balance_loss_mlp": 1.09176612, + "diversity_loss_mlp": 0.0, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.07067272187318202, + "language_loss": 0.88475168, + "learning_rate": 0.0009608897128413701, + "loss": 0.89583182, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 795, + "time_per_iteration": 2.7658157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110863, + "balance_loss_mlp": 1.09251332, + "diversity_loss_mlp": 0.0, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.05987412473430484, + "language_loss": 0.85522842, + "learning_rate": 0.0009607688338485965, + "loss": 0.86631477, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 796, + "time_per_iteration": 2.849942207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112598, + "balance_loss_mlp": 1.10935068, + "diversity_loss_mlp": 0.0, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.07148533051381147, + "language_loss": 0.90245026, + "learning_rate": 0.0009606477759694969, + "loss": 0.91371006, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 797, + "time_per_iteration": 3.0240113735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144466, + "balance_loss_mlp": 1.12839675, + "diversity_loss_mlp": 0.0, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07535837127697287, + "language_loss": 0.87540114, + "learning_rate": 0.0009605265392510703, + "loss": 0.88684577, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 798, + "time_per_iteration": 2.6324868202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147656, + "balance_loss_mlp": 1.13140786, + "diversity_loss_mlp": 0.0, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.070317951825601, + "language_loss": 0.91919398, + "learning_rate": 0.0009604051237403846, + "loss": 0.93067056, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 799, + "time_per_iteration": 2.6472957134246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159957, + "balance_loss_mlp": 1.14441192, + "diversity_loss_mlp": 0.0, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.08825283549053219, + "language_loss": 0.8626982, + "learning_rate": 0.0009602835294845776, + "loss": 0.8742978, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 800, + "time_per_iteration": 2.4501516819000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141823, + "balance_loss_mlp": 1.12552738, + "diversity_loss_mlp": 0.0, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.07489761537063061, + "language_loss": 0.89964634, + "learning_rate": 0.0009601617565308565, + "loss": 0.91106457, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 801, + "time_per_iteration": 2.6480391025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00945745, + "balance_loss_mlp": 1.65525413, + "diversity_loss_mlp": 0.20237769, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.03656221347615257, + "language_loss": 0.8655234, + "learning_rate": 0.0009600398049264977, + "loss": 0.87498081, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01692954, + "step": 802, + "time_per_iteration": 3.0029048919677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00923116, + "balance_loss_mlp": 1.61011553, + "diversity_loss_mlp": 0.20312682, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.045238735441598905, + "language_loss": 0.92041564, + "learning_rate": 0.0009599176747188469, + "loss": 0.92964679, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164945, + "step": 803, + "time_per_iteration": 2.860461473464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113914, + "balance_loss_mlp": 1.12246239, + "diversity_loss_mlp": 0.0, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.08350523706559901, + "language_loss": 0.83155477, + "learning_rate": 0.0009597953659553196, + "loss": 0.84294617, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.16687012, + "routerloss_mlp": 0.0, + "step": 804, + "time_per_iteration": 2.733302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139673, + "balance_loss_mlp": 1.12363935, + "diversity_loss_mlp": 0.0, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.08094420015679657, + "language_loss": 0.89484847, + "learning_rate": 0.0009596728786833997, + "loss": 0.90624517, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.16027832, + "routerloss_mlp": 0.0, + "step": 805, + "time_per_iteration": 2.602963447570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112483, + "balance_loss_mlp": 1.10851073, + "diversity_loss_mlp": 0.0, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.09295267358895155, + "language_loss": 0.8926357, + "learning_rate": 0.0009595502129506415, + "loss": 0.90388405, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 806, + "time_per_iteration": 3.358494997024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112525, + "balance_loss_mlp": 1.10893035, + "diversity_loss_mlp": 0.0, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.09807919542340894, + "language_loss": 0.82600027, + "learning_rate": 0.0009594273688046678, + "loss": 0.83725274, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 807, + "time_per_iteration": 2.7516088485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121041, + "balance_loss_mlp": 1.10408974, + "diversity_loss_mlp": 0.0, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.13657059547118527, + "language_loss": 0.85685933, + "learning_rate": 0.000959304346293171, + "loss": 0.86806977, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 808, + "time_per_iteration": 2.676118850708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133717, + "balance_loss_mlp": 1.11686087, + "diversity_loss_mlp": 0.0, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.08670416080232539, + "language_loss": 0.88104093, + "learning_rate": 0.0009591811454639125, + "loss": 0.89237815, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.16870117, + "routerloss_mlp": 0.0, + "step": 809, + "time_per_iteration": 2.806877613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143795, + "balance_loss_mlp": 1.12712979, + "diversity_loss_mlp": 0.0, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.07575766208840308, + "language_loss": 0.88623202, + "learning_rate": 0.0009590577663647234, + "loss": 0.89766991, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 810, + "time_per_iteration": 2.705397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167139, + "balance_loss_mlp": 1.15012765, + "diversity_loss_mlp": 0.0, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.07966338850805216, + "language_loss": 0.86178398, + "learning_rate": 0.0009589342090435036, + "loss": 0.87345541, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 811, + "time_per_iteration": 2.767648935317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164589, + "balance_loss_mlp": 1.14749408, + "diversity_loss_mlp": 0.0, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07988119295983553, + "language_loss": 0.87430739, + "learning_rate": 0.0009588104735482223, + "loss": 0.88595331, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 812, + "time_per_iteration": 2.6543996334075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.14989901, + "diversity_loss_mlp": 0.0, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.09429144108453459, + "language_loss": 0.83906114, + "learning_rate": 0.0009586865599269177, + "loss": 0.85073483, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 813, + "time_per_iteration": 2.632206439971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180179, + "balance_loss_mlp": 1.1632992, + "diversity_loss_mlp": 0.0, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.08748302318090055, + "language_loss": 0.88416874, + "learning_rate": 0.0009585624682276977, + "loss": 0.89597052, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 814, + "time_per_iteration": 2.7365036010742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187488, + "balance_loss_mlp": 1.17066741, + "diversity_loss_mlp": 0.0, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.08109713122840453, + "language_loss": 0.87263978, + "learning_rate": 0.0009584381984987386, + "loss": 0.88451469, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 815, + "time_per_iteration": 2.5354831218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011941, + "balance_loss_mlp": 1.1770407, + "diversity_loss_mlp": 0.0, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.07928759805262754, + "language_loss": 0.89978456, + "learning_rate": 0.0009583137507882864, + "loss": 0.91172552, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.17077637, + "routerloss_mlp": 0.0, + "step": 816, + "time_per_iteration": 2.679156541824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895961, + "balance_loss_mlp": 1.55854249, + "diversity_loss_mlp": 0.20119007, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.035733799703693336, + "language_loss": 0.81236839, + "learning_rate": 0.000958189125144656, + "loss": 0.82132804, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160944, + "step": 817, + "time_per_iteration": 2.6629080772399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211679, + "balance_loss_mlp": 1.1954186, + "diversity_loss_mlp": 0.0, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08655764528844483, + "language_loss": 0.88309336, + "learning_rate": 0.0009580643216162313, + "loss": 0.89521015, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 818, + "time_per_iteration": 2.6631743907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174608, + "balance_loss_mlp": 1.15813375, + "diversity_loss_mlp": 0.0, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.07543766685957613, + "language_loss": 0.79610753, + "learning_rate": 0.0009579393402514652, + "loss": 0.80785358, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 819, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116637, + "balance_loss_mlp": 1.15002656, + "diversity_loss_mlp": 0.0, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.08555828674018097, + "language_loss": 0.90543056, + "learning_rate": 0.0009578141810988801, + "loss": 0.91709423, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 820, + "time_per_iteration": 2.6443581581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154879, + "balance_loss_mlp": 1.13852358, + "diversity_loss_mlp": 0.0, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.08457683432578478, + "language_loss": 0.90617025, + "learning_rate": 0.0009576888442070668, + "loss": 0.91771901, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.16357422, + "routerloss_mlp": 0.0, + "step": 821, + "time_per_iteration": 2.588172197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131243, + "balance_loss_mlp": 1.11597228, + "diversity_loss_mlp": 0.0, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08246293521158644, + "language_loss": 0.92183721, + "learning_rate": 0.0009575633296246854, + "loss": 0.93314958, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 822, + "time_per_iteration": 2.5674116611480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894902, + "balance_loss_mlp": 1.55344844, + "diversity_loss_mlp": 0.20225295, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.035537794180972825, + "language_loss": 0.83368647, + "learning_rate": 0.0009574376374004652, + "loss": 0.84263551, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01705186, + "step": 823, + "time_per_iteration": 2.6215808391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124038, + "balance_loss_mlp": 1.10815978, + "diversity_loss_mlp": 0.0, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.07732147283422666, + "language_loss": 0.801727, + "learning_rate": 0.000957311767583204, + "loss": 0.81296742, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 824, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114811, + "balance_loss_mlp": 1.12617576, + "diversity_loss_mlp": 0.0, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.06675818035974217, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83219701, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.21972656, + "routerloss_mlp": 0.0, + "step": 825, + "time_per_iteration": 4.730658531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00883043, + "balance_loss_mlp": 1.5295732, + "diversity_loss_mlp": 0.20110103, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.0472865977200058, + "language_loss": 0.91635585, + "learning_rate": 0.0009570594953650961, + "loss": 0.92518628, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01770616, + "step": 826, + "time_per_iteration": 2.528219699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119191, + "balance_loss_mlp": 1.10247803, + "diversity_loss_mlp": 0.0, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.1137923923451387, + "language_loss": 0.80430406, + "learning_rate": 0.00095693309306219, + "loss": 0.81549597, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 827, + "time_per_iteration": 3.0950989723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111184, + "balance_loss_mlp": 1.09513879, + "diversity_loss_mlp": 0.0, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.08215179220405018, + "language_loss": 0.87886679, + "learning_rate": 0.0009568065133621244, + "loss": 0.8899852, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.16699219, + "routerloss_mlp": 0.0, + "step": 828, + "time_per_iteration": 3.367777109146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106235, + "balance_loss_mlp": 1.08993912, + "diversity_loss_mlp": 0.0, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.0806870261134831, + "language_loss": 0.85100621, + "learning_rate": 0.0009566797563140422, + "loss": 0.86206853, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 829, + "time_per_iteration": 2.8803212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122437, + "balance_loss_mlp": 1.10618925, + "diversity_loss_mlp": 0.0, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.0881590388408274, + "language_loss": 0.88045579, + "learning_rate": 0.0009565528219671547, + "loss": 0.89168018, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 830, + "time_per_iteration": 2.8965914249420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130205, + "balance_loss_mlp": 1.11437368, + "diversity_loss_mlp": 0.0, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.08433678519740714, + "language_loss": 0.84820044, + "learning_rate": 0.0009564257103707418, + "loss": 0.85950249, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 831, + "time_per_iteration": 2.6071205139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138047, + "balance_loss_mlp": 1.12237096, + "diversity_loss_mlp": 0.0, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08192391736137887, + "language_loss": 0.90990019, + "learning_rate": 0.0009562984215741533, + "loss": 0.92128068, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.15661621, + "routerloss_mlp": 0.0, + "step": 832, + "time_per_iteration": 2.647022008895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126204, + "balance_loss_mlp": 1.11050415, + "diversity_loss_mlp": 0.0, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.08304692865674389, + "language_loss": 0.8233614, + "learning_rate": 0.0009561709556268065, + "loss": 0.83462346, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 833, + "time_per_iteration": 2.7033326625823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113334, + "balance_loss_mlp": 1.09758639, + "diversity_loss_mlp": 0.0, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.1118379895427605, + "language_loss": 0.94022137, + "learning_rate": 0.0009560433125781884, + "loss": 0.95135468, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 834, + "time_per_iteration": 2.7286314964294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137088, + "balance_loss_mlp": 1.12088716, + "diversity_loss_mlp": 0.0, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.07457680689162895, + "language_loss": 0.92389894, + "learning_rate": 0.0009559154924778544, + "loss": 0.93526971, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 835, + "time_per_iteration": 2.7348785400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143876, + "balance_loss_mlp": 1.12812805, + "diversity_loss_mlp": 0.0, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.10043267780752475, + "language_loss": 0.85037422, + "learning_rate": 0.0009557874953754284, + "loss": 0.86181295, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 836, + "time_per_iteration": 3.069246768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156501, + "balance_loss_mlp": 1.14049125, + "diversity_loss_mlp": 0.0, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08327927090533828, + "language_loss": 0.83506572, + "learning_rate": 0.0009556593213206038, + "loss": 0.84663069, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 837, + "time_per_iteration": 2.7368414402008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190738, + "balance_loss_mlp": 1.17505026, + "diversity_loss_mlp": 0.0, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.08045457133261572, + "language_loss": 0.87076676, + "learning_rate": 0.0009555309703631414, + "loss": 0.88267422, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 838, + "time_per_iteration": 2.72027850151062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180132, + "balance_loss_mlp": 1.16382456, + "diversity_loss_mlp": 0.0, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.09367634959673259, + "language_loss": 0.87476748, + "learning_rate": 0.0009554024425528722, + "loss": 0.88656878, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 839, + "time_per_iteration": 2.7314722537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173375, + "balance_loss_mlp": 1.15756762, + "diversity_loss_mlp": 0.0, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0683151622017414, + "language_loss": 0.88983327, + "learning_rate": 0.0009552737379396948, + "loss": 0.90156698, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.15795898, + "routerloss_mlp": 0.0, + "step": 840, + "time_per_iteration": 2.6384117603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165757, + "balance_loss_mlp": 1.14950919, + "diversity_loss_mlp": 0.0, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.08203724053437887, + "language_loss": 0.87545735, + "learning_rate": 0.0009551448565735767, + "loss": 0.88711488, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 841, + "time_per_iteration": 2.7497382164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158402, + "balance_loss_mlp": 1.14156926, + "diversity_loss_mlp": 0.0, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.08523302245909381, + "language_loss": 0.84374112, + "learning_rate": 0.0009550157985045543, + "loss": 0.8553251, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 842, + "time_per_iteration": 3.080169916152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114708, + "balance_loss_mlp": 1.13046193, + "diversity_loss_mlp": 0.0, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.10255895710786052, + "language_loss": 0.89356017, + "learning_rate": 0.0009548865637827321, + "loss": 0.90503097, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 843, + "time_per_iteration": 2.684195041656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158581, + "balance_loss_mlp": 1.14129627, + "diversity_loss_mlp": 0.0, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.08376364289368579, + "language_loss": 0.89409387, + "learning_rate": 0.0009547571524582838, + "loss": 0.90567964, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 844, + "time_per_iteration": 2.5846645832061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157702, + "balance_loss_mlp": 1.14051175, + "diversity_loss_mlp": 0.0, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.09201378669766774, + "language_loss": 0.92096436, + "learning_rate": 0.0009546275645814512, + "loss": 0.93254137, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.17211914, + "routerloss_mlp": 0.0, + "step": 845, + "time_per_iteration": 2.603830575942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165367, + "balance_loss_mlp": 1.1485343, + "diversity_loss_mlp": 0.0, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.11870998115484692, + "language_loss": 0.8935858, + "learning_rate": 0.0009544978002025446, + "loss": 0.90523952, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 846, + "time_per_iteration": 2.57155179977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167547, + "balance_loss_mlp": 1.15075064, + "diversity_loss_mlp": 0.0, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.08095587687984966, + "language_loss": 0.86639023, + "learning_rate": 0.0009543678593719434, + "loss": 0.87806571, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.16809082, + "routerloss_mlp": 0.0, + "step": 847, + "time_per_iteration": 2.7022597789764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189002, + "balance_loss_mlp": 1.17215741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.06757237913003537, + "language_loss": 0.87374425, + "learning_rate": 0.0009542377421400945, + "loss": 0.8856343, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 848, + "time_per_iteration": 2.7858939170837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209239, + "balance_loss_mlp": 1.1922878, + "diversity_loss_mlp": 0.0, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.0709695929057924, + "language_loss": 0.83489215, + "learning_rate": 0.0009541074485575145, + "loss": 0.84698457, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 849, + "time_per_iteration": 2.7202138900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206318, + "balance_loss_mlp": 1.18949735, + "diversity_loss_mlp": 0.0, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.09796618546415216, + "language_loss": 0.91934282, + "learning_rate": 0.0009539769786747874, + "loss": 0.93140602, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 850, + "time_per_iteration": 2.6165611743927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183142, + "balance_loss_mlp": 1.16619003, + "diversity_loss_mlp": 0.0, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.08882238893928415, + "language_loss": 0.81184316, + "learning_rate": 0.0009538463325425665, + "loss": 0.82367456, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 851, + "time_per_iteration": 2.686708927154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.13394117, + "diversity_loss_mlp": 0.0, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.07439357185799754, + "language_loss": 0.85950458, + "learning_rate": 0.0009537155102115728, + "loss": 0.87101221, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 852, + "time_per_iteration": 2.5918595790863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875998, + "balance_loss_mlp": 1.52336514, + "diversity_loss_mlp": 0.19506347, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.033648266618603755, + "language_loss": 0.83653182, + "learning_rate": 0.0009535845117325961, + "loss": 0.84529185, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0167836, + "step": 853, + "time_per_iteration": 2.724388599395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106481, + "balance_loss_mlp": 1.08957744, + "diversity_loss_mlp": 0.0, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.08216353114673619, + "language_loss": 0.93429655, + "learning_rate": 0.0009534533371564946, + "loss": 0.94536138, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 854, + "time_per_iteration": 2.7487661838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011031, + "balance_loss_mlp": 1.08627963, + "diversity_loss_mlp": 0.0, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.1393079137823864, + "language_loss": 0.88947123, + "learning_rate": 0.0009533219865341949, + "loss": 0.9005022, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 855, + "time_per_iteration": 2.5900051593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095935, + "balance_loss_mlp": 1.0794363, + "diversity_loss_mlp": 0.0, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.09213408499242232, + "language_loss": 0.86629748, + "learning_rate": 0.0009531904599166916, + "loss": 0.87725687, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 856, + "time_per_iteration": 2.6516594886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093162, + "balance_loss_mlp": 1.07659197, + "diversity_loss_mlp": 0.0, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.11803940214792888, + "language_loss": 0.85319799, + "learning_rate": 0.0009530587573550478, + "loss": 0.86412966, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 857, + "time_per_iteration": 2.6046345233917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.06968486, + "diversity_loss_mlp": 0.0, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.035898632567184195, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75406808, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 858, + "time_per_iteration": 5.039424180984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113669, + "balance_loss_mlp": 1.12172914, + "diversity_loss_mlp": 0.0, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.11200047020164162, + "language_loss": 0.90257657, + "learning_rate": 0.0009527948246039337, + "loss": 0.91394353, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.14929199, + "routerloss_mlp": 0.0, + "step": 859, + "time_per_iteration": 2.550898551940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912162, + "balance_loss_mlp": 1.5939728, + "diversity_loss_mlp": 0.19291875, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.041813305841329106, + "language_loss": 0.87981749, + "learning_rate": 0.000952662594516931, + "loss": 0.88893914, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871633, + "step": 860, + "time_per_iteration": 3.135986089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159964, + "balance_loss_mlp": 1.14404976, + "diversity_loss_mlp": 0.0, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.09693666764449156, + "language_loss": 0.86321676, + "learning_rate": 0.0009525301886907234, + "loss": 0.87481636, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 861, + "time_per_iteration": 2.8601465225219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117936, + "balance_loss_mlp": 1.16340995, + "diversity_loss_mlp": 0.0, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.08775979857040934, + "language_loss": 0.87897611, + "learning_rate": 0.0009523976071767155, + "loss": 0.89076972, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 862, + "time_per_iteration": 2.676481246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186964, + "balance_loss_mlp": 1.17058492, + "diversity_loss_mlp": 0.0, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.08829714099376759, + "language_loss": 0.87565947, + "learning_rate": 0.00095226485002638, + "loss": 0.88752913, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 863, + "time_per_iteration": 2.7554168701171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188442, + "balance_loss_mlp": 1.17221785, + "diversity_loss_mlp": 0.0, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.07683945950910559, + "language_loss": 0.89008975, + "learning_rate": 0.0009521319172912576, + "loss": 0.90197414, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.16223145, + "routerloss_mlp": 0.0, + "step": 864, + "time_per_iteration": 2.7515084743499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180456, + "balance_loss_mlp": 1.16381395, + "diversity_loss_mlp": 0.0, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.07957847945510911, + "language_loss": 0.95031559, + "learning_rate": 0.0009519988090229579, + "loss": 0.96212018, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 865, + "time_per_iteration": 2.671473741531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177408, + "balance_loss_mlp": 1.16058719, + "diversity_loss_mlp": 0.0, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.08787110668844439, + "language_loss": 0.87748879, + "learning_rate": 0.0009518655252731576, + "loss": 0.8892628, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 866, + "time_per_iteration": 2.7561991214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152051, + "balance_loss_mlp": 1.13470602, + "diversity_loss_mlp": 0.0, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.07641565274747647, + "language_loss": 0.90193641, + "learning_rate": 0.0009517320660936022, + "loss": 0.91345698, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.17358398, + "routerloss_mlp": 0.0, + "step": 867, + "time_per_iteration": 2.7005693912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177189, + "balance_loss_mlp": 1.16064239, + "diversity_loss_mlp": 0.0, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.08424262891613502, + "language_loss": 0.83321446, + "learning_rate": 0.0009515984315361051, + "loss": 0.84498632, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.16552734, + "routerloss_mlp": 0.0, + "step": 868, + "time_per_iteration": 2.7969586849212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167914, + "balance_loss_mlp": 1.15145087, + "diversity_loss_mlp": 0.0, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08829416831991993, + "language_loss": 0.87132847, + "learning_rate": 0.000951464621652548, + "loss": 0.88300765, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 869, + "time_per_iteration": 2.6121644973754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152825, + "balance_loss_mlp": 1.13639808, + "diversity_loss_mlp": 0.0, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.07099792340868973, + "language_loss": 0.79077303, + "learning_rate": 0.0009513306364948804, + "loss": 0.80230129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 870, + "time_per_iteration": 2.7814862728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140954, + "balance_loss_mlp": 1.12481356, + "diversity_loss_mlp": 0.0, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09401721418936884, + "language_loss": 0.89126736, + "learning_rate": 0.0009511964761151197, + "loss": 0.90267694, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 871, + "time_per_iteration": 2.601903200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152354, + "balance_loss_mlp": 1.13628435, + "diversity_loss_mlp": 0.0, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.07594901152089473, + "language_loss": 0.90430808, + "learning_rate": 0.0009510621405653521, + "loss": 0.91583163, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 872, + "time_per_iteration": 2.6015260219573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140995, + "balance_loss_mlp": 1.12449682, + "diversity_loss_mlp": 0.0, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.08553354640914074, + "language_loss": 0.84159112, + "learning_rate": 0.0009509276298977309, + "loss": 0.85300112, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 873, + "time_per_iteration": 2.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156157, + "balance_loss_mlp": 1.13969469, + "diversity_loss_mlp": 0.0, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.09960357111836311, + "language_loss": 0.81973028, + "learning_rate": 0.0009507929441644778, + "loss": 0.83129185, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 874, + "time_per_iteration": 3.518749237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141075, + "balance_loss_mlp": 1.12455297, + "diversity_loss_mlp": 0.0, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.09789550875526438, + "language_loss": 0.86003464, + "learning_rate": 0.0009506580834178826, + "loss": 0.87144536, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.1652832, + "routerloss_mlp": 0.0, + "step": 875, + "time_per_iteration": 2.7423431873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152406, + "balance_loss_mlp": 1.13565707, + "diversity_loss_mlp": 0.0, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.08790070613593892, + "language_loss": 0.91631377, + "learning_rate": 0.0009505230477103028, + "loss": 0.92783785, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 876, + "time_per_iteration": 2.698725938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133355, + "balance_loss_mlp": 1.11677289, + "diversity_loss_mlp": 0.0, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.09908277874944699, + "language_loss": 0.81365788, + "learning_rate": 0.0009503878370941641, + "loss": 0.82499135, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 877, + "time_per_iteration": 2.791314125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891363, + "balance_loss_mlp": 1.54620337, + "diversity_loss_mlp": 0.20141272, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.04203797903351432, + "language_loss": 0.89092785, + "learning_rate": 0.0009502524516219595, + "loss": 0.89984149, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01755447, + "step": 878, + "time_per_iteration": 2.776076078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_mlp": 1.12719083, + "diversity_loss_mlp": 0.0, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.08982042340710936, + "language_loss": 0.90123284, + "learning_rate": 0.0009501168913462506, + "loss": 0.91266429, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 879, + "time_per_iteration": 2.6948277950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112281, + "balance_loss_mlp": 1.09587741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.05096984028598956, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80234206, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 880, + "time_per_iteration": 4.850466728210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143836, + "balance_loss_mlp": 1.12831497, + "diversity_loss_mlp": 0.0, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.08080936273118028, + "language_loss": 0.85235959, + "learning_rate": 0.0009498452465949042, + "loss": 0.8637979, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.1550293, + "routerloss_mlp": 0.0, + "step": 881, + "time_per_iteration": 3.2163655757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147334, + "balance_loss_mlp": 1.13156271, + "diversity_loss_mlp": 0.0, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.06875421208466073, + "language_loss": 0.91363323, + "learning_rate": 0.0009497091622247285, + "loss": 0.92510653, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 882, + "time_per_iteration": 2.686939239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152935, + "balance_loss_mlp": 1.13735437, + "diversity_loss_mlp": 0.0, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.08376903723107024, + "language_loss": 0.93688583, + "learning_rate": 0.0009495729032619723, + "loss": 0.94841516, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.15563965, + "routerloss_mlp": 0.0, + "step": 883, + "time_per_iteration": 2.709554433822632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164282, + "balance_loss_mlp": 1.14845097, + "diversity_loss_mlp": 0.0, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07836441801613908, + "language_loss": 0.83897853, + "learning_rate": 0.0009494364697595354, + "loss": 0.85062128, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 884, + "time_per_iteration": 2.905869722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192457, + "balance_loss_mlp": 1.17685246, + "diversity_loss_mlp": 0.0, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.08347533231949411, + "language_loss": 0.89193916, + "learning_rate": 0.0009492998617703867, + "loss": 0.90386373, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 885, + "time_per_iteration": 2.655181884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.18021917, + "diversity_loss_mlp": 0.0, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.09597329726050118, + "language_loss": 0.87667245, + "learning_rate": 0.0009491630793475619, + "loss": 0.88863432, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 886, + "time_per_iteration": 2.6077725887298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195953, + "balance_loss_mlp": 1.17983615, + "diversity_loss_mlp": 0.0, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.09161300078510141, + "language_loss": 0.8529889, + "learning_rate": 0.0009490261225441643, + "loss": 0.86494851, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 887, + "time_per_iteration": 2.8882617950439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169082, + "balance_loss_mlp": 1.15244031, + "diversity_loss_mlp": 0.0, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07944379291645969, + "language_loss": 0.90366387, + "learning_rate": 0.0009488889914133656, + "loss": 0.91535467, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 888, + "time_per_iteration": 2.969808578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192276, + "balance_loss_mlp": 1.17532432, + "diversity_loss_mlp": 0.0, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.0816216626447537, + "language_loss": 0.89335579, + "learning_rate": 0.0009487516860084047, + "loss": 0.90527856, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 889, + "time_per_iteration": 2.6975717544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164555, + "balance_loss_mlp": 1.14738929, + "diversity_loss_mlp": 0.0, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.08956429914743876, + "language_loss": 0.88835347, + "learning_rate": 0.0009486142063825884, + "loss": 0.89999902, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 890, + "time_per_iteration": 2.5376908779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087842, + "balance_loss_mlp": 1.07248783, + "diversity_loss_mlp": 0.0, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.041165905845677725, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73514056, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 891, + "time_per_iteration": 4.961901664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168071, + "balance_loss_mlp": 1.15150142, + "diversity_loss_mlp": 0.0, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.09530662242326329, + "language_loss": 0.89790797, + "learning_rate": 0.0009483387246819542, + "loss": 0.90958869, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 892, + "time_per_iteration": 2.7075483798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.0489924, + "diversity_loss_mlp": 0.0, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.03173229244132217, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83349359, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 893, + "time_per_iteration": 4.639479398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175334, + "balance_loss_mlp": 1.15915704, + "diversity_loss_mlp": 0.0, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.09568003043121609, + "language_loss": 0.88799989, + "learning_rate": 0.0009480625467392688, + "loss": 0.89975327, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 894, + "time_per_iteration": 2.6601061820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.04933381, + "diversity_loss_mlp": 0.0, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.02668432598653126, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79057646, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 895, + "time_per_iteration": 4.739619970321655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154117, + "balance_loss_mlp": 1.13857174, + "diversity_loss_mlp": 0.0, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0641043143423189, + "language_loss": 0.87743723, + "learning_rate": 0.0009477856729834196, + "loss": 0.88897842, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 896, + "time_per_iteration": 2.7397632598876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143695, + "balance_loss_mlp": 1.12863934, + "diversity_loss_mlp": 0.0, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.08265751895316475, + "language_loss": 0.89999056, + "learning_rate": 0.0009476469753098809, + "loss": 0.9114275, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 897, + "time_per_iteration": 2.7494678497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.13624024, + "diversity_loss_mlp": 0.0, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08701823937514089, + "language_loss": 0.86839932, + "learning_rate": 0.0009475081038443738, + "loss": 0.87991428, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.15234375, + "routerloss_mlp": 0.0, + "step": 898, + "time_per_iteration": 2.6241486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147135, + "balance_loss_mlp": 1.13179302, + "diversity_loss_mlp": 0.0, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.10104724937619765, + "language_loss": 0.85756111, + "learning_rate": 0.0009473690586408124, + "loss": 0.86903244, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 899, + "time_per_iteration": 2.8371973037719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141451, + "balance_loss_mlp": 1.1257633, + "diversity_loss_mlp": 0.0, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.08019640817702944, + "language_loss": 0.86364079, + "learning_rate": 0.0009472298397531792, + "loss": 0.87505525, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 900, + "time_per_iteration": 2.742392063140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158392, + "balance_loss_mlp": 1.14285886, + "diversity_loss_mlp": 0.0, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.08623310667606855, + "language_loss": 0.86846912, + "learning_rate": 0.0009470904472355235, + "loss": 0.88005304, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 901, + "time_per_iteration": 2.6695165634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168499, + "balance_loss_mlp": 1.15235806, + "diversity_loss_mlp": 0.0, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.08505658620970231, + "language_loss": 0.7976377, + "learning_rate": 0.0009469508811419626, + "loss": 0.80932266, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 902, + "time_per_iteration": 2.706495761871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295395, + "balance_loss_mlp": 1.28533375, + "diversity_loss_mlp": 0.0, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.12561294289393785, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72909224, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 903, + "time_per_iteration": 4.816544532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201232, + "balance_loss_mlp": 1.18432808, + "diversity_loss_mlp": 0.0, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.08260915403461032, + "language_loss": 0.83578205, + "learning_rate": 0.0009466712284439292, + "loss": 0.84779429, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 904, + "time_per_iteration": 2.7518186569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225673, + "balance_loss_mlp": 1.20837545, + "diversity_loss_mlp": 0.0, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.10172065741669829, + "language_loss": 0.88445127, + "learning_rate": 0.0009465311419480276, + "loss": 0.89670801, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 905, + "time_per_iteration": 2.6713294982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222896, + "balance_loss_mlp": 1.20540833, + "diversity_loss_mlp": 0.0, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.08928567213571854, + "language_loss": 0.88188136, + "learning_rate": 0.0009463908820933622, + "loss": 0.89411032, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 906, + "time_per_iteration": 2.838935375213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211371, + "balance_loss_mlp": 1.19455028, + "diversity_loss_mlp": 0.0, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.07641026648080583, + "language_loss": 0.82561022, + "learning_rate": 0.0009462504489343868, + "loss": 0.83772391, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 907, + "time_per_iteration": 2.814695119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176767, + "balance_loss_mlp": 1.15961313, + "diversity_loss_mlp": 0.0, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.1031074016814366, + "language_loss": 0.88790941, + "learning_rate": 0.0009461098425256222, + "loss": 0.89967716, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 908, + "time_per_iteration": 2.6116297245025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159634, + "balance_loss_mlp": 1.14329028, + "diversity_loss_mlp": 0.0, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.08015161116044169, + "language_loss": 0.86030436, + "learning_rate": 0.0009459690629216567, + "loss": 0.87190068, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 909, + "time_per_iteration": 2.6483752727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130085, + "balance_loss_mlp": 1.11407518, + "diversity_loss_mlp": 0.0, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.1301831169035446, + "language_loss": 0.87761313, + "learning_rate": 0.0009458281101771457, + "loss": 0.88891399, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 910, + "time_per_iteration": 2.6089227199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00992009, + "balance_loss_mlp": 1.75545192, + "diversity_loss_mlp": 0.19214596, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.033219305186726854, + "language_loss": 0.82887536, + "learning_rate": 0.0009456869843468122, + "loss": 0.83879542, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820984, + "step": 911, + "time_per_iteration": 2.895577907562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110935, + "balance_loss_mlp": 1.09519958, + "diversity_loss_mlp": 0.0, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.09801228329993106, + "language_loss": 0.78689641, + "learning_rate": 0.0009455456854854459, + "loss": 0.79800576, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 912, + "time_per_iteration": 2.61677885055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112332, + "balance_loss_mlp": 1.09684718, + "diversity_loss_mlp": 0.0, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.10345929433375275, + "language_loss": 0.84027654, + "learning_rate": 0.0009454042136479039, + "loss": 0.8513999, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 913, + "time_per_iteration": 2.63289737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970368, + "balance_loss_mlp": 1.71473479, + "diversity_loss_mlp": 0.18966624, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.036406885856323776, + "language_loss": 0.82874572, + "learning_rate": 0.0009452625688891103, + "loss": 0.83844936, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816791, + "step": 914, + "time_per_iteration": 2.5505056381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00652668, + "balance_loss_mlp": 1.1176697, + "diversity_loss_mlp": 0.15453993, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.002103211778310914, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79387403, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01656273, + "step": 915, + "time_per_iteration": 4.6835761070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138887, + "balance_loss_mlp": 1.12381876, + "diversity_loss_mlp": 0.0, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.10180381633640839, + "language_loss": 0.92940623, + "learning_rate": 0.0009449787608278015, + "loss": 0.94079512, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 916, + "time_per_iteration": 2.7294180393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155245, + "balance_loss_mlp": 1.13949776, + "diversity_loss_mlp": 0.0, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08481056496958321, + "language_loss": 0.92318904, + "learning_rate": 0.0009448365976354704, + "loss": 0.9347415, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 917, + "time_per_iteration": 2.4908158779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174187, + "balance_loss_mlp": 1.15821338, + "diversity_loss_mlp": 0.0, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.1031397623895646, + "language_loss": 0.89928877, + "learning_rate": 0.0009446942617422558, + "loss": 0.91103065, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 918, + "time_per_iteration": 2.5721499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191219, + "balance_loss_mlp": 1.1748755, + "diversity_loss_mlp": 0.0, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.17804953788653613, + "language_loss": 0.85687363, + "learning_rate": 0.0009445517532034176, + "loss": 0.86878586, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 919, + "time_per_iteration": 2.6613845825195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195517, + "balance_loss_mlp": 1.18031824, + "diversity_loss_mlp": 0.0, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.09678678856513988, + "language_loss": 0.89147103, + "learning_rate": 0.0009444090720742824, + "loss": 0.90342629, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 920, + "time_per_iteration": 2.587042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186456, + "balance_loss_mlp": 1.17107785, + "diversity_loss_mlp": 0.0, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.10185153476697495, + "language_loss": 0.87654328, + "learning_rate": 0.0009442662184102439, + "loss": 0.88840789, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 921, + "time_per_iteration": 2.8263702392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153869, + "balance_loss_mlp": 1.13851511, + "diversity_loss_mlp": 0.0, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.07023953845341, + "language_loss": 0.87764925, + "learning_rate": 0.000944123192266763, + "loss": 0.88918793, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 922, + "time_per_iteration": 2.789288282394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914197, + "balance_loss_mlp": 1.60349846, + "diversity_loss_mlp": 0.18745996, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.03372690713262746, + "language_loss": 0.83555657, + "learning_rate": 0.0009439799936993671, + "loss": 0.84469855, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871805, + "step": 923, + "time_per_iteration": 2.7374520301818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137351, + "balance_loss_mlp": 1.12125802, + "diversity_loss_mlp": 0.0, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.08202300708599226, + "language_loss": 0.87886107, + "learning_rate": 0.0009438366227636511, + "loss": 0.89023459, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.16088867, + "routerloss_mlp": 0.0, + "step": 924, + "time_per_iteration": 2.7159595489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148154, + "balance_loss_mlp": 1.13190556, + "diversity_loss_mlp": 0.0, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.08035818105278464, + "language_loss": 0.86048192, + "learning_rate": 0.0009436930795152763, + "loss": 0.8719635, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 925, + "time_per_iteration": 2.8248116970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143318, + "balance_loss_mlp": 1.12739205, + "diversity_loss_mlp": 0.0, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07405817727017547, + "language_loss": 0.86317486, + "learning_rate": 0.0009435493640099713, + "loss": 0.87460804, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 926, + "time_per_iteration": 2.8155741691589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161834, + "balance_loss_mlp": 1.1451211, + "diversity_loss_mlp": 0.0, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.09122083849675254, + "language_loss": 0.84453332, + "learning_rate": 0.0009434054763035314, + "loss": 0.8561517, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 927, + "time_per_iteration": 2.636686325073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158411, + "balance_loss_mlp": 1.1422224, + "diversity_loss_mlp": 0.0, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.0663266274239875, + "language_loss": 0.85362542, + "learning_rate": 0.0009432614164518185, + "loss": 0.86520946, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 928, + "time_per_iteration": 2.9446685314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171163, + "balance_loss_mlp": 1.15443754, + "diversity_loss_mlp": 0.0, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07726522608444414, + "language_loss": 0.84178561, + "learning_rate": 0.000943117184510762, + "loss": 0.85349721, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 929, + "time_per_iteration": 3.0194530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175374, + "balance_loss_mlp": 1.16435885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.030831515732685378, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79965341, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 930, + "time_per_iteration": 5.04656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172004, + "balance_loss_mlp": 1.15555263, + "diversity_loss_mlp": 0.0, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.08209248711818126, + "language_loss": 0.88495553, + "learning_rate": 0.0009428282045846674, + "loss": 0.89667559, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.16455078, + "routerloss_mlp": 0.0, + "step": 931, + "time_per_iteration": 2.6833221912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905029, + "balance_loss_mlp": 1.58147573, + "diversity_loss_mlp": 0.18920106, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.030391877730158674, + "language_loss": 0.89804769, + "learning_rate": 0.0009426834567118214, + "loss": 0.90709794, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01969042, + "step": 932, + "time_per_iteration": 3.0804004669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174106, + "balance_loss_mlp": 1.15761924, + "diversity_loss_mlp": 0.0, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.06967623980831897, + "language_loss": 0.80600739, + "learning_rate": 0.0009425385369740155, + "loss": 0.81774843, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.16491699, + "routerloss_mlp": 0.0, + "step": 933, + "time_per_iteration": 3.039576530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172613, + "balance_loss_mlp": 1.15553069, + "diversity_loss_mlp": 0.0, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.09198882046168515, + "language_loss": 0.87049097, + "learning_rate": 0.0009423934454275125, + "loss": 0.88221705, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 934, + "time_per_iteration": 2.8528192043304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147429, + "balance_loss_mlp": 1.13053656, + "diversity_loss_mlp": 0.0, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.09002999058802562, + "language_loss": 0.92077851, + "learning_rate": 0.0009422481821286418, + "loss": 0.93225282, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.16906738, + "routerloss_mlp": 0.0, + "step": 935, + "time_per_iteration": 2.720700740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140916, + "balance_loss_mlp": 1.12434602, + "diversity_loss_mlp": 0.0, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.11818586168906865, + "language_loss": 0.88474637, + "learning_rate": 0.0009421027471337998, + "loss": 0.89615548, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 936, + "time_per_iteration": 2.61820125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114364, + "balance_loss_mlp": 1.12680769, + "diversity_loss_mlp": 0.0, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.13119105141522364, + "language_loss": 0.82430404, + "learning_rate": 0.0009419571404994493, + "loss": 0.83574045, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 937, + "time_per_iteration": 2.6458749771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.11016333, + "diversity_loss_mlp": 0.0, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.10011425098636609, + "language_loss": 0.90748799, + "learning_rate": 0.00094181136228212, + "loss": 0.91875559, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 938, + "time_per_iteration": 2.659946918487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132333, + "balance_loss_mlp": 1.11602521, + "diversity_loss_mlp": 0.0, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06984091109722412, + "language_loss": 0.86027002, + "learning_rate": 0.0009416654125384077, + "loss": 0.8715933, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 939, + "time_per_iteration": 2.723839044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182476, + "balance_loss_mlp": 1.17174697, + "diversity_loss_mlp": 0.0, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.0414358910702132, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.8095485, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 940, + "time_per_iteration": 4.920511722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141007, + "balance_loss_mlp": 1.12453222, + "diversity_loss_mlp": 0.0, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.0813056862192268, + "language_loss": 0.83903325, + "learning_rate": 0.000941372998698552, + "loss": 0.85044336, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 941, + "time_per_iteration": 2.937645673751831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00896978, + "balance_loss_mlp": 1.56833267, + "diversity_loss_mlp": 0.1911485, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.04191931915848681, + "language_loss": 0.82149267, + "learning_rate": 0.0009412265347159336, + "loss": 0.83046246, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0172378, + "step": 942, + "time_per_iteration": 2.7250781059265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116112, + "balance_loss_mlp": 1.14446664, + "diversity_loss_mlp": 0.0, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.08706600394859935, + "language_loss": 0.84761524, + "learning_rate": 0.0009410798994339829, + "loss": 0.85922647, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 943, + "time_per_iteration": 2.5916900634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115721, + "balance_loss_mlp": 1.14027047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.07414862428622851, + "language_loss": 0.87698966, + "learning_rate": 0.000940933092909628, + "loss": 0.88856173, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 944, + "time_per_iteration": 2.6747801303863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166789, + "balance_loss_mlp": 1.15049326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.07390491400887403, + "language_loss": 0.83424389, + "learning_rate": 0.0009407861151998649, + "loss": 0.84591174, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 945, + "time_per_iteration": 2.602691411972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163795, + "balance_loss_mlp": 1.14708209, + "diversity_loss_mlp": 0.0, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.07435679337016335, + "language_loss": 0.86087269, + "learning_rate": 0.0009406389663617552, + "loss": 0.87251067, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 946, + "time_per_iteration": 2.6775379180908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139209, + "balance_loss_mlp": 1.12300825, + "diversity_loss_mlp": 0.0, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.08423780444915897, + "language_loss": 0.86031067, + "learning_rate": 0.000940491646452427, + "loss": 0.87170279, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 947, + "time_per_iteration": 2.717313051223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134537, + "balance_loss_mlp": 1.11805058, + "diversity_loss_mlp": 0.0, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.0716601161320721, + "language_loss": 0.90799212, + "learning_rate": 0.000940344155529075, + "loss": 0.91933751, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 948, + "time_per_iteration": 2.645601749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905236, + "balance_loss_mlp": 1.57791471, + "diversity_loss_mlp": 0.19691566, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.03478780514937427, + "language_loss": 0.87420666, + "learning_rate": 0.0009401964936489605, + "loss": 0.883259, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01782099, + "step": 949, + "time_per_iteration": 2.546546459197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132433, + "balance_loss_mlp": 1.11666203, + "diversity_loss_mlp": 0.0, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.11218622077210595, + "language_loss": 0.85308415, + "learning_rate": 0.0009400486608694108, + "loss": 0.86440849, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 950, + "time_per_iteration": 2.71462345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135805, + "balance_loss_mlp": 1.1190201, + "diversity_loss_mlp": 0.0, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.07143871570155125, + "language_loss": 0.87176299, + "learning_rate": 0.0009399006572478195, + "loss": 0.88312101, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 951, + "time_per_iteration": 3.0933260917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137853, + "balance_loss_mlp": 1.12129509, + "diversity_loss_mlp": 0.0, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.08672794105569953, + "language_loss": 0.90997601, + "learning_rate": 0.0009397524828416468, + "loss": 0.92135453, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 952, + "time_per_iteration": 2.6721160411834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906668, + "balance_loss_mlp": 1.58174932, + "diversity_loss_mlp": 0.19792399, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.0341945315399877, + "language_loss": 0.96079636, + "learning_rate": 0.0009396041377084192, + "loss": 0.96986312, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01683164, + "step": 953, + "time_per_iteration": 2.6563429832458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147916, + "balance_loss_mlp": 1.1312983, + "diversity_loss_mlp": 0.0, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.07156922543086394, + "language_loss": 0.87274891, + "learning_rate": 0.0009394556219057295, + "loss": 0.88422805, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 954, + "time_per_iteration": 2.710129499435425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.1480366, + "diversity_loss_mlp": 0.0, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08933499459227748, + "language_loss": 0.83389091, + "learning_rate": 0.0009393069354912362, + "loss": 0.84553862, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 955, + "time_per_iteration": 2.736077070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162546, + "balance_loss_mlp": 1.1459167, + "diversity_loss_mlp": 0.0, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.10088049230192819, + "language_loss": 0.81851852, + "learning_rate": 0.0009391580785226649, + "loss": 0.83014399, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 956, + "time_per_iteration": 2.8675243854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139492, + "balance_loss_mlp": 1.12933517, + "diversity_loss_mlp": 0.0, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.028623000900350283, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80479944, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 957, + "time_per_iteration": 4.758531332015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128949, + "balance_loss_mlp": 1.11177051, + "diversity_loss_mlp": 0.0, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.0742792603097427, + "language_loss": 0.8674221, + "learning_rate": 0.0009388598531545196, + "loss": 0.87871158, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 958, + "time_per_iteration": 2.8665144443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110151, + "balance_loss_mlp": 1.09304404, + "diversity_loss_mlp": 0.0, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.08387101873752756, + "language_loss": 0.85292655, + "learning_rate": 0.000938710484870727, + "loss": 0.86402804, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.17126465, + "routerloss_mlp": 0.0, + "step": 959, + "time_per_iteration": 2.5621094703674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113798, + "balance_loss_mlp": 1.09718001, + "diversity_loss_mlp": 0.0, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.08027143748444723, + "language_loss": 0.85896957, + "learning_rate": 0.0009385609462644189, + "loss": 0.87010753, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 960, + "time_per_iteration": 2.6949400901794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122642, + "balance_loss_mlp": 1.10596502, + "diversity_loss_mlp": 0.0, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07967759372686231, + "language_loss": 0.8535409, + "learning_rate": 0.0009384112373936514, + "loss": 0.86476731, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 961, + "time_per_iteration": 2.644244432449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.11566615, + "diversity_loss_mlp": 0.0, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.09330138113238175, + "language_loss": 0.91539109, + "learning_rate": 0.0009382613583165467, + "loss": 0.92671585, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 962, + "time_per_iteration": 2.8191375732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128481, + "balance_loss_mlp": 1.11161256, + "diversity_loss_mlp": 0.0, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.08799115365988901, + "language_loss": 0.89600122, + "learning_rate": 0.0009381113090912928, + "loss": 0.90728599, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 963, + "time_per_iteration": 2.77341890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137775, + "balance_loss_mlp": 1.12159812, + "diversity_loss_mlp": 0.0, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.08224545608030313, + "language_loss": 0.89354098, + "learning_rate": 0.000937961089776144, + "loss": 0.90491867, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 964, + "time_per_iteration": 2.6057045459747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140677, + "balance_loss_mlp": 1.12448788, + "diversity_loss_mlp": 0.0, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.08763662153745684, + "language_loss": 0.82399738, + "learning_rate": 0.0009378107004294208, + "loss": 0.83540416, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 965, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132665, + "balance_loss_mlp": 1.11624968, + "diversity_loss_mlp": 0.0, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.0696996408734829, + "language_loss": 0.91584361, + "learning_rate": 0.0009376601411095096, + "loss": 0.92717028, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.16418457, + "routerloss_mlp": 0.0, + "step": 966, + "time_per_iteration": 2.6557700634002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108368, + "balance_loss_mlp": 1.09209585, + "diversity_loss_mlp": 0.0, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.0928645758984953, + "language_loss": 0.86438054, + "learning_rate": 0.0009375094118748622, + "loss": 0.8754642, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.16271973, + "routerloss_mlp": 0.0, + "step": 967, + "time_per_iteration": 2.5574727058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121341, + "balance_loss_mlp": 1.10546279, + "diversity_loss_mlp": 0.0, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.08866997131388626, + "language_loss": 0.90710455, + "learning_rate": 0.0009373585127839976, + "loss": 0.91831791, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 968, + "time_per_iteration": 2.9949731826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.1066587, + "diversity_loss_mlp": 0.0, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08663719992470821, + "language_loss": 0.90892541, + "learning_rate": 0.0009372074438954994, + "loss": 0.92014849, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.15637207, + "routerloss_mlp": 0.0, + "step": 969, + "time_per_iteration": 2.583392381668091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115205, + "balance_loss_mlp": 1.09983897, + "diversity_loss_mlp": 0.0, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.1288159292638968, + "language_loss": 0.91714692, + "learning_rate": 0.0009370562052680181, + "loss": 0.92829901, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.15356445, + "routerloss_mlp": 0.0, + "step": 970, + "time_per_iteration": 2.476053476333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131477, + "balance_loss_mlp": 1.1160872, + "diversity_loss_mlp": 0.0, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.05501755081279848, + "language_loss": 0.89296091, + "learning_rate": 0.0009369047969602695, + "loss": 0.90427566, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 971, + "time_per_iteration": 2.705310344696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161734, + "balance_loss_mlp": 1.14604628, + "diversity_loss_mlp": 0.0, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.09590230746039986, + "language_loss": 0.86690193, + "learning_rate": 0.0009367532190310357, + "loss": 0.8785193, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 972, + "time_per_iteration": 2.551683187484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151378, + "balance_loss_mlp": 1.13526106, + "diversity_loss_mlp": 0.0, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.13723256450586457, + "language_loss": 0.88859725, + "learning_rate": 0.0009366014715391644, + "loss": 0.90011096, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 973, + "time_per_iteration": 2.6311707496643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140536, + "balance_loss_mlp": 1.12521768, + "diversity_loss_mlp": 0.0, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.0667022200872989, + "language_loss": 0.83902818, + "learning_rate": 0.0009364495545435693, + "loss": 0.85043353, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 974, + "time_per_iteration": 2.756056308746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121055, + "balance_loss_mlp": 1.10528326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.06720472395514528, + "language_loss": 0.88235438, + "learning_rate": 0.0009362974681032297, + "loss": 0.89356488, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 975, + "time_per_iteration": 2.601027488708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117499, + "balance_loss_mlp": 1.10179889, + "diversity_loss_mlp": 0.0, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.09372829562862567, + "language_loss": 0.88529336, + "learning_rate": 0.0009361452122771907, + "loss": 0.8964684, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 976, + "time_per_iteration": 2.8729074001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124468, + "balance_loss_mlp": 1.107934, + "diversity_loss_mlp": 0.0, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.10248565336705484, + "language_loss": 0.83506191, + "learning_rate": 0.0009359927871245635, + "loss": 0.84630656, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 977, + "time_per_iteration": 2.4633541107177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114403, + "balance_loss_mlp": 1.12861657, + "diversity_loss_mlp": 0.0, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09207140211488826, + "language_loss": 0.85937703, + "learning_rate": 0.0009358401927045246, + "loss": 0.87081736, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 978, + "time_per_iteration": 2.8528451919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165656, + "balance_loss_mlp": 1.15002799, + "diversity_loss_mlp": 0.0, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.09819064259764942, + "language_loss": 0.88151729, + "learning_rate": 0.0009356874290763166, + "loss": 0.89317381, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 979, + "time_per_iteration": 3.4732589721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165217, + "balance_loss_mlp": 1.14985144, + "diversity_loss_mlp": 0.0, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.07125364842819645, + "language_loss": 0.88739443, + "learning_rate": 0.0009355344962992474, + "loss": 0.8990466, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 980, + "time_per_iteration": 2.618013381958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092711, + "balance_loss_mlp": 1.61735535, + "diversity_loss_mlp": 0.20325859, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.031158428526317693, + "language_loss": 0.8787328, + "learning_rate": 0.0009353813944326908, + "loss": 0.88800395, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0168031, + "step": 981, + "time_per_iteration": 2.926612377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00925726, + "balance_loss_mlp": 1.616956, + "diversity_loss_mlp": 0.20126666, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0354798675553145, + "language_loss": 0.82752389, + "learning_rate": 0.0009352281235360863, + "loss": 0.83678114, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01661466, + "step": 982, + "time_per_iteration": 2.7461719512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156754, + "balance_loss_mlp": 1.14193642, + "diversity_loss_mlp": 0.0, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.08008026175511872, + "language_loss": 0.84875655, + "learning_rate": 0.0009350746836689389, + "loss": 0.86032403, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 983, + "time_per_iteration": 2.5128703117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232965, + "balance_loss_mlp": 1.22199774, + "diversity_loss_mlp": 0.0, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.06420942239022731, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82672185, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 984, + "time_per_iteration": 4.987680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144146, + "balance_loss_mlp": 1.12880325, + "diversity_loss_mlp": 0.0, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08702988523082197, + "language_loss": 0.82654107, + "learning_rate": 0.0009347672972613634, + "loss": 0.83798254, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 985, + "time_per_iteration": 2.586580514907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891878, + "balance_loss_mlp": 1.54986262, + "diversity_loss_mlp": 0.20135348, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.032521151954013804, + "language_loss": 0.85226321, + "learning_rate": 0.0009346133508402735, + "loss": 0.86118197, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01626948, + "step": 986, + "time_per_iteration": 2.7389352321624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151414, + "balance_loss_mlp": 1.13596404, + "diversity_loss_mlp": 0.0, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.0982536864932062, + "language_loss": 0.84267235, + "learning_rate": 0.0009344592356873166, + "loss": 0.85418648, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 987, + "time_per_iteration": 2.6327145099639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157169, + "balance_loss_mlp": 1.14155281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.07528447862042392, + "language_loss": 0.78532755, + "learning_rate": 0.0009343049518623255, + "loss": 0.79689926, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 988, + "time_per_iteration": 2.7461259365081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161817, + "balance_loss_mlp": 1.14693928, + "diversity_loss_mlp": 0.0, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.07061488940634471, + "language_loss": 0.83142781, + "learning_rate": 0.0009341504994251985, + "loss": 0.84304595, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 989, + "time_per_iteration": 2.9033045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128003, + "balance_loss_mlp": 1.11765516, + "diversity_loss_mlp": 0.0, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.02664126889468688, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74648499, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 990, + "time_per_iteration": 5.065544605255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116372, + "balance_loss_mlp": 1.14821064, + "diversity_loss_mlp": 0.0, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.062492069067547173, + "language_loss": 0.81668103, + "learning_rate": 0.0009338410889544574, + "loss": 0.82831824, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 991, + "time_per_iteration": 3.0360453128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160077, + "balance_loss_mlp": 1.14444828, + "diversity_loss_mlp": 0.0, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.07188646642614673, + "language_loss": 0.87598348, + "learning_rate": 0.000933686131040967, + "loss": 0.88758421, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 992, + "time_per_iteration": 4.194309234619141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132508, + "balance_loss_mlp": 1.11693931, + "diversity_loss_mlp": 0.0, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.07096950165415856, + "language_loss": 0.90250611, + "learning_rate": 0.0009335310047555883, + "loss": 0.91383117, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 993, + "time_per_iteration": 2.7198565006256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128004, + "balance_loss_mlp": 1.11225605, + "diversity_loss_mlp": 0.0, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.07682750770192658, + "language_loss": 0.8836562, + "learning_rate": 0.0009333757101585467, + "loss": 0.89493626, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 994, + "time_per_iteration": 2.6651480197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.10621142, + "diversity_loss_mlp": 0.0, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.10461680978710068, + "language_loss": 0.9317944, + "learning_rate": 0.0009332202473101329, + "loss": 0.94301325, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 995, + "time_per_iteration": 2.667943239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00890685, + "balance_loss_mlp": 1.54595685, + "diversity_loss_mlp": 0.2013846, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.03439253799161941, + "language_loss": 0.8270663, + "learning_rate": 0.0009330646162707028, + "loss": 0.83597314, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170145, + "step": 996, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130524, + "balance_loss_mlp": 1.11483645, + "diversity_loss_mlp": 0.0, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.07379991060729872, + "language_loss": 0.84002179, + "learning_rate": 0.0009329088171006779, + "loss": 0.85132706, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 997, + "time_per_iteration": 3.133023738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136353, + "balance_loss_mlp": 1.12061739, + "diversity_loss_mlp": 0.0, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.09187105070084006, + "language_loss": 0.85599297, + "learning_rate": 0.0009327528498605446, + "loss": 0.86735654, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 998, + "time_per_iteration": 2.5390877723693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888942, + "balance_loss_mlp": 1.54108667, + "diversity_loss_mlp": 0.20404731, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.03685920036749298, + "language_loss": 0.89166534, + "learning_rate": 0.0009325967146108548, + "loss": 0.90055484, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01637482, + "step": 999, + "time_per_iteration": 2.7167420387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159789, + "balance_loss_mlp": 1.14361215, + "diversity_loss_mlp": 0.0, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.08415694153473897, + "language_loss": 0.87386107, + "learning_rate": 0.0009324404114122258, + "loss": 0.88545901, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 1000, + "time_per_iteration": 2.6833291053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164843, + "balance_loss_mlp": 1.1492269, + "diversity_loss_mlp": 0.0, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.07516183221332183, + "language_loss": 0.86446774, + "learning_rate": 0.0009322839403253397, + "loss": 0.87611622, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 1001, + "time_per_iteration": 4.16480565071106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173642, + "balance_loss_mlp": 1.15789402, + "diversity_loss_mlp": 0.0, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07739515949456567, + "language_loss": 0.84035075, + "learning_rate": 0.0009321273014109439, + "loss": 0.8520872, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1002, + "time_per_iteration": 2.9390604496002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183539, + "balance_loss_mlp": 1.16795826, + "diversity_loss_mlp": 0.0, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.08102605487142737, + "language_loss": 0.84643984, + "learning_rate": 0.0009319704947298513, + "loss": 0.85827518, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1003, + "time_per_iteration": 2.923952579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116012, + "balance_loss_mlp": 1.14496815, + "diversity_loss_mlp": 0.0, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.060771133612280225, + "language_loss": 0.88448775, + "learning_rate": 0.0009318135203429393, + "loss": 0.89608896, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.15124512, + "routerloss_mlp": 0.0, + "step": 1004, + "time_per_iteration": 2.7170984745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135222, + "balance_loss_mlp": 1.11972475, + "diversity_loss_mlp": 0.0, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.07023398647530335, + "language_loss": 0.87528408, + "learning_rate": 0.0009316563783111511, + "loss": 0.88663626, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1005, + "time_per_iteration": 2.7271320819854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011162, + "balance_loss_mlp": 1.10061884, + "diversity_loss_mlp": 0.0, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.07388032809600253, + "language_loss": 0.82009041, + "learning_rate": 0.0009314990686954943, + "loss": 0.83125246, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1006, + "time_per_iteration": 2.9210305213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108745, + "balance_loss_mlp": 1.09337938, + "diversity_loss_mlp": 0.0, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.06330578200459082, + "language_loss": 0.80805916, + "learning_rate": 0.000931341591557042, + "loss": 0.81914663, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 1007, + "time_per_iteration": 3.695157051086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095993, + "balance_loss_mlp": 1.08054364, + "diversity_loss_mlp": 0.0, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.07858263731415134, + "language_loss": 0.87216473, + "learning_rate": 0.0009311839469569325, + "loss": 0.88312465, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1008, + "time_per_iteration": 2.633854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108854, + "balance_loss_mlp": 1.07287586, + "diversity_loss_mlp": 0.0, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.14235975733457876, + "language_loss": 0.87399781, + "learning_rate": 0.0009310261349563687, + "loss": 0.88488322, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1009, + "time_per_iteration": 2.702073574066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898627, + "balance_loss_mlp": 1.56164169, + "diversity_loss_mlp": 0.20371187, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.03011805945399338, + "language_loss": 0.85438645, + "learning_rate": 0.0009308681556166186, + "loss": 0.86337274, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01594995, + "step": 1010, + "time_per_iteration": 2.8698601722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111744, + "balance_loss_mlp": 1.0962348, + "diversity_loss_mlp": 0.0, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.08879322612819535, + "language_loss": 0.87462533, + "learning_rate": 0.0009307100089990152, + "loss": 0.88574278, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1011, + "time_per_iteration": 2.7149901390075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140864, + "balance_loss_mlp": 1.12543821, + "diversity_loss_mlp": 0.0, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.07383907155719892, + "language_loss": 0.83837229, + "learning_rate": 0.0009305516951649568, + "loss": 0.84978092, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.15405273, + "routerloss_mlp": 0.0, + "step": 1012, + "time_per_iteration": 2.702683448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161407, + "balance_loss_mlp": 1.14599323, + "diversity_loss_mlp": 0.0, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07624018834593461, + "language_loss": 0.86570859, + "learning_rate": 0.0009303932141759057, + "loss": 0.87732267, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 1013, + "time_per_iteration": 2.7500197887420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168747, + "balance_loss_mlp": 1.15382242, + "diversity_loss_mlp": 0.0, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.08469076174706892, + "language_loss": 0.83575755, + "learning_rate": 0.0009302345660933902, + "loss": 0.84744501, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1014, + "time_per_iteration": 2.8010780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171185, + "balance_loss_mlp": 1.15642715, + "diversity_loss_mlp": 0.0, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.08619273283705803, + "language_loss": 0.85146868, + "learning_rate": 0.0009300757509790026, + "loss": 0.86318052, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1015, + "time_per_iteration": 2.840315103530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150596, + "balance_loss_mlp": 1.13570654, + "diversity_loss_mlp": 0.0, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.10655365126946059, + "language_loss": 0.90244913, + "learning_rate": 0.0009299167688944005, + "loss": 0.91395509, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1016, + "time_per_iteration": 2.502391815185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130549, + "balance_loss_mlp": 1.11540985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07757202619564983, + "language_loss": 0.85754222, + "learning_rate": 0.0009297576199013063, + "loss": 0.86884773, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1017, + "time_per_iteration": 2.7255496978759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00657481, + "balance_loss_mlp": 1.1064117, + "diversity_loss_mlp": 0.17609364, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.0027779106975556575, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.73659611, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01622855, + "step": 1018, + "time_per_iteration": 4.943171739578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01384914, + "balance_loss_mlp": 1.37351775, + "diversity_loss_mlp": 0.0, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.09054623740471555, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80811214, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 1019, + "time_per_iteration": 5.518418788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125322, + "balance_loss_mlp": 1.11074281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.08202201534603108, + "language_loss": 0.8648417, + "learning_rate": 0.0009292791720892659, + "loss": 0.87609494, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1020, + "time_per_iteration": 2.889078140258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131715, + "balance_loss_mlp": 1.11721921, + "diversity_loss_mlp": 0.0, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07932574612707302, + "language_loss": 0.88913518, + "learning_rate": 0.0009291193560807218, + "loss": 0.90045238, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1021, + "time_per_iteration": 2.5933609008789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136679, + "balance_loss_mlp": 1.122159, + "diversity_loss_mlp": 0.0, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.08278255048112054, + "language_loss": 0.87034905, + "learning_rate": 0.0009289593734732688, + "loss": 0.88171583, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1022, + "time_per_iteration": 2.600834369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132774, + "balance_loss_mlp": 1.11842132, + "diversity_loss_mlp": 0.0, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.08270608551386573, + "language_loss": 0.93774927, + "learning_rate": 0.0009287992243290175, + "loss": 0.94907701, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1023, + "time_per_iteration": 2.474914312362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111783, + "balance_loss_mlp": 1.10275006, + "diversity_loss_mlp": 0.0, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.06901830196983176, + "language_loss": 0.90473127, + "learning_rate": 0.0009286389087101435, + "loss": 0.91590953, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1024, + "time_per_iteration": 2.7718465328216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120328, + "balance_loss_mlp": 1.1055932, + "diversity_loss_mlp": 0.0, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07476522676232629, + "language_loss": 0.8853035, + "learning_rate": 0.0009284784266788864, + "loss": 0.89650679, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1025, + "time_per_iteration": 2.7143290042877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122071, + "balance_loss_mlp": 1.10795665, + "diversity_loss_mlp": 0.0, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.08990804702262417, + "language_loss": 0.91984832, + "learning_rate": 0.0009283177782975512, + "loss": 0.93106908, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1026, + "time_per_iteration": 2.948909282684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115739, + "balance_loss_mlp": 1.10118401, + "diversity_loss_mlp": 0.0, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08229992096701991, + "language_loss": 0.88074464, + "learning_rate": 0.000928156963628507, + "loss": 0.89190209, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1027, + "time_per_iteration": 2.5764074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109846, + "balance_loss_mlp": 1.09483802, + "diversity_loss_mlp": 0.0, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.08379460495492784, + "language_loss": 0.87978798, + "learning_rate": 0.0009279959827341877, + "loss": 0.89088643, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1028, + "time_per_iteration": 2.752347946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08043635, + "diversity_loss_mlp": 0.0, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.08467225305095022, + "language_loss": 0.87624389, + "learning_rate": 0.0009278348356770915, + "loss": 0.88720024, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1029, + "time_per_iteration": 2.555527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.08132768, + "diversity_loss_mlp": 0.0, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.0755245964113765, + "language_loss": 0.85285002, + "learning_rate": 0.0009276735225197814, + "loss": 0.86381966, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1030, + "time_per_iteration": 2.5947089195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104122, + "balance_loss_mlp": 1.08832633, + "diversity_loss_mlp": 0.0, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.08972056860523267, + "language_loss": 0.85732102, + "learning_rate": 0.0009275120433248847, + "loss": 0.86836231, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.15783691, + "routerloss_mlp": 0.0, + "step": 1031, + "time_per_iteration": 2.676872730255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109193, + "balance_loss_mlp": 1.09355247, + "diversity_loss_mlp": 0.0, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.07488561277584621, + "language_loss": 0.85529125, + "learning_rate": 0.0009273503981550931, + "loss": 0.86638314, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1032, + "time_per_iteration": 3.09958815574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099668, + "balance_loss_mlp": 1.08494592, + "diversity_loss_mlp": 0.0, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.1040963884260124, + "language_loss": 0.86882496, + "learning_rate": 0.0009271885870731626, + "loss": 0.87982166, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1033, + "time_per_iteration": 2.509047269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098009, + "balance_loss_mlp": 1.08258307, + "diversity_loss_mlp": 0.0, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.09324111295027285, + "language_loss": 0.88376671, + "learning_rate": 0.0009270266101419143, + "loss": 0.89474678, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1034, + "time_per_iteration": 2.6504034996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094236, + "balance_loss_mlp": 1.07954955, + "diversity_loss_mlp": 0.0, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.12545708784893086, + "language_loss": 0.85201651, + "learning_rate": 0.0009268644674242328, + "loss": 0.86295891, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1035, + "time_per_iteration": 2.6919047832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105423, + "balance_loss_mlp": 1.08997381, + "diversity_loss_mlp": 0.0, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.09055239952020887, + "language_loss": 0.80814689, + "learning_rate": 0.0009267021589830678, + "loss": 0.81920111, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1036, + "time_per_iteration": 2.582871198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278291, + "balance_loss_mlp": 1.26927888, + "diversity_loss_mlp": 0.0, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.10087907784966592, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78905374, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 1037, + "time_per_iteration": 4.955699920654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112922, + "balance_loss_mlp": 1.11371088, + "diversity_loss_mlp": 0.0, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.08737337363848705, + "language_loss": 0.9264009, + "learning_rate": 0.000926377045182406, + "loss": 0.93769312, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1038, + "time_per_iteration": 2.8884389400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140143, + "balance_loss_mlp": 1.12453878, + "diversity_loss_mlp": 0.0, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.10415849564176528, + "language_loss": 0.87916917, + "learning_rate": 0.0009262142399491296, + "loss": 0.89057058, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1039, + "time_per_iteration": 3.045872211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143419, + "balance_loss_mlp": 1.12763548, + "diversity_loss_mlp": 0.0, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.09906225236156592, + "language_loss": 0.87455821, + "learning_rate": 0.0009260512692448105, + "loss": 0.88599241, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.15771484, + "routerloss_mlp": 0.0, + "step": 1040, + "time_per_iteration": 2.699052572250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.10879421, + "diversity_loss_mlp": 0.0, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0911420547130344, + "language_loss": 0.8431657, + "learning_rate": 0.000925888133132719, + "loss": 0.85441184, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1041, + "time_per_iteration": 2.780141830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063145, + "balance_loss_mlp": 1.05260694, + "diversity_loss_mlp": 0.0, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.04139604987307943, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80673575, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 1042, + "time_per_iteration": 4.971017360687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100746, + "balance_loss_mlp": 1.08498645, + "diversity_loss_mlp": 0.0, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.08950731646766712, + "language_loss": 0.81070006, + "learning_rate": 0.0009255613649386244, + "loss": 0.82170749, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.1574707, + "routerloss_mlp": 0.0, + "step": 1043, + "time_per_iteration": 2.6508612632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091355, + "balance_loss_mlp": 1.07623935, + "diversity_loss_mlp": 0.0, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07614483401418765, + "language_loss": 0.78829026, + "learning_rate": 0.0009253977329834838, + "loss": 0.79920387, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1044, + "time_per_iteration": 2.7090582847595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109464, + "balance_loss_mlp": 1.07947624, + "diversity_loss_mlp": 0.0, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.0989854096864982, + "language_loss": 0.86366481, + "learning_rate": 0.0009252339358742965, + "loss": 0.8746112, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1045, + "time_per_iteration": 2.801323652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100349, + "balance_loss_mlp": 1.08526874, + "diversity_loss_mlp": 0.0, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07994799859902735, + "language_loss": 0.83704323, + "learning_rate": 0.000925069973674654, + "loss": 0.84804672, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1046, + "time_per_iteration": 2.6286635398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011046, + "balance_loss_mlp": 1.09036636, + "diversity_loss_mlp": 0.0, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.05803081938267982, + "language_loss": 0.88841283, + "learning_rate": 0.000924905846448212, + "loss": 0.89945889, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1047, + "time_per_iteration": 2.7208023071289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135029, + "balance_loss_mlp": 1.12078381, + "diversity_loss_mlp": 0.0, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.09159511175118457, + "language_loss": 0.85692465, + "learning_rate": 0.0009247415542586906, + "loss": 0.86827493, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1048, + "time_per_iteration": 2.8772377967834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089504, + "balance_loss_mlp": 1.55797935, + "diversity_loss_mlp": 0.19993141, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.028193920194447036, + "language_loss": 0.83094788, + "learning_rate": 0.0009245770971698735, + "loss": 0.83989829, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01608507, + "step": 1049, + "time_per_iteration": 2.922792911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143099, + "balance_loss_mlp": 1.12878203, + "diversity_loss_mlp": 0.0, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.08345797467079887, + "language_loss": 0.88434327, + "learning_rate": 0.0009244124752456087, + "loss": 0.89577425, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1050, + "time_per_iteration": 2.5263967514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141188, + "balance_loss_mlp": 1.12675214, + "diversity_loss_mlp": 0.0, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.07479960387863874, + "language_loss": 0.85303241, + "learning_rate": 0.0009242476885498081, + "loss": 0.86444432, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1051, + "time_per_iteration": 2.8012773990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146474, + "balance_loss_mlp": 1.13181126, + "diversity_loss_mlp": 0.0, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.07632391919964465, + "language_loss": 0.81114984, + "learning_rate": 0.0009240827371464474, + "loss": 0.82261455, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1052, + "time_per_iteration": 2.546449661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146729, + "balance_loss_mlp": 1.1323998, + "diversity_loss_mlp": 0.0, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.11219768477147798, + "language_loss": 0.84167284, + "learning_rate": 0.0009239176210995666, + "loss": 0.85314012, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1053, + "time_per_iteration": 3.4905290603637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153158, + "balance_loss_mlp": 1.13878179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.07345468089138417, + "language_loss": 0.93850195, + "learning_rate": 0.0009237523404732695, + "loss": 0.95003355, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1054, + "time_per_iteration": 2.8854215145111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116831, + "balance_loss_mlp": 1.15374279, + "diversity_loss_mlp": 0.0, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.08788286689344726, + "language_loss": 0.84136868, + "learning_rate": 0.0009235868953317235, + "loss": 0.85305184, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1055, + "time_per_iteration": 2.785616397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115453, + "balance_loss_mlp": 1.14033246, + "diversity_loss_mlp": 0.0, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.07006303181868268, + "language_loss": 0.85314858, + "learning_rate": 0.0009234212857391602, + "loss": 0.86469388, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1056, + "time_per_iteration": 3.192293167114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167757, + "balance_loss_mlp": 1.15304708, + "diversity_loss_mlp": 0.0, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.07469852363602907, + "language_loss": 0.89220309, + "learning_rate": 0.000923255511759875, + "loss": 0.9038806, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1057, + "time_per_iteration": 2.783778429031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881428, + "balance_loss_mlp": 1.53356147, + "diversity_loss_mlp": 0.1968638, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.032510948660132113, + "language_loss": 0.84587663, + "learning_rate": 0.000923089573458227, + "loss": 0.85469091, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621579, + "step": 1058, + "time_per_iteration": 2.8847100734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150746, + "balance_loss_mlp": 1.13623881, + "diversity_loss_mlp": 0.0, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.11181454207252314, + "language_loss": 0.83516467, + "learning_rate": 0.0009229234708986392, + "loss": 0.84667218, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1059, + "time_per_iteration": 2.9079415798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172867, + "balance_loss_mlp": 1.16251993, + "diversity_loss_mlp": 0.0, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.06024273804144221, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82839763, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1060, + "time_per_iteration": 4.646218776702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112152, + "balance_loss_mlp": 1.10713172, + "diversity_loss_mlp": 0.0, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.08928557521337042, + "language_loss": 0.85345757, + "learning_rate": 0.0009225907732636548, + "loss": 0.86467278, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1061, + "time_per_iteration": 2.745448112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106249, + "balance_loss_mlp": 1.09209883, + "diversity_loss_mlp": 0.0, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.079028173596336, + "language_loss": 0.86936563, + "learning_rate": 0.0009224241783174227, + "loss": 0.88042819, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1062, + "time_per_iteration": 2.6923935413360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090506, + "balance_loss_mlp": 1.07616472, + "diversity_loss_mlp": 0.0, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.07452632641130948, + "language_loss": 0.85384166, + "learning_rate": 0.0009222574193715802, + "loss": 0.86474669, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1063, + "time_per_iteration": 2.7701327800750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092958, + "balance_loss_mlp": 1.07850981, + "diversity_loss_mlp": 0.0, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06517233034985846, + "language_loss": 0.85915947, + "learning_rate": 0.000922090496490869, + "loss": 0.87008905, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1064, + "time_per_iteration": 2.7387099266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098934, + "balance_loss_mlp": 1.08404493, + "diversity_loss_mlp": 0.0, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.06963355430403552, + "language_loss": 0.89889115, + "learning_rate": 0.0009219234097400937, + "loss": 0.90988052, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1065, + "time_per_iteration": 2.859334707260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112884, + "balance_loss_mlp": 1.09778059, + "diversity_loss_mlp": 0.0, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06723697540994414, + "language_loss": 0.83086514, + "learning_rate": 0.0009217561591841237, + "loss": 0.84199405, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1066, + "time_per_iteration": 3.3065547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00886484, + "balance_loss_mlp": 1.54046464, + "diversity_loss_mlp": 0.1982768, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.03984406199709606, + "language_loss": 0.80820358, + "learning_rate": 0.0009215887448878913, + "loss": 0.8170684, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01711285, + "step": 1067, + "time_per_iteration": 2.6291754245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131678, + "balance_loss_mlp": 1.11697936, + "diversity_loss_mlp": 0.0, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.07633348035576148, + "language_loss": 0.85365784, + "learning_rate": 0.0009214211669163922, + "loss": 0.86497462, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1068, + "time_per_iteration": 2.747936725616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136914, + "balance_loss_mlp": 1.12220347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.07197705825645119, + "language_loss": 0.9405331, + "learning_rate": 0.0009212534253346862, + "loss": 0.95190227, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1069, + "time_per_iteration": 2.696131467819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128507, + "balance_loss_mlp": 1.11372542, + "diversity_loss_mlp": 0.0, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.09743186487320747, + "language_loss": 0.84269625, + "learning_rate": 0.0009210855202078964, + "loss": 0.85398132, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1070, + "time_per_iteration": 2.6194372177124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114316, + "balance_loss_mlp": 1.12903321, + "diversity_loss_mlp": 0.0, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.08033414700046611, + "language_loss": 0.87081122, + "learning_rate": 0.0009209174516012091, + "loss": 0.88224292, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1071, + "time_per_iteration": 2.5169904232025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146914, + "balance_loss_mlp": 1.13247752, + "diversity_loss_mlp": 0.0, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.06769648970134874, + "language_loss": 0.89207751, + "learning_rate": 0.0009207492195798747, + "loss": 0.90354669, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1072, + "time_per_iteration": 2.804577112197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137485, + "balance_loss_mlp": 1.12303698, + "diversity_loss_mlp": 0.0, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.0857236005827703, + "language_loss": 0.84780991, + "learning_rate": 0.0009205808242092061, + "loss": 0.85918474, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1073, + "time_per_iteration": 2.6134936809539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122455, + "balance_loss_mlp": 1.10787559, + "diversity_loss_mlp": 0.0, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.09531084522047072, + "language_loss": 0.82512677, + "learning_rate": 0.0009204122655545808, + "loss": 0.83635134, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1074, + "time_per_iteration": 3.461315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888955, + "balance_loss_mlp": 1.54418314, + "diversity_loss_mlp": 0.20175909, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.03221822204199988, + "language_loss": 0.80952764, + "learning_rate": 0.0009202435436814388, + "loss": 0.81841719, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01598355, + "step": 1075, + "time_per_iteration": 2.728055238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146745, + "balance_loss_mlp": 1.13259482, + "diversity_loss_mlp": 0.0, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.0831097658087499, + "language_loss": 0.89925295, + "learning_rate": 0.0009200746586552836, + "loss": 0.91072041, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1076, + "time_per_iteration": 2.929422616958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136182, + "balance_loss_mlp": 1.12185347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.07960863169785164, + "language_loss": 0.84148425, + "learning_rate": 0.0009199056105416825, + "loss": 0.85284609, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1077, + "time_per_iteration": 3.0795576572418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148051, + "balance_loss_mlp": 1.13384151, + "diversity_loss_mlp": 0.0, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.06589509494701294, + "language_loss": 0.86599898, + "learning_rate": 0.0009197363994062654, + "loss": 0.87747955, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1078, + "time_per_iteration": 2.8304550647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891417, + "balance_loss_mlp": 1.54815006, + "diversity_loss_mlp": 0.20151556, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.027729032115243194, + "language_loss": 0.84302026, + "learning_rate": 0.0009195670253147262, + "loss": 0.85193443, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01658459, + "step": 1079, + "time_per_iteration": 2.987715005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168872, + "balance_loss_mlp": 1.15472198, + "diversity_loss_mlp": 0.0, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.07878432741989363, + "language_loss": 0.82508785, + "learning_rate": 0.0009193974883328216, + "loss": 0.83677661, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1080, + "time_per_iteration": 2.6007754802703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178335, + "balance_loss_mlp": 1.16408908, + "diversity_loss_mlp": 0.0, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06872318796781544, + "language_loss": 0.86871535, + "learning_rate": 0.0009192277885263718, + "loss": 0.88049871, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1081, + "time_per_iteration": 2.645918846130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116777, + "balance_loss_mlp": 1.15339386, + "diversity_loss_mlp": 0.0, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.08475435362049728, + "language_loss": 0.86010319, + "learning_rate": 0.0009190579259612602, + "loss": 0.87178093, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1082, + "time_per_iteration": 3.2688331604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153529, + "balance_loss_mlp": 1.13914001, + "diversity_loss_mlp": 0.0, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.06676527060715894, + "language_loss": 0.86419082, + "learning_rate": 0.000918887900703433, + "loss": 0.8757261, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1083, + "time_per_iteration": 2.7645068168640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129996, + "balance_loss_mlp": 1.11559522, + "diversity_loss_mlp": 0.0, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07296749014166971, + "language_loss": 0.89779425, + "learning_rate": 0.0009187177128188999, + "loss": 0.90909421, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1084, + "time_per_iteration": 2.441312313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128832, + "balance_loss_mlp": 1.11915255, + "diversity_loss_mlp": 0.0, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.053207927956046876, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78285372, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1085, + "time_per_iteration": 4.864179849624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117368, + "balance_loss_mlp": 1.1029439, + "diversity_loss_mlp": 0.0, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07905606819783856, + "language_loss": 0.85833263, + "learning_rate": 0.000918376849434071, + "loss": 0.86950636, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1086, + "time_per_iteration": 4.049270868301392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112999, + "balance_loss_mlp": 1.09849179, + "diversity_loss_mlp": 0.0, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.08954509639668791, + "language_loss": 0.90778226, + "learning_rate": 0.0009182061740661098, + "loss": 0.91891223, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1087, + "time_per_iteration": 2.557358741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128974, + "balance_loss_mlp": 1.11446643, + "diversity_loss_mlp": 0.0, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.08446380837501397, + "language_loss": 0.85054636, + "learning_rate": 0.0009180353363361127, + "loss": 0.86183608, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1088, + "time_per_iteration": 3.0897305011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118573, + "balance_loss_mlp": 1.10417306, + "diversity_loss_mlp": 0.0, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.08173869768976531, + "language_loss": 0.82508695, + "learning_rate": 0.0009178643363104044, + "loss": 0.83627272, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1089, + "time_per_iteration": 3.124645948410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113657, + "balance_loss_mlp": 1.09938824, + "diversity_loss_mlp": 0.0, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.09307233053408402, + "language_loss": 0.90518665, + "learning_rate": 0.0009176931740553735, + "loss": 0.9163233, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1090, + "time_per_iteration": 2.6098225116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113731, + "balance_loss_mlp": 1.09981966, + "diversity_loss_mlp": 0.0, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.09489388322063774, + "language_loss": 0.8240813, + "learning_rate": 0.0009175218496374708, + "loss": 0.83521861, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1091, + "time_per_iteration": 3.336355686187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110612, + "balance_loss_mlp": 1.09205294, + "diversity_loss_mlp": 0.0, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.08870561470384966, + "language_loss": 0.86057436, + "learning_rate": 0.0009173503631232103, + "loss": 0.87163556, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1092, + "time_per_iteration": 3.356015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106884, + "balance_loss_mlp": 1.09269798, + "diversity_loss_mlp": 0.0, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.09478788106803046, + "language_loss": 0.82067865, + "learning_rate": 0.0009171787145791691, + "loss": 0.83174753, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1093, + "time_per_iteration": 3.2546143531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116222, + "balance_loss_mlp": 1.10199988, + "diversity_loss_mlp": 0.0, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.14674509624116924, + "language_loss": 0.80160701, + "learning_rate": 0.000917006904071987, + "loss": 0.81276917, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1094, + "time_per_iteration": 2.5837080478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911953, + "balance_loss_mlp": 1.58726883, + "diversity_loss_mlp": 0.20477253, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.035943125208157026, + "language_loss": 0.8737694, + "learning_rate": 0.0009168349316683669, + "loss": 0.88288891, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01593196, + "step": 1095, + "time_per_iteration": 2.768296718597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136825, + "balance_loss_mlp": 1.1224122, + "diversity_loss_mlp": 0.0, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.06639171103878667, + "language_loss": 0.82719827, + "learning_rate": 0.0009166627974350741, + "loss": 0.83856648, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1096, + "time_per_iteration": 2.8819992542266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145046, + "balance_loss_mlp": 1.13041949, + "diversity_loss_mlp": 0.0, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.08337696606413014, + "language_loss": 0.89929205, + "learning_rate": 0.0009164905014389373, + "loss": 0.91074252, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1097, + "time_per_iteration": 2.7877442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163813, + "balance_loss_mlp": 1.1495918, + "diversity_loss_mlp": 0.0, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.08033808486911229, + "language_loss": 0.86386079, + "learning_rate": 0.0009163180437468476, + "loss": 0.87549889, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1098, + "time_per_iteration": 2.6314592361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176615, + "balance_loss_mlp": 1.16195273, + "diversity_loss_mlp": 0.0, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.09094665560265827, + "language_loss": 0.85629344, + "learning_rate": 0.000916145424425759, + "loss": 0.86805964, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1099, + "time_per_iteration": 2.6608541011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181873, + "balance_loss_mlp": 1.16744852, + "diversity_loss_mlp": 0.0, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.09944182260515583, + "language_loss": 0.9083795, + "learning_rate": 0.0009159726435426885, + "loss": 0.9201982, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1100, + "time_per_iteration": 3.0502405166625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.134619, + "diversity_loss_mlp": 0.0, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.09151162791452093, + "language_loss": 0.90900993, + "learning_rate": 0.0009157997011647154, + "loss": 0.92050231, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1101, + "time_per_iteration": 2.6048476696014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127613, + "balance_loss_mlp": 1.11389172, + "diversity_loss_mlp": 0.0, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.07696729699318336, + "language_loss": 0.86130077, + "learning_rate": 0.0009156265973589817, + "loss": 0.87257689, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1102, + "time_per_iteration": 2.7552144527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114805, + "balance_loss_mlp": 1.10088181, + "diversity_loss_mlp": 0.0, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.07661877314329607, + "language_loss": 0.89485067, + "learning_rate": 0.0009154533321926926, + "loss": 0.90599877, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.13909912, + "routerloss_mlp": 0.0, + "step": 1103, + "time_per_iteration": 4.073851108551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105254, + "balance_loss_mlp": 1.09134197, + "diversity_loss_mlp": 0.0, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.08363594534482698, + "language_loss": 0.8717171, + "learning_rate": 0.0009152799057331156, + "loss": 0.88276958, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1104, + "time_per_iteration": 3.142221450805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100132, + "balance_loss_mlp": 1.08656633, + "diversity_loss_mlp": 0.0, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.1056362594360365, + "language_loss": 0.91270363, + "learning_rate": 0.0009151063180475805, + "loss": 0.92370498, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1105, + "time_per_iteration": 2.512547016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095772, + "balance_loss_mlp": 1.08196795, + "diversity_loss_mlp": 0.0, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.08072473316090223, + "language_loss": 0.84285367, + "learning_rate": 0.0009149325692034803, + "loss": 0.85381138, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1106, + "time_per_iteration": 2.5711469650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.06266928, + "diversity_loss_mlp": 0.0, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.04229613635199888, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.8027482, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1107, + "time_per_iteration": 4.817704916000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129097, + "balance_loss_mlp": 1.11547112, + "diversity_loss_mlp": 0.0, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.07382538641756346, + "language_loss": 0.8748607, + "learning_rate": 0.0009145845883094678, + "loss": 0.88615161, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1108, + "time_per_iteration": 3.039318561553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150208, + "balance_loss_mlp": 1.13671303, + "diversity_loss_mlp": 0.0, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07887220377556703, + "language_loss": 0.85174125, + "learning_rate": 0.000914410356394654, + "loss": 0.86324334, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1109, + "time_per_iteration": 2.76413893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116209, + "balance_loss_mlp": 1.1484766, + "diversity_loss_mlp": 0.0, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.06362602917472766, + "language_loss": 0.84447891, + "learning_rate": 0.0009142359635914709, + "loss": 0.85609984, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1110, + "time_per_iteration": 3.007201671600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163563, + "balance_loss_mlp": 1.15004468, + "diversity_loss_mlp": 0.0, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.07633144605420673, + "language_loss": 0.84598219, + "learning_rate": 0.0009140614099676245, + "loss": 0.85761786, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1111, + "time_per_iteration": 2.569401979446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161722, + "balance_loss_mlp": 1.14807272, + "diversity_loss_mlp": 0.0, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.0712977258009472, + "language_loss": 0.82590818, + "learning_rate": 0.0009138866955908821, + "loss": 0.83752549, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1112, + "time_per_iteration": 2.870701789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166663, + "balance_loss_mlp": 1.15294182, + "diversity_loss_mlp": 0.0, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.09239605609063735, + "language_loss": 0.80485952, + "learning_rate": 0.0009137118205290738, + "loss": 0.81652606, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.13739014, + "routerloss_mlp": 0.0, + "step": 1113, + "time_per_iteration": 2.9623591899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174843, + "balance_loss_mlp": 1.16082442, + "diversity_loss_mlp": 0.0, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.08763873550503462, + "language_loss": 0.90553653, + "learning_rate": 0.0009135367848500924, + "loss": 0.91728497, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1114, + "time_per_iteration": 2.5287492275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165146, + "balance_loss_mlp": 1.15138936, + "diversity_loss_mlp": 0.0, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.11593363319598911, + "language_loss": 0.86361086, + "learning_rate": 0.0009133615886218927, + "loss": 0.87526232, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1115, + "time_per_iteration": 2.6945505142211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141616, + "balance_loss_mlp": 1.12725139, + "diversity_loss_mlp": 0.0, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.08371979294567897, + "language_loss": 0.87389791, + "learning_rate": 0.0009131862319124917, + "loss": 0.88531411, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1116, + "time_per_iteration": 2.6219210624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130003, + "balance_loss_mlp": 1.1162107, + "diversity_loss_mlp": 0.0, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08272793517794225, + "language_loss": 0.83981287, + "learning_rate": 0.0009130107147899691, + "loss": 0.85111284, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1117, + "time_per_iteration": 2.698151111602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.1039083, + "diversity_loss_mlp": 0.0, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.4685945915436946, + "language_loss": 0.85086691, + "learning_rate": 0.0009128350373224665, + "loss": 0.86204791, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1118, + "time_per_iteration": 2.545565128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059182, + "balance_loss_mlp": 1.04950213, + "diversity_loss_mlp": 0.0, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.03761711697708654, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82515609, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1119, + "time_per_iteration": 4.648902416229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118843, + "balance_loss_mlp": 1.10412121, + "diversity_loss_mlp": 0.0, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07492511871579786, + "language_loss": 0.85205054, + "learning_rate": 0.0009124832016254005, + "loss": 0.86323893, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1120, + "time_per_iteration": 2.5875513553619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112958, + "balance_loss_mlp": 1.11404657, + "diversity_loss_mlp": 0.0, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.10623123993924175, + "language_loss": 0.88117284, + "learning_rate": 0.0009123070435324316, + "loss": 0.89246857, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1121, + "time_per_iteration": 2.752814769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119114, + "balance_loss_mlp": 1.10852826, + "diversity_loss_mlp": 0.0, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.05861429426141409, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78994894, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 1122, + "time_per_iteration": 4.993450880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114894, + "balance_loss_mlp": 1.13229823, + "diversity_loss_mlp": 0.0, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.09758120262844092, + "language_loss": 0.86477894, + "learning_rate": 0.0009119542471995752, + "loss": 0.87626839, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 1123, + "time_per_iteration": 2.8260560035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132116, + "balance_loss_mlp": 1.1160109, + "diversity_loss_mlp": 0.0, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.1175490331770948, + "language_loss": 0.81597894, + "learning_rate": 0.0009117776090966554, + "loss": 0.82730007, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1124, + "time_per_iteration": 2.955768585205078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133281, + "balance_loss_mlp": 1.1166153, + "diversity_loss_mlp": 0.0, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.08908783615486303, + "language_loss": 0.86717665, + "learning_rate": 0.0009116008111274899, + "loss": 0.87850952, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1125, + "time_per_iteration": 3.2493131160736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_mlp": 1.02921367, + "diversity_loss_mlp": 0.0, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.03267712428803131, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80145574, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 1126, + "time_per_iteration": 4.8121678829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148434, + "balance_loss_mlp": 1.13257909, + "diversity_loss_mlp": 0.0, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.09699177011816186, + "language_loss": 0.85244691, + "learning_rate": 0.0009112467358650396, + "loss": 0.86393118, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1127, + "time_per_iteration": 3.144075393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166528, + "balance_loss_mlp": 1.15056634, + "diversity_loss_mlp": 0.0, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.07985175184807933, + "language_loss": 0.86319685, + "learning_rate": 0.0009110694587092192, + "loss": 0.87486213, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.1595459, + "routerloss_mlp": 0.0, + "step": 1128, + "time_per_iteration": 2.7497644424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179675, + "balance_loss_mlp": 1.1634866, + "diversity_loss_mlp": 0.0, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.1038215552752292, + "language_loss": 0.81267089, + "learning_rate": 0.0009108920219620815, + "loss": 0.82446766, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 1129, + "time_per_iteration": 2.6150496006011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195026, + "balance_loss_mlp": 1.1788609, + "diversity_loss_mlp": 0.0, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06771714561059723, + "language_loss": 0.89286679, + "learning_rate": 0.0009107144256925133, + "loss": 0.9048171, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1130, + "time_per_iteration": 2.6569926738739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196317, + "balance_loss_mlp": 1.18006873, + "diversity_loss_mlp": 0.0, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08333124164895586, + "language_loss": 0.82520813, + "learning_rate": 0.0009105366699694638, + "loss": 0.83717132, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 1131, + "time_per_iteration": 2.7384698390960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200769, + "balance_loss_mlp": 1.18390059, + "diversity_loss_mlp": 0.0, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.07018840625680964, + "language_loss": 0.81826723, + "learning_rate": 0.0009103587548619439, + "loss": 0.83027488, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 1132, + "time_per_iteration": 2.8361291885375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188026, + "balance_loss_mlp": 1.17064476, + "diversity_loss_mlp": 0.0, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.08238158624987729, + "language_loss": 0.85952497, + "learning_rate": 0.0009101806804390261, + "loss": 0.87140524, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.1739502, + "routerloss_mlp": 0.0, + "step": 1133, + "time_per_iteration": 2.8646528720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846565, + "balance_loss_mlp": 1.45559311, + "diversity_loss_mlp": 0.20202307, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.03511986753794681, + "language_loss": 0.90682399, + "learning_rate": 0.0009100024467698453, + "loss": 0.91528964, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01775702, + "step": 1134, + "time_per_iteration": 2.628955364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119036, + "balance_loss_mlp": 1.17289567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.09831196896097749, + "language_loss": 0.82889581, + "learning_rate": 0.0009098240539235981, + "loss": 0.84079945, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 1135, + "time_per_iteration": 2.6857638359069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179858, + "balance_loss_mlp": 1.16191649, + "diversity_loss_mlp": 0.0, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.07855046788509763, + "language_loss": 0.87649047, + "learning_rate": 0.0009096455019695423, + "loss": 0.88828909, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 1136, + "time_per_iteration": 2.814746856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175201, + "balance_loss_mlp": 1.15702188, + "diversity_loss_mlp": 0.0, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.090535881946018, + "language_loss": 0.89789271, + "learning_rate": 0.000909466790976998, + "loss": 0.90964472, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.18188477, + "routerloss_mlp": 0.0, + "step": 1137, + "time_per_iteration": 2.503934144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151304, + "balance_loss_mlp": 1.13231349, + "diversity_loss_mlp": 0.0, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07386356915969775, + "language_loss": 0.82546908, + "learning_rate": 0.0009092879210153473, + "loss": 0.83698207, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.18981934, + "routerloss_mlp": 0.0, + "step": 1138, + "time_per_iteration": 3.106015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143167, + "balance_loss_mlp": 1.12445128, + "diversity_loss_mlp": 0.0, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.08443059177839436, + "language_loss": 0.89126158, + "learning_rate": 0.0009091088921540333, + "loss": 0.90269327, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.18701172, + "routerloss_mlp": 0.0, + "step": 1139, + "time_per_iteration": 2.5165584087371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197317, + "balance_loss_mlp": 1.18491888, + "diversity_loss_mlp": 0.0, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.06938907882855633, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76705992, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12402344, + "routerloss_mlp": 0.0, + "step": 1140, + "time_per_iteration": 4.907839775085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845315, + "balance_loss_mlp": 1.45913088, + "diversity_loss_mlp": 0.19676474, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.04157801253712285, + "language_loss": 0.84799111, + "learning_rate": 0.0009087503580104985, + "loss": 0.8564443, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01736734, + "step": 1141, + "time_per_iteration": 2.6928980350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106137, + "balance_loss_mlp": 1.08643126, + "diversity_loss_mlp": 0.0, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.09652849342648293, + "language_loss": 0.7964108, + "learning_rate": 0.0009085708528674728, + "loss": 0.80747211, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 1142, + "time_per_iteration": 2.7800490856170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.09476519, + "diversity_loss_mlp": 0.0, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.11345906914127299, + "language_loss": 0.8700006, + "learning_rate": 0.0009083911891031745, + "loss": 0.88115132, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 1143, + "time_per_iteration": 3.104893684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110402, + "balance_loss_mlp": 1.08533978, + "diversity_loss_mlp": 0.0, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.12428556161586228, + "language_loss": 0.91569418, + "learning_rate": 0.0009082113667873553, + "loss": 0.92673439, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.18676758, + "routerloss_mlp": 0.0, + "step": 1144, + "time_per_iteration": 3.0838277339935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138926, + "balance_loss_mlp": 1.12060392, + "diversity_loss_mlp": 0.0, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.0955721440223133, + "language_loss": 0.90911627, + "learning_rate": 0.0009080313859898283, + "loss": 0.92050546, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 1145, + "time_per_iteration": 2.4998109340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162548, + "balance_loss_mlp": 1.14463091, + "diversity_loss_mlp": 0.0, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07871728913387968, + "language_loss": 0.91642439, + "learning_rate": 0.0009078512467804684, + "loss": 0.92804986, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.17932129, + "routerloss_mlp": 0.0, + "step": 1146, + "time_per_iteration": 2.583137273788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192448, + "balance_loss_mlp": 1.17516243, + "diversity_loss_mlp": 0.0, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.10815580627735921, + "language_loss": 0.90245295, + "learning_rate": 0.0009076709492292119, + "loss": 0.91437739, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 1147, + "time_per_iteration": 2.6189510822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199389, + "balance_loss_mlp": 1.18260384, + "diversity_loss_mlp": 0.0, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.10018226205073696, + "language_loss": 0.88948917, + "learning_rate": 0.0009074904934060562, + "loss": 0.90148306, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1148, + "time_per_iteration": 2.6619913578033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.17623389, + "diversity_loss_mlp": 0.0, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.09879445691718633, + "language_loss": 0.85041308, + "learning_rate": 0.0009073098793810607, + "loss": 0.8623414, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.1661377, + "routerloss_mlp": 0.0, + "step": 1149, + "time_per_iteration": 2.9382119178771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185083, + "balance_loss_mlp": 1.16848898, + "diversity_loss_mlp": 0.0, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.09716543961816822, + "language_loss": 0.88557786, + "learning_rate": 0.000907129107224346, + "loss": 0.89742863, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.16601562, + "routerloss_mlp": 0.0, + "step": 1150, + "time_per_iteration": 2.717400550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.17356002, + "diversity_loss_mlp": 0.0, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.0741661773141201, + "language_loss": 0.88313866, + "learning_rate": 0.0009069481770060939, + "loss": 0.89504004, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1151, + "time_per_iteration": 2.676938056945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118655, + "balance_loss_mlp": 1.17039752, + "diversity_loss_mlp": 0.0, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06827936796637825, + "language_loss": 0.83848286, + "learning_rate": 0.000906767088796548, + "loss": 0.85034835, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1152, + "time_per_iteration": 3.442782163619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185002, + "balance_loss_mlp": 1.16889715, + "diversity_loss_mlp": 0.0, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.07358747282835834, + "language_loss": 0.87001419, + "learning_rate": 0.0009065858426660127, + "loss": 0.88186425, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1153, + "time_per_iteration": 2.6501753330230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178927, + "balance_loss_mlp": 1.16286922, + "diversity_loss_mlp": 0.0, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.0863709920952229, + "language_loss": 0.84764236, + "learning_rate": 0.0009064044386848543, + "loss": 0.85943162, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1154, + "time_per_iteration": 2.920689344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176891, + "balance_loss_mlp": 1.16032064, + "diversity_loss_mlp": 0.0, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.07669791788600007, + "language_loss": 0.88829726, + "learning_rate": 0.0009062228769234997, + "loss": 0.90006614, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 1155, + "time_per_iteration": 2.561638832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154629, + "balance_loss_mlp": 1.13797593, + "diversity_loss_mlp": 0.0, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.08447027490527963, + "language_loss": 0.81123281, + "learning_rate": 0.0009060411574524376, + "loss": 0.82277906, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 1156, + "time_per_iteration": 2.655132293701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162354, + "balance_loss_mlp": 1.14597416, + "diversity_loss_mlp": 0.0, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.08665349089557017, + "language_loss": 0.87817705, + "learning_rate": 0.0009058592803422178, + "loss": 0.88980061, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 1157, + "time_per_iteration": 3.1417362689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183028, + "balance_loss_mlp": 1.17430186, + "diversity_loss_mlp": 0.0, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.06198684812147071, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79893315, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1158, + "time_per_iteration": 4.867843866348267 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.1120069, + "diversity_loss_mlp": 0.0, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.0864152607347894, + "language_loss": 0.90156865, + "learning_rate": 0.00090549505348681, + "loss": 0.91285539, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1159, + "time_per_iteration": 2.581865072250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118982, + "balance_loss_mlp": 1.1025548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07056827667929483, + "language_loss": 0.83819324, + "learning_rate": 0.0009053127038830275, + "loss": 0.84938306, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 1160, + "time_per_iteration": 2.9969708919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881169, + "balance_loss_mlp": 1.53314447, + "diversity_loss_mlp": 0.19063006, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.04002382495760162, + "language_loss": 0.87460124, + "learning_rate": 0.000905130196922898, + "loss": 0.88341296, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01928164, + "step": 1161, + "time_per_iteration": 2.6307718753814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881407, + "balance_loss_mlp": 1.5316093, + "diversity_loss_mlp": 0.19140732, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.030280826501304762, + "language_loss": 0.86784196, + "learning_rate": 0.0009049475326772769, + "loss": 0.87665606, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01989887, + "step": 1162, + "time_per_iteration": 2.6021478176116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00889034, + "balance_loss_mlp": 1.54766631, + "diversity_loss_mlp": 0.19066738, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.03198536270345376, + "language_loss": 0.83124602, + "learning_rate": 0.0009047647112170811, + "loss": 0.84013629, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01986698, + "step": 1163, + "time_per_iteration": 2.804150342941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123868, + "balance_loss_mlp": 1.1070838, + "diversity_loss_mlp": 0.0, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.09901141435665076, + "language_loss": 0.87948084, + "learning_rate": 0.0009045817326132876, + "loss": 0.89071947, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1164, + "time_per_iteration": 3.6840732097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125369, + "balance_loss_mlp": 1.107988, + "diversity_loss_mlp": 0.0, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.08432013167879508, + "language_loss": 0.83142793, + "learning_rate": 0.0009043985969369357, + "loss": 0.84268159, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.17407227, + "routerloss_mlp": 0.0, + "step": 1165, + "time_per_iteration": 2.8148193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146301, + "balance_loss_mlp": 1.12976706, + "diversity_loss_mlp": 0.0, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.06944445596490195, + "language_loss": 0.84334069, + "learning_rate": 0.0009042153042591245, + "loss": 0.85480368, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 1166, + "time_per_iteration": 2.8004493713378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142176, + "balance_loss_mlp": 1.12542677, + "diversity_loss_mlp": 0.0, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.06821660135571728, + "language_loss": 0.85225487, + "learning_rate": 0.0009040318546510146, + "loss": 0.86367661, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 1167, + "time_per_iteration": 3.1969215869903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156354, + "balance_loss_mlp": 1.13979554, + "diversity_loss_mlp": 0.0, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.06547364647617461, + "language_loss": 0.84988701, + "learning_rate": 0.0009038482481838275, + "loss": 0.86145055, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 1168, + "time_per_iteration": 2.7087180614471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00861334, + "balance_loss_mlp": 1.49333596, + "diversity_loss_mlp": 0.19261675, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.02892951533663535, + "language_loss": 0.87266529, + "learning_rate": 0.0009036644849288455, + "loss": 0.88127863, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01835741, + "step": 1169, + "time_per_iteration": 3.1039352416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.1631248, + "diversity_loss_mlp": 0.0, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.06865085555084699, + "language_loss": 0.85404736, + "learning_rate": 0.0009034805649574118, + "loss": 0.86584634, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.16784668, + "routerloss_mlp": 0.0, + "step": 1170, + "time_per_iteration": 2.659322738647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208955, + "balance_loss_mlp": 1.1926589, + "diversity_loss_mlp": 0.0, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.07685307661183591, + "language_loss": 0.85691977, + "learning_rate": 0.0009032964883409308, + "loss": 0.86900926, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 1171, + "time_per_iteration": 2.8938751220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128211, + "balance_loss_mlp": 1.11910319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.06058864885284362, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74178743, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 1172, + "time_per_iteration": 4.983820676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217918, + "balance_loss_mlp": 1.20207548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.1048847225020503, + "language_loss": 0.8717351, + "learning_rate": 0.0009029278654587462, + "loss": 0.88391435, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.1583252, + "routerloss_mlp": 0.0, + "step": 1173, + "time_per_iteration": 2.639632225036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181665, + "balance_loss_mlp": 1.16508245, + "diversity_loss_mlp": 0.0, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.07111002228073603, + "language_loss": 0.82226282, + "learning_rate": 0.0009027433193361548, + "loss": 0.83407944, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1174, + "time_per_iteration": 2.7443323135375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159983, + "balance_loss_mlp": 1.14366364, + "diversity_loss_mlp": 0.0, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06531304020653, + "language_loss": 0.86980343, + "learning_rate": 0.00090255861685474, + "loss": 0.88140327, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 1175, + "time_per_iteration": 2.7534220218658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142116, + "balance_loss_mlp": 1.12533128, + "diversity_loss_mlp": 0.0, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.10016618462748716, + "language_loss": 0.90750074, + "learning_rate": 0.0009023737580862095, + "loss": 0.91892195, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1176, + "time_per_iteration": 2.5116937160491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114077, + "balance_loss_mlp": 1.12470055, + "diversity_loss_mlp": 0.0, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0707285441494173, + "language_loss": 0.83225566, + "learning_rate": 0.0009021887431023321, + "loss": 0.84366333, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1177, + "time_per_iteration": 2.599956512451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130034, + "balance_loss_mlp": 1.11444104, + "diversity_loss_mlp": 0.0, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.08431891612549362, + "language_loss": 0.87212515, + "learning_rate": 0.0009020035719749369, + "loss": 0.88342547, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1178, + "time_per_iteration": 2.7144312858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135701, + "balance_loss_mlp": 1.1205014, + "diversity_loss_mlp": 0.0, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.09883499682369536, + "language_loss": 0.77450085, + "learning_rate": 0.0009018182447759136, + "loss": 0.7858578, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1179, + "time_per_iteration": 2.98848557472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137145, + "balance_loss_mlp": 1.12187457, + "diversity_loss_mlp": 0.0, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.08173095074239418, + "language_loss": 0.79878223, + "learning_rate": 0.0009016327615772126, + "loss": 0.81015366, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1180, + "time_per_iteration": 2.9338154792785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149275, + "balance_loss_mlp": 1.13449335, + "diversity_loss_mlp": 0.0, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.08374692364956231, + "language_loss": 0.87680298, + "learning_rate": 0.0009014471224508451, + "loss": 0.88829577, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1181, + "time_per_iteration": 2.7131431102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881934, + "balance_loss_mlp": 1.53494334, + "diversity_loss_mlp": 0.19571492, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.04185105584005936, + "language_loss": 0.83154267, + "learning_rate": 0.0009012613274688823, + "loss": 0.84036207, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01660516, + "step": 1182, + "time_per_iteration": 2.649559736251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184244, + "balance_loss_mlp": 1.1692239, + "diversity_loss_mlp": 0.0, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.12019924395271459, + "language_loss": 0.87753081, + "learning_rate": 0.0009010753767034565, + "loss": 0.8893733, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1183, + "time_per_iteration": 2.5258986949920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175003, + "balance_loss_mlp": 1.16030502, + "diversity_loss_mlp": 0.0, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.08783280174490297, + "language_loss": 0.78918862, + "learning_rate": 0.0009008892702267599, + "loss": 0.80093861, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1184, + "time_per_iteration": 2.9962406158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139115, + "balance_loss_mlp": 1.12460732, + "diversity_loss_mlp": 0.0, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.08254121322216867, + "language_loss": 0.88525105, + "learning_rate": 0.0009007030081110457, + "loss": 0.89664215, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1185, + "time_per_iteration": 2.5990660190582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125585, + "balance_loss_mlp": 1.11087465, + "diversity_loss_mlp": 0.0, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.07610459395316062, + "language_loss": 0.84548527, + "learning_rate": 0.000900516590428627, + "loss": 0.85674113, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1186, + "time_per_iteration": 2.7377407550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121666, + "balance_loss_mlp": 1.1070751, + "diversity_loss_mlp": 0.0, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.13748029932532174, + "language_loss": 0.89182103, + "learning_rate": 0.0009003300172518778, + "loss": 0.90303767, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1187, + "time_per_iteration": 2.6916556358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116227, + "balance_loss_mlp": 1.10145736, + "diversity_loss_mlp": 0.0, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.11313229810108143, + "language_loss": 0.84335989, + "learning_rate": 0.0009001432886532321, + "loss": 0.85452211, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1188, + "time_per_iteration": 2.9698264598846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114727, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.06729358528862889, + "language_loss": 0.86774516, + "learning_rate": 0.0008999564047051843, + "loss": 0.87889242, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1189, + "time_per_iteration": 2.5002098083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136799, + "balance_loss_mlp": 1.12243462, + "diversity_loss_mlp": 0.0, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.0714274855120672, + "language_loss": 0.84824312, + "learning_rate": 0.0008997693654802894, + "loss": 0.85961115, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1190, + "time_per_iteration": 2.6300055980682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149047, + "balance_loss_mlp": 1.13425303, + "diversity_loss_mlp": 0.0, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.07754985979781381, + "language_loss": 0.86714745, + "learning_rate": 0.0008995821710511625, + "loss": 0.87863791, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1191, + "time_per_iteration": 2.7126989364624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162855, + "balance_loss_mlp": 1.14807296, + "diversity_loss_mlp": 0.0, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.11547698788472376, + "language_loss": 0.85060751, + "learning_rate": 0.0008993948214904786, + "loss": 0.86223602, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1192, + "time_per_iteration": 2.5562260150909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152534, + "balance_loss_mlp": 1.14361739, + "diversity_loss_mlp": 0.0, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.05307726892258072, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79574746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 1193, + "time_per_iteration": 4.909748792648315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187526, + "balance_loss_mlp": 1.17205215, + "diversity_loss_mlp": 0.0, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.09739164860103838, + "language_loss": 0.78353333, + "learning_rate": 0.0008990196572654427, + "loss": 0.79540861, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.15454102, + "routerloss_mlp": 0.0, + "step": 1194, + "time_per_iteration": 2.8592262268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117424, + "balance_loss_mlp": 1.1592319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.06260411033315277, + "language_loss": 0.87559408, + "learning_rate": 0.0008988318427467426, + "loss": 0.88733649, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1195, + "time_per_iteration": 2.7444722652435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00878316, + "balance_loss_mlp": 1.52780199, + "diversity_loss_mlp": 0.1948241, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.0364111048645648, + "language_loss": 0.86376345, + "learning_rate": 0.0008986438733877887, + "loss": 0.87254667, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01700337, + "step": 1196, + "time_per_iteration": 3.5090088844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137546, + "balance_loss_mlp": 1.1229074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.08413871186116019, + "language_loss": 0.83810687, + "learning_rate": 0.0008984557492615576, + "loss": 0.84948236, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1197, + "time_per_iteration": 2.9953744411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10803354, + "diversity_loss_mlp": 0.0, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.08617240411661099, + "language_loss": 0.90267789, + "learning_rate": 0.0008982674704410854, + "loss": 0.91390687, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1198, + "time_per_iteration": 2.7513339519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110338, + "balance_loss_mlp": 1.09598517, + "diversity_loss_mlp": 0.0, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.11146547076727734, + "language_loss": 0.77876621, + "learning_rate": 0.0008980790369994682, + "loss": 0.78986955, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1199, + "time_per_iteration": 2.989825487136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120977, + "balance_loss_mlp": 1.10670781, + "diversity_loss_mlp": 0.0, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.0677628031660983, + "language_loss": 0.8729977, + "learning_rate": 0.000897890449009863, + "loss": 0.88420743, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1200, + "time_per_iteration": 2.6784448623657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127646, + "balance_loss_mlp": 1.11330509, + "diversity_loss_mlp": 0.0, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.080414080555838, + "language_loss": 0.89825618, + "learning_rate": 0.0008977017065454853, + "loss": 0.90953267, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1201, + "time_per_iteration": 2.6610703468322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00880483, + "balance_loss_mlp": 1.52539706, + "diversity_loss_mlp": 0.19880572, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.03277795962214655, + "language_loss": 0.80367738, + "learning_rate": 0.0008975128096796121, + "loss": 0.81248224, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01838172, + "step": 1202, + "time_per_iteration": 2.901998996734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145011, + "balance_loss_mlp": 1.13089633, + "diversity_loss_mlp": 0.0, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.10693947298766643, + "language_loss": 0.85848922, + "learning_rate": 0.0008973237584855794, + "loss": 0.86993933, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1203, + "time_per_iteration": 2.872408151626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160017, + "balance_loss_mlp": 1.1457237, + "diversity_loss_mlp": 0.0, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.08753213296005687, + "language_loss": 0.82586002, + "learning_rate": 0.0008971345530367832, + "loss": 0.83746028, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1204, + "time_per_iteration": 2.4641921520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185717, + "balance_loss_mlp": 1.17120886, + "diversity_loss_mlp": 0.0, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07947534631123947, + "language_loss": 0.85658818, + "learning_rate": 0.0008969451934066799, + "loss": 0.8684454, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1205, + "time_per_iteration": 2.7822117805480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173591, + "balance_loss_mlp": 1.15872586, + "diversity_loss_mlp": 0.0, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08780432716538046, + "language_loss": 0.79991889, + "learning_rate": 0.0008967556796687854, + "loss": 0.81165481, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1206, + "time_per_iteration": 2.8849406242370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117013, + "balance_loss_mlp": 1.15584886, + "diversity_loss_mlp": 0.0, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.07569633120476413, + "language_loss": 0.83779937, + "learning_rate": 0.0008965660118966752, + "loss": 0.84950066, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1207, + "time_per_iteration": 2.9316329956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146692, + "balance_loss_mlp": 1.1319102, + "diversity_loss_mlp": 0.0, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06968265941642382, + "language_loss": 0.90114093, + "learning_rate": 0.0008963761901639851, + "loss": 0.91260791, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1208, + "time_per_iteration": 2.8140323162078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113879, + "balance_loss_mlp": 1.12392485, + "diversity_loss_mlp": 0.0, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.08612535310277082, + "language_loss": 0.83098078, + "learning_rate": 0.0008961862145444103, + "loss": 0.84236872, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1209, + "time_per_iteration": 2.7529945373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122935, + "balance_loss_mlp": 1.10796285, + "diversity_loss_mlp": 0.0, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08243119711445285, + "language_loss": 0.85338795, + "learning_rate": 0.0008959960851117059, + "loss": 0.86461735, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1210, + "time_per_iteration": 2.624340534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108554, + "balance_loss_mlp": 1.09396267, + "diversity_loss_mlp": 0.0, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.10596241027535934, + "language_loss": 0.84048676, + "learning_rate": 0.0008958058019396868, + "loss": 0.85157233, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1211, + "time_per_iteration": 2.8316566944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112068, + "balance_loss_mlp": 1.09751284, + "diversity_loss_mlp": 0.0, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.07651667178885936, + "language_loss": 0.86494702, + "learning_rate": 0.0008956153651022274, + "loss": 0.8760677, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1212, + "time_per_iteration": 2.684788465499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103634, + "balance_loss_mlp": 1.08926892, + "diversity_loss_mlp": 0.0, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.07459915787800217, + "language_loss": 0.83929688, + "learning_rate": 0.0008954247746732618, + "loss": 0.85033321, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1213, + "time_per_iteration": 2.6184399127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117524, + "balance_loss_mlp": 1.10321903, + "diversity_loss_mlp": 0.0, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.08317009769115577, + "language_loss": 0.90604293, + "learning_rate": 0.0008952340307267837, + "loss": 0.91721821, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1214, + "time_per_iteration": 2.8993093967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119644, + "balance_loss_mlp": 1.10553002, + "diversity_loss_mlp": 0.0, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.09601716623847659, + "language_loss": 0.83731341, + "learning_rate": 0.0008950431333368468, + "loss": 0.84850979, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1215, + "time_per_iteration": 2.6151199340820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130858, + "balance_loss_mlp": 1.11676729, + "diversity_loss_mlp": 0.0, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.08049188450288745, + "language_loss": 0.84623635, + "learning_rate": 0.0008948520825775634, + "loss": 0.8575449, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1216, + "time_per_iteration": 3.645200490951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123063, + "balance_loss_mlp": 1.10880601, + "diversity_loss_mlp": 0.0, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.08038238822992319, + "language_loss": 0.83978343, + "learning_rate": 0.0008946608785231067, + "loss": 0.85101402, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1217, + "time_per_iteration": 2.871616840362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126329, + "balance_loss_mlp": 1.11263156, + "diversity_loss_mlp": 0.0, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.07832391647543825, + "language_loss": 0.84442961, + "learning_rate": 0.0008944695212477084, + "loss": 0.85569292, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1218, + "time_per_iteration": 2.507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123493, + "balance_loss_mlp": 1.10867572, + "diversity_loss_mlp": 0.0, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.07420792055611987, + "language_loss": 0.86334574, + "learning_rate": 0.0008942780108256599, + "loss": 0.87458062, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1219, + "time_per_iteration": 2.6183433532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107778, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.07657909053901747, + "language_loss": 0.86160946, + "learning_rate": 0.0008940863473313121, + "loss": 0.87268722, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1220, + "time_per_iteration": 2.495164632797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107377, + "balance_loss_mlp": 1.09272623, + "diversity_loss_mlp": 0.0, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07962638616920462, + "language_loss": 0.87889743, + "learning_rate": 0.0008938945308390756, + "loss": 0.88997114, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1221, + "time_per_iteration": 2.613927125930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097552, + "balance_loss_mlp": 1.08298469, + "diversity_loss_mlp": 0.0, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.06679649396710063, + "language_loss": 0.87179595, + "learning_rate": 0.00089370256142342, + "loss": 0.88277149, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1222, + "time_per_iteration": 2.732208013534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094745, + "balance_loss_mlp": 1.07952189, + "diversity_loss_mlp": 0.0, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.06680688140454344, + "language_loss": 0.84810197, + "learning_rate": 0.0008935104391588746, + "loss": 0.85904944, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1223, + "time_per_iteration": 2.7585461139678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094902, + "balance_loss_mlp": 1.07917881, + "diversity_loss_mlp": 0.0, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.07271030004651308, + "language_loss": 0.83111542, + "learning_rate": 0.0008933181641200276, + "loss": 0.84206444, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.15710449, + "routerloss_mlp": 0.0, + "step": 1224, + "time_per_iteration": 3.1440725326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087445, + "balance_loss_mlp": 1.07139981, + "diversity_loss_mlp": 0.0, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.07882513603721358, + "language_loss": 0.85824931, + "learning_rate": 0.0008931257363815271, + "loss": 0.8691237, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.16040039, + "routerloss_mlp": 0.0, + "step": 1225, + "time_per_iteration": 2.8887243270874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092595, + "balance_loss_mlp": 1.07659674, + "diversity_loss_mlp": 0.0, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.09571789824401095, + "language_loss": 0.89901638, + "learning_rate": 0.0008929331560180798, + "loss": 0.90994227, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.15991211, + "routerloss_mlp": 0.0, + "step": 1226, + "time_per_iteration": 2.897155284881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095366, + "balance_loss_mlp": 1.07965469, + "diversity_loss_mlp": 0.0, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.068724406385502, + "language_loss": 0.90771782, + "learning_rate": 0.0008927404231044525, + "loss": 0.91867149, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 1227, + "time_per_iteration": 2.6892144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_mlp": 1.08764625, + "diversity_loss_mlp": 0.0, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.06943954848997126, + "language_loss": 0.81646705, + "learning_rate": 0.0008925475377154703, + "loss": 0.82749879, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1228, + "time_per_iteration": 2.727325201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.11394727, + "diversity_loss_mlp": 0.0, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.0778889683705481, + "language_loss": 0.8212285, + "learning_rate": 0.0008923544999260183, + "loss": 0.83252132, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1229, + "time_per_iteration": 2.7520618438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.13194346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.0853653064859127, + "language_loss": 0.91254115, + "learning_rate": 0.00089216130981104, + "loss": 0.92400861, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1230, + "time_per_iteration": 3.016228199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138894, + "balance_loss_mlp": 1.12364721, + "diversity_loss_mlp": 0.0, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.08048994442870243, + "language_loss": 0.82752085, + "learning_rate": 0.000891967967445539, + "loss": 0.83890975, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.15222168, + "routerloss_mlp": 0.0, + "step": 1231, + "time_per_iteration": 2.65736722946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126061, + "balance_loss_mlp": 1.11135054, + "diversity_loss_mlp": 0.0, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.05909715635047166, + "language_loss": 0.889099, + "learning_rate": 0.0008917744729045772, + "loss": 0.90035963, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1232, + "time_per_iteration": 2.8686273097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110871, + "balance_loss_mlp": 1.0962795, + "diversity_loss_mlp": 0.0, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.08046733758331526, + "language_loss": 0.83836448, + "learning_rate": 0.0008915808262632757, + "loss": 0.84947324, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1233, + "time_per_iteration": 2.860353708267212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918962, + "balance_loss_mlp": 1.60287488, + "diversity_loss_mlp": 0.20008399, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.03182006079144566, + "language_loss": 0.93544835, + "learning_rate": 0.0008913870275968148, + "loss": 0.94463801, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017482, + "step": 1234, + "time_per_iteration": 2.7328829765319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095751, + "balance_loss_mlp": 1.08008718, + "diversity_loss_mlp": 0.0, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.07195832826776788, + "language_loss": 0.87503707, + "learning_rate": 0.0008911930769804342, + "loss": 0.88599461, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1235, + "time_per_iteration": 3.2619638442993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091405, + "balance_loss_mlp": 1.07551408, + "diversity_loss_mlp": 0.0, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.07148547933088874, + "language_loss": 0.91313815, + "learning_rate": 0.0008909989744894318, + "loss": 0.92405218, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1236, + "time_per_iteration": 2.8687992095947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080974, + "balance_loss_mlp": 1.06530952, + "diversity_loss_mlp": 0.0, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.08021447901266163, + "language_loss": 0.81662518, + "learning_rate": 0.0008908047201991649, + "loss": 0.8274349, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1237, + "time_per_iteration": 2.737638235092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076608, + "balance_loss_mlp": 1.06138515, + "diversity_loss_mlp": 0.0, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.07749899394714953, + "language_loss": 0.86585152, + "learning_rate": 0.0008906103141850502, + "loss": 0.87661767, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.15197754, + "routerloss_mlp": 0.0, + "step": 1238, + "time_per_iteration": 2.9184746742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068848, + "balance_loss_mlp": 1.05385113, + "diversity_loss_mlp": 0.0, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.10230617436374452, + "language_loss": 0.88104367, + "learning_rate": 0.0008904157565225621, + "loss": 0.89173216, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.1496582, + "routerloss_mlp": 0.0, + "step": 1239, + "time_per_iteration": 2.6396749019622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_mlp": 1.06220865, + "diversity_loss_mlp": 0.0, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.10467557893696883, + "language_loss": 0.81824136, + "learning_rate": 0.000890221047287235, + "loss": 0.82901168, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1240, + "time_per_iteration": 3.496812582015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081371, + "balance_loss_mlp": 1.06710172, + "diversity_loss_mlp": 0.0, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.09443583580909311, + "language_loss": 0.91125917, + "learning_rate": 0.0008900261865546615, + "loss": 0.92207289, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1241, + "time_per_iteration": 2.6527724266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103977, + "balance_loss_mlp": 1.0890398, + "diversity_loss_mlp": 0.0, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.08429957072104315, + "language_loss": 0.84985352, + "learning_rate": 0.0008898311744004936, + "loss": 0.86089325, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1242, + "time_per_iteration": 2.6740338802337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118763, + "balance_loss_mlp": 1.10411179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.07332762129893158, + "language_loss": 0.86932802, + "learning_rate": 0.0008896360109004414, + "loss": 0.88051569, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1243, + "time_per_iteration": 2.643489122390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142112, + "balance_loss_mlp": 1.12715125, + "diversity_loss_mlp": 0.0, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.09306092844590973, + "language_loss": 0.84636557, + "learning_rate": 0.0008894406961302742, + "loss": 0.85778666, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1244, + "time_per_iteration": 2.5876173973083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150798, + "balance_loss_mlp": 1.13590896, + "diversity_loss_mlp": 0.0, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.0838589606869783, + "language_loss": 0.83944738, + "learning_rate": 0.0008892452301658201, + "loss": 0.85095537, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1245, + "time_per_iteration": 2.928391218185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116146, + "balance_loss_mlp": 1.1460346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.0736247551351698, + "language_loss": 0.83299339, + "learning_rate": 0.0008890496130829653, + "loss": 0.84460801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1246, + "time_per_iteration": 2.6510462760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915571, + "balance_loss_mlp": 1.59993446, + "diversity_loss_mlp": 0.1987851, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.03287481157446996, + "language_loss": 0.85918486, + "learning_rate": 0.0008888538449576555, + "loss": 0.86834061, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621127, + "step": 1247, + "time_per_iteration": 2.5719456672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178279, + "balance_loss_mlp": 1.16323447, + "diversity_loss_mlp": 0.0, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.10811715250715398, + "language_loss": 0.83036304, + "learning_rate": 0.0008886579258658944, + "loss": 0.8421458, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.15014648, + "routerloss_mlp": 0.0, + "step": 1248, + "time_per_iteration": 2.5736701488494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148631, + "balance_loss_mlp": 1.13341999, + "diversity_loss_mlp": 0.0, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.07868761607649298, + "language_loss": 0.84717274, + "learning_rate": 0.0008884618558837446, + "loss": 0.85865903, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1249, + "time_per_iteration": 2.8215761184692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911764, + "balance_loss_mlp": 1.59372783, + "diversity_loss_mlp": 0.19720009, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.03236174678929329, + "language_loss": 0.8677094, + "learning_rate": 0.0008882656350873273, + "loss": 0.87682706, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01629994, + "step": 1250, + "time_per_iteration": 2.885092258453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126022, + "balance_loss_mlp": 1.11122799, + "diversity_loss_mlp": 0.0, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.08347743908005935, + "language_loss": 0.87000573, + "learning_rate": 0.0008880692635528219, + "loss": 0.88126594, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1251, + "time_per_iteration": 3.049070119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106629, + "balance_loss_mlp": 1.09177542, + "diversity_loss_mlp": 0.0, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.07406446185181008, + "language_loss": 0.89514965, + "learning_rate": 0.0008878727413564669, + "loss": 0.90621597, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1252, + "time_per_iteration": 2.734839677810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.06804204, + "diversity_loss_mlp": 0.0, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.048930323133030355, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81211317, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1253, + "time_per_iteration": 4.854974031448364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00873083, + "balance_loss_mlp": 1.51531768, + "diversity_loss_mlp": 0.19563958, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.03648198852202315, + "language_loss": 0.78763413, + "learning_rate": 0.0008874792452834528, + "loss": 0.7963649, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01760404, + "step": 1254, + "time_per_iteration": 2.803690195083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090026, + "balance_loss_mlp": 1.07530415, + "diversity_loss_mlp": 0.0, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.09659900556863026, + "language_loss": 0.8729195, + "learning_rate": 0.0008872822715595626, + "loss": 0.88381982, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1255, + "time_per_iteration": 2.657867670059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084204, + "balance_loss_mlp": 1.06968451, + "diversity_loss_mlp": 0.0, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.10497791491954662, + "language_loss": 0.87333822, + "learning_rate": 0.0008870851474793598, + "loss": 0.88418031, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1256, + "time_per_iteration": 2.5694568157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083756, + "balance_loss_mlp": 1.06920075, + "diversity_loss_mlp": 0.0, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.07331256259210016, + "language_loss": 0.89243567, + "learning_rate": 0.0008868878731193752, + "loss": 0.90327322, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1257, + "time_per_iteration": 2.829789400100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086138, + "balance_loss_mlp": 1.07158267, + "diversity_loss_mlp": 0.0, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.07236027639177293, + "language_loss": 0.89720446, + "learning_rate": 0.0008866904485561973, + "loss": 0.90806586, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1258, + "time_per_iteration": 2.731635570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078524, + "balance_loss_mlp": 1.06384969, + "diversity_loss_mlp": 0.0, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.0727569881861308, + "language_loss": 0.83084273, + "learning_rate": 0.000886492873866473, + "loss": 0.84162796, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1259, + "time_per_iteration": 2.8250575065612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080175, + "balance_loss_mlp": 1.06528533, + "diversity_loss_mlp": 0.0, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.10762424055834904, + "language_loss": 0.84672934, + "learning_rate": 0.000886295149126908, + "loss": 0.85753107, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1260, + "time_per_iteration": 2.7148356437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086434, + "balance_loss_mlp": 1.07181931, + "diversity_loss_mlp": 0.0, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.07159531524201106, + "language_loss": 0.85693741, + "learning_rate": 0.0008860972744142655, + "loss": 0.86780179, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1261, + "time_per_iteration": 2.931696653366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115009, + "balance_loss_mlp": 1.10064411, + "diversity_loss_mlp": 0.0, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.065367920687613, + "language_loss": 0.81639904, + "learning_rate": 0.0008858992498053671, + "loss": 0.82754916, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1262, + "time_per_iteration": 2.846466541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055704, + "balance_loss_mlp": 1.04764521, + "diversity_loss_mlp": 0.0, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.03374572714932058, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77644455, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1263, + "time_per_iteration": 4.882519006729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00872344, + "balance_loss_mlp": 1.51226497, + "diversity_loss_mlp": 0.19974959, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.03166105856965055, + "language_loss": 0.83409035, + "learning_rate": 0.0008855027512063817, + "loss": 0.84281385, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633644, + "step": 1264, + "time_per_iteration": 2.7414488792419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185798, + "balance_loss_mlp": 1.17132628, + "diversity_loss_mlp": 0.0, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.06261248257395001, + "language_loss": 0.85949916, + "learning_rate": 0.0008853042773702292, + "loss": 0.8713572, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1265, + "time_per_iteration": 2.695514440536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196886, + "balance_loss_mlp": 1.18234205, + "diversity_loss_mlp": 0.0, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.08760826562773598, + "language_loss": 0.87981403, + "learning_rate": 0.0008851056539456896, + "loss": 0.89178288, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1266, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119913, + "balance_loss_mlp": 1.18489647, + "diversity_loss_mlp": 0.0, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.07991839198753149, + "language_loss": 0.81904382, + "learning_rate": 0.0008849068810098755, + "loss": 0.83103514, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1267, + "time_per_iteration": 3.3067915439605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174372, + "balance_loss_mlp": 1.15992332, + "diversity_loss_mlp": 0.0, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.10499473220259715, + "language_loss": 0.83550054, + "learning_rate": 0.0008847079586399575, + "loss": 0.84724426, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1268, + "time_per_iteration": 2.4791157245635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115106, + "balance_loss_mlp": 1.13699341, + "diversity_loss_mlp": 0.0, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07765469411987547, + "language_loss": 0.86144567, + "learning_rate": 0.0008845088869131641, + "loss": 0.87295628, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1269, + "time_per_iteration": 2.6733555793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111483, + "balance_loss_mlp": 1.10053682, + "diversity_loss_mlp": 0.0, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.0888033537849515, + "language_loss": 0.88898385, + "learning_rate": 0.0008843096659067818, + "loss": 0.90013218, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1270, + "time_per_iteration": 2.6315910816192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111213, + "balance_loss_mlp": 1.09708679, + "diversity_loss_mlp": 0.0, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.09475560383246978, + "language_loss": 0.86565858, + "learning_rate": 0.000884110295698155, + "loss": 0.87677073, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1271, + "time_per_iteration": 2.926668643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.08752966, + "diversity_loss_mlp": 0.0, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.09917556522455147, + "language_loss": 0.85849231, + "learning_rate": 0.0008839107763646861, + "loss": 0.86951411, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1272, + "time_per_iteration": 2.58022403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110242, + "balance_loss_mlp": 1.08751881, + "diversity_loss_mlp": 0.0, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.08783320449451974, + "language_loss": 0.89941388, + "learning_rate": 0.0008837111079838353, + "loss": 0.91043806, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1273, + "time_per_iteration": 2.6877150535583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111566, + "balance_loss_mlp": 1.10096157, + "diversity_loss_mlp": 0.0, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.07640958054403056, + "language_loss": 0.89671296, + "learning_rate": 0.000883511290633121, + "loss": 0.90786958, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1274, + "time_per_iteration": 2.5929813385009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123812, + "balance_loss_mlp": 1.10898256, + "diversity_loss_mlp": 0.0, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.05814589763763208, + "language_loss": 0.92211604, + "learning_rate": 0.000883311324390119, + "loss": 0.93335414, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1275, + "time_per_iteration": 2.721343517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138447, + "balance_loss_mlp": 1.12315261, + "diversity_loss_mlp": 0.0, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.10098653640048322, + "language_loss": 0.81237984, + "learning_rate": 0.0008831112093324629, + "loss": 0.82376432, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.15283203, + "routerloss_mlp": 0.0, + "step": 1276, + "time_per_iteration": 3.066657543182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148152, + "balance_loss_mlp": 1.13266695, + "diversity_loss_mlp": 0.0, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.07328274291062464, + "language_loss": 0.89255905, + "learning_rate": 0.0008829109455378444, + "loss": 0.90404058, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 1277, + "time_per_iteration": 2.6705071926116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163573, + "balance_loss_mlp": 1.14844561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.08343231090098181, + "language_loss": 0.86569774, + "learning_rate": 0.000882710533084013, + "loss": 0.87733346, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1278, + "time_per_iteration": 2.632864236831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152351, + "balance_loss_mlp": 1.13783133, + "diversity_loss_mlp": 0.0, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.0729065811951457, + "language_loss": 0.8929435, + "learning_rate": 0.0008825099720487755, + "loss": 0.90446699, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1279, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00676302, + "balance_loss_mlp": 1.12665224, + "diversity_loss_mlp": 0.19835761, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.0027483074809680533, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.75937444, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0137972, + "step": 1280, + "time_per_iteration": 4.88429594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111003, + "balance_loss_mlp": 1.10232449, + "diversity_loss_mlp": 0.0, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.05615046205501133, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79055113, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1281, + "time_per_iteration": 4.752316236495972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113823, + "balance_loss_mlp": 1.09987593, + "diversity_loss_mlp": 0.0, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.08093958913819582, + "language_loss": 0.89542687, + "learning_rate": 0.0008819073982335619, + "loss": 0.90656507, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1282, + "time_per_iteration": 2.876927137374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110167, + "balance_loss_mlp": 1.08783603, + "diversity_loss_mlp": 0.0, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07169123109412263, + "language_loss": 0.84362143, + "learning_rate": 0.0008817062436519235, + "loss": 0.8546381, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.13824463, + "routerloss_mlp": 0.0, + "step": 1283, + "time_per_iteration": 2.6551387310028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0086846, + "balance_loss_mlp": 1.5022366, + "diversity_loss_mlp": 0.20048198, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.033180516132009126, + "language_loss": 0.89655471, + "learning_rate": 0.0008815049408787788, + "loss": 0.90523928, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01710081, + "step": 1284, + "time_per_iteration": 2.5652830600738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100363, + "balance_loss_mlp": 1.08698821, + "diversity_loss_mlp": 0.0, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.0762028673981185, + "language_loss": 0.85473216, + "learning_rate": 0.0008813034899922805, + "loss": 0.86573577, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1285, + "time_per_iteration": 2.549622058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111306, + "balance_loss_mlp": 1.09783578, + "diversity_loss_mlp": 0.0, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.11471388318643767, + "language_loss": 0.89855313, + "learning_rate": 0.0008811018910706387, + "loss": 0.9096663, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1286, + "time_per_iteration": 2.575176954269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117993, + "balance_loss_mlp": 1.10453439, + "diversity_loss_mlp": 0.0, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.10517914532856759, + "language_loss": 0.81922066, + "learning_rate": 0.0008809001441921211, + "loss": 0.83040059, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1287, + "time_per_iteration": 2.732236862182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.1132865, + "diversity_loss_mlp": 0.0, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.1440229573277689, + "language_loss": 0.85392761, + "learning_rate": 0.0008806982494350528, + "loss": 0.86519527, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1288, + "time_per_iteration": 2.6544177532196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168468, + "balance_loss_mlp": 1.1549263, + "diversity_loss_mlp": 0.0, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.07192560701016996, + "language_loss": 0.9021467, + "learning_rate": 0.0008804962068778161, + "loss": 0.91383135, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1289, + "time_per_iteration": 2.8321304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_mlp": 1.20329499, + "diversity_loss_mlp": 0.0, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.08274381184261048, + "language_loss": 0.81234664, + "learning_rate": 0.0008802940165988511, + "loss": 0.82451665, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1290, + "time_per_iteration": 2.848726749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262968, + "balance_loss_mlp": 1.24875808, + "diversity_loss_mlp": 0.0, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.09449787402071168, + "language_loss": 0.88461435, + "learning_rate": 0.000880091678676655, + "loss": 0.8972441, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1291, + "time_per_iteration": 2.802199363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279654, + "balance_loss_mlp": 1.26553965, + "diversity_loss_mlp": 0.0, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.11843407890200246, + "language_loss": 0.88870949, + "learning_rate": 0.0008798891931897821, + "loss": 0.90150601, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1292, + "time_per_iteration": 2.7150259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870403, + "balance_loss_mlp": 1.50883341, + "diversity_loss_mlp": 0.20002533, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.035309457370921726, + "language_loss": 0.84031773, + "learning_rate": 0.0008796865602168447, + "loss": 0.84902173, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597392, + "step": 1293, + "time_per_iteration": 2.5952000617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210957, + "balance_loss_mlp": 1.19661582, + "diversity_loss_mlp": 0.0, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.07909897749306223, + "language_loss": 0.88611919, + "learning_rate": 0.0008794837798365115, + "loss": 0.89822876, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1294, + "time_per_iteration": 2.6257524490356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167929, + "balance_loss_mlp": 1.15246725, + "diversity_loss_mlp": 0.0, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.06704316740686254, + "language_loss": 0.8866623, + "learning_rate": 0.0008792808521275089, + "loss": 0.89834166, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1295, + "time_per_iteration": 2.7125115394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153488, + "balance_loss_mlp": 1.13757372, + "diversity_loss_mlp": 0.0, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.08601952378824393, + "language_loss": 0.87496305, + "learning_rate": 0.0008790777771686206, + "loss": 0.88649786, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1296, + "time_per_iteration": 2.6131319999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124706, + "balance_loss_mlp": 1.10882747, + "diversity_loss_mlp": 0.0, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.0951042007575699, + "language_loss": 0.8543523, + "learning_rate": 0.0008788745550386872, + "loss": 0.86559939, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 1297, + "time_per_iteration": 2.5590503215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115503, + "balance_loss_mlp": 1.09948111, + "diversity_loss_mlp": 0.0, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.07219065567928346, + "language_loss": 0.80291975, + "learning_rate": 0.0008786711858166063, + "loss": 0.81407487, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1298, + "time_per_iteration": 2.951768398284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00871436, + "balance_loss_mlp": 1.51113367, + "diversity_loss_mlp": 0.19870289, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.03357842357877673, + "language_loss": 0.83488023, + "learning_rate": 0.0008784676695813332, + "loss": 0.84359455, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165179, + "step": 1299, + "time_per_iteration": 2.985684871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108985, + "balance_loss_mlp": 1.07411456, + "diversity_loss_mlp": 0.0, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07050099983107566, + "language_loss": 0.84900999, + "learning_rate": 0.0008782640064118796, + "loss": 0.85990846, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1300, + "time_per_iteration": 2.943368673324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139161, + "balance_loss_mlp": 1.13172245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.062054541004710057, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323914, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.07421875, + "routerloss_mlp": 0.0, + "step": 1301, + "time_per_iteration": 4.975619316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106013, + "balance_loss_mlp": 1.09055138, + "diversity_loss_mlp": 0.0, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.08145949094764637, + "language_loss": 0.86554521, + "learning_rate": 0.0008778562395867648, + "loss": 0.87660533, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1302, + "time_per_iteration": 2.6318612098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111342, + "balance_loss_mlp": 1.09572554, + "diversity_loss_mlp": 0.0, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.0727542370097133, + "language_loss": 0.84224409, + "learning_rate": 0.0008776521360894127, + "loss": 0.85335743, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 1303, + "time_per_iteration": 2.6512627601623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_mlp": 1.02259421, + "diversity_loss_mlp": 0.0, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.02979233866947858, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79991817, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.07128906, + "routerloss_mlp": 0.0, + "step": 1304, + "time_per_iteration": 4.802467107772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112993, + "balance_loss_mlp": 1.11518431, + "diversity_loss_mlp": 0.0, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.07060498048015267, + "language_loss": 0.9057076, + "learning_rate": 0.0008772434893213186, + "loss": 0.91700697, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1305, + "time_per_iteration": 2.601546049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137218, + "balance_loss_mlp": 1.12251997, + "diversity_loss_mlp": 0.0, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.13797279723809866, + "language_loss": 0.84362888, + "learning_rate": 0.0008770389462092276, + "loss": 0.85500103, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1306, + "time_per_iteration": 2.626138210296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141522, + "balance_loss_mlp": 1.12685966, + "diversity_loss_mlp": 0.0, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.08471108342240245, + "language_loss": 0.86803389, + "learning_rate": 0.0008768342567176357, + "loss": 0.87944913, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1307, + "time_per_iteration": 2.8074796199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114159, + "balance_loss_mlp": 1.12681937, + "diversity_loss_mlp": 0.0, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.07263390393133992, + "language_loss": 0.90559924, + "learning_rate": 0.0008766294209260107, + "loss": 0.91701508, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1308, + "time_per_iteration": 2.670790910720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.13312435, + "diversity_loss_mlp": 0.0, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.07764888634730133, + "language_loss": 0.91554916, + "learning_rate": 0.0008764244389138767, + "loss": 0.92702377, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1309, + "time_per_iteration": 2.572793483734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147476, + "balance_loss_mlp": 1.13318276, + "diversity_loss_mlp": 0.0, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09714227143719616, + "language_loss": 0.82980847, + "learning_rate": 0.000876219310760815, + "loss": 0.8412832, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1310, + "time_per_iteration": 2.8601791858673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146968, + "balance_loss_mlp": 1.13273418, + "diversity_loss_mlp": 0.0, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.09648806821544922, + "language_loss": 0.81436276, + "learning_rate": 0.0008760140365464631, + "loss": 0.82583249, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1311, + "time_per_iteration": 2.599353790283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870128, + "balance_loss_mlp": 1.50605726, + "diversity_loss_mlp": 0.20002663, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.03529693250820236, + "language_loss": 0.871418, + "learning_rate": 0.0008758086163505156, + "loss": 0.88011926, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170862, + "step": 1312, + "time_per_iteration": 2.6166832447052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163863, + "balance_loss_mlp": 1.14953399, + "diversity_loss_mlp": 0.0, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.07147814499844148, + "language_loss": 0.89267951, + "learning_rate": 0.0008756030502527239, + "loss": 0.90431809, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1313, + "time_per_iteration": 2.8452062606811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188075, + "balance_loss_mlp": 1.17377019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.09335955432973846, + "language_loss": 0.90298462, + "learning_rate": 0.0008753973383328954, + "loss": 0.91486537, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1314, + "time_per_iteration": 2.6988537311553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165459, + "balance_loss_mlp": 1.15108287, + "diversity_loss_mlp": 0.0, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.08872096542459323, + "language_loss": 0.83944553, + "learning_rate": 0.0008751914806708952, + "loss": 0.85110015, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1315, + "time_per_iteration": 2.6328680515289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.1372478, + "diversity_loss_mlp": 0.0, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.09247066962171595, + "language_loss": 0.81854099, + "learning_rate": 0.0008749854773466439, + "loss": 0.83005595, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1316, + "time_per_iteration": 2.6708498001098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134446, + "balance_loss_mlp": 1.11980653, + "diversity_loss_mlp": 0.0, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.06992463478304738, + "language_loss": 0.84568423, + "learning_rate": 0.0008747793284401192, + "loss": 0.85702872, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1317, + "time_per_iteration": 2.70182204246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120804, + "balance_loss_mlp": 1.10560477, + "diversity_loss_mlp": 0.0, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.11229953955213261, + "language_loss": 0.85994983, + "learning_rate": 0.0008745730340313551, + "loss": 0.87115788, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1318, + "time_per_iteration": 2.8026556968688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.1048007, + "diversity_loss_mlp": 0.0, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.0843917818222923, + "language_loss": 0.84519732, + "learning_rate": 0.0008743665942004422, + "loss": 0.85639453, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.14904785, + "routerloss_mlp": 0.0, + "step": 1319, + "time_per_iteration": 2.6717073917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120645, + "balance_loss_mlp": 1.10569644, + "diversity_loss_mlp": 0.0, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.06860607652829093, + "language_loss": 0.92769039, + "learning_rate": 0.0008741600090275277, + "loss": 0.93889689, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1320, + "time_per_iteration": 2.6251981258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120587, + "balance_loss_mlp": 1.10530448, + "diversity_loss_mlp": 0.0, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.09643257369734548, + "language_loss": 0.8425917, + "learning_rate": 0.0008739532785928151, + "loss": 0.85379755, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1321, + "time_per_iteration": 3.4925267696380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101061, + "balance_loss_mlp": 1.09305024, + "diversity_loss_mlp": 0.0, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.04547815076873398, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75994641, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1322, + "time_per_iteration": 4.8446879386901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0085354, + "balance_loss_mlp": 1.4814328, + "diversity_loss_mlp": 0.19370571, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.036800523279172735, + "language_loss": 0.82844102, + "learning_rate": 0.0008735393822590908, + "loss": 0.83697641, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597124, + "step": 1323, + "time_per_iteration": 2.7354650497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174586, + "balance_loss_mlp": 1.16032863, + "diversity_loss_mlp": 0.0, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.08280852347492981, + "language_loss": 0.87442601, + "learning_rate": 0.0008733322165207681, + "loss": 0.88617194, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1324, + "time_per_iteration": 2.6581695079803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120021, + "balance_loss_mlp": 1.18529749, + "diversity_loss_mlp": 0.0, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.0779912319299164, + "language_loss": 0.8296451, + "learning_rate": 0.0008731249058420247, + "loss": 0.84164721, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1325, + "time_per_iteration": 3.0674960613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203892, + "balance_loss_mlp": 1.18865728, + "diversity_loss_mlp": 0.0, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.10695670124077197, + "language_loss": 0.90080667, + "learning_rate": 0.0008729174503033459, + "loss": 0.91284555, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1326, + "time_per_iteration": 2.6511192321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188403, + "balance_loss_mlp": 1.17334652, + "diversity_loss_mlp": 0.0, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.10125548093505272, + "language_loss": 0.82427752, + "learning_rate": 0.0008727098499852728, + "loss": 0.83616149, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1327, + "time_per_iteration": 2.833803415298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150318, + "balance_loss_mlp": 1.13529778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.08478455973869617, + "language_loss": 0.89778203, + "learning_rate": 0.0008725021049684034, + "loss": 0.90928519, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.15002441, + "routerloss_mlp": 0.0, + "step": 1328, + "time_per_iteration": 2.7405433654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116795, + "balance_loss_mlp": 1.10194123, + "diversity_loss_mlp": 0.0, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.07099770943741918, + "language_loss": 0.83078361, + "learning_rate": 0.000872294215333391, + "loss": 0.84195161, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1329, + "time_per_iteration": 3.219834089279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099158, + "balance_loss_mlp": 1.08430433, + "diversity_loss_mlp": 0.0, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.06913408205057751, + "language_loss": 0.82662833, + "learning_rate": 0.0008720861811609457, + "loss": 0.8376199, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1330, + "time_per_iteration": 2.753122329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096587, + "balance_loss_mlp": 1.0816741, + "diversity_loss_mlp": 0.0, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0919113566921475, + "language_loss": 0.83719599, + "learning_rate": 0.0008718780025318338, + "loss": 0.84816188, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1331, + "time_per_iteration": 2.724808692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.09296656, + "diversity_loss_mlp": 0.0, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.09880415123515712, + "language_loss": 0.83982158, + "learning_rate": 0.0008716696795268771, + "loss": 0.85089689, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1332, + "time_per_iteration": 2.718421220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098797, + "balance_loss_mlp": 1.08430111, + "diversity_loss_mlp": 0.0, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.15208681676824193, + "language_loss": 0.85333431, + "learning_rate": 0.0008714612122269538, + "loss": 0.8643223, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1333, + "time_per_iteration": 2.877823829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120258, + "balance_loss_mlp": 1.10586989, + "diversity_loss_mlp": 0.0, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.07756137703605612, + "language_loss": 0.89334106, + "learning_rate": 0.0008712526007129982, + "loss": 0.90454364, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1334, + "time_per_iteration": 2.561842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155014, + "balance_loss_mlp": 1.14101923, + "diversity_loss_mlp": 0.0, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.12724628219842446, + "language_loss": 0.90676123, + "learning_rate": 0.0008710438450660003, + "loss": 0.91831136, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1335, + "time_per_iteration": 2.6618270874023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199277, + "balance_loss_mlp": 1.18486404, + "diversity_loss_mlp": 0.0, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.10895723532104484, + "language_loss": 0.87596953, + "learning_rate": 0.0008708349453670064, + "loss": 0.88796222, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1336, + "time_per_iteration": 2.5121865272521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195197, + "balance_loss_mlp": 1.18032002, + "diversity_loss_mlp": 0.0, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.10227195785495524, + "language_loss": 0.91035736, + "learning_rate": 0.0008706259016971185, + "loss": 0.92230934, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1337, + "time_per_iteration": 2.7760090827941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189061, + "balance_loss_mlp": 1.17414773, + "diversity_loss_mlp": 0.0, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.12625436277937716, + "language_loss": 0.83095431, + "learning_rate": 0.0008704167141374944, + "loss": 0.84284496, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1338, + "time_per_iteration": 2.824122428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146224, + "balance_loss_mlp": 1.13107228, + "diversity_loss_mlp": 0.0, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.0801465901926633, + "language_loss": 0.88427222, + "learning_rate": 0.0008702073827693482, + "loss": 0.89573455, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1339, + "time_per_iteration": 2.708488941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101183, + "balance_loss_mlp": 1.0865202, + "diversity_loss_mlp": 0.0, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.07445900988257396, + "language_loss": 0.88514435, + "learning_rate": 0.0008699979076739494, + "loss": 0.89615613, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1340, + "time_per_iteration": 2.960650682449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.07054412, + "diversity_loss_mlp": 0.0, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.09041758143252471, + "language_loss": 0.88622832, + "learning_rate": 0.0008697882889326234, + "loss": 0.89708054, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1341, + "time_per_iteration": 2.5199689865112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094608, + "balance_loss_mlp": 1.08043432, + "diversity_loss_mlp": 0.0, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.08157938691300957, + "language_loss": 0.86840844, + "learning_rate": 0.0008695785266267515, + "loss": 0.87935448, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1342, + "time_per_iteration": 2.6833419799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089859, + "balance_loss_mlp": 1.56664371, + "diversity_loss_mlp": 0.19803861, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.03344075262961686, + "language_loss": 0.83491886, + "learning_rate": 0.0008693686208377704, + "loss": 0.84390479, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01624843, + "step": 1343, + "time_per_iteration": 2.8157622814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101399, + "balance_loss_mlp": 1.08711743, + "diversity_loss_mlp": 0.0, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.07460013341605923, + "language_loss": 0.89022982, + "learning_rate": 0.0008691585716471733, + "loss": 0.90124375, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1344, + "time_per_iteration": 2.6386232376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.09707415, + "diversity_loss_mlp": 0.0, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.08548738123283665, + "language_loss": 0.85822487, + "learning_rate": 0.0008689483791365079, + "loss": 0.86934054, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1345, + "time_per_iteration": 2.831817626953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112096, + "balance_loss_mlp": 1.10685778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.07218857890204664, + "language_loss": 0.89327282, + "learning_rate": 0.0008687380433873786, + "loss": 0.90448248, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1346, + "time_per_iteration": 2.8322408199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1251955, + "diversity_loss_mlp": 0.0, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.07612070672802876, + "language_loss": 0.82638776, + "learning_rate": 0.0008685275644814448, + "loss": 0.83778065, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1347, + "time_per_iteration": 2.689772367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116224, + "balance_loss_mlp": 1.14764857, + "diversity_loss_mlp": 0.0, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07884944678342334, + "language_loss": 0.84390515, + "learning_rate": 0.0008683169425004216, + "loss": 0.85552752, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1348, + "time_per_iteration": 2.895153760910034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159732, + "balance_loss_mlp": 1.14511704, + "diversity_loss_mlp": 0.0, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.10354145261803285, + "language_loss": 0.83314335, + "learning_rate": 0.0008681061775260799, + "loss": 0.84474063, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1349, + "time_per_iteration": 2.850862503051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166024, + "balance_loss_mlp": 1.15118265, + "diversity_loss_mlp": 0.0, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08416928552821445, + "language_loss": 0.9214983, + "learning_rate": 0.0008678952696402458, + "loss": 0.93315852, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1350, + "time_per_iteration": 2.525019884109497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153353, + "balance_loss_mlp": 1.13848734, + "diversity_loss_mlp": 0.0, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.07397225666721696, + "language_loss": 0.86554277, + "learning_rate": 0.000867684218924801, + "loss": 0.87707639, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1351, + "time_per_iteration": 2.8780648708343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083238, + "balance_loss_mlp": 1.07517958, + "diversity_loss_mlp": 0.0, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.0438698963901256, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80030328, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1352, + "time_per_iteration": 4.916059255599976 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132931, + "balance_loss_mlp": 1.11807716, + "diversity_loss_mlp": 0.0, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.06358739416567256, + "language_loss": 0.85154414, + "learning_rate": 0.0008672616893328834, + "loss": 0.86287344, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1353, + "time_per_iteration": 2.9301464557647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120208, + "balance_loss_mlp": 1.10545015, + "diversity_loss_mlp": 0.0, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.0804298790611747, + "language_loss": 0.89736795, + "learning_rate": 0.0008670502106204512, + "loss": 0.90857005, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.14733887, + "routerloss_mlp": 0.0, + "step": 1354, + "time_per_iteration": 2.8392651081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121529, + "balance_loss_mlp": 1.10672283, + "diversity_loss_mlp": 0.0, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.08121830869095954, + "language_loss": 0.81676221, + "learning_rate": 0.0008668385894064892, + "loss": 0.82797754, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1355, + "time_per_iteration": 2.632744550704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.10095191, + "diversity_loss_mlp": 0.0, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.0871855710564252, + "language_loss": 0.88984954, + "learning_rate": 0.0008666268257731562, + "loss": 0.90100139, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1356, + "time_per_iteration": 3.0961363315582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132093, + "balance_loss_mlp": 1.11785948, + "diversity_loss_mlp": 0.0, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.08548634624367135, + "language_loss": 0.8594982, + "learning_rate": 0.0008664149198026662, + "loss": 0.87081909, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1357, + "time_per_iteration": 3.2423956394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133945, + "balance_loss_mlp": 1.12039137, + "diversity_loss_mlp": 0.0, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.09109654485188295, + "language_loss": 0.88802171, + "learning_rate": 0.0008662028715772883, + "loss": 0.89936113, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1358, + "time_per_iteration": 2.619495153427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138578, + "balance_loss_mlp": 1.12476182, + "diversity_loss_mlp": 0.0, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.07135790209188476, + "language_loss": 0.85816395, + "learning_rate": 0.0008659906811793467, + "loss": 0.86954975, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.13842773, + "routerloss_mlp": 0.0, + "step": 1359, + "time_per_iteration": 2.6752817630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135637, + "balance_loss_mlp": 1.12191582, + "diversity_loss_mlp": 0.0, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.07783428421444573, + "language_loss": 0.89649427, + "learning_rate": 0.0008657783486912215, + "loss": 0.90785068, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1360, + "time_per_iteration": 2.770136594772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918859, + "balance_loss_mlp": 1.60386825, + "diversity_loss_mlp": 0.20058532, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.03438194549161764, + "language_loss": 0.90315008, + "learning_rate": 0.0008655658741953472, + "loss": 0.91233867, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01663268, + "step": 1361, + "time_per_iteration": 3.239567518234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117406, + "balance_loss_mlp": 1.10352993, + "diversity_loss_mlp": 0.0, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.053733033776962646, + "language_loss": 0.88311911, + "learning_rate": 0.0008653532577742136, + "loss": 0.89429319, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1362, + "time_per_iteration": 2.6912107467651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111717, + "balance_loss_mlp": 1.09805584, + "diversity_loss_mlp": 0.0, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.07456283347469675, + "language_loss": 0.8687824, + "learning_rate": 0.0008651404995103659, + "loss": 0.87989956, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1363, + "time_per_iteration": 2.5554919242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106371, + "balance_loss_mlp": 1.09212554, + "diversity_loss_mlp": 0.0, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.0735216597505126, + "language_loss": 0.87311852, + "learning_rate": 0.0008649275994864041, + "loss": 0.88418221, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1364, + "time_per_iteration": 2.7228429317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109936, + "balance_loss_mlp": 1.0955832, + "diversity_loss_mlp": 0.0, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.06423000395680191, + "language_loss": 0.83767593, + "learning_rate": 0.0008647145577849834, + "loss": 0.84877527, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1365, + "time_per_iteration": 2.8194234371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110395, + "balance_loss_mlp": 1.09573257, + "diversity_loss_mlp": 0.0, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.0636918785190987, + "language_loss": 0.82912111, + "learning_rate": 0.0008645013744888139, + "loss": 0.8402251, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1366, + "time_per_iteration": 2.9121909141540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106528, + "balance_loss_mlp": 1.09266424, + "diversity_loss_mlp": 0.0, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.07268525177684865, + "language_loss": 0.87255573, + "learning_rate": 0.0008642880496806607, + "loss": 0.88362104, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1367, + "time_per_iteration": 2.7527663707733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117256, + "balance_loss_mlp": 1.1027844, + "diversity_loss_mlp": 0.0, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.06883104565378229, + "language_loss": 0.84193766, + "learning_rate": 0.0008640745834433437, + "loss": 0.85311019, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1368, + "time_per_iteration": 2.7203800678253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114698, + "balance_loss_mlp": 1.10065532, + "diversity_loss_mlp": 0.0, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.0718323039568536, + "language_loss": 0.87083656, + "learning_rate": 0.000863860975859738, + "loss": 0.88198352, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1369, + "time_per_iteration": 2.9021553993225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116648, + "balance_loss_mlp": 1.10278392, + "diversity_loss_mlp": 0.0, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.08463505288724613, + "language_loss": 0.88568735, + "learning_rate": 0.0008636472270127733, + "loss": 0.8968538, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1370, + "time_per_iteration": 2.6336748600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118916, + "balance_loss_mlp": 1.10440779, + "diversity_loss_mlp": 0.0, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.08505114845208346, + "language_loss": 0.90530956, + "learning_rate": 0.0008634333369854345, + "loss": 0.91649872, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1371, + "time_per_iteration": 2.585775136947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122621, + "balance_loss_mlp": 1.10868549, + "diversity_loss_mlp": 0.0, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.07138701063901956, + "language_loss": 0.87574148, + "learning_rate": 0.0008632193058607608, + "loss": 0.88696772, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.13952637, + "routerloss_mlp": 0.0, + "step": 1372, + "time_per_iteration": 2.719151735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124687, + "balance_loss_mlp": 1.11042953, + "diversity_loss_mlp": 0.0, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.09395332240398839, + "language_loss": 0.81125695, + "learning_rate": 0.0008630051337218466, + "loss": 0.82250381, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1373, + "time_per_iteration": 2.6700031757354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118707, + "balance_loss_mlp": 1.10506988, + "diversity_loss_mlp": 0.0, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0808240378873911, + "language_loss": 0.82403839, + "learning_rate": 0.0008627908206518409, + "loss": 0.83522546, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1374, + "time_per_iteration": 2.6610107421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061343, + "balance_loss_mlp": 1.05442929, + "diversity_loss_mlp": 0.0, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.04099598647265769, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76212597, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 1375, + "time_per_iteration": 4.979893922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109458, + "balance_loss_mlp": 1.09580863, + "diversity_loss_mlp": 0.0, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.06989177478220372, + "language_loss": 0.91488004, + "learning_rate": 0.0008623617720514241, + "loss": 0.92597461, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1376, + "time_per_iteration": 2.6515755653381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109452, + "balance_loss_mlp": 1.09554029, + "diversity_loss_mlp": 0.0, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.07399727326907257, + "language_loss": 0.84706682, + "learning_rate": 0.0008621470366875848, + "loss": 0.85816133, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1377, + "time_per_iteration": 2.599776268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119233, + "balance_loss_mlp": 1.10546422, + "diversity_loss_mlp": 0.0, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.07769258092785128, + "language_loss": 0.87980253, + "learning_rate": 0.0008619321607257966, + "loss": 0.89099485, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1378, + "time_per_iteration": 2.678865671157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.10274947, + "diversity_loss_mlp": 0.0, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.07519514659764338, + "language_loss": 0.82002568, + "learning_rate": 0.000861717144249482, + "loss": 0.83118635, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1379, + "time_per_iteration": 2.8830740451812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118616, + "balance_loss_mlp": 1.10515702, + "diversity_loss_mlp": 0.0, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06542821866252439, + "language_loss": 0.89670694, + "learning_rate": 0.0008615019873421175, + "loss": 0.90789306, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.1348877, + "routerloss_mlp": 0.0, + "step": 1380, + "time_per_iteration": 2.4692320823669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124803, + "balance_loss_mlp": 1.11096311, + "diversity_loss_mlp": 0.0, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.08230289019981965, + "language_loss": 0.85984069, + "learning_rate": 0.0008612866900872349, + "loss": 0.87108874, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1381, + "time_per_iteration": 2.5671193599700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119212, + "balance_loss_mlp": 1.10564578, + "diversity_loss_mlp": 0.0, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.09708901974799254, + "language_loss": 0.8800329, + "learning_rate": 0.0008610712525684197, + "loss": 0.89122504, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1382, + "time_per_iteration": 2.673672676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.12075388, + "diversity_loss_mlp": 0.0, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.08550137436350284, + "language_loss": 0.84231853, + "learning_rate": 0.0008608556748693121, + "loss": 0.85366714, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1383, + "time_per_iteration": 3.285391330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113293, + "balance_loss_mlp": 1.11881518, + "diversity_loss_mlp": 0.0, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.07276264363306281, + "language_loss": 0.86098409, + "learning_rate": 0.000860639957073607, + "loss": 0.87231338, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1384, + "time_per_iteration": 2.74979829788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130834, + "balance_loss_mlp": 1.11668396, + "diversity_loss_mlp": 0.0, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.07735164598050102, + "language_loss": 0.87488532, + "learning_rate": 0.0008604240992650534, + "loss": 0.88619369, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1385, + "time_per_iteration": 2.765714406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113264, + "balance_loss_mlp": 1.11819148, + "diversity_loss_mlp": 0.0, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.09224305204204497, + "language_loss": 0.89344275, + "learning_rate": 0.0008602081015274545, + "loss": 0.90476912, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1386, + "time_per_iteration": 2.7466471195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130382, + "balance_loss_mlp": 1.11580229, + "diversity_loss_mlp": 0.0, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.08049268911379595, + "language_loss": 0.83551365, + "learning_rate": 0.0008599919639446684, + "loss": 0.84681749, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1387, + "time_per_iteration": 2.680053234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119435, + "balance_loss_mlp": 1.10439074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.08313146027802099, + "language_loss": 0.80363739, + "learning_rate": 0.000859775686600607, + "loss": 0.81483173, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1388, + "time_per_iteration": 2.5738272666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114186, + "balance_loss_mlp": 1.12722135, + "diversity_loss_mlp": 0.0, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.08559032433145165, + "language_loss": 0.85052109, + "learning_rate": 0.0008595592695792367, + "loss": 0.86193967, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1389, + "time_per_iteration": 2.660012722015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112772, + "balance_loss_mlp": 1.11312914, + "diversity_loss_mlp": 0.0, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.07620364037172102, + "language_loss": 0.90774226, + "learning_rate": 0.0008593427129645778, + "loss": 0.91901946, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1390, + "time_per_iteration": 2.62744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131127, + "balance_loss_mlp": 1.11615419, + "diversity_loss_mlp": 0.0, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.0742307152228864, + "language_loss": 0.85619152, + "learning_rate": 0.0008591260168407052, + "loss": 0.86750275, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1391, + "time_per_iteration": 2.738680124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113811, + "balance_loss_mlp": 1.09930313, + "diversity_loss_mlp": 0.0, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.05574398067767488, + "language_loss": 0.82839364, + "learning_rate": 0.0008589091812917479, + "loss": 0.83953172, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1392, + "time_per_iteration": 2.5947506427764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109245, + "balance_loss_mlp": 1.09471345, + "diversity_loss_mlp": 0.0, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07022348692687568, + "language_loss": 0.85257161, + "learning_rate": 0.0008586922064018887, + "loss": 0.86366403, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1393, + "time_per_iteration": 2.6624581813812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110195, + "balance_loss_mlp": 1.09542501, + "diversity_loss_mlp": 0.0, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.07561979453055602, + "language_loss": 0.89401793, + "learning_rate": 0.0008584750922553651, + "loss": 0.9051199, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1394, + "time_per_iteration": 3.1940202713012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107917, + "balance_loss_mlp": 1.0934931, + "diversity_loss_mlp": 0.0, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.07234350422575066, + "language_loss": 0.83740592, + "learning_rate": 0.0008582578389364677, + "loss": 0.84848505, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1395, + "time_per_iteration": 2.8844621181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106129, + "balance_loss_mlp": 1.09147811, + "diversity_loss_mlp": 0.0, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.061968206774760184, + "language_loss": 0.91908813, + "learning_rate": 0.0008580404465295422, + "loss": 0.93014938, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1396, + "time_per_iteration": 2.7842769622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106127, + "balance_loss_mlp": 1.09155917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07293181793333794, + "language_loss": 0.88274646, + "learning_rate": 0.0008578229151189876, + "loss": 0.89380777, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1397, + "time_per_iteration": 2.96771502494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110096, + "balance_loss_mlp": 1.08638036, + "diversity_loss_mlp": 0.0, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.08798004746081324, + "language_loss": 0.81253606, + "learning_rate": 0.0008576052447892573, + "loss": 0.82354569, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1398, + "time_per_iteration": 2.5413830280303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101823, + "balance_loss_mlp": 1.08761334, + "diversity_loss_mlp": 0.0, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.0737959226904994, + "language_loss": 0.86320835, + "learning_rate": 0.000857387435624858, + "loss": 0.87422657, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1399, + "time_per_iteration": 2.554016351699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00934821, + "balance_loss_mlp": 1.63627267, + "diversity_loss_mlp": 0.20064378, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.02492172823463741, + "language_loss": 0.88190895, + "learning_rate": 0.0008571694877103513, + "loss": 0.89125717, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636335, + "step": 1400, + "time_per_iteration": 3.307114839553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110386, + "balance_loss_mlp": 1.09591365, + "diversity_loss_mlp": 0.0, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.07757128819182789, + "language_loss": 0.87680864, + "learning_rate": 0.0008569514011303515, + "loss": 0.88791251, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1401, + "time_per_iteration": 2.800502300262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917512, + "balance_loss_mlp": 1.60226941, + "diversity_loss_mlp": 0.19939175, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.03393521208879438, + "language_loss": 0.88186574, + "learning_rate": 0.0008567331759695277, + "loss": 0.8910408, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01668182, + "step": 1402, + "time_per_iteration": 2.7670016288757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108043, + "balance_loss_mlp": 1.09297514, + "diversity_loss_mlp": 0.0, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.0674494366068644, + "language_loss": 0.86427194, + "learning_rate": 0.0008565148123126023, + "loss": 0.87535238, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1403, + "time_per_iteration": 2.660659074783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094781, + "balance_loss_mlp": 1.08053553, + "diversity_loss_mlp": 0.0, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.059221605294443855, + "language_loss": 0.86113608, + "learning_rate": 0.0008562963102443516, + "loss": 0.8720839, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1404, + "time_per_iteration": 2.6982760429382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110424, + "balance_loss_mlp": 1.090042, + "diversity_loss_mlp": 0.0, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.08483345099627004, + "language_loss": 0.85166299, + "learning_rate": 0.0008560776698496056, + "loss": 0.86270541, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1405, + "time_per_iteration": 2.9167518615722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110133, + "balance_loss_mlp": 1.09539831, + "diversity_loss_mlp": 0.0, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.06923600464578249, + "language_loss": 0.85861331, + "learning_rate": 0.0008558588912132481, + "loss": 0.86971468, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1406, + "time_per_iteration": 2.8346776962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00696474, + "balance_loss_mlp": 1.17983532, + "diversity_loss_mlp": 0.18206902, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.0036772550136199766, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77155459, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0155216, + "step": 1407, + "time_per_iteration": 4.943782091140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_mlp": 1.09137964, + "diversity_loss_mlp": 0.0, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.08329945876184135, + "language_loss": 0.82942384, + "learning_rate": 0.0008554209195555016, + "loss": 0.84047806, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1408, + "time_per_iteration": 2.7417516708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125368, + "balance_loss_mlp": 1.11146832, + "diversity_loss_mlp": 0.0, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.06975199960684045, + "language_loss": 0.8827157, + "learning_rate": 0.0008552017267041483, + "loss": 0.89396936, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1409, + "time_per_iteration": 2.6978721618652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126015, + "balance_loss_mlp": 1.11216331, + "diversity_loss_mlp": 0.0, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06710824628929367, + "language_loss": 0.83395678, + "learning_rate": 0.0008549823959512549, + "loss": 0.84521693, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1410, + "time_per_iteration": 2.6867637634277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125714, + "balance_loss_mlp": 1.11246991, + "diversity_loss_mlp": 0.0, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.07002470067050659, + "language_loss": 0.86486357, + "learning_rate": 0.0008547629273819728, + "loss": 0.87612069, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.13262939, + "routerloss_mlp": 0.0, + "step": 1411, + "time_per_iteration": 3.410454750061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142962, + "balance_loss_mlp": 1.12940812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.07619635814943253, + "language_loss": 0.83522588, + "learning_rate": 0.0008545433210815074, + "loss": 0.84665549, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1412, + "time_per_iteration": 2.638172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139051, + "balance_loss_mlp": 1.12536621, + "diversity_loss_mlp": 0.0, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.06317158203016926, + "language_loss": 0.87351668, + "learning_rate": 0.0008543235771351176, + "loss": 0.88490719, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1413, + "time_per_iteration": 2.7705581188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159735, + "balance_loss_mlp": 1.14645457, + "diversity_loss_mlp": 0.0, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.08259318688939964, + "language_loss": 0.84684592, + "learning_rate": 0.0008541036956281154, + "loss": 0.85844326, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1414, + "time_per_iteration": 2.8803579807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147853, + "balance_loss_mlp": 1.13435841, + "diversity_loss_mlp": 0.0, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.09396951476817994, + "language_loss": 0.81928164, + "learning_rate": 0.0008538836766458665, + "loss": 0.83076018, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.13519287, + "routerloss_mlp": 0.0, + "step": 1415, + "time_per_iteration": 2.860991954803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140979, + "balance_loss_mlp": 1.12721062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.07553622395064079, + "language_loss": 0.84927893, + "learning_rate": 0.0008536635202737897, + "loss": 0.86068869, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1416, + "time_per_iteration": 2.848196268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146453, + "balance_loss_mlp": 1.13278019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.07031625369418516, + "language_loss": 0.82188255, + "learning_rate": 0.0008534432265973573, + "loss": 0.83334708, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1417, + "time_per_iteration": 2.6029789447784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153419, + "balance_loss_mlp": 1.13950717, + "diversity_loss_mlp": 0.0, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07823597875801033, + "language_loss": 0.88322413, + "learning_rate": 0.000853222795702095, + "loss": 0.89475828, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1418, + "time_per_iteration": 3.3933968544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149691, + "balance_loss_mlp": 1.13570726, + "diversity_loss_mlp": 0.0, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.07267637680100167, + "language_loss": 0.83730674, + "learning_rate": 0.0008530022276735813, + "loss": 0.84880364, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1419, + "time_per_iteration": 2.766181707382202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134499, + "balance_loss_mlp": 1.12086129, + "diversity_loss_mlp": 0.0, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.06887995103877555, + "language_loss": 0.86238861, + "learning_rate": 0.0008527815225974489, + "loss": 0.87373358, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1420, + "time_per_iteration": 2.6471102237701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135972, + "balance_loss_mlp": 1.12148833, + "diversity_loss_mlp": 0.0, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10131461494963417, + "language_loss": 0.88726115, + "learning_rate": 0.0008525606805593829, + "loss": 0.89862096, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1421, + "time_per_iteration": 2.436647653579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118286, + "balance_loss_mlp": 1.10405266, + "diversity_loss_mlp": 0.0, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.0859881194807961, + "language_loss": 0.8254106, + "learning_rate": 0.0008523397016451213, + "loss": 0.83659345, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1422, + "time_per_iteration": 2.593588352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103656, + "balance_loss_mlp": 1.08907628, + "diversity_loss_mlp": 0.0, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.06052148467578676, + "language_loss": 0.87038374, + "learning_rate": 0.0008521185859404564, + "loss": 0.88142037, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1423, + "time_per_iteration": 3.3936307430267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092129, + "balance_loss_mlp": 1.07775199, + "diversity_loss_mlp": 0.0, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06977326166261295, + "language_loss": 0.8940134, + "learning_rate": 0.0008518973335312326, + "loss": 0.90493476, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1424, + "time_per_iteration": 2.7834270000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081272, + "balance_loss_mlp": 1.06702638, + "diversity_loss_mlp": 0.0, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.119675165593639, + "language_loss": 0.83282709, + "learning_rate": 0.0008516759445033477, + "loss": 0.84363985, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1425, + "time_per_iteration": 2.665099859237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.06930685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08266887436661914, + "language_loss": 0.85026807, + "learning_rate": 0.0008514544189427526, + "loss": 0.86110568, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1426, + "time_per_iteration": 2.6887404918670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086038, + "balance_loss_mlp": 1.07249546, + "diversity_loss_mlp": 0.0, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.06908859165293682, + "language_loss": 0.86575979, + "learning_rate": 0.0008512327569354511, + "loss": 0.87662017, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1427, + "time_per_iteration": 2.5235631465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108142, + "balance_loss_mlp": 1.09480238, + "diversity_loss_mlp": 0.0, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.08987008099145026, + "language_loss": 0.8368206, + "learning_rate": 0.0008510109585675001, + "loss": 0.847902, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.13360596, + "routerloss_mlp": 0.0, + "step": 1428, + "time_per_iteration": 2.613348960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140613, + "balance_loss_mlp": 1.13260245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.05207498704371428, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82293957, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1429, + "time_per_iteration": 4.706013202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133032, + "balance_loss_mlp": 1.11977601, + "diversity_loss_mlp": 0.0, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.09002666847623074, + "language_loss": 0.80503839, + "learning_rate": 0.0008505669530941415, + "loss": 0.8163687, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1430, + "time_per_iteration": 3.2976372241973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0097004, + "balance_loss_mlp": 1.70641518, + "diversity_loss_mlp": 0.20088202, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.03747760406507578, + "language_loss": 0.84294951, + "learning_rate": 0.000850344746161112, + "loss": 0.85264993, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01639144, + "step": 1431, + "time_per_iteration": 2.6297106742858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139922, + "balance_loss_mlp": 1.12685704, + "diversity_loss_mlp": 0.0, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.08230554095697513, + "language_loss": 0.87346137, + "learning_rate": 0.0008501224032121894, + "loss": 0.88486063, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1432, + "time_per_iteration": 2.4853787422180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129049, + "balance_loss_mlp": 1.1158998, + "diversity_loss_mlp": 0.0, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06557126517551867, + "language_loss": 0.82118285, + "learning_rate": 0.0008498999243336946, + "loss": 0.83247334, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1433, + "time_per_iteration": 2.623809576034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11776567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.0832335684907068, + "language_loss": 0.87471139, + "learning_rate": 0.0008496773096120021, + "loss": 0.88601708, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.12817383, + "routerloss_mlp": 0.0, + "step": 1434, + "time_per_iteration": 2.7995760440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111971, + "balance_loss_mlp": 1.10637057, + "diversity_loss_mlp": 0.0, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.10286197296711953, + "language_loss": 0.84387434, + "learning_rate": 0.0008494545591335381, + "loss": 0.85507143, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.13354492, + "routerloss_mlp": 0.0, + "step": 1435, + "time_per_iteration": 2.933576822280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113068, + "balance_loss_mlp": 1.09978795, + "diversity_loss_mlp": 0.0, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.053150449500146836, + "language_loss": 0.86971611, + "learning_rate": 0.0008492316729847823, + "loss": 0.88084674, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1436, + "time_per_iteration": 2.8865604400634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.09676659, + "diversity_loss_mlp": 0.0, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08937825724590943, + "language_loss": 0.7968539, + "learning_rate": 0.0008490086512522664, + "loss": 0.80795395, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1437, + "time_per_iteration": 2.7166872024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105369, + "balance_loss_mlp": 1.0916723, + "diversity_loss_mlp": 0.0, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.09013751301914075, + "language_loss": 0.90582836, + "learning_rate": 0.0008487854940225755, + "loss": 0.91688204, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1438, + "time_per_iteration": 2.4426465034484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102616, + "balance_loss_mlp": 1.08844161, + "diversity_loss_mlp": 0.0, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.09066429268698341, + "language_loss": 0.89896768, + "learning_rate": 0.0008485622013823466, + "loss": 0.90999383, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1439, + "time_per_iteration": 2.599177360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090727, + "balance_loss_mlp": 1.07675576, + "diversity_loss_mlp": 0.0, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.08059762035463526, + "language_loss": 0.83446515, + "learning_rate": 0.00084833877341827, + "loss": 0.84537244, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1440, + "time_per_iteration": 2.667215347290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.0762167, + "diversity_loss_mlp": 0.0, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.07889497077341047, + "language_loss": 0.80625433, + "learning_rate": 0.000848115210217088, + "loss": 0.81715715, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1441, + "time_per_iteration": 2.5463788509368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.08003855, + "diversity_loss_mlp": 0.0, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.08443965058939805, + "language_loss": 0.81771946, + "learning_rate": 0.0008478915118655952, + "loss": 0.82866359, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1442, + "time_per_iteration": 2.743678569793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118232, + "balance_loss_mlp": 1.10385561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.07019455815968899, + "language_loss": 0.86195552, + "learning_rate": 0.0008476676784506393, + "loss": 0.87313789, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1443, + "time_per_iteration": 2.663422107696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.10996866, + "diversity_loss_mlp": 0.0, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.08623331537045495, + "language_loss": 0.81889486, + "learning_rate": 0.0008474437100591201, + "loss": 0.83014178, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1444, + "time_per_iteration": 3.340557813644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129459, + "balance_loss_mlp": 1.11489129, + "diversity_loss_mlp": 0.0, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.08279806566523454, + "language_loss": 0.85577607, + "learning_rate": 0.0008472196067779898, + "loss": 0.86707067, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1445, + "time_per_iteration": 2.675623655319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112665, + "balance_loss_mlp": 1.09800267, + "diversity_loss_mlp": 0.0, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10281028137483857, + "language_loss": 0.85108185, + "learning_rate": 0.0008469953686942531, + "loss": 0.86220849, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1446, + "time_per_iteration": 3.0647382736206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933894, + "balance_loss_mlp": 1.63962197, + "diversity_loss_mlp": 0.19544066, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.039122045531048345, + "language_loss": 0.83261281, + "learning_rate": 0.0008467709958949668, + "loss": 0.84195173, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636306, + "step": 1447, + "time_per_iteration": 2.777806043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932176, + "balance_loss_mlp": 1.63710666, + "diversity_loss_mlp": 0.19454433, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.036668832644649825, + "language_loss": 0.85678959, + "learning_rate": 0.0008465464884672403, + "loss": 0.8661114, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01635053, + "step": 1448, + "time_per_iteration": 2.7313778400421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109364, + "balance_loss_mlp": 1.07944214, + "diversity_loss_mlp": 0.0, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.08672786191572247, + "language_loss": 0.85892808, + "learning_rate": 0.0008463218464982348, + "loss": 0.86986446, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1449, + "time_per_iteration": 2.8115885257720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109775, + "balance_loss_mlp": 1.08367157, + "diversity_loss_mlp": 0.0, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.09681901325388456, + "language_loss": 0.8756566, + "learning_rate": 0.0008460970700751645, + "loss": 0.88663405, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1450, + "time_per_iteration": 3.071645975112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093318, + "balance_loss_mlp": 1.07963276, + "diversity_loss_mlp": 0.0, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.09020366192691211, + "language_loss": 0.87640095, + "learning_rate": 0.000845872159285295, + "loss": 0.88733411, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1451, + "time_per_iteration": 2.7342164516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051691, + "balance_loss_mlp": 1.04301238, + "diversity_loss_mlp": 0.0, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.032344288076380935, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78818536, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1452, + "time_per_iteration": 4.95387077331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121492, + "balance_loss_mlp": 1.10795009, + "diversity_loss_mlp": 0.0, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.08097200979220782, + "language_loss": 0.86171871, + "learning_rate": 0.0008454219349544836, + "loss": 0.87293363, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1453, + "time_per_iteration": 3.373755693435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127619, + "balance_loss_mlp": 1.11439896, + "diversity_loss_mlp": 0.0, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.0882994281711823, + "language_loss": 0.81864405, + "learning_rate": 0.000845196621588334, + "loss": 0.82992017, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.13244629, + "routerloss_mlp": 0.0, + "step": 1454, + "time_per_iteration": 2.758122682571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147815, + "balance_loss_mlp": 1.13453507, + "diversity_loss_mlp": 0.0, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.06575509380885615, + "language_loss": 0.76256007, + "learning_rate": 0.0008449711742049706, + "loss": 0.7740382, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1455, + "time_per_iteration": 2.752345561981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.1432693, + "diversity_loss_mlp": 0.0, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.10411587441286801, + "language_loss": 0.84306383, + "learning_rate": 0.0008447455928919196, + "loss": 0.85462898, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1456, + "time_per_iteration": 2.6104180812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146529, + "balance_loss_mlp": 1.13327312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.07273170046833245, + "language_loss": 0.86767292, + "learning_rate": 0.0008445198777367595, + "loss": 0.87913817, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1457, + "time_per_iteration": 2.614743947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144047, + "balance_loss_mlp": 1.13080251, + "diversity_loss_mlp": 0.0, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.08362811388708001, + "language_loss": 0.81054902, + "learning_rate": 0.0008442940288271208, + "loss": 0.82198954, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1458, + "time_per_iteration": 2.615705966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112578, + "balance_loss_mlp": 1.11191583, + "diversity_loss_mlp": 0.0, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06892977395484212, + "language_loss": 0.8688817, + "learning_rate": 0.0008440680462506856, + "loss": 0.88013953, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1459, + "time_per_iteration": 2.810474157333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121233, + "balance_loss_mlp": 1.10828125, + "diversity_loss_mlp": 0.0, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.06441288224223744, + "language_loss": 0.86424565, + "learning_rate": 0.0008438419300951883, + "loss": 0.87545788, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1460, + "time_per_iteration": 2.6540863513946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115517, + "balance_loss_mlp": 1.10215354, + "diversity_loss_mlp": 0.0, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.12446768600100189, + "language_loss": 0.86647975, + "learning_rate": 0.0008436156804484148, + "loss": 0.87763494, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1461, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110833, + "balance_loss_mlp": 1.0965395, + "diversity_loss_mlp": 0.0, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.08490544085138897, + "language_loss": 0.88168794, + "learning_rate": 0.0008433892973982031, + "loss": 0.89279622, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1462, + "time_per_iteration": 2.561211347579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10098886, + "diversity_loss_mlp": 0.0, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07295818188475026, + "language_loss": 0.84776855, + "learning_rate": 0.0008431627810324431, + "loss": 0.85892212, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1463, + "time_per_iteration": 2.654146671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117739, + "balance_loss_mlp": 1.10345769, + "diversity_loss_mlp": 0.0, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.06893619297503142, + "language_loss": 0.8126353, + "learning_rate": 0.000842936131439076, + "loss": 0.82381272, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1464, + "time_per_iteration": 2.6571760177612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115394, + "balance_loss_mlp": 1.1010766, + "diversity_loss_mlp": 0.0, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.07879840484237804, + "language_loss": 0.87885797, + "learning_rate": 0.0008427093487060951, + "loss": 0.89001191, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1465, + "time_per_iteration": 2.6847336292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101907, + "balance_loss_mlp": 1.08776927, + "diversity_loss_mlp": 0.0, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.06118480673876746, + "language_loss": 0.84661305, + "learning_rate": 0.000842482432921545, + "loss": 0.8576321, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1466, + "time_per_iteration": 2.884965181350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110751, + "balance_loss_mlp": 1.09353852, + "diversity_loss_mlp": 0.0, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.07927655906335743, + "language_loss": 0.87199128, + "learning_rate": 0.0008422553841735225, + "loss": 0.88306642, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1467, + "time_per_iteration": 2.528017997741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115631, + "balance_loss_mlp": 1.10146928, + "diversity_loss_mlp": 0.0, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.07348722340160863, + "language_loss": 0.84837711, + "learning_rate": 0.0008420282025501757, + "loss": 0.85953343, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1468, + "time_per_iteration": 2.7696359157562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115321, + "balance_loss_mlp": 1.10156429, + "diversity_loss_mlp": 0.0, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07024793700711117, + "language_loss": 0.85080296, + "learning_rate": 0.0008418008881397043, + "loss": 0.86195612, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1469, + "time_per_iteration": 2.659646511077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115825, + "balance_loss_mlp": 1.10241413, + "diversity_loss_mlp": 0.0, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.12791916727658353, + "language_loss": 0.82420468, + "learning_rate": 0.0008415734410303595, + "loss": 0.83536291, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1470, + "time_per_iteration": 3.2350287437438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120259, + "balance_loss_mlp": 1.10672879, + "diversity_loss_mlp": 0.0, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.0700140113394834, + "language_loss": 0.90437436, + "learning_rate": 0.0008413458613104444, + "loss": 0.91557699, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1471, + "time_per_iteration": 2.7219245433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111254, + "balance_loss_mlp": 1.09766376, + "diversity_loss_mlp": 0.0, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.07145574186167022, + "language_loss": 0.83164495, + "learning_rate": 0.0008411181490683129, + "loss": 0.84275752, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1472, + "time_per_iteration": 2.727936029434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107735, + "balance_loss_mlp": 1.09348917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.0645149730480124, + "language_loss": 0.82377428, + "learning_rate": 0.0008408903043923707, + "loss": 0.83485162, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1473, + "time_per_iteration": 2.9972269535064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111455, + "balance_loss_mlp": 1.1004951, + "diversity_loss_mlp": 0.0, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09233547648167305, + "language_loss": 0.81268132, + "learning_rate": 0.0008406623273710754, + "loss": 0.82382679, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1474, + "time_per_iteration": 2.5923123359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105938, + "balance_loss_mlp": 1.09263408, + "diversity_loss_mlp": 0.0, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.0761903935255829, + "language_loss": 0.8290056, + "learning_rate": 0.0008404342180929351, + "loss": 0.840065, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1475, + "time_per_iteration": 2.664698600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121728, + "balance_loss_mlp": 1.10819817, + "diversity_loss_mlp": 0.0, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.08946081876366527, + "language_loss": 0.81824017, + "learning_rate": 0.00084020597664651, + "loss": 0.82945752, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1476, + "time_per_iteration": 2.7941510677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113829, + "balance_loss_mlp": 1.10019112, + "diversity_loss_mlp": 0.0, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.09030679544521746, + "language_loss": 0.83820337, + "learning_rate": 0.0008399776031204111, + "loss": 0.84934169, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1477, + "time_per_iteration": 2.7508158683776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101529, + "balance_loss_mlp": 1.08784389, + "diversity_loss_mlp": 0.0, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07642048536310797, + "language_loss": 0.79864645, + "learning_rate": 0.0008397490976033009, + "loss": 0.80966175, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1478, + "time_per_iteration": 2.6500625610351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054127, + "balance_loss_mlp": 1.04673624, + "diversity_loss_mlp": 0.0, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.0303646120618472, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933775, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.07373047, + "routerloss_mlp": 0.0, + "step": 1479, + "time_per_iteration": 4.757360935211182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098606, + "balance_loss_mlp": 1.08449173, + "diversity_loss_mlp": 0.0, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06570619267025138, + "language_loss": 0.85133117, + "learning_rate": 0.0008392916909509525, + "loss": 0.86231726, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1480, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093081, + "balance_loss_mlp": 1.07888281, + "diversity_loss_mlp": 0.0, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.07896332999012158, + "language_loss": 0.8543641, + "learning_rate": 0.0008390627899932954, + "loss": 0.86529493, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1481, + "time_per_iteration": 2.5937705039978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100254, + "balance_loss_mlp": 1.08532953, + "diversity_loss_mlp": 0.0, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.08879627929694006, + "language_loss": 0.88894033, + "learning_rate": 0.000838833757399789, + "loss": 0.89994287, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1482, + "time_per_iteration": 2.95451283454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106961, + "balance_loss_mlp": 1.09247661, + "diversity_loss_mlp": 0.0, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.08557616325511565, + "language_loss": 0.80760586, + "learning_rate": 0.0008386045932593515, + "loss": 0.81867552, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1483, + "time_per_iteration": 2.6901025772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.09776473, + "diversity_loss_mlp": 0.0, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.0661413109298982, + "language_loss": 0.86017227, + "learning_rate": 0.0008383752976609525, + "loss": 0.87129307, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1484, + "time_per_iteration": 2.9148330688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116421, + "balance_loss_mlp": 1.1014719, + "diversity_loss_mlp": 0.0, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06788684976720215, + "language_loss": 0.80004096, + "learning_rate": 0.0008381458706936123, + "loss": 0.81120521, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1485, + "time_per_iteration": 2.681067943572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112387, + "balance_loss_mlp": 1.09728312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06920905175587555, + "language_loss": 0.8725493, + "learning_rate": 0.0008379163124464025, + "loss": 0.88367319, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1486, + "time_per_iteration": 2.7093162536621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_mlp": 1.10290396, + "diversity_loss_mlp": 0.0, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.09647963836289664, + "language_loss": 0.77093983, + "learning_rate": 0.0008376866230084452, + "loss": 0.78211844, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1487, + "time_per_iteration": 2.8678433895111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00910546, + "balance_loss_mlp": 1.59136748, + "diversity_loss_mlp": 0.19592074, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.03660624024989628, + "language_loss": 0.86046171, + "learning_rate": 0.000837456802468914, + "loss": 0.86956716, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01690142, + "step": 1488, + "time_per_iteration": 2.602982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102391, + "balance_loss_mlp": 1.08787107, + "diversity_loss_mlp": 0.0, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.0820682475712047, + "language_loss": 0.85374725, + "learning_rate": 0.0008372268509170331, + "loss": 0.86477119, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1489, + "time_per_iteration": 2.6895487308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099554, + "balance_loss_mlp": 1.08529639, + "diversity_loss_mlp": 0.0, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.09305985964981825, + "language_loss": 0.85262501, + "learning_rate": 0.0008369967684420779, + "loss": 0.86362052, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1490, + "time_per_iteration": 2.7102949619293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083179, + "balance_loss_mlp": 1.06912422, + "diversity_loss_mlp": 0.0, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.08804420397834639, + "language_loss": 0.84696782, + "learning_rate": 0.0008367665551333736, + "loss": 0.85779965, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1491, + "time_per_iteration": 2.618272304534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088636, + "balance_loss_mlp": 1.07430756, + "diversity_loss_mlp": 0.0, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.07991380194683065, + "language_loss": 0.85525382, + "learning_rate": 0.0008365362110802977, + "loss": 0.86614019, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1492, + "time_per_iteration": 2.851928234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101019, + "balance_loss_mlp": 1.08655906, + "diversity_loss_mlp": 0.0, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.0838988471662801, + "language_loss": 0.82620168, + "learning_rate": 0.0008363057363722773, + "loss": 0.83721185, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1493, + "time_per_iteration": 2.853207588195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_mlp": 1.09245062, + "diversity_loss_mlp": 0.0, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.06826703692619526, + "language_loss": 0.84157109, + "learning_rate": 0.0008360751310987906, + "loss": 0.85263485, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1494, + "time_per_iteration": 2.57387638092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11695361, + "diversity_loss_mlp": 0.0, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.058749130100992836, + "language_loss": 0.85290074, + "learning_rate": 0.0008358443953493666, + "loss": 0.86420786, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1495, + "time_per_iteration": 2.8883073329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164777, + "balance_loss_mlp": 1.15067482, + "diversity_loss_mlp": 0.0, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.08087911977453179, + "language_loss": 0.88221979, + "learning_rate": 0.0008356135292135851, + "loss": 0.89386749, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1496, + "time_per_iteration": 2.5230934619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186431, + "balance_loss_mlp": 1.17226899, + "diversity_loss_mlp": 0.0, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.11116302526442519, + "language_loss": 0.92429602, + "learning_rate": 0.0008353825327810758, + "loss": 0.93616039, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1497, + "time_per_iteration": 2.420966863632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188369, + "balance_loss_mlp": 1.17465985, + "diversity_loss_mlp": 0.0, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.07094257684914687, + "language_loss": 0.8160103, + "learning_rate": 0.00083515140614152, + "loss": 0.82789397, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1498, + "time_per_iteration": 2.7105205059051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172297, + "balance_loss_mlp": 1.15901685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.09212284213685974, + "language_loss": 0.87059236, + "learning_rate": 0.0008349201493846485, + "loss": 0.88231528, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1499, + "time_per_iteration": 2.6807801723480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148211, + "balance_loss_mlp": 1.13470435, + "diversity_loss_mlp": 0.0, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.07375807574735407, + "language_loss": 0.88790113, + "learning_rate": 0.0008346887626002432, + "loss": 0.89938325, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1500, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919256, + "balance_loss_mlp": 1.60489607, + "diversity_loss_mlp": 0.19980004, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.030907333217789122, + "language_loss": 0.85892522, + "learning_rate": 0.000834457245878137, + "loss": 0.86811781, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0169074, + "step": 1501, + "time_per_iteration": 2.6543540954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112198, + "balance_loss_mlp": 1.10861671, + "diversity_loss_mlp": 0.0, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.09029230185558035, + "language_loss": 0.81450766, + "learning_rate": 0.000834225599308212, + "loss": 0.82572746, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1502, + "time_per_iteration": 3.2493886947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125349, + "balance_loss_mlp": 1.11191428, + "diversity_loss_mlp": 0.0, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07343077704271528, + "language_loss": 0.85592055, + "learning_rate": 0.0008339938229804016, + "loss": 0.86717403, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.13458252, + "routerloss_mlp": 0.0, + "step": 1503, + "time_per_iteration": 2.712455987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.08344853, + "diversity_loss_mlp": 0.0, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.040592353184382625, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76525998, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1504, + "time_per_iteration": 4.975377082824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117311, + "balance_loss_mlp": 1.10320854, + "diversity_loss_mlp": 0.0, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.10665663300821891, + "language_loss": 0.84014988, + "learning_rate": 0.0008335298814111094, + "loss": 0.85132295, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1505, + "time_per_iteration": 2.563352584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119478, + "balance_loss_mlp": 1.10572124, + "diversity_loss_mlp": 0.0, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.07488877863745698, + "language_loss": 0.87982982, + "learning_rate": 0.0008332977163497455, + "loss": 0.89102459, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1506, + "time_per_iteration": 2.799177646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011178, + "balance_loss_mlp": 1.10419846, + "diversity_loss_mlp": 0.0, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.08855239932012744, + "language_loss": 0.83522987, + "learning_rate": 0.0008330654218907325, + "loss": 0.84640789, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1507, + "time_per_iteration": 2.7311654090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130891, + "balance_loss_mlp": 1.1170032, + "diversity_loss_mlp": 0.0, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.06185767339129184, + "language_loss": 0.82011658, + "learning_rate": 0.0008328329981242548, + "loss": 0.83142549, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1508, + "time_per_iteration": 2.87014102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148949, + "balance_loss_mlp": 1.13483465, + "diversity_loss_mlp": 0.0, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.0780337340178098, + "language_loss": 0.88045996, + "learning_rate": 0.0008326004451405475, + "loss": 0.89194947, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1509, + "time_per_iteration": 2.7449288368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146827, + "balance_loss_mlp": 1.13290334, + "diversity_loss_mlp": 0.0, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.07615169765943663, + "language_loss": 0.82328165, + "learning_rate": 0.0008323677630298957, + "loss": 0.83474988, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1510, + "time_per_iteration": 2.5527472496032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911058, + "balance_loss_mlp": 1.59209251, + "diversity_loss_mlp": 0.19929613, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.030084219280472915, + "language_loss": 0.84789264, + "learning_rate": 0.0008321349518826345, + "loss": 0.85700321, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01536426, + "step": 1511, + "time_per_iteration": 2.85006046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167449, + "balance_loss_mlp": 1.15337038, + "diversity_loss_mlp": 0.0, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.09547204503407083, + "language_loss": 0.94614309, + "learning_rate": 0.0008319020117891491, + "loss": 0.95781755, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1512, + "time_per_iteration": 2.619699001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150869, + "balance_loss_mlp": 1.13603973, + "diversity_loss_mlp": 0.0, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.0903449194731753, + "language_loss": 0.86757064, + "learning_rate": 0.0008316689428398751, + "loss": 0.87907934, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1513, + "time_per_iteration": 2.6975061893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122355, + "balance_loss_mlp": 1.10804975, + "diversity_loss_mlp": 0.0, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05700485295001885, + "language_loss": 0.88661957, + "learning_rate": 0.0008314357451252979, + "loss": 0.89784312, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1514, + "time_per_iteration": 2.7759623527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101866, + "balance_loss_mlp": 1.08762062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.06876651723291546, + "language_loss": 0.87979865, + "learning_rate": 0.0008312024187359527, + "loss": 0.89081734, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1515, + "time_per_iteration": 2.6594746112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108928, + "balance_loss_mlp": 1.07499838, + "diversity_loss_mlp": 0.0, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.06943657009436902, + "language_loss": 0.87168229, + "learning_rate": 0.000830968963762425, + "loss": 0.88257504, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1516, + "time_per_iteration": 3.0544168949127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078645, + "balance_loss_mlp": 1.06457818, + "diversity_loss_mlp": 0.0, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.07942748937188983, + "language_loss": 0.84183443, + "learning_rate": 0.0008307353802953497, + "loss": 0.85262084, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1517, + "time_per_iteration": 2.7325901985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.06031072, + "diversity_loss_mlp": 0.0, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.0903207444065502, + "language_loss": 0.86203992, + "learning_rate": 0.0008305016684254125, + "loss": 0.87279052, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1518, + "time_per_iteration": 2.790580987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073552, + "balance_loss_mlp": 1.05908012, + "diversity_loss_mlp": 0.0, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.07640210633127195, + "language_loss": 0.86818451, + "learning_rate": 0.0008302678282433479, + "loss": 0.87892002, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1519, + "time_per_iteration": 2.594045400619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077986, + "balance_loss_mlp": 1.06394291, + "diversity_loss_mlp": 0.0, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07607218771192015, + "language_loss": 0.84937745, + "learning_rate": 0.0008300338598399411, + "loss": 0.86015737, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1520, + "time_per_iteration": 2.6176183223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897129, + "balance_loss_mlp": 1.56367016, + "diversity_loss_mlp": 0.19839743, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.03454500929264816, + "language_loss": 0.94754219, + "learning_rate": 0.0008297997633060263, + "loss": 0.95651346, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160955, + "step": 1521, + "time_per_iteration": 2.5507402420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098471, + "balance_loss_mlp": 1.08445215, + "diversity_loss_mlp": 0.0, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07923859397995789, + "language_loss": 0.84868819, + "learning_rate": 0.0008295655387324883, + "loss": 0.8596729, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1522, + "time_per_iteration": 2.942894458770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103286, + "balance_loss_mlp": 1.08957708, + "diversity_loss_mlp": 0.0, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.09185291067452052, + "language_loss": 0.84979212, + "learning_rate": 0.0008293311862102609, + "loss": 0.86082506, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1523, + "time_per_iteration": 2.555556297302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.10218382, + "diversity_loss_mlp": 0.0, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.07878242279946136, + "language_loss": 0.88546365, + "learning_rate": 0.0008290967058303275, + "loss": 0.89662319, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1524, + "time_per_iteration": 2.5723721981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117022, + "balance_loss_mlp": 1.10387325, + "diversity_loss_mlp": 0.0, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07157234250277994, + "language_loss": 0.86573815, + "learning_rate": 0.0008288620976837219, + "loss": 0.87690842, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1525, + "time_per_iteration": 2.539079427719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116802, + "balance_loss_mlp": 1.10354626, + "diversity_loss_mlp": 0.0, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.07300174969402286, + "language_loss": 0.82548958, + "learning_rate": 0.000828627361861527, + "loss": 0.83665758, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1526, + "time_per_iteration": 2.5784413814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117225, + "balance_loss_mlp": 1.10368335, + "diversity_loss_mlp": 0.0, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.105387273671708, + "language_loss": 0.84438479, + "learning_rate": 0.0008283924984548752, + "loss": 0.85555708, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1527, + "time_per_iteration": 2.876854181289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136626, + "balance_loss_mlp": 1.12352467, + "diversity_loss_mlp": 0.0, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.07473419184062492, + "language_loss": 0.84776825, + "learning_rate": 0.0008281575075549485, + "loss": 0.8591345, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1528, + "time_per_iteration": 2.5660881996154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103997, + "balance_loss_mlp": 1.09631968, + "diversity_loss_mlp": 0.0, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.053938657910520806, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78456688, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1529, + "time_per_iteration": 4.633493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149647, + "balance_loss_mlp": 1.13666511, + "diversity_loss_mlp": 0.0, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.07225715112962865, + "language_loss": 0.90511358, + "learning_rate": 0.0008276871436402469, + "loss": 0.91661, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1530, + "time_per_iteration": 2.8149213790893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156897, + "balance_loss_mlp": 1.14402199, + "diversity_loss_mlp": 0.0, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.10076437192912456, + "language_loss": 0.87526608, + "learning_rate": 0.000827451770808083, + "loss": 0.88683504, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1531, + "time_per_iteration": 2.7307019233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137224, + "balance_loss_mlp": 1.12402749, + "diversity_loss_mlp": 0.0, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.07118672956881426, + "language_loss": 0.8318634, + "learning_rate": 0.0008272162708478674, + "loss": 0.84323561, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1532, + "time_per_iteration": 2.559326648712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135091, + "balance_loss_mlp": 1.1222167, + "diversity_loss_mlp": 0.0, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.07324079883183283, + "language_loss": 0.86170006, + "learning_rate": 0.000826980643851029, + "loss": 0.87305093, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1533, + "time_per_iteration": 2.728351354598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120692, + "balance_loss_mlp": 1.10734081, + "diversity_loss_mlp": 0.0, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.07850912920042735, + "language_loss": 0.84523225, + "learning_rate": 0.0008267448899090464, + "loss": 0.85643911, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1534, + "time_per_iteration": 2.595296859741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121931, + "balance_loss_mlp": 1.10788798, + "diversity_loss_mlp": 0.0, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.07265790711823701, + "language_loss": 0.80930066, + "learning_rate": 0.0008265090091134473, + "loss": 0.82051992, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1535, + "time_per_iteration": 2.8336315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105358, + "balance_loss_mlp": 1.09133863, + "diversity_loss_mlp": 0.0, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.08467148330579209, + "language_loss": 0.80271345, + "learning_rate": 0.0008262730015558088, + "loss": 0.81376696, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1536, + "time_per_iteration": 2.9066760540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102197, + "balance_loss_mlp": 1.08847594, + "diversity_loss_mlp": 0.0, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.07407642769484, + "language_loss": 0.81805962, + "learning_rate": 0.0008260368673277574, + "loss": 0.82908159, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1537, + "time_per_iteration": 3.1795482635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106736, + "balance_loss_mlp": 1.09302735, + "diversity_loss_mlp": 0.0, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06784415515848828, + "language_loss": 0.84026253, + "learning_rate": 0.0008258006065209682, + "loss": 0.85132986, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1538, + "time_per_iteration": 2.766732931137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112658, + "balance_loss_mlp": 1.09863889, + "diversity_loss_mlp": 0.0, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.0747520981493109, + "language_loss": 0.80543184, + "learning_rate": 0.0008255642192271657, + "loss": 0.81655836, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1539, + "time_per_iteration": 2.792191505432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130833, + "balance_loss_mlp": 1.11683834, + "diversity_loss_mlp": 0.0, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06277821647748005, + "language_loss": 0.83592129, + "learning_rate": 0.0008253277055381241, + "loss": 0.8472296, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1540, + "time_per_iteration": 2.8384311199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138407, + "balance_loss_mlp": 1.12428069, + "diversity_loss_mlp": 0.0, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09924754491110549, + "language_loss": 0.85482454, + "learning_rate": 0.0008250910655456658, + "loss": 0.86620867, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1541, + "time_per_iteration": 3.1718008518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133859, + "balance_loss_mlp": 1.12016189, + "diversity_loss_mlp": 0.0, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.07747440640117766, + "language_loss": 0.83370835, + "learning_rate": 0.0008248542993416625, + "loss": 0.84504688, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1542, + "time_per_iteration": 2.5952396392822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127147, + "balance_loss_mlp": 1.11278272, + "diversity_loss_mlp": 0.0, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.08018137719350796, + "language_loss": 0.83926904, + "learning_rate": 0.0008246174070180352, + "loss": 0.85054052, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1543, + "time_per_iteration": 2.6775217056274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.10168624, + "diversity_loss_mlp": 0.0, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09273281815149376, + "language_loss": 0.83928716, + "learning_rate": 0.0008243803886667537, + "loss": 0.85044312, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1544, + "time_per_iteration": 3.0925238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110422, + "balance_loss_mlp": 1.09024858, + "diversity_loss_mlp": 0.0, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.06593992881851045, + "language_loss": 0.79115343, + "learning_rate": 0.0008241432443798364, + "loss": 0.80219567, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1545, + "time_per_iteration": 2.839099407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088557, + "balance_loss_mlp": 1.07518196, + "diversity_loss_mlp": 0.0, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05453506209022983, + "language_loss": 0.85691601, + "learning_rate": 0.0008239059742493512, + "loss": 0.86780155, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1546, + "time_per_iteration": 2.7476751804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088565, + "balance_loss_mlp": 1.07480812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.06672989003234615, + "language_loss": 0.87117672, + "learning_rate": 0.0008236685783674142, + "loss": 0.88206244, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1547, + "time_per_iteration": 3.0519776344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06796312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.04305360715769565, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.772995, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1548, + "time_per_iteration": 4.883166790008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.07123256, + "diversity_loss_mlp": 0.0, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.11160876507978217, + "language_loss": 0.82253683, + "learning_rate": 0.0008231934097178955, + "loss": 0.8333841, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.1350708, + "routerloss_mlp": 0.0, + "step": 1549, + "time_per_iteration": 2.60786771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_mlp": 1.07919788, + "diversity_loss_mlp": 0.0, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.07843428838445873, + "language_loss": 0.85328496, + "learning_rate": 0.0008229556371347903, + "loss": 0.86420953, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1550, + "time_per_iteration": 2.962412118911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106892, + "balance_loss_mlp": 1.09379029, + "diversity_loss_mlp": 0.0, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.0840525031564576, + "language_loss": 0.79399186, + "learning_rate": 0.0008227177391691874, + "loss": 0.80506086, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.13122559, + "routerloss_mlp": 0.0, + "step": 1551, + "time_per_iteration": 3.1673550605773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111848, + "balance_loss_mlp": 1.09871709, + "diversity_loss_mlp": 0.0, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07195743014481873, + "language_loss": 0.89281148, + "learning_rate": 0.0008224797159134463, + "loss": 0.90392995, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1552, + "time_per_iteration": 2.7333877086639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121508, + "balance_loss_mlp": 1.10890126, + "diversity_loss_mlp": 0.0, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.07485820549569244, + "language_loss": 0.83144093, + "learning_rate": 0.0008222415674599765, + "loss": 0.84265602, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.12609863, + "routerloss_mlp": 0.0, + "step": 1553, + "time_per_iteration": 3.077017068862915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135128, + "balance_loss_mlp": 1.12165701, + "diversity_loss_mlp": 0.0, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.08671551895934956, + "language_loss": 0.83149582, + "learning_rate": 0.0008220032939012349, + "loss": 0.84284711, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1554, + "time_per_iteration": 2.6689035892486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115966, + "balance_loss_mlp": 1.10284674, + "diversity_loss_mlp": 0.0, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.06666483036401037, + "language_loss": 0.87800217, + "learning_rate": 0.0008217648953297277, + "loss": 0.88916183, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1555, + "time_per_iteration": 2.8417294025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119981, + "balance_loss_mlp": 1.10677278, + "diversity_loss_mlp": 0.0, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.08472740856632217, + "language_loss": 0.78017807, + "learning_rate": 0.0008215263718380095, + "loss": 0.7913779, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1556, + "time_per_iteration": 2.682047128677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096383, + "balance_loss_mlp": 1.08319807, + "diversity_loss_mlp": 0.0, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07743195715790333, + "language_loss": 0.84389544, + "learning_rate": 0.0008212877235186833, + "loss": 0.85485923, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.13201904, + "routerloss_mlp": 0.0, + "step": 1557, + "time_per_iteration": 2.6532580852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074398, + "balance_loss_mlp": 1.06710196, + "diversity_loss_mlp": 0.0, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.04061005434024277, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78811955, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.07275391, + "routerloss_mlp": 0.0, + "step": 1558, + "time_per_iteration": 4.923272132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092088, + "balance_loss_mlp": 1.07896352, + "diversity_loss_mlp": 0.0, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.10565427097675566, + "language_loss": 0.8116585, + "learning_rate": 0.0008208100527678611, + "loss": 0.82257938, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1559, + "time_per_iteration": 2.602773427963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.07101393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.11780548804152448, + "language_loss": 0.78494406, + "learning_rate": 0.0008205710305218135, + "loss": 0.79578459, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1560, + "time_per_iteration": 3.013576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089526, + "balance_loss_mlp": 1.07663918, + "diversity_loss_mlp": 0.0, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.08018423106971302, + "language_loss": 0.89838511, + "learning_rate": 0.0008203318838190541, + "loss": 0.9092803, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1561, + "time_per_iteration": 2.741619348526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108989, + "balance_loss_mlp": 1.07702184, + "diversity_loss_mlp": 0.0, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.09397123990600864, + "language_loss": 0.85396177, + "learning_rate": 0.0008200926127524281, + "loss": 0.86486065, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1562, + "time_per_iteration": 2.60974383354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106708, + "balance_loss_mlp": 1.0936904, + "diversity_loss_mlp": 0.0, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.08688269643752358, + "language_loss": 0.83400619, + "learning_rate": 0.0008198532174148289, + "loss": 0.84507322, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1563, + "time_per_iteration": 2.7336533069610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079297, + "balance_loss_mlp": 1.07195389, + "diversity_loss_mlp": 0.0, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.04112604139988501, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81765467, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1564, + "time_per_iteration": 4.828714609146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145426, + "balance_loss_mlp": 1.1324501, + "diversity_loss_mlp": 0.0, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08852118135813189, + "language_loss": 0.89291, + "learning_rate": 0.0008193740542985244, + "loss": 0.90436429, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1565, + "time_per_iteration": 2.5988731384277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151488, + "balance_loss_mlp": 1.13872099, + "diversity_loss_mlp": 0.0, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.1281977179548432, + "language_loss": 0.86354733, + "learning_rate": 0.0008191342867058467, + "loss": 0.87506223, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1566, + "time_per_iteration": 2.6914639472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.10574174, + "diversity_loss_mlp": 0.0, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.07018370282969584, + "language_loss": 0.83602738, + "learning_rate": 0.0008188943952142509, + "loss": 0.84721458, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1567, + "time_per_iteration": 2.7846438884735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111402, + "balance_loss_mlp": 1.09847367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.08750889372003143, + "language_loss": 0.82150149, + "learning_rate": 0.0008186543799168711, + "loss": 0.83261549, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1568, + "time_per_iteration": 3.1300384998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094152, + "balance_loss_mlp": 1.08103871, + "diversity_loss_mlp": 0.0, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.07719475001811499, + "language_loss": 0.88627326, + "learning_rate": 0.0008184142409068892, + "loss": 0.89721477, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.13134766, + "routerloss_mlp": 0.0, + "step": 1569, + "time_per_iteration": 2.9922726154327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087737, + "balance_loss_mlp": 1.07475495, + "diversity_loss_mlp": 0.0, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.07345065764158631, + "language_loss": 0.86446834, + "learning_rate": 0.000818173978277536, + "loss": 0.87534571, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.12994385, + "routerloss_mlp": 0.0, + "step": 1570, + "time_per_iteration": 2.695930242538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089564, + "balance_loss_mlp": 1.07673669, + "diversity_loss_mlp": 0.0, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.0712021049255776, + "language_loss": 0.83337176, + "learning_rate": 0.000817933592122089, + "loss": 0.84426749, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.12841797, + "routerloss_mlp": 0.0, + "step": 1571, + "time_per_iteration": 2.7131617069244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087482, + "balance_loss_mlp": 1.07427394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.08283074842036095, + "language_loss": 0.83667982, + "learning_rate": 0.0008176930825338749, + "loss": 0.84755468, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1572, + "time_per_iteration": 2.5447826385498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087862, + "balance_loss_mlp": 1.07405734, + "diversity_loss_mlp": 0.0, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07741282152017008, + "language_loss": 0.88849854, + "learning_rate": 0.0008174524496062679, + "loss": 0.89937723, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1573, + "time_per_iteration": 2.908740997314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092114, + "balance_loss_mlp": 1.07822633, + "diversity_loss_mlp": 0.0, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.06962859876416791, + "language_loss": 0.85499102, + "learning_rate": 0.0008172116934326894, + "loss": 0.86591208, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1574, + "time_per_iteration": 2.751488208770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098046, + "balance_loss_mlp": 1.08365786, + "diversity_loss_mlp": 0.0, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.09195920466248479, + "language_loss": 0.8794626, + "learning_rate": 0.0008169708141066097, + "loss": 0.89044309, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1575, + "time_per_iteration": 2.5947275161743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118908, + "balance_loss_mlp": 1.10441208, + "diversity_loss_mlp": 0.0, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.0784824693742563, + "language_loss": 0.90658617, + "learning_rate": 0.0008167298117215465, + "loss": 0.91777527, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1576, + "time_per_iteration": 2.5396125316619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011316, + "balance_loss_mlp": 1.11705649, + "diversity_loss_mlp": 0.0, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.1093253517132677, + "language_loss": 0.87566864, + "learning_rate": 0.0008164886863710649, + "loss": 0.88698471, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1577, + "time_per_iteration": 2.931835412979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138004, + "balance_loss_mlp": 1.12323439, + "diversity_loss_mlp": 0.0, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07788016425512684, + "language_loss": 0.8637675, + "learning_rate": 0.0008162474381487783, + "loss": 0.87514758, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1578, + "time_per_iteration": 3.041262626647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125978, + "balance_loss_mlp": 1.11132693, + "diversity_loss_mlp": 0.0, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.1532642042193693, + "language_loss": 0.84568751, + "learning_rate": 0.0008160060671483475, + "loss": 0.8569473, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1579, + "time_per_iteration": 2.6566197872161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110829, + "balance_loss_mlp": 1.0942831, + "diversity_loss_mlp": 0.0, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.10001869607158981, + "language_loss": 0.8342396, + "learning_rate": 0.0008157645734634809, + "loss": 0.84532249, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1580, + "time_per_iteration": 2.5994346141815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151521, + "balance_loss_mlp": 1.14064956, + "diversity_loss_mlp": 0.0, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.06737085519591758, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78048015, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 1581, + "time_per_iteration": 4.946556329727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00631723, + "balance_loss_mlp": 1.05820811, + "diversity_loss_mlp": 0.17941347, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.002006006723137456, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.73846221, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01291206, + "step": 1582, + "time_per_iteration": 4.897693395614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.08376384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.07529557219412701, + "language_loss": 0.83949858, + "learning_rate": 0.000815039357240067, + "loss": 0.85047406, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1583, + "time_per_iteration": 2.6096932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101837, + "balance_loss_mlp": 1.0882473, + "diversity_loss_mlp": 0.0, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.0740498467066553, + "language_loss": 0.84922493, + "learning_rate": 0.0008147973737554952, + "loss": 0.86024332, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1584, + "time_per_iteration": 2.7863824367523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106775, + "balance_loss_mlp": 1.09364963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.11669723774220289, + "language_loss": 0.85926318, + "learning_rate": 0.000814555268055744, + "loss": 0.87033093, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1585, + "time_per_iteration": 2.6167564392089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111589, + "balance_loss_mlp": 1.1022768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07476018488685929, + "language_loss": 0.87489879, + "learning_rate": 0.0008143130402348073, + "loss": 0.88605773, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1586, + "time_per_iteration": 2.6318202018737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112097, + "balance_loss_mlp": 1.10742807, + "diversity_loss_mlp": 0.0, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.07016471467090964, + "language_loss": 0.79198885, + "learning_rate": 0.0008140706903867265, + "loss": 0.80319858, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1587, + "time_per_iteration": 2.82663893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128991, + "balance_loss_mlp": 1.11541307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.09040046070353, + "language_loss": 0.90612531, + "learning_rate": 0.0008138282186055897, + "loss": 0.91741514, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1588, + "time_per_iteration": 2.690561294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142156, + "balance_loss_mlp": 1.12872136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.07675542780120453, + "language_loss": 0.82382154, + "learning_rate": 0.0008135856249855331, + "loss": 0.83524311, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1589, + "time_per_iteration": 2.6935813426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115907, + "balance_loss_mlp": 1.14551568, + "diversity_loss_mlp": 0.0, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.07642745969896261, + "language_loss": 0.89603746, + "learning_rate": 0.0008133429096207398, + "loss": 0.90762818, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1590, + "time_per_iteration": 2.7690787315368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113549, + "balance_loss_mlp": 1.10534787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.03962763613217991, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76425815, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.08203125, + "routerloss_mlp": 0.0, + "step": 1591, + "time_per_iteration": 4.950432538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184059, + "balance_loss_mlp": 1.17060041, + "diversity_loss_mlp": 0.0, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.0624915030883944, + "language_loss": 0.8671608, + "learning_rate": 0.0008128571140339123, + "loss": 0.87900144, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1592, + "time_per_iteration": 2.717022657394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.15618944, + "diversity_loss_mlp": 0.0, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.08640912687422367, + "language_loss": 0.87240267, + "learning_rate": 0.0008126140340004805, + "loss": 0.88410139, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1593, + "time_per_iteration": 2.5112054347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157381, + "balance_loss_mlp": 1.14379096, + "diversity_loss_mlp": 0.0, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06492228459438584, + "language_loss": 0.82168889, + "learning_rate": 0.0008123708325995172, + "loss": 0.83326268, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1594, + "time_per_iteration": 3.193125009536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139509, + "balance_loss_mlp": 1.1256932, + "diversity_loss_mlp": 0.0, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06515151231920442, + "language_loss": 0.79815221, + "learning_rate": 0.0008121275099254414, + "loss": 0.80954736, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1595, + "time_per_iteration": 2.9032304286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133663, + "balance_loss_mlp": 1.12007284, + "diversity_loss_mlp": 0.0, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06899315915000012, + "language_loss": 0.88638222, + "learning_rate": 0.0008118840660727194, + "loss": 0.89771879, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1596, + "time_per_iteration": 2.6298515796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115466, + "balance_loss_mlp": 1.10215056, + "diversity_loss_mlp": 0.0, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.06984166924665287, + "language_loss": 0.87847084, + "learning_rate": 0.0008116405011358644, + "loss": 0.88962543, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.13336182, + "routerloss_mlp": 0.0, + "step": 1597, + "time_per_iteration": 3.1922342777252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095988, + "balance_loss_mlp": 1.08212388, + "diversity_loss_mlp": 0.0, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.07145022695402857, + "language_loss": 0.79985273, + "learning_rate": 0.0008113968152094369, + "loss": 0.81081259, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1598, + "time_per_iteration": 2.500500440597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090097, + "balance_loss_mlp": 1.07637632, + "diversity_loss_mlp": 0.0, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.07896733537507578, + "language_loss": 0.82477671, + "learning_rate": 0.0008111530083880438, + "loss": 0.83567768, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1599, + "time_per_iteration": 2.9081485271453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.07693791, + "diversity_loss_mlp": 0.0, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.10700735308097704, + "language_loss": 0.86289096, + "learning_rate": 0.0008109090807663399, + "loss": 0.87379909, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1600, + "time_per_iteration": 2.7883458137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084832, + "balance_loss_mlp": 1.07049167, + "diversity_loss_mlp": 0.0, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.058046583591585654, + "language_loss": 0.8845669, + "learning_rate": 0.0008106650324390257, + "loss": 0.89541531, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1601, + "time_per_iteration": 2.8250818252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012306, + "balance_loss_mlp": 1.78856134, + "diversity_loss_mlp": 0.20302816, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.03151963489439222, + "language_loss": 0.81347358, + "learning_rate": 0.0008104208635008493, + "loss": 0.8235966, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165114, + "step": 1602, + "time_per_iteration": 2.6824991703033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078191, + "balance_loss_mlp": 1.06365991, + "diversity_loss_mlp": 0.0, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.06925842581040223, + "language_loss": 0.81696957, + "learning_rate": 0.0008101765740466058, + "loss": 0.82775152, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1603, + "time_per_iteration": 2.4828884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083153, + "balance_loss_mlp": 1.06891942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.08194523431430376, + "language_loss": 0.83996522, + "learning_rate": 0.0008099321641711364, + "loss": 0.85079676, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1604, + "time_per_iteration": 2.628990650177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093302, + "balance_loss_mlp": 1.07891393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.066381842407901, + "language_loss": 0.83568424, + "learning_rate": 0.0008096876339693295, + "loss": 0.84661728, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1605, + "time_per_iteration": 2.621486186981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104017, + "balance_loss_mlp": 1.0898906, + "diversity_loss_mlp": 0.0, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.08065648415588843, + "language_loss": 0.8146233, + "learning_rate": 0.0008094429835361206, + "loss": 0.82566357, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1606, + "time_per_iteration": 2.9436137676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101821, + "balance_loss_mlp": 1.08727765, + "diversity_loss_mlp": 0.0, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.06722603246449312, + "language_loss": 0.85730284, + "learning_rate": 0.0008091982129664908, + "loss": 0.86832106, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1607, + "time_per_iteration": 2.6776270866394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110298, + "balance_loss_mlp": 1.09606481, + "diversity_loss_mlp": 0.0, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.07435522574008574, + "language_loss": 0.83177197, + "learning_rate": 0.0008089533223554687, + "loss": 0.842875, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1608, + "time_per_iteration": 2.6971724033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106883, + "balance_loss_mlp": 1.09322155, + "diversity_loss_mlp": 0.0, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.08534881839400792, + "language_loss": 0.85436511, + "learning_rate": 0.0008087083117981294, + "loss": 0.86543399, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1609, + "time_per_iteration": 2.873072624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100887, + "balance_loss_mlp": 1.08715367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.08408730625442483, + "language_loss": 0.88209295, + "learning_rate": 0.0008084631813895943, + "loss": 0.89310181, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1610, + "time_per_iteration": 2.7717368602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_mlp": 1.0843389, + "diversity_loss_mlp": 0.0, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07291880748627809, + "language_loss": 0.84093356, + "learning_rate": 0.0008082179312250315, + "loss": 0.85191453, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1611, + "time_per_iteration": 2.6323728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167376, + "balance_loss_mlp": 1.15912676, + "diversity_loss_mlp": 0.0, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.06715325583723679, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81023216, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.08251953, + "routerloss_mlp": 0.0, + "step": 1612, + "time_per_iteration": 4.837978839874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103787, + "balance_loss_mlp": 1.09591889, + "diversity_loss_mlp": 0.0, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.04843806861709949, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77733123, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.07861328, + "routerloss_mlp": 0.0, + "step": 1613, + "time_per_iteration": 5.086154937744141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118625, + "balance_loss_mlp": 1.10497594, + "diversity_loss_mlp": 0.0, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.09649046421891638, + "language_loss": 0.82414234, + "learning_rate": 0.0008074814631475545, + "loss": 0.83532858, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1614, + "time_per_iteration": 3.3300058841705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115901, + "balance_loss_mlp": 1.10232294, + "diversity_loss_mlp": 0.0, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.10381126956618623, + "language_loss": 0.7917223, + "learning_rate": 0.0008072357349114907, + "loss": 0.80288124, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1615, + "time_per_iteration": 2.692242383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123449, + "balance_loss_mlp": 1.1100384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.09811598085954727, + "language_loss": 0.88751173, + "learning_rate": 0.0008069898873959363, + "loss": 0.89874619, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1616, + "time_per_iteration": 2.688138723373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119599, + "balance_loss_mlp": 1.10590243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.06496922585492992, + "language_loss": 0.85670269, + "learning_rate": 0.0008067439206963375, + "loss": 0.8678987, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1617, + "time_per_iteration": 2.628465175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126727, + "balance_loss_mlp": 1.11359048, + "diversity_loss_mlp": 0.0, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.08367367493581554, + "language_loss": 0.86233091, + "learning_rate": 0.0008064978349081873, + "loss": 0.87359822, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1618, + "time_per_iteration": 2.9359195232391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122791, + "balance_loss_mlp": 1.10941529, + "diversity_loss_mlp": 0.0, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.062058920213391884, + "language_loss": 0.86742592, + "learning_rate": 0.0008062516301270245, + "loss": 0.87865382, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1619, + "time_per_iteration": 2.685615301132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968061, + "balance_loss_mlp": 1.70987701, + "diversity_loss_mlp": 0.19448289, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.02692656797073588, + "language_loss": 0.8831743, + "learning_rate": 0.0008060053064484343, + "loss": 0.89285493, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588114, + "step": 1620, + "time_per_iteration": 2.9507076740264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131577, + "balance_loss_mlp": 1.11839283, + "diversity_loss_mlp": 0.0, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.08216719715750098, + "language_loss": 0.85142976, + "learning_rate": 0.0008057588639680482, + "loss": 0.86274558, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.13208008, + "routerloss_mlp": 0.0, + "step": 1621, + "time_per_iteration": 2.7498936653137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00955916, + "balance_loss_mlp": 1.68915153, + "diversity_loss_mlp": 0.19115068, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.038673577194741904, + "language_loss": 0.82934028, + "learning_rate": 0.0008055123027815434, + "loss": 0.83889943, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01576493, + "step": 1622, + "time_per_iteration": 2.92877459526062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119016, + "balance_loss_mlp": 1.10545552, + "diversity_loss_mlp": 0.0, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.11144773799130939, + "language_loss": 0.8492527, + "learning_rate": 0.0008052656229846436, + "loss": 0.86044282, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.13580322, + "routerloss_mlp": 0.0, + "step": 1623, + "time_per_iteration": 2.6647849082946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104342, + "balance_loss_mlp": 1.09039474, + "diversity_loss_mlp": 0.0, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.09067734621983937, + "language_loss": 0.90320027, + "learning_rate": 0.0008050188246731182, + "loss": 0.9142437, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1624, + "time_per_iteration": 2.6908931732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108727, + "balance_loss_mlp": 1.07360816, + "diversity_loss_mlp": 0.0, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08706559573327896, + "language_loss": 0.8222695, + "learning_rate": 0.0008047719079427834, + "loss": 0.83314216, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1625, + "time_per_iteration": 2.979578733444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281481, + "balance_loss_mlp": 1.27170551, + "diversity_loss_mlp": 0.0, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.09241126848133228, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75633186, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.09765625, + "routerloss_mlp": 0.0, + "step": 1626, + "time_per_iteration": 4.813723802566528 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078597, + "balance_loss_mlp": 1.06489933, + "diversity_loss_mlp": 0.0, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.061158387019755324, + "language_loss": 0.86164916, + "learning_rate": 0.0008042777196091757, + "loss": 0.87243509, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1627, + "time_per_iteration": 2.6777052879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931263, + "balance_loss_mlp": 1.63595629, + "diversity_loss_mlp": 0.19502082, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.02888255305303151, + "language_loss": 0.81839561, + "learning_rate": 0.0008040304481977643, + "loss": 0.82770824, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01577434, + "step": 1628, + "time_per_iteration": 2.685519218444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_mlp": 1.07024312, + "diversity_loss_mlp": 0.0, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.070875243316129, + "language_loss": 0.86462033, + "learning_rate": 0.0008037830587512649, + "loss": 0.875458, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1629, + "time_per_iteration": 3.0812296867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093655, + "balance_loss_mlp": 1.07976675, + "diversity_loss_mlp": 0.0, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.07857424850498267, + "language_loss": 0.78910959, + "learning_rate": 0.0008035355513657224, + "loss": 0.80004621, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1630, + "time_per_iteration": 2.509866714477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109932, + "balance_loss_mlp": 1.08518136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.05926482463995905, + "language_loss": 0.9323386, + "learning_rate": 0.0008032879261372279, + "loss": 0.94333184, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1631, + "time_per_iteration": 2.793675422668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121244, + "balance_loss_mlp": 1.20142555, + "diversity_loss_mlp": 0.0, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.0543299042148954, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80848283, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 1632, + "time_per_iteration": 5.6717705726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_mlp": 1.08712876, + "diversity_loss_mlp": 0.0, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.07399367926820971, + "language_loss": 0.87236691, + "learning_rate": 0.0008027923225359748, + "loss": 0.88337696, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.13885498, + "routerloss_mlp": 0.0, + "step": 1633, + "time_per_iteration": 2.591161012649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_mlp": 1.09272563, + "diversity_loss_mlp": 0.0, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.07361205381971474, + "language_loss": 0.8823992, + "learning_rate": 0.0008025443443556267, + "loss": 0.89347273, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1634, + "time_per_iteration": 2.714925765991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_mlp": 1.09279966, + "diversity_loss_mlp": 0.0, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.05821338652647348, + "language_loss": 0.88174599, + "learning_rate": 0.000802296248717147, + "loss": 0.89281231, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1635, + "time_per_iteration": 2.924661159515381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102803, + "balance_loss_mlp": 1.08889091, + "diversity_loss_mlp": 0.0, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06918051977022115, + "language_loss": 0.78766519, + "learning_rate": 0.0008020480357168554, + "loss": 0.79869324, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1636, + "time_per_iteration": 2.8397598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096954, + "balance_loss_mlp": 1.08334041, + "diversity_loss_mlp": 0.0, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.061070409346790804, + "language_loss": 0.88343245, + "learning_rate": 0.0008017997054511165, + "loss": 0.89440191, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.13623047, + "routerloss_mlp": 0.0, + "step": 1637, + "time_per_iteration": 2.5770463943481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109567, + "balance_loss_mlp": 1.08241367, + "diversity_loss_mlp": 0.0, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06082888573267997, + "language_loss": 0.85688329, + "learning_rate": 0.0008015512580163407, + "loss": 0.86783999, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1638, + "time_per_iteration": 2.7893900871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915347, + "balance_loss_mlp": 1.6005652, + "diversity_loss_mlp": 0.19760543, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.03200753828687725, + "language_loss": 0.80247211, + "learning_rate": 0.0008013026935089838, + "loss": 0.8116256, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0162621, + "step": 1639, + "time_per_iteration": 2.9013028144836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116887, + "balance_loss_mlp": 1.10366678, + "diversity_loss_mlp": 0.0, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.07107229367788748, + "language_loss": 0.84156835, + "learning_rate": 0.0008010540120255472, + "loss": 0.85273731, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1640, + "time_per_iteration": 2.6617894172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122905, + "balance_loss_mlp": 1.10991144, + "diversity_loss_mlp": 0.0, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.08316081918757003, + "language_loss": 0.86058956, + "learning_rate": 0.0008008052136625774, + "loss": 0.87181866, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1641, + "time_per_iteration": 2.8128581047058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117766, + "balance_loss_mlp": 1.10461712, + "diversity_loss_mlp": 0.0, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.11340060957388516, + "language_loss": 0.86898887, + "learning_rate": 0.0008005562985166666, + "loss": 0.88016647, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1642, + "time_per_iteration": 2.6915791034698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113412, + "balance_loss_mlp": 1.10045385, + "diversity_loss_mlp": 0.0, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.06371803301806024, + "language_loss": 0.85065734, + "learning_rate": 0.0008003072666844524, + "loss": 0.86179143, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.12976074, + "routerloss_mlp": 0.0, + "step": 1643, + "time_per_iteration": 2.713515520095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110554, + "balance_loss_mlp": 1.09287417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.09207812275617455, + "language_loss": 0.82446098, + "learning_rate": 0.0008000581182626173, + "loss": 0.83551639, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 1644, + "time_per_iteration": 2.5728507041931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099382, + "balance_loss_mlp": 1.08668065, + "diversity_loss_mlp": 0.0, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.07446065392993936, + "language_loss": 0.86341298, + "learning_rate": 0.0007998088533478894, + "loss": 0.87440687, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.12713623, + "routerloss_mlp": 0.0, + "step": 1645, + "time_per_iteration": 2.7022316455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103676, + "balance_loss_mlp": 1.09096265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.09512310951915111, + "language_loss": 0.84171218, + "learning_rate": 0.000799559472037042, + "loss": 0.85274899, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.12719727, + "routerloss_mlp": 0.0, + "step": 1646, + "time_per_iteration": 2.5341672897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089286, + "balance_loss_mlp": 1.07678151, + "diversity_loss_mlp": 0.0, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05690135295492242, + "language_loss": 0.87462902, + "learning_rate": 0.0007993099744268932, + "loss": 0.88552189, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1647, + "time_per_iteration": 2.9204719066619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097973, + "balance_loss_mlp": 1.08491409, + "diversity_loss_mlp": 0.0, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.08028992569563033, + "language_loss": 0.88103539, + "learning_rate": 0.000799060360614307, + "loss": 0.8920151, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1648, + "time_per_iteration": 2.7098584175109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094994, + "balance_loss_mlp": 1.08204746, + "diversity_loss_mlp": 0.0, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.07374581447427947, + "language_loss": 0.83565277, + "learning_rate": 0.0007988106306961917, + "loss": 0.84660268, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1649, + "time_per_iteration": 3.136148691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096074, + "balance_loss_mlp": 1.08292556, + "diversity_loss_mlp": 0.0, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.08307651310008923, + "language_loss": 0.84510154, + "learning_rate": 0.0007985607847695014, + "loss": 0.85606229, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1650, + "time_per_iteration": 2.6657865047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090136, + "balance_loss_mlp": 1.07697558, + "diversity_loss_mlp": 0.0, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.07221907468491222, + "language_loss": 0.82981718, + "learning_rate": 0.0007983108229312345, + "loss": 0.84071863, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.13183594, + "routerloss_mlp": 0.0, + "step": 1651, + "time_per_iteration": 2.939943313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109006, + "balance_loss_mlp": 1.07648206, + "diversity_loss_mlp": 0.0, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.0785368607999539, + "language_loss": 0.86505926, + "learning_rate": 0.0007980607452784351, + "loss": 0.87595987, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1652, + "time_per_iteration": 2.586700916290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082675, + "balance_loss_mlp": 1.06952596, + "diversity_loss_mlp": 0.0, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.06920593361186494, + "language_loss": 0.90510356, + "learning_rate": 0.0007978105519081919, + "loss": 0.91593033, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1653, + "time_per_iteration": 2.665844440460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084984, + "balance_loss_mlp": 1.0715965, + "diversity_loss_mlp": 0.0, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.07269169213621761, + "language_loss": 0.87967515, + "learning_rate": 0.0007975602429176385, + "loss": 0.89052504, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.13415527, + "routerloss_mlp": 0.0, + "step": 1654, + "time_per_iteration": 2.5818393230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085975, + "balance_loss_mlp": 1.07225442, + "diversity_loss_mlp": 0.0, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.08150423110047789, + "language_loss": 0.81308222, + "learning_rate": 0.0007973098184039536, + "loss": 0.82394195, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1655, + "time_per_iteration": 2.664916515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094509, + "balance_loss_mlp": 1.08110952, + "diversity_loss_mlp": 0.0, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.0661968945841423, + "language_loss": 0.8695243, + "learning_rate": 0.0007970592784643602, + "loss": 0.88046944, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.13427734, + "routerloss_mlp": 0.0, + "step": 1656, + "time_per_iteration": 2.851214647293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104427, + "balance_loss_mlp": 1.09084868, + "diversity_loss_mlp": 0.0, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.0809768283097012, + "language_loss": 0.85228848, + "learning_rate": 0.0007968086231961272, + "loss": 0.86333275, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1657, + "time_per_iteration": 2.6277201175689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111168, + "balance_loss_mlp": 1.09744644, + "diversity_loss_mlp": 0.0, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.10999441213252201, + "language_loss": 0.83322126, + "learning_rate": 0.0007965578526965671, + "loss": 0.84433806, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1658, + "time_per_iteration": 2.5514447689056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097556, + "balance_loss_mlp": 1.08337009, + "diversity_loss_mlp": 0.0, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.07090711515760839, + "language_loss": 0.86299932, + "learning_rate": 0.0007963069670630377, + "loss": 0.87397492, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1659, + "time_per_iteration": 2.722572088241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.07523549, + "diversity_loss_mlp": 0.0, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.07181055202596492, + "language_loss": 0.88127738, + "learning_rate": 0.0007960559663929416, + "loss": 0.8921715, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1660, + "time_per_iteration": 2.6411688327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079317, + "balance_loss_mlp": 1.06500006, + "diversity_loss_mlp": 0.0, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.06614466369263741, + "language_loss": 0.87915826, + "learning_rate": 0.0007958048507837259, + "loss": 0.88995141, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1661, + "time_per_iteration": 2.954888343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075627, + "balance_loss_mlp": 1.06107187, + "diversity_loss_mlp": 0.0, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.08599761261652404, + "language_loss": 0.87309289, + "learning_rate": 0.0007955536203328822, + "loss": 0.88384914, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1662, + "time_per_iteration": 2.9499282836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074811, + "balance_loss_mlp": 1.06073272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.08962386225204486, + "language_loss": 0.8334958, + "learning_rate": 0.0007953022751379469, + "loss": 0.84424388, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1663, + "time_per_iteration": 2.768754005432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075722, + "balance_loss_mlp": 1.06131005, + "diversity_loss_mlp": 0.0, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.08182948291647181, + "language_loss": 0.8200748, + "learning_rate": 0.000795050815296501, + "loss": 0.830832, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1664, + "time_per_iteration": 2.9893014430999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.07167196, + "diversity_loss_mlp": 0.0, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.0641722272838546, + "language_loss": 0.93037909, + "learning_rate": 0.0007947992409061695, + "loss": 0.94122881, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1665, + "time_per_iteration": 2.583789110183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100063, + "balance_loss_mlp": 1.08662808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07388769827525307, + "language_loss": 0.86501724, + "learning_rate": 0.0007945475520646226, + "loss": 0.87601787, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1666, + "time_per_iteration": 2.944988965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127031, + "balance_loss_mlp": 1.11408508, + "diversity_loss_mlp": 0.0, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.0781321549049884, + "language_loss": 0.84777099, + "learning_rate": 0.0007942957488695743, + "loss": 0.85904133, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1667, + "time_per_iteration": 2.667464017868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138299, + "balance_loss_mlp": 1.12505507, + "diversity_loss_mlp": 0.0, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06588913292879497, + "language_loss": 0.81000018, + "learning_rate": 0.0007940438314187833, + "loss": 0.82138324, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.13250732, + "routerloss_mlp": 0.0, + "step": 1668, + "time_per_iteration": 3.0395359992980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147791, + "balance_loss_mlp": 1.13491094, + "diversity_loss_mlp": 0.0, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.07621602089938284, + "language_loss": 0.80540276, + "learning_rate": 0.0007937917998100529, + "loss": 0.8168807, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1669, + "time_per_iteration": 2.5894687175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142528, + "balance_loss_mlp": 1.1294744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.07981389159152626, + "language_loss": 0.79167509, + "learning_rate": 0.0007935396541412302, + "loss": 0.80310035, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.13067627, + "routerloss_mlp": 0.0, + "step": 1670, + "time_per_iteration": 2.672978401184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141245, + "balance_loss_mlp": 1.12813175, + "diversity_loss_mlp": 0.0, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.06899314705075654, + "language_loss": 0.85712755, + "learning_rate": 0.0007932873945102068, + "loss": 0.86854005, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1671, + "time_per_iteration": 2.6296515464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_mlp": 1.25616145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.05047573422440889, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.77033865, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1672, + "time_per_iteration": 4.840561628341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138556, + "balance_loss_mlp": 1.1251744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.06902528499394482, + "language_loss": 0.86527705, + "learning_rate": 0.0007927825337533461, + "loss": 0.87666261, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1673, + "time_per_iteration": 2.693758964538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142697, + "balance_loss_mlp": 1.12930942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.08521571565711833, + "language_loss": 0.84877092, + "learning_rate": 0.0007925299328235131, + "loss": 0.8601979, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1674, + "time_per_iteration": 2.659621238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141943, + "balance_loss_mlp": 1.12855613, + "diversity_loss_mlp": 0.0, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.08187135533898351, + "language_loss": 0.84720862, + "learning_rate": 0.000792277218323488, + "loss": 0.85862803, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1675, + "time_per_iteration": 2.646108865737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135022, + "balance_loss_mlp": 1.12169456, + "diversity_loss_mlp": 0.0, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.08499328402904442, + "language_loss": 0.8509531, + "learning_rate": 0.0007920243903513833, + "loss": 0.86230332, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1676, + "time_per_iteration": 2.5730555057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126699, + "balance_loss_mlp": 1.11364567, + "diversity_loss_mlp": 0.0, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.08854342537284099, + "language_loss": 0.84008271, + "learning_rate": 0.0007917714490053556, + "loss": 0.85134971, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1677, + "time_per_iteration": 2.718555212020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122958, + "balance_loss_mlp": 1.10974979, + "diversity_loss_mlp": 0.0, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.07711595043056121, + "language_loss": 0.86223996, + "learning_rate": 0.0007915183943836055, + "loss": 0.87346947, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1678, + "time_per_iteration": 2.902038812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112402, + "balance_loss_mlp": 1.09958673, + "diversity_loss_mlp": 0.0, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07762427611918464, + "language_loss": 0.8422336, + "learning_rate": 0.0007912652265843773, + "loss": 0.85335761, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1679, + "time_per_iteration": 3.024665117263794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107958, + "balance_loss_mlp": 1.09453535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.06959311244041297, + "language_loss": 0.81845474, + "learning_rate": 0.0007910119457059597, + "loss": 0.82953429, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1680, + "time_per_iteration": 2.6954221725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111109, + "balance_loss_mlp": 1.09806776, + "diversity_loss_mlp": 0.0, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.08135634404485692, + "language_loss": 0.80380678, + "learning_rate": 0.0007907585518466849, + "loss": 0.81491786, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1681, + "time_per_iteration": 2.961648464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108764, + "balance_loss_mlp": 1.09574652, + "diversity_loss_mlp": 0.0, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.06462126830885603, + "language_loss": 0.89670283, + "learning_rate": 0.000790505045104929, + "loss": 0.90779042, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1682, + "time_per_iteration": 2.5210485458374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111018, + "balance_loss_mlp": 1.09719789, + "diversity_loss_mlp": 0.0, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.08715930327910015, + "language_loss": 0.86719161, + "learning_rate": 0.0007902514255791125, + "loss": 0.8782934, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1683, + "time_per_iteration": 2.8002610206604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097901, + "balance_loss_mlp": 1.084764, + "diversity_loss_mlp": 0.0, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06656486310868524, + "language_loss": 0.8795855, + "learning_rate": 0.0007899976933676986, + "loss": 0.89056444, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1684, + "time_per_iteration": 2.967172622680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092249, + "balance_loss_mlp": 1.07880259, + "diversity_loss_mlp": 0.0, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.09628316614228749, + "language_loss": 0.87045735, + "learning_rate": 0.0007897438485691955, + "loss": 0.88137984, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1685, + "time_per_iteration": 2.680147171020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103099, + "balance_loss_mlp": 1.0898304, + "diversity_loss_mlp": 0.0, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.0850736326825917, + "language_loss": 0.82684374, + "learning_rate": 0.0007894898912821542, + "loss": 0.83787471, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1686, + "time_per_iteration": 2.554380416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.0880518, + "diversity_loss_mlp": 0.0, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.06056792299191916, + "language_loss": 0.86695451, + "learning_rate": 0.0007892358216051695, + "loss": 0.87797034, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1687, + "time_per_iteration": 2.7851648330688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.09641767, + "diversity_loss_mlp": 0.0, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.07434076211008771, + "language_loss": 0.91829026, + "learning_rate": 0.0007889816396368803, + "loss": 0.92938912, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1688, + "time_per_iteration": 2.6211581230163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.10499799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07845440141588131, + "language_loss": 0.85253429, + "learning_rate": 0.0007887273454759687, + "loss": 0.8637172, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.13299561, + "routerloss_mlp": 0.0, + "step": 1689, + "time_per_iteration": 2.507779598236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.10946417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.08373410695529686, + "language_loss": 0.82792354, + "learning_rate": 0.0007884729392211603, + "loss": 0.83914578, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1690, + "time_per_iteration": 2.6805906295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119249, + "balance_loss_mlp": 1.10672641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09069843341009556, + "language_loss": 0.85648167, + "learning_rate": 0.0007882184209712245, + "loss": 0.86767411, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.12530518, + "routerloss_mlp": 0.0, + "step": 1691, + "time_per_iteration": 2.569239377975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00949982, + "balance_loss_mlp": 1.66309059, + "diversity_loss_mlp": 0.20491584, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.028395749586794427, + "language_loss": 0.85757548, + "learning_rate": 0.000787963790824974, + "loss": 0.86707526, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597837, + "step": 1692, + "time_per_iteration": 3.009209156036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113225, + "balance_loss_mlp": 1.10071397, + "diversity_loss_mlp": 0.0, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.22846677162281695, + "language_loss": 0.89612615, + "learning_rate": 0.0007877090488812651, + "loss": 0.90725839, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.12512207, + "routerloss_mlp": 0.0, + "step": 1693, + "time_per_iteration": 2.450209617614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936753, + "balance_loss_mlp": 1.63723278, + "diversity_loss_mlp": 0.20419246, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.03161007726798549, + "language_loss": 0.83743423, + "learning_rate": 0.0007874541952389973, + "loss": 0.84680176, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01604037, + "step": 1694, + "time_per_iteration": 2.6965737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111615, + "balance_loss_mlp": 1.10350823, + "diversity_loss_mlp": 0.0, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.07424213060006848, + "language_loss": 0.86538494, + "learning_rate": 0.0007871992299971136, + "loss": 0.87654638, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1695, + "time_per_iteration": 2.570406913757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131966, + "balance_loss_mlp": 1.11953878, + "diversity_loss_mlp": 0.0, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.0612219868328418, + "language_loss": 0.84142137, + "learning_rate": 0.0007869441532546001, + "loss": 0.852741, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1696, + "time_per_iteration": 2.763688087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128533, + "balance_loss_mlp": 1.11626601, + "diversity_loss_mlp": 0.0, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06155756648422996, + "language_loss": 0.79298395, + "learning_rate": 0.0007866889651104867, + "loss": 0.80426925, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 1697, + "time_per_iteration": 2.816236972808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130866, + "balance_loss_mlp": 1.11769366, + "diversity_loss_mlp": 0.0, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.0827611554210385, + "language_loss": 0.83172429, + "learning_rate": 0.000786433665663846, + "loss": 0.84303296, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.13195801, + "routerloss_mlp": 0.0, + "step": 1698, + "time_per_iteration": 2.6627049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135283, + "balance_loss_mlp": 1.12240815, + "diversity_loss_mlp": 0.0, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.08562611300573084, + "language_loss": 0.86256903, + "learning_rate": 0.0007861782550137942, + "loss": 0.87392187, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1699, + "time_per_iteration": 2.9298973083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115677, + "balance_loss_mlp": 1.10270739, + "diversity_loss_mlp": 0.0, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.06870341741306431, + "language_loss": 0.85913056, + "learning_rate": 0.0007859227332594901, + "loss": 0.8702873, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1700, + "time_per_iteration": 2.9108214378356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099921, + "balance_loss_mlp": 1.08703494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.08010897822069696, + "language_loss": 0.84705722, + "learning_rate": 0.0007856671005001365, + "loss": 0.85805643, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1701, + "time_per_iteration": 3.172921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088126, + "balance_loss_mlp": 1.07506084, + "diversity_loss_mlp": 0.0, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.0963591610521261, + "language_loss": 0.81720912, + "learning_rate": 0.0007854113568349787, + "loss": 0.82809043, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.13085938, + "routerloss_mlp": 0.0, + "step": 1702, + "time_per_iteration": 3.1135685443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100269, + "balance_loss_mlp": 1.08686948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.07838750037803571, + "language_loss": 0.80661154, + "learning_rate": 0.0007851555023633052, + "loss": 0.8176142, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.13397217, + "routerloss_mlp": 0.0, + "step": 1703, + "time_per_iteration": 2.841059684753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086421, + "balance_loss_mlp": 1.07271171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07047077484334266, + "language_loss": 0.82222247, + "learning_rate": 0.0007848995371844474, + "loss": 0.83308667, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1704, + "time_per_iteration": 2.515455961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094981, + "balance_loss_mlp": 1.0816896, + "diversity_loss_mlp": 0.0, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.08203255389116743, + "language_loss": 0.80260348, + "learning_rate": 0.0007846434613977801, + "loss": 0.81355333, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1705, + "time_per_iteration": 2.523026466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.0868392, + "diversity_loss_mlp": 0.0, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.07270926258732689, + "language_loss": 0.78603041, + "learning_rate": 0.0007843872751027203, + "loss": 0.7970314, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.13275146, + "routerloss_mlp": 0.0, + "step": 1706, + "time_per_iteration": 2.8923709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915397, + "balance_loss_mlp": 1.59612775, + "diversity_loss_mlp": 0.20258766, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.02966318853366187, + "language_loss": 0.87305748, + "learning_rate": 0.0007841309783987287, + "loss": 0.88221151, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01603885, + "step": 1707, + "time_per_iteration": 2.7517144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115655, + "balance_loss_mlp": 1.10263109, + "diversity_loss_mlp": 0.0, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06500174516261728, + "language_loss": 0.89240694, + "learning_rate": 0.0007838745713853084, + "loss": 0.9035635, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1708, + "time_per_iteration": 2.6181201934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122322, + "balance_loss_mlp": 1.10945296, + "diversity_loss_mlp": 0.0, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.06936064314807153, + "language_loss": 0.8434307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85465395, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.12866211, + "routerloss_mlp": 0.0, + "step": 1709, + "time_per_iteration": 2.7040350437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124287, + "balance_loss_mlp": 1.1112572, + "diversity_loss_mlp": 0.0, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06883588356672955, + "language_loss": 0.86454904, + "learning_rate": 0.0007833614268284082, + "loss": 0.87579191, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 1710, + "time_per_iteration": 2.5110740661621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425821, + "balance_loss_mlp": 1.41738081, + "diversity_loss_mlp": 0.0, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.1402114647579648, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75535595, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.08447266, + "routerloss_mlp": 0.0, + "step": 1711, + "time_per_iteration": 4.873327016830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129416, + "balance_loss_mlp": 1.11650598, + "diversity_loss_mlp": 0.0, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.0798208466882041, + "language_loss": 0.78414649, + "learning_rate": 0.0007828478422289016, + "loss": 0.79544067, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 1712, + "time_per_iteration": 2.608412027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138403, + "balance_loss_mlp": 1.12507582, + "diversity_loss_mlp": 0.0, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.07544776571140048, + "language_loss": 0.8909815, + "learning_rate": 0.0007825908851623833, + "loss": 0.90236557, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.13323975, + "routerloss_mlp": 0.0, + "step": 1713, + "time_per_iteration": 2.8033607006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134961, + "balance_loss_mlp": 1.12190771, + "diversity_loss_mlp": 0.0, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.06974595077498419, + "language_loss": 0.85003847, + "learning_rate": 0.0007823338183843533, + "loss": 0.86138809, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1714, + "time_per_iteration": 2.6861188411712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148942, + "balance_loss_mlp": 1.13610959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.07049806127627434, + "language_loss": 0.81025606, + "learning_rate": 0.0007820766419946141, + "loss": 0.82174551, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1715, + "time_per_iteration": 3.3007164001464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168148, + "balance_loss_mlp": 1.16008925, + "diversity_loss_mlp": 0.0, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.052131774928428895, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80840629, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1716, + "time_per_iteration": 4.947760105133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906852, + "balance_loss_mlp": 1.58163857, + "diversity_loss_mlp": 0.20079982, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.033697214377685164, + "language_loss": 0.75853068, + "learning_rate": 0.0007815619607794288, + "loss": 0.76759923, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563331, + "step": 1717, + "time_per_iteration": 2.689937114715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173062, + "balance_loss_mlp": 1.1601274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.09689448967864323, + "language_loss": 0.8294118, + "learning_rate": 0.0007813044561538001, + "loss": 0.84114236, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1718, + "time_per_iteration": 3.1421005725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158875, + "balance_loss_mlp": 1.14559531, + "diversity_loss_mlp": 0.0, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06842928932014077, + "language_loss": 0.88578129, + "learning_rate": 0.0007810468423160958, + "loss": 0.89736998, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1719, + "time_per_iteration": 2.8917293548583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157511, + "balance_loss_mlp": 1.14486265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06941390463820386, + "language_loss": 0.81896281, + "learning_rate": 0.0007807891193663306, + "loss": 0.83053792, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.12640381, + "routerloss_mlp": 0.0, + "step": 1720, + "time_per_iteration": 2.8352882862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141979, + "balance_loss_mlp": 1.12950385, + "diversity_loss_mlp": 0.0, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07961809028947962, + "language_loss": 0.82409328, + "learning_rate": 0.0007805312874045614, + "loss": 0.83551311, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1721, + "time_per_iteration": 2.5056259632110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137994, + "balance_loss_mlp": 1.12510777, + "diversity_loss_mlp": 0.0, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.09061115976682882, + "language_loss": 0.86960506, + "learning_rate": 0.0007802733465308874, + "loss": 0.88098502, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1722, + "time_per_iteration": 2.438533306121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144898, + "balance_loss_mlp": 1.13225603, + "diversity_loss_mlp": 0.0, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06773749819611302, + "language_loss": 0.84162688, + "learning_rate": 0.0007800152968454501, + "loss": 0.8530758, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1723, + "time_per_iteration": 2.6364991664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134044, + "balance_loss_mlp": 1.12146711, + "diversity_loss_mlp": 0.0, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06044198445597461, + "language_loss": 0.90330362, + "learning_rate": 0.0007797571384484334, + "loss": 0.91464406, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.12567139, + "routerloss_mlp": 0.0, + "step": 1724, + "time_per_iteration": 2.8638265132904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133346, + "balance_loss_mlp": 1.12061453, + "diversity_loss_mlp": 0.0, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.0752969909322094, + "language_loss": 0.91929704, + "learning_rate": 0.0007794988714400633, + "loss": 0.93063056, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 1725, + "time_per_iteration": 2.615788698196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125798, + "balance_loss_mlp": 1.11242867, + "diversity_loss_mlp": 0.0, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.07890733478173245, + "language_loss": 0.85302055, + "learning_rate": 0.0007792404959206079, + "loss": 0.86427855, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.13372803, + "routerloss_mlp": 0.0, + "step": 1726, + "time_per_iteration": 2.545780897140503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107165, + "balance_loss_mlp": 1.09446895, + "diversity_loss_mlp": 0.0, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.07756389475354548, + "language_loss": 0.81480336, + "learning_rate": 0.0007789820119903774, + "loss": 0.82587504, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.12689209, + "routerloss_mlp": 0.0, + "step": 1727, + "time_per_iteration": 3.005662441253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114992, + "balance_loss_mlp": 1.10335684, + "diversity_loss_mlp": 0.0, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.03748312413261812, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.7960766, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 1728, + "time_per_iteration": 4.833205223083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105872, + "balance_loss_mlp": 1.09285486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.07170574552345628, + "language_loss": 0.83970881, + "learning_rate": 0.0007784647192990428, + "loss": 0.85076749, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 1729, + "time_per_iteration": 2.7309772968292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107979, + "balance_loss_mlp": 1.0948776, + "diversity_loss_mlp": 0.0, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06011930461286596, + "language_loss": 0.80777055, + "learning_rate": 0.0007782059107387696, + "loss": 0.81885028, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.13116455, + "routerloss_mlp": 0.0, + "step": 1730, + "time_per_iteration": 2.8615641593933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113195, + "balance_loss_mlp": 1.11733532, + "diversity_loss_mlp": 0.0, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.08106060743083753, + "language_loss": 0.88617826, + "learning_rate": 0.0007779469941693826, + "loss": 0.89749771, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1731, + "time_per_iteration": 2.801208257675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126142, + "balance_loss_mlp": 1.11240935, + "diversity_loss_mlp": 0.0, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.09519717038034853, + "language_loss": 0.77091044, + "learning_rate": 0.0007776879696914029, + "loss": 0.78217185, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1732, + "time_per_iteration": 2.8286595344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123068, + "balance_loss_mlp": 1.10889435, + "diversity_loss_mlp": 0.0, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.05947539267688924, + "language_loss": 0.88910627, + "learning_rate": 0.000777428837405392, + "loss": 0.90033698, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1733, + "time_per_iteration": 2.8319156169891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121491, + "balance_loss_mlp": 1.10701954, + "diversity_loss_mlp": 0.0, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.07113995025739508, + "language_loss": 0.86735553, + "learning_rate": 0.0007771695974119544, + "loss": 0.87857044, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1734, + "time_per_iteration": 2.5376570224761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112031, + "balance_loss_mlp": 1.09795249, + "diversity_loss_mlp": 0.0, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.08734149249458338, + "language_loss": 0.75937277, + "learning_rate": 0.0007769102498117359, + "loss": 0.77049315, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1735, + "time_per_iteration": 3.093188524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105698, + "balance_loss_mlp": 1.09138131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06929562674350419, + "language_loss": 0.79383999, + "learning_rate": 0.000776650794705424, + "loss": 0.80489695, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1736, + "time_per_iteration": 3.253673791885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121685, + "balance_loss_mlp": 1.10730791, + "diversity_loss_mlp": 0.0, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.06325878214231093, + "language_loss": 0.82130396, + "learning_rate": 0.0007763912321937483, + "loss": 0.83252084, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1737, + "time_per_iteration": 2.7109947204589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117751, + "balance_loss_mlp": 1.10324299, + "diversity_loss_mlp": 0.0, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.08404595709863052, + "language_loss": 0.82403475, + "learning_rate": 0.0007761315623774799, + "loss": 0.83521223, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1738, + "time_per_iteration": 3.4125657081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109047, + "balance_loss_mlp": 1.0946703, + "diversity_loss_mlp": 0.0, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08421865543081901, + "language_loss": 0.87820536, + "learning_rate": 0.0007758717853574313, + "loss": 0.88929582, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1739, + "time_per_iteration": 2.7345223426818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106099, + "balance_loss_mlp": 1.09184134, + "diversity_loss_mlp": 0.0, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.07638673743764693, + "language_loss": 0.90095574, + "learning_rate": 0.0007756119012344571, + "loss": 0.91201669, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1740, + "time_per_iteration": 2.5901129245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.08709717, + "diversity_loss_mlp": 0.0, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06863708242027233, + "language_loss": 0.8461023, + "learning_rate": 0.0007753519101094535, + "loss": 0.85711253, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1741, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089984, + "balance_loss_mlp": 1.07595301, + "diversity_loss_mlp": 0.0, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.07992644583812669, + "language_loss": 0.86363387, + "learning_rate": 0.0007750918120833575, + "loss": 0.87453371, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1742, + "time_per_iteration": 2.58940052986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.07488728, + "diversity_loss_mlp": 0.0, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.11201991585260462, + "language_loss": 0.87392128, + "learning_rate": 0.0007748316072571485, + "loss": 0.88480592, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1743, + "time_per_iteration": 2.8557286262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086266, + "balance_loss_mlp": 1.07202053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0749416267225997, + "language_loss": 0.79045737, + "learning_rate": 0.0007745712957318467, + "loss": 0.80131996, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1744, + "time_per_iteration": 2.9912548065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084233, + "balance_loss_mlp": 1.07057166, + "diversity_loss_mlp": 0.0, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06946859722884112, + "language_loss": 0.86471289, + "learning_rate": 0.0007743108776085141, + "loss": 0.87555522, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1745, + "time_per_iteration": 2.7899224758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_mlp": 1.07023191, + "diversity_loss_mlp": 0.0, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.08256839233284315, + "language_loss": 0.82965624, + "learning_rate": 0.0007740503529882543, + "loss": 0.84050083, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1746, + "time_per_iteration": 2.808084011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084564, + "balance_loss_mlp": 1.07044971, + "diversity_loss_mlp": 0.0, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.07349682427851349, + "language_loss": 0.90707254, + "learning_rate": 0.0007737897219722114, + "loss": 0.91791821, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1747, + "time_per_iteration": 2.712833881378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092286, + "balance_loss_mlp": 1.07794499, + "diversity_loss_mlp": 0.0, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.05794758251669461, + "language_loss": 0.81094921, + "learning_rate": 0.0007735289846615716, + "loss": 0.82187206, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1748, + "time_per_iteration": 2.677976369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108166, + "balance_loss_mlp": 1.09457588, + "diversity_loss_mlp": 0.0, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.0827866783592608, + "language_loss": 0.823035, + "learning_rate": 0.0007732681411575621, + "loss": 0.8341167, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1749, + "time_per_iteration": 2.674349069595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114062, + "balance_loss_mlp": 1.09997165, + "diversity_loss_mlp": 0.0, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.4203922337067485, + "language_loss": 0.87328398, + "learning_rate": 0.0007730071915614514, + "loss": 0.88442457, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1750, + "time_per_iteration": 2.6714634895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113648, + "balance_loss_mlp": 1.10037947, + "diversity_loss_mlp": 0.0, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09571011442330926, + "language_loss": 0.88792437, + "learning_rate": 0.0007727461359745489, + "loss": 0.89906085, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1751, + "time_per_iteration": 2.469905376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141755, + "balance_loss_mlp": 1.12897623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.07412184794878955, + "language_loss": 0.85941112, + "learning_rate": 0.0007724849744982056, + "loss": 0.87082875, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 1752, + "time_per_iteration": 2.6805977821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117715, + "balance_loss_mlp": 1.16388226, + "diversity_loss_mlp": 0.0, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.09378397224837084, + "language_loss": 0.81843758, + "learning_rate": 0.0007722237072338131, + "loss": 0.83020908, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1753, + "time_per_iteration": 2.7348344326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186311, + "balance_loss_mlp": 1.17280459, + "diversity_loss_mlp": 0.0, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.1034159122014491, + "language_loss": 0.85304463, + "learning_rate": 0.0007719623342828046, + "loss": 0.86490774, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1754, + "time_per_iteration": 2.5181336402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202577, + "balance_loss_mlp": 1.18872511, + "diversity_loss_mlp": 0.0, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.12703041648808322, + "language_loss": 0.84088987, + "learning_rate": 0.000771700855746654, + "loss": 0.85291564, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1755, + "time_per_iteration": 2.590925931930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188345, + "balance_loss_mlp": 1.1743381, + "diversity_loss_mlp": 0.0, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.06849832931784437, + "language_loss": 0.88371092, + "learning_rate": 0.0007714392717268763, + "loss": 0.89559436, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1756, + "time_per_iteration": 2.560246706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189305, + "balance_loss_mlp": 1.17545295, + "diversity_loss_mlp": 0.0, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.09135673410225151, + "language_loss": 0.8630141, + "learning_rate": 0.0007711775823250273, + "loss": 0.8749072, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1757, + "time_per_iteration": 2.562939167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194838, + "balance_loss_mlp": 1.18069935, + "diversity_loss_mlp": 0.0, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.07414503329772545, + "language_loss": 0.83081156, + "learning_rate": 0.0007709157876427039, + "loss": 0.84275991, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1758, + "time_per_iteration": 3.0652947425842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190916, + "balance_loss_mlp": 1.17681408, + "diversity_loss_mlp": 0.0, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.06977999371164574, + "language_loss": 0.85321373, + "learning_rate": 0.0007706538877815439, + "loss": 0.86512285, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1759, + "time_per_iteration": 2.5949320793151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202515, + "balance_loss_mlp": 1.1888063, + "diversity_loss_mlp": 0.0, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.052908737395413206, + "language_loss": 0.83029473, + "learning_rate": 0.0007703918828432259, + "loss": 0.84231991, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1760, + "time_per_iteration": 2.6404576301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231589, + "balance_loss_mlp": 1.21696198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.11529749255982873, + "language_loss": 0.89274669, + "learning_rate": 0.000770129772929469, + "loss": 0.90506256, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1761, + "time_per_iteration": 2.6486427783966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212596, + "balance_loss_mlp": 1.19812357, + "diversity_loss_mlp": 0.0, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.10010821715075297, + "language_loss": 0.8820551, + "learning_rate": 0.0007698675581420334, + "loss": 0.89418107, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1762, + "time_per_iteration": 2.8473589420318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170537, + "balance_loss_mlp": 1.15610099, + "diversity_loss_mlp": 0.0, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06768336788468338, + "language_loss": 0.79040444, + "learning_rate": 0.0007696052385827199, + "loss": 0.80210984, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1763, + "time_per_iteration": 2.9893951416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147034, + "balance_loss_mlp": 1.13271689, + "diversity_loss_mlp": 0.0, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.06731413775333611, + "language_loss": 0.78161937, + "learning_rate": 0.00076934281435337, + "loss": 0.79308975, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1764, + "time_per_iteration": 2.7329161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933074, + "balance_loss_mlp": 1.62411106, + "diversity_loss_mlp": 0.20785357, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0341650984642099, + "language_loss": 0.86205357, + "learning_rate": 0.0007690802855558658, + "loss": 0.87138426, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170921, + "step": 1765, + "time_per_iteration": 2.9281163215637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121638, + "balance_loss_mlp": 1.10924029, + "diversity_loss_mlp": 0.0, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.029090002598214117, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77496594, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 1766, + "time_per_iteration": 4.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104609, + "balance_loss_mlp": 1.08886182, + "diversity_loss_mlp": 0.0, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.08396151855964885, + "language_loss": 0.89357018, + "learning_rate": 0.0007685549146641262, + "loss": 0.90461624, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1767, + "time_per_iteration": 2.5867435932159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108303, + "balance_loss_mlp": 1.093521, + "diversity_loss_mlp": 0.0, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.10736891621188589, + "language_loss": 0.8816734, + "learning_rate": 0.0007682920727738579, + "loss": 0.89275646, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1768, + "time_per_iteration": 2.5119268894195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102653, + "balance_loss_mlp": 1.08738232, + "diversity_loss_mlp": 0.0, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.10494960168224592, + "language_loss": 0.85048056, + "learning_rate": 0.000768029126723369, + "loss": 0.86150718, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1769, + "time_per_iteration": 2.495424270629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090257, + "balance_loss_mlp": 1.07520068, + "diversity_loss_mlp": 0.0, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.08686425564719477, + "language_loss": 0.82128584, + "learning_rate": 0.0007677660766147447, + "loss": 0.83218843, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 1770, + "time_per_iteration": 2.532904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066946, + "balance_loss_mlp": 1.05578792, + "diversity_loss_mlp": 0.0, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.023964921008177247, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73537892, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 1771, + "time_per_iteration": 4.944117784500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117931, + "balance_loss_mlp": 1.1034112, + "diversity_loss_mlp": 0.0, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.10616133846526872, + "language_loss": 0.795196, + "learning_rate": 0.0007672396646316306, + "loss": 0.80637527, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1772, + "time_per_iteration": 2.6089062690734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134399, + "balance_loss_mlp": 1.11959314, + "diversity_loss_mlp": 0.0, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.07513330183645242, + "language_loss": 0.80376065, + "learning_rate": 0.000766976302961512, + "loss": 0.8151046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1773, + "time_per_iteration": 3.042421340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158934, + "balance_loss_mlp": 1.14410484, + "diversity_loss_mlp": 0.0, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.07872996810077096, + "language_loss": 0.81390858, + "learning_rate": 0.0007667128376420003, + "loss": 0.82549793, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1774, + "time_per_iteration": 2.536562442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208475, + "balance_loss_mlp": 1.19358635, + "diversity_loss_mlp": 0.0, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.08297883362487203, + "language_loss": 0.8462863, + "learning_rate": 0.0007664492687753817, + "loss": 0.85837102, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1775, + "time_per_iteration": 2.6977102756500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198612, + "balance_loss_mlp": 1.18424678, + "diversity_loss_mlp": 0.0, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10155126624771216, + "language_loss": 0.81542516, + "learning_rate": 0.000766185596463983, + "loss": 0.82741123, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1776, + "time_per_iteration": 2.6038215160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196202, + "balance_loss_mlp": 1.18163514, + "diversity_loss_mlp": 0.0, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.0897891274607312, + "language_loss": 0.77011722, + "learning_rate": 0.0007659218208101706, + "loss": 0.78207922, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1777, + "time_per_iteration": 3.0933022499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173425, + "balance_loss_mlp": 1.15902483, + "diversity_loss_mlp": 0.0, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.08364054831663822, + "language_loss": 0.85122472, + "learning_rate": 0.0007656579419163515, + "loss": 0.86295897, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1778, + "time_per_iteration": 2.732297420501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146549, + "balance_loss_mlp": 1.13211274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.0722191895240348, + "language_loss": 0.77409559, + "learning_rate": 0.0007653939598849724, + "loss": 0.78556108, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1779, + "time_per_iteration": 2.4908664226531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_mlp": 1.02253902, + "diversity_loss_mlp": 0.0, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.029240552967656448, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83912855, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.10107422, + "routerloss_mlp": 0.0, + "step": 1780, + "time_per_iteration": 4.9182775020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.10688317, + "diversity_loss_mlp": 0.0, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07624931845389674, + "language_loss": 0.80176342, + "learning_rate": 0.000764865686819522, + "loss": 0.81297386, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1781, + "time_per_iteration": 3.0602052211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111853, + "balance_loss_mlp": 1.097965, + "diversity_loss_mlp": 0.0, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.07936344533488468, + "language_loss": 0.85836053, + "learning_rate": 0.0007646013959905449, + "loss": 0.86947906, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1782, + "time_per_iteration": 2.5750925540924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109071, + "balance_loss_mlp": 1.09528995, + "diversity_loss_mlp": 0.0, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.07233814650781724, + "language_loss": 0.81042612, + "learning_rate": 0.0007643370024341949, + "loss": 0.82151681, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1783, + "time_per_iteration": 3.0870087146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110812, + "balance_loss_mlp": 1.09431553, + "diversity_loss_mlp": 0.0, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.07806584209391611, + "language_loss": 0.83175099, + "learning_rate": 0.0007640725062531195, + "loss": 0.84283221, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1784, + "time_per_iteration": 2.5063886642456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102776, + "balance_loss_mlp": 1.08888865, + "diversity_loss_mlp": 0.0, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.5067557182324087, + "language_loss": 0.86699629, + "learning_rate": 0.0007638079075500047, + "loss": 0.87802398, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1785, + "time_per_iteration": 2.532945394515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015111, + "balance_loss_mlp": 1.00562215, + "diversity_loss_mlp": 0.0, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.016449027395748255, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76195776, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 1786, + "time_per_iteration": 4.944318056106567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150049, + "balance_loss_mlp": 1.13542247, + "diversity_loss_mlp": 0.0, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.07356798682381475, + "language_loss": 0.83088338, + "learning_rate": 0.0007632784029886026, + "loss": 0.84238386, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1787, + "time_per_iteration": 2.6217002868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204344, + "balance_loss_mlp": 1.1884768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.08799574205003287, + "language_loss": 0.85466659, + "learning_rate": 0.0007630134973358873, + "loss": 0.86671007, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1788, + "time_per_iteration": 2.9664394855499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251833, + "balance_loss_mlp": 1.2359066, + "diversity_loss_mlp": 0.0, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.1052875761358054, + "language_loss": 0.86575854, + "learning_rate": 0.0007627484895722763, + "loss": 0.87827688, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1789, + "time_per_iteration": 2.67280912399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247407, + "balance_loss_mlp": 1.23117065, + "diversity_loss_mlp": 0.0, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.09611070791328494, + "language_loss": 0.80025196, + "learning_rate": 0.0007624833798006552, + "loss": 0.81272602, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.16235352, + "routerloss_mlp": 0.0, + "step": 1790, + "time_per_iteration": 3.046809196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238128, + "balance_loss_mlp": 1.22221315, + "diversity_loss_mlp": 0.0, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.07959093752215074, + "language_loss": 0.83783114, + "learning_rate": 0.0007622181681239483, + "loss": 0.8502124, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1791, + "time_per_iteration": 2.6601433753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244342, + "balance_loss_mlp": 1.22793913, + "diversity_loss_mlp": 0.0, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07919089267187412, + "language_loss": 0.84668601, + "learning_rate": 0.0007619528546451202, + "loss": 0.85912943, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 1792, + "time_per_iteration": 2.782947063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208587, + "balance_loss_mlp": 1.19314909, + "diversity_loss_mlp": 0.0, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.07332959959795217, + "language_loss": 0.83832949, + "learning_rate": 0.0007616874394671745, + "loss": 0.85041535, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1793, + "time_per_iteration": 3.3206703662872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184994, + "balance_loss_mlp": 1.169258, + "diversity_loss_mlp": 0.0, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.0713753042238581, + "language_loss": 0.85051751, + "learning_rate": 0.0007614219226931547, + "loss": 0.86236751, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1794, + "time_per_iteration": 2.7190396785736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179587, + "balance_loss_mlp": 1.16401851, + "diversity_loss_mlp": 0.0, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.07163818055438703, + "language_loss": 0.8457973, + "learning_rate": 0.0007611563044261435, + "loss": 0.85759324, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 1795, + "time_per_iteration": 2.5077741146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150042, + "balance_loss_mlp": 1.13422251, + "diversity_loss_mlp": 0.0, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.0670543853763616, + "language_loss": 0.86376798, + "learning_rate": 0.0007608905847692631, + "loss": 0.8752684, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1796, + "time_per_iteration": 2.4662768840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112741, + "balance_loss_mlp": 1.11171043, + "diversity_loss_mlp": 0.0, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07671810253227593, + "language_loss": 0.86553091, + "learning_rate": 0.0007606247638256749, + "loss": 0.87680501, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 1797, + "time_per_iteration": 2.8649494647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00624206, + "balance_loss_mlp": 1.05204535, + "diversity_loss_mlp": 0.16984753, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.0016633519833830733, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.78794497, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01325956, + "step": 1798, + "time_per_iteration": 4.963132619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_mlp": 1.04498482, + "diversity_loss_mlp": 0.0, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.032920799461559694, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80382872, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.10693359, + "routerloss_mlp": 0.0, + "step": 1799, + "time_per_iteration": 4.773633003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099713, + "balance_loss_mlp": 1.08345306, + "diversity_loss_mlp": 0.0, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.10233507255995049, + "language_loss": 0.85892332, + "learning_rate": 0.0007598266943068686, + "loss": 0.86992049, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 1800, + "time_per_iteration": 2.7380948066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092311, + "balance_loss_mlp": 1.0761466, + "diversity_loss_mlp": 0.0, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.08416075255699706, + "language_loss": 0.83903629, + "learning_rate": 0.0007595604692488507, + "loss": 0.84995937, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1801, + "time_per_iteration": 2.5558300018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099836, + "balance_loss_mlp": 1.08382583, + "diversity_loss_mlp": 0.0, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.0681721192963598, + "language_loss": 0.82674247, + "learning_rate": 0.0007592941434205215, + "loss": 0.83774084, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 1802, + "time_per_iteration": 2.8181002140045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017477, + "balance_loss_mlp": 1.00651026, + "diversity_loss_mlp": 0.0, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.018274165575771096, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588537, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 1803, + "time_per_iteration": 5.063629388809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126513, + "balance_loss_mlp": 1.11121821, + "diversity_loss_mlp": 0.0, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07342722091818694, + "language_loss": 0.80217302, + "learning_rate": 0.0007587611898665566, + "loss": 0.81343818, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.15270996, + "routerloss_mlp": 0.0, + "step": 1804, + "time_per_iteration": 3.0994317531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113982, + "balance_loss_mlp": 1.12468028, + "diversity_loss_mlp": 0.0, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.05936466476556785, + "language_loss": 0.82130265, + "learning_rate": 0.0007584945623478315, + "loss": 0.83270085, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1805, + "time_per_iteration": 2.833981513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152624, + "balance_loss_mlp": 1.13780582, + "diversity_loss_mlp": 0.0, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.08744691316973383, + "language_loss": 0.80801159, + "learning_rate": 0.000758227834472617, + "loss": 0.81953788, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1806, + "time_per_iteration": 3.0535178184509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166216, + "balance_loss_mlp": 1.15111172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.07500761638021176, + "language_loss": 0.77729452, + "learning_rate": 0.0007579610063444664, + "loss": 0.7889567, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1807, + "time_per_iteration": 2.7615864276885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149194, + "balance_loss_mlp": 1.1339947, + "diversity_loss_mlp": 0.0, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.07406875426876382, + "language_loss": 0.87547183, + "learning_rate": 0.0007576940780669712, + "loss": 0.88696373, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1808, + "time_per_iteration": 3.264080762863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143539, + "balance_loss_mlp": 1.12863731, + "diversity_loss_mlp": 0.0, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.07928472428244501, + "language_loss": 0.84104979, + "learning_rate": 0.0007574270497437624, + "loss": 0.85248518, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1809, + "time_per_iteration": 2.9859273433685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128376, + "balance_loss_mlp": 1.11302221, + "diversity_loss_mlp": 0.0, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.07150597602774303, + "language_loss": 0.88426095, + "learning_rate": 0.000757159921478509, + "loss": 0.89554477, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 1810, + "time_per_iteration": 2.7891488075256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057512, + "balance_loss_mlp": 1.04754615, + "diversity_loss_mlp": 0.0, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.03228641235871289, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75508153, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 1811, + "time_per_iteration": 4.737962007522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103488, + "balance_loss_mlp": 1.08814573, + "diversity_loss_mlp": 0.0, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.07438083858778873, + "language_loss": 0.87798911, + "learning_rate": 0.0007566253655367423, + "loss": 0.88902402, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 1812, + "time_per_iteration": 2.5879476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.07600367, + "diversity_loss_mlp": 0.0, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.06854488097647142, + "language_loss": 0.8957805, + "learning_rate": 0.000756357938067762, + "loss": 0.90669596, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 1813, + "time_per_iteration": 2.7090489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.07826209, + "diversity_loss_mlp": 0.0, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.0690606019510397, + "language_loss": 0.8334865, + "learning_rate": 0.0007560904110718033, + "loss": 0.84443069, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1814, + "time_per_iteration": 3.2445590496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096093, + "balance_loss_mlp": 1.08003569, + "diversity_loss_mlp": 0.0, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06223934742271703, + "language_loss": 0.83650601, + "learning_rate": 0.0007558227846527297, + "loss": 0.84746695, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1815, + "time_per_iteration": 2.8504550457000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110334, + "balance_loss_mlp": 1.08731842, + "diversity_loss_mlp": 0.0, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.07831164241761415, + "language_loss": 0.83117825, + "learning_rate": 0.0007555550589144429, + "loss": 0.84221166, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1816, + "time_per_iteration": 2.4655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111049, + "balance_loss_mlp": 1.09515882, + "diversity_loss_mlp": 0.0, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.08460625336983617, + "language_loss": 0.84522688, + "learning_rate": 0.000755287233960883, + "loss": 0.85633731, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1817, + "time_per_iteration": 2.602492094039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.07385683, + "diversity_loss_mlp": 0.0, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.07045705340523431, + "language_loss": 0.77682364, + "learning_rate": 0.0007550193098960292, + "loss": 0.78771949, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1818, + "time_per_iteration": 2.8674800395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00989642, + "balance_loss_mlp": 1.73270237, + "diversity_loss_mlp": 0.21087486, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.029406524514427698, + "language_loss": 0.86412024, + "learning_rate": 0.0007547512868238988, + "loss": 0.87401664, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01785346, + "step": 1819, + "time_per_iteration": 3.151559829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090552, + "balance_loss_mlp": 1.07453036, + "diversity_loss_mlp": 0.0, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.06124546921927801, + "language_loss": 0.83503008, + "learning_rate": 0.0007544831648485473, + "loss": 0.84593564, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1820, + "time_per_iteration": 2.6791367530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094234, + "balance_loss_mlp": 1.07806909, + "diversity_loss_mlp": 0.0, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.08232155140582742, + "language_loss": 0.81448233, + "learning_rate": 0.0007542149440740694, + "loss": 0.82542467, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1821, + "time_per_iteration": 2.665632724761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.07229352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.08177047744866778, + "language_loss": 0.85514361, + "learning_rate": 0.000753946624604597, + "loss": 0.8660273, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1822, + "time_per_iteration": 2.708221673965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085159, + "balance_loss_mlp": 1.06938744, + "diversity_loss_mlp": 0.0, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.07022994660183399, + "language_loss": 0.88119262, + "learning_rate": 0.0007536782065443015, + "loss": 0.89204431, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 1823, + "time_per_iteration": 2.633929967880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109141, + "balance_loss_mlp": 1.0758059, + "diversity_loss_mlp": 0.0, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.09965750131036237, + "language_loss": 0.75038946, + "learning_rate": 0.0007534096899973919, + "loss": 0.7613036, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1824, + "time_per_iteration": 2.585160732269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089888, + "balance_loss_mlp": 1.07460535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.0636070515998131, + "language_loss": 0.82941401, + "learning_rate": 0.0007531410750681154, + "loss": 0.84031284, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1825, + "time_per_iteration": 2.7595911026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100118, + "balance_loss_mlp": 1.08562207, + "diversity_loss_mlp": 0.0, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.09267960960885083, + "language_loss": 0.87015611, + "learning_rate": 0.0007528723618607575, + "loss": 0.88115728, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1826, + "time_per_iteration": 3.4216692447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090335, + "balance_loss_mlp": 1.07524323, + "diversity_loss_mlp": 0.0, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.07214965975453298, + "language_loss": 0.82582879, + "learning_rate": 0.0007526035504796422, + "loss": 0.83673215, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.15087891, + "routerloss_mlp": 0.0, + "step": 1827, + "time_per_iteration": 2.7822000980377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094099, + "balance_loss_mlp": 1.0794003, + "diversity_loss_mlp": 0.0, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.07057247929289283, + "language_loss": 0.86824054, + "learning_rate": 0.0007523346410291312, + "loss": 0.8791815, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1828, + "time_per_iteration": 2.7560181617736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098232, + "balance_loss_mlp": 1.08291376, + "diversity_loss_mlp": 0.0, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.0630617970486185, + "language_loss": 0.85159689, + "learning_rate": 0.0007520656336136245, + "loss": 0.86257917, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1829, + "time_per_iteration": 2.9432313442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098974, + "balance_loss_mlp": 1.08431172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06541232162591855, + "language_loss": 0.88230217, + "learning_rate": 0.0007517965283375599, + "loss": 0.89329195, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1830, + "time_per_iteration": 2.8773486614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098552, + "balance_loss_mlp": 1.08363926, + "diversity_loss_mlp": 0.0, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.06973135687475002, + "language_loss": 0.89511967, + "learning_rate": 0.0007515273253054132, + "loss": 0.90610522, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1831, + "time_per_iteration": 2.662757396697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097986, + "balance_loss_mlp": 1.08288169, + "diversity_loss_mlp": 0.0, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.07142201858296882, + "language_loss": 0.82785273, + "learning_rate": 0.0007512580246216988, + "loss": 0.83883256, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1832, + "time_per_iteration": 2.730994939804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096378, + "balance_loss_mlp": 1.08164394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.07119734441282773, + "language_loss": 0.84715027, + "learning_rate": 0.000750988626390968, + "loss": 0.85811406, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1833, + "time_per_iteration": 2.604182004928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_mlp": 1.07508624, + "diversity_loss_mlp": 0.0, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.07060575001723658, + "language_loss": 0.85089648, + "learning_rate": 0.0007507191307178108, + "loss": 0.86179501, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1834, + "time_per_iteration": 2.7584774494171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083532, + "balance_loss_mlp": 1.06808281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.09392412586459238, + "language_loss": 0.75105453, + "learning_rate": 0.0007504495377068543, + "loss": 0.76188982, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1835, + "time_per_iteration": 2.731039524078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087025, + "balance_loss_mlp": 1.07230306, + "diversity_loss_mlp": 0.0, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09299008065025831, + "language_loss": 0.81784093, + "learning_rate": 0.0007501798474627642, + "loss": 0.82871115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1836, + "time_per_iteration": 2.9180665016174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092183, + "balance_loss_mlp": 1.07738876, + "diversity_loss_mlp": 0.0, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.06800399913452355, + "language_loss": 0.8354817, + "learning_rate": 0.0007499100600902433, + "loss": 0.84640354, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1837, + "time_per_iteration": 2.981478452682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097597, + "balance_loss_mlp": 1.08236217, + "diversity_loss_mlp": 0.0, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.07178124654929893, + "language_loss": 0.83625698, + "learning_rate": 0.0007496401756940324, + "loss": 0.84723294, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1838, + "time_per_iteration": 2.7256877422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107267, + "balance_loss_mlp": 1.09267545, + "diversity_loss_mlp": 0.0, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.08438072522416575, + "language_loss": 0.81940264, + "learning_rate": 0.0007493701943789098, + "loss": 0.83047533, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1839, + "time_per_iteration": 2.805553674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117723, + "balance_loss_mlp": 1.10266685, + "diversity_loss_mlp": 0.0, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07000666511795951, + "language_loss": 0.82830888, + "learning_rate": 0.000749100116249692, + "loss": 0.83948612, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1840, + "time_per_iteration": 2.608135223388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00954188, + "balance_loss_mlp": 1.66862321, + "diversity_loss_mlp": 0.20571998, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.03743173710930313, + "language_loss": 0.86076337, + "learning_rate": 0.0007488299414112321, + "loss": 0.87030524, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01701665, + "step": 1841, + "time_per_iteration": 2.6307811737060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112413, + "balance_loss_mlp": 1.10974133, + "diversity_loss_mlp": 0.0, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.06710116446149988, + "language_loss": 0.77204335, + "learning_rate": 0.0007485596699684215, + "loss": 0.78328466, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1842, + "time_per_iteration": 2.808776378631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132611, + "balance_loss_mlp": 1.11780548, + "diversity_loss_mlp": 0.0, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.07987851383877129, + "language_loss": 0.85353696, + "learning_rate": 0.000748289302026189, + "loss": 0.86486304, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1843, + "time_per_iteration": 2.8449106216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127963, + "balance_loss_mlp": 1.11339569, + "diversity_loss_mlp": 0.0, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.06918658934745357, + "language_loss": 0.85752398, + "learning_rate": 0.0007480188376895004, + "loss": 0.86880362, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1844, + "time_per_iteration": 3.0339298248291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160602, + "balance_loss_mlp": 1.15135121, + "diversity_loss_mlp": 0.0, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.06421168097867443, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74971944, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 1845, + "time_per_iteration": 4.932978391647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.10506296, + "diversity_loss_mlp": 0.0, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08194467088107492, + "language_loss": 0.78768218, + "learning_rate": 0.0007474776202528074, + "loss": 0.79887938, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1846, + "time_per_iteration": 2.9188990592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111713, + "balance_loss_mlp": 1.1021452, + "diversity_loss_mlp": 0.0, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08015412782248336, + "language_loss": 0.80999184, + "learning_rate": 0.000747206867362922, + "loss": 0.82116312, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1847, + "time_per_iteration": 3.0966272354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099814, + "balance_loss_mlp": 1.085235, + "diversity_loss_mlp": 0.0, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.09857033029565816, + "language_loss": 0.836568, + "learning_rate": 0.0007469360184988194, + "loss": 0.84756613, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1848, + "time_per_iteration": 2.9021246433258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104687, + "balance_loss_mlp": 1.08986914, + "diversity_loss_mlp": 0.0, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08185517170087683, + "language_loss": 0.86821651, + "learning_rate": 0.0007466650737656518, + "loss": 0.8792634, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1849, + "time_per_iteration": 2.615549325942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102053, + "balance_loss_mlp": 1.0876888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06916390030254578, + "language_loss": 0.89687926, + "learning_rate": 0.0007463940332686098, + "loss": 0.9078998, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1850, + "time_per_iteration": 2.497159242630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931214, + "balance_loss_mlp": 1.62144685, + "diversity_loss_mlp": 0.20650919, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.030410176313075864, + "language_loss": 0.84120536, + "learning_rate": 0.0007461228971129205, + "loss": 0.85051751, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01723633, + "step": 1851, + "time_per_iteration": 2.959170341491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931448, + "balance_loss_mlp": 1.62270963, + "diversity_loss_mlp": 0.20620242, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.03221270440610224, + "language_loss": 0.85523784, + "learning_rate": 0.0007458516654038483, + "loss": 0.86455238, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01699215, + "step": 1852, + "time_per_iteration": 2.6886868476867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.13526964, + "diversity_loss_mlp": 0.0, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06572834298852859, + "language_loss": 0.86835778, + "learning_rate": 0.0007455803382466946, + "loss": 0.8798511, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1853, + "time_per_iteration": 2.8323659896850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151398, + "balance_loss_mlp": 1.13686657, + "diversity_loss_mlp": 0.0, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.06349489422764842, + "language_loss": 0.86956179, + "learning_rate": 0.0007453089157467979, + "loss": 0.88107574, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1854, + "time_per_iteration": 2.817117929458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.13687038, + "diversity_loss_mlp": 0.0, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06687597930641362, + "language_loss": 0.8221277, + "learning_rate": 0.0007450373980095341, + "loss": 0.83364242, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1855, + "time_per_iteration": 3.0857772827148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148466, + "balance_loss_mlp": 1.13494754, + "diversity_loss_mlp": 0.0, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.0656889709190827, + "language_loss": 0.86804116, + "learning_rate": 0.0007447657851403155, + "loss": 0.87952584, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1856, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144273, + "balance_loss_mlp": 1.1303966, + "diversity_loss_mlp": 0.0, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.08894932465162153, + "language_loss": 0.78988904, + "learning_rate": 0.0007444940772445915, + "loss": 0.80133176, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1857, + "time_per_iteration": 2.752232551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122576, + "balance_loss_mlp": 1.10860419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06705763345081875, + "language_loss": 0.80129987, + "learning_rate": 0.0007442222744278484, + "loss": 0.81252563, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1858, + "time_per_iteration": 2.638322591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110883, + "balance_loss_mlp": 1.09717393, + "diversity_loss_mlp": 0.0, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.05935371072747042, + "language_loss": 0.8399322, + "learning_rate": 0.0007439503767956099, + "loss": 0.85104102, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.137146, + "routerloss_mlp": 0.0, + "step": 1859, + "time_per_iteration": 2.699204921722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124434, + "balance_loss_mlp": 1.11480188, + "diversity_loss_mlp": 0.0, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.03541879327423246, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80796039, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 1860, + "time_per_iteration": 4.89499831199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089258, + "balance_loss_mlp": 1.07479787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.06413043417122823, + "language_loss": 0.86215138, + "learning_rate": 0.000743406297506922, + "loss": 0.87304389, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1861, + "time_per_iteration": 2.7184388637542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919817, + "balance_loss_mlp": 1.60078692, + "diversity_loss_mlp": 0.20507258, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.028510278569739433, + "language_loss": 0.84439111, + "learning_rate": 0.0007431341160617031, + "loss": 0.8535893, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01688758, + "step": 1862, + "time_per_iteration": 2.8915610313415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084391, + "balance_loss_mlp": 1.06988358, + "diversity_loss_mlp": 0.0, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06954606141633879, + "language_loss": 0.88100171, + "learning_rate": 0.0007428618402234491, + "loss": 0.8918457, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1863, + "time_per_iteration": 2.6724555492401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087919, + "balance_loss_mlp": 1.0733279, + "diversity_loss_mlp": 0.0, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.07542508091229044, + "language_loss": 0.80288851, + "learning_rate": 0.0007425894700978668, + "loss": 0.81376767, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1864, + "time_per_iteration": 2.724853038787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_mlp": 1.06996608, + "diversity_loss_mlp": 0.0, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.07695346444963648, + "language_loss": 0.7981261, + "learning_rate": 0.0007423170057906996, + "loss": 0.80896473, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1865, + "time_per_iteration": 3.9006779193878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108627, + "balance_loss_mlp": 1.0722512, + "diversity_loss_mlp": 0.0, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.07814080760266444, + "language_loss": 0.86228722, + "learning_rate": 0.0007420444474077275, + "loss": 0.87314993, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1866, + "time_per_iteration": 2.546194076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095193, + "balance_loss_mlp": 1.0812335, + "diversity_loss_mlp": 0.0, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.0773553058948038, + "language_loss": 0.8949936, + "learning_rate": 0.0007417717950547671, + "loss": 0.90594554, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1867, + "time_per_iteration": 2.5670700073242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052493, + "balance_loss_mlp": 1.04262233, + "diversity_loss_mlp": 0.0, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.023944930622272237, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.770491, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 1868, + "time_per_iteration": 4.900780200958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101302, + "balance_loss_mlp": 1.087533, + "diversity_loss_mlp": 0.0, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.06547244306940128, + "language_loss": 0.84938717, + "learning_rate": 0.0007412262088623299, + "loss": 0.86040014, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1869, + "time_per_iteration": 2.7674195766448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092029, + "balance_loss_mlp": 1.60128522, + "diversity_loss_mlp": 0.20662443, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.03542659619783611, + "language_loss": 0.79155517, + "learning_rate": 0.0007409532752346684, + "loss": 0.80075806, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633519, + "step": 1870, + "time_per_iteration": 2.7116785049438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111485, + "balance_loss_mlp": 1.101367, + "diversity_loss_mlp": 0.0, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.061502004439029076, + "language_loss": 0.8836326, + "learning_rate": 0.0007406802480606491, + "loss": 0.89478111, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1871, + "time_per_iteration": 2.642608165740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105605, + "balance_loss_mlp": 1.0916698, + "diversity_loss_mlp": 0.0, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.06939665757215846, + "language_loss": 0.90353388, + "learning_rate": 0.0007404071274462707, + "loss": 0.91458994, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.1394043, + "routerloss_mlp": 0.0, + "step": 1872, + "time_per_iteration": 2.5600955486297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113518, + "balance_loss_mlp": 1.09967744, + "diversity_loss_mlp": 0.0, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.07241097832053987, + "language_loss": 0.83719409, + "learning_rate": 0.0007401339134975682, + "loss": 0.84832925, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1873, + "time_per_iteration": 2.6775293350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111162, + "balance_loss_mlp": 1.09724998, + "diversity_loss_mlp": 0.0, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.07980684605652169, + "language_loss": 0.84604299, + "learning_rate": 0.0007398606063206122, + "loss": 0.85715467, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1874, + "time_per_iteration": 2.6092889308929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109131, + "balance_loss_mlp": 1.09546924, + "diversity_loss_mlp": 0.0, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.09304103013369584, + "language_loss": 0.78818524, + "learning_rate": 0.0007395872060215101, + "loss": 0.79927647, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1875, + "time_per_iteration": 2.5999374389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124779, + "balance_loss_mlp": 1.11121297, + "diversity_loss_mlp": 0.0, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.08049441369365674, + "language_loss": 0.8851527, + "learning_rate": 0.0007393137127064056, + "loss": 0.89640045, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1876, + "time_per_iteration": 2.635896682739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127435, + "balance_loss_mlp": 1.11380959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.06613177233605298, + "language_loss": 0.84377646, + "learning_rate": 0.0007390401264814779, + "loss": 0.8550508, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1877, + "time_per_iteration": 2.597508192062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151319, + "balance_loss_mlp": 1.1378243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.09083655630754779, + "language_loss": 0.84454513, + "learning_rate": 0.0007387664474529427, + "loss": 0.8560583, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1878, + "time_per_iteration": 2.6493661403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143725, + "balance_loss_mlp": 1.1302073, + "diversity_loss_mlp": 0.0, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.0643860955644754, + "language_loss": 0.91379291, + "learning_rate": 0.0007384926757270518, + "loss": 0.92523015, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1879, + "time_per_iteration": 2.62565016746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152012, + "balance_loss_mlp": 1.13819528, + "diversity_loss_mlp": 0.0, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.07609143241795291, + "language_loss": 0.80057949, + "learning_rate": 0.0007382188114100924, + "loss": 0.81209958, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1880, + "time_per_iteration": 2.974212169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.14148784, + "diversity_loss_mlp": 0.0, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.0632350243804942, + "language_loss": 0.8182314, + "learning_rate": 0.0007379448546083884, + "loss": 0.82978803, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1881, + "time_per_iteration": 2.894099712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154364, + "balance_loss_mlp": 1.14052355, + "diversity_loss_mlp": 0.0, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06232367753538678, + "language_loss": 0.8822301, + "learning_rate": 0.0007376708054282992, + "loss": 0.89377379, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1882, + "time_per_iteration": 2.9576163291931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162916, + "balance_loss_mlp": 1.14919519, + "diversity_loss_mlp": 0.0, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.06608098206448941, + "language_loss": 0.83563071, + "learning_rate": 0.0007373966639762201, + "loss": 0.84725988, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1883, + "time_per_iteration": 2.6004068851470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158732, + "balance_loss_mlp": 1.14478457, + "diversity_loss_mlp": 0.0, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.07441448138889938, + "language_loss": 0.88544619, + "learning_rate": 0.0007371224303585822, + "loss": 0.89703357, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1884, + "time_per_iteration": 2.5741078853607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109461, + "balance_loss_mlp": 1.09897089, + "diversity_loss_mlp": 0.0, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.03545085729862102, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81466532, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 1885, + "time_per_iteration": 4.706872224807739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.13442218, + "diversity_loss_mlp": 0.0, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.0691831634947964, + "language_loss": 0.8278423, + "learning_rate": 0.0007365736870525335, + "loss": 0.83932269, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1886, + "time_per_iteration": 2.8480284214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135823, + "balance_loss_mlp": 1.12236464, + "diversity_loss_mlp": 0.0, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.0786816251155578, + "language_loss": 0.82659888, + "learning_rate": 0.000736299177577164, + "loss": 0.83795714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1887, + "time_per_iteration": 2.601449966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127197, + "balance_loss_mlp": 1.11358309, + "diversity_loss_mlp": 0.0, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0767010159800114, + "language_loss": 0.8381778, + "learning_rate": 0.0007360245763623174, + "loss": 0.84944975, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1888, + "time_per_iteration": 2.6951138973236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106263, + "balance_loss_mlp": 1.09350717, + "diversity_loss_mlp": 0.0, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06311908909694558, + "language_loss": 0.89886129, + "learning_rate": 0.0007357498835146039, + "loss": 0.90992391, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1889, + "time_per_iteration": 2.8509137630462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094399, + "balance_loss_mlp": 1.08141732, + "diversity_loss_mlp": 0.0, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.06820711534899371, + "language_loss": 0.86674547, + "learning_rate": 0.0007354750991406684, + "loss": 0.87768942, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1890, + "time_per_iteration": 2.7162795066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089716, + "balance_loss_mlp": 1.07673419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07876014589837055, + "language_loss": 0.80930853, + "learning_rate": 0.0007352002233471919, + "loss": 0.82020569, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1891, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091374, + "balance_loss_mlp": 1.07835662, + "diversity_loss_mlp": 0.0, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.08103720744805817, + "language_loss": 0.79372823, + "learning_rate": 0.0007349252562408906, + "loss": 0.80464196, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1892, + "time_per_iteration": 2.6752734184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097706, + "balance_loss_mlp": 1.08496833, + "diversity_loss_mlp": 0.0, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.07356128462514616, + "language_loss": 0.81490725, + "learning_rate": 0.0007346501979285158, + "loss": 0.82588428, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1893, + "time_per_iteration": 2.8990893363952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_mlp": 1.03214884, + "diversity_loss_mlp": 0.0, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.022756463517582398, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81579787, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.08544922, + "routerloss_mlp": 0.0, + "step": 1894, + "time_per_iteration": 4.8097145557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098768, + "balance_loss_mlp": 1.0857501, + "diversity_loss_mlp": 0.0, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06969655176236832, + "language_loss": 0.85880721, + "learning_rate": 0.0007340998081127308, + "loss": 0.86979485, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1895, + "time_per_iteration": 2.757380485534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087705, + "balance_loss_mlp": 1.074646, + "diversity_loss_mlp": 0.0, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06910669114263218, + "language_loss": 0.91127002, + "learning_rate": 0.0007338244768230007, + "loss": 0.92214715, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1896, + "time_per_iteration": 2.7967634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098629, + "balance_loss_mlp": 1.08584976, + "diversity_loss_mlp": 0.0, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.05804787602656793, + "language_loss": 0.88684666, + "learning_rate": 0.0007335490547545578, + "loss": 0.89783299, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1897, + "time_per_iteration": 3.086498260498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095377, + "balance_loss_mlp": 1.08286643, + "diversity_loss_mlp": 0.0, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06953546528053214, + "language_loss": 0.82679451, + "learning_rate": 0.0007332735420143308, + "loss": 0.83774823, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1898, + "time_per_iteration": 2.788245439529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097867, + "balance_loss_mlp": 1.08476591, + "diversity_loss_mlp": 0.0, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.07600656362423025, + "language_loss": 0.86647844, + "learning_rate": 0.0007329979387092826, + "loss": 0.87745708, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1899, + "time_per_iteration": 2.5437934398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101105, + "balance_loss_mlp": 1.08821869, + "diversity_loss_mlp": 0.0, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.05952938167480439, + "language_loss": 0.83796108, + "learning_rate": 0.0007327222449464124, + "loss": 0.8489722, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1900, + "time_per_iteration": 3.2824244499206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011078, + "balance_loss_mlp": 1.09499097, + "diversity_loss_mlp": 0.0, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07745224305421915, + "language_loss": 0.88634431, + "learning_rate": 0.0007324464608327538, + "loss": 0.89742231, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.12823486, + "routerloss_mlp": 0.0, + "step": 1901, + "time_per_iteration": 2.6411991119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102996, + "balance_loss_mlp": 1.08995461, + "diversity_loss_mlp": 0.0, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.08223816362142805, + "language_loss": 0.88474846, + "learning_rate": 0.0007321705864753758, + "loss": 0.89577842, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.1305542, + "routerloss_mlp": 0.0, + "step": 1902, + "time_per_iteration": 2.682002544403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931657, + "balance_loss_mlp": 1.62497878, + "diversity_loss_mlp": 0.20707282, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.026825446902959647, + "language_loss": 0.84137708, + "learning_rate": 0.0007318946219813823, + "loss": 0.85069364, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563089, + "step": 1903, + "time_per_iteration": 3.0061404705047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108165, + "balance_loss_mlp": 1.09403849, + "diversity_loss_mlp": 0.0, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.07526416733947026, + "language_loss": 0.89736164, + "learning_rate": 0.000731618567457912, + "loss": 0.90844321, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.14105225, + "routerloss_mlp": 0.0, + "step": 1904, + "time_per_iteration": 2.6523027420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099252, + "balance_loss_mlp": 1.08536446, + "diversity_loss_mlp": 0.0, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07605082206895837, + "language_loss": 0.87058568, + "learning_rate": 0.000731342423012139, + "loss": 0.88157821, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1905, + "time_per_iteration": 3.0595312118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096318, + "balance_loss_mlp": 1.08213234, + "diversity_loss_mlp": 0.0, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.07718853495225737, + "language_loss": 0.82559443, + "learning_rate": 0.0007310661887512722, + "loss": 0.83655763, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1906, + "time_per_iteration": 3.056859016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090478, + "balance_loss_mlp": 1.07672131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.07458396044121823, + "language_loss": 0.8194133, + "learning_rate": 0.0007307898647825549, + "loss": 0.83031803, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1907, + "time_per_iteration": 2.670468807220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090408, + "balance_loss_mlp": 1.07666349, + "diversity_loss_mlp": 0.0, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.09231339543244264, + "language_loss": 0.89368939, + "learning_rate": 0.0007305134512132659, + "loss": 0.90459347, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.13751221, + "routerloss_mlp": 0.0, + "step": 1908, + "time_per_iteration": 2.6561663150787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091843, + "balance_loss_mlp": 1.07826495, + "diversity_loss_mlp": 0.0, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.08913139219920335, + "language_loss": 0.83308864, + "learning_rate": 0.0007302369481507183, + "loss": 0.84400707, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1909, + "time_per_iteration": 2.5485799312591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017138, + "balance_loss_mlp": 1.00979447, + "diversity_loss_mlp": 0.0, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.013277678950868657, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.80978894, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1910, + "time_per_iteration": 4.848855257034302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111697, + "balance_loss_mlp": 1.09842944, + "diversity_loss_mlp": 0.0, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.058739485749840115, + "language_loss": 0.85315347, + "learning_rate": 0.000729683673975274, + "loss": 0.86427045, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.13287354, + "routerloss_mlp": 0.0, + "step": 1911, + "time_per_iteration": 2.690218210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114863, + "balance_loss_mlp": 1.10165429, + "diversity_loss_mlp": 0.0, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.05585809452393386, + "language_loss": 0.8291769, + "learning_rate": 0.0007294069030771774, + "loss": 0.84032547, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1912, + "time_per_iteration": 3.678927183151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125561, + "balance_loss_mlp": 1.1124301, + "diversity_loss_mlp": 0.0, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.06389765233013874, + "language_loss": 0.90667701, + "learning_rate": 0.0007291300431154224, + "loss": 0.91793263, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1913, + "time_per_iteration": 2.616999387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043841, + "balance_loss_mlp": 1.03611672, + "diversity_loss_mlp": 0.0, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.02051984405011318, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7143358, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1914, + "time_per_iteration": 4.973980903625488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137116, + "balance_loss_mlp": 1.12441444, + "diversity_loss_mlp": 0.0, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.0814243559806059, + "language_loss": 0.7981922, + "learning_rate": 0.0007285760564309179, + "loss": 0.8095634, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.1270752, + "routerloss_mlp": 0.0, + "step": 1915, + "time_per_iteration": 3.091447353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127931, + "balance_loss_mlp": 1.11485386, + "diversity_loss_mlp": 0.0, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.09574055809111115, + "language_loss": 0.84848046, + "learning_rate": 0.0007282989299232448, + "loss": 0.85975981, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.13092041, + "routerloss_mlp": 0.0, + "step": 1916, + "time_per_iteration": 3.074547052383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113017, + "balance_loss_mlp": 1.09977341, + "diversity_loss_mlp": 0.0, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.08763204320127825, + "language_loss": 0.83209801, + "learning_rate": 0.0007280217147820668, + "loss": 0.84322822, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1917, + "time_per_iteration": 2.6260228157043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092055, + "balance_loss_mlp": 1.07888198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06316346716689762, + "language_loss": 0.79465461, + "learning_rate": 0.0007277444111150079, + "loss": 0.80557513, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.13189697, + "routerloss_mlp": 0.0, + "step": 1918, + "time_per_iteration": 2.6777923107147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088544, + "balance_loss_mlp": 1.07465601, + "diversity_loss_mlp": 0.0, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.09595367080188737, + "language_loss": 0.84512901, + "learning_rate": 0.0007274670190297272, + "loss": 0.85601443, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1919, + "time_per_iteration": 2.590839147567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085845, + "balance_loss_mlp": 1.07205224, + "diversity_loss_mlp": 0.0, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.07431087712553297, + "language_loss": 0.82079387, + "learning_rate": 0.0007271895386339179, + "loss": 0.83165228, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1920, + "time_per_iteration": 2.7924282550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094639, + "balance_loss_mlp": 1.08048892, + "diversity_loss_mlp": 0.0, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.07797312778631413, + "language_loss": 0.83431751, + "learning_rate": 0.0007269119700353073, + "loss": 0.84526384, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1921, + "time_per_iteration": 2.7155139446258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112357, + "balance_loss_mlp": 1.0987196, + "diversity_loss_mlp": 0.0, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.07250682713227712, + "language_loss": 0.84994757, + "learning_rate": 0.0007266343133416571, + "loss": 0.86107111, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1922, + "time_per_iteration": 2.7394983768463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073276, + "balance_loss_mlp": 1.06564641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.035523530201468645, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78190196, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.07617188, + "routerloss_mlp": 0.0, + "step": 1923, + "time_per_iteration": 4.877161026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10153794, + "diversity_loss_mlp": 0.0, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.0789330271899564, + "language_loss": 0.84356588, + "learning_rate": 0.0007260787361004556, + "loss": 0.85471952, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1924, + "time_per_iteration": 2.608745813369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_mlp": 1.02985299, + "diversity_loss_mlp": 0.0, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.021371165562314075, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74798417, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.07080078, + "routerloss_mlp": 0.0, + "step": 1925, + "time_per_iteration": 4.906585931777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114233, + "balance_loss_mlp": 1.10069048, + "diversity_loss_mlp": 0.0, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.12026638393290963, + "language_loss": 0.87422252, + "learning_rate": 0.0007255228077730903, + "loss": 0.88536477, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1926, + "time_per_iteration": 2.6886680126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123836, + "balance_loss_mlp": 1.11107421, + "diversity_loss_mlp": 0.0, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.06719853297068734, + "language_loss": 0.81722987, + "learning_rate": 0.0007252447122218632, + "loss": 0.82846814, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1927, + "time_per_iteration": 3.1511058807373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125512, + "balance_loss_mlp": 1.11258984, + "diversity_loss_mlp": 0.0, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.08764579691953547, + "language_loss": 0.87849444, + "learning_rate": 0.0007249665292228834, + "loss": 0.88974959, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1928, + "time_per_iteration": 2.565991163253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120289, + "balance_loss_mlp": 1.1073308, + "diversity_loss_mlp": 0.0, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.0633685198143462, + "language_loss": 0.83318496, + "learning_rate": 0.000724688258884151, + "loss": 0.84438789, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1929, + "time_per_iteration": 2.531827926635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_mlp": 1.10286927, + "diversity_loss_mlp": 0.0, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.05744658583323744, + "language_loss": 0.86564112, + "learning_rate": 0.0007244099013137002, + "loss": 0.8767941, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1930, + "time_per_iteration": 3.1130166053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.10404849, + "diversity_loss_mlp": 0.0, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.06880018611034966, + "language_loss": 0.88695574, + "learning_rate": 0.0007241314566195993, + "loss": 0.89812243, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.1262207, + "routerloss_mlp": 0.0, + "step": 1931, + "time_per_iteration": 3.374743700027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110097, + "balance_loss_mlp": 1.08821416, + "diversity_loss_mlp": 0.0, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.06303779661636588, + "language_loss": 0.85510373, + "learning_rate": 0.0007238529249099496, + "loss": 0.86611342, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1932, + "time_per_iteration": 2.6654059886932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097731, + "balance_loss_mlp": 1.0911988, + "diversity_loss_mlp": 0.0, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.03412398452916775, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78954613, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.06542969, + "routerloss_mlp": 0.0, + "step": 1933, + "time_per_iteration": 4.851354598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091175, + "balance_loss_mlp": 1.07859278, + "diversity_loss_mlp": 0.0, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.08014253307267598, + "language_loss": 0.80636895, + "learning_rate": 0.000723295600876581, + "loss": 0.81728071, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.12597656, + "routerloss_mlp": 0.0, + "step": 1934, + "time_per_iteration": 3.0025534629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097044, + "balance_loss_mlp": 1.08416963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.08698689907724866, + "language_loss": 0.88006312, + "learning_rate": 0.0007230168087692344, + "loss": 0.89103359, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.12872314, + "routerloss_mlp": 0.0, + "step": 1935, + "time_per_iteration": 2.6499342918395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095813, + "balance_loss_mlp": 1.0830214, + "diversity_loss_mlp": 0.0, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.07031074193849007, + "language_loss": 0.82382512, + "learning_rate": 0.0007227379300790839, + "loss": 0.8347832, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1936, + "time_per_iteration": 3.0040676593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092601, + "balance_loss_mlp": 1.07969058, + "diversity_loss_mlp": 0.0, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.07132774808829288, + "language_loss": 0.85478282, + "learning_rate": 0.0007224589649143997, + "loss": 0.86570889, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 1937, + "time_per_iteration": 2.584545612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089825, + "balance_loss_mlp": 1.07662272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.0711139803163438, + "language_loss": 0.8120302, + "learning_rate": 0.0007221799133834861, + "loss": 0.82292843, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.13214111, + "routerloss_mlp": 0.0, + "step": 1938, + "time_per_iteration": 2.6393649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109955, + "balance_loss_mlp": 1.08649623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.20460237815205612, + "language_loss": 0.81793052, + "learning_rate": 0.00072190077559468, + "loss": 0.82892597, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1939, + "time_per_iteration": 2.5494682788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127139, + "balance_loss_mlp": 1.1140976, + "diversity_loss_mlp": 0.0, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.05817015695703163, + "language_loss": 0.89248812, + "learning_rate": 0.0007216215516563527, + "loss": 0.90375948, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.13049316, + "routerloss_mlp": 0.0, + "step": 1940, + "time_per_iteration": 2.6755452156066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129035, + "balance_loss_mlp": 1.1159811, + "diversity_loss_mlp": 0.0, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.07778932214282369, + "language_loss": 0.83852386, + "learning_rate": 0.0007213422416769083, + "loss": 0.84981418, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1941, + "time_per_iteration": 2.6008002758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135994, + "balance_loss_mlp": 1.12319708, + "diversity_loss_mlp": 0.0, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.06345716224902766, + "language_loss": 0.7501297, + "learning_rate": 0.0007210628457647849, + "loss": 0.76148963, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1942, + "time_per_iteration": 2.5911362171173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140859, + "balance_loss_mlp": 1.12763917, + "diversity_loss_mlp": 0.0, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.06753886702103719, + "language_loss": 0.78585184, + "learning_rate": 0.000720783364028453, + "loss": 0.7972604, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.13238525, + "routerloss_mlp": 0.0, + "step": 1943, + "time_per_iteration": 2.7490458488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149977, + "balance_loss_mlp": 1.13685822, + "diversity_loss_mlp": 0.0, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.0650742437261564, + "language_loss": 0.87667847, + "learning_rate": 0.0007205037965764177, + "loss": 0.88817823, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1944, + "time_per_iteration": 2.5870554447174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134812, + "balance_loss_mlp": 1.12192512, + "diversity_loss_mlp": 0.0, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07468357539719116, + "language_loss": 0.85650361, + "learning_rate": 0.0007202241435172161, + "loss": 0.86785173, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1945, + "time_per_iteration": 2.7550253868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131811, + "balance_loss_mlp": 1.11901414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07270487210957549, + "language_loss": 0.87884831, + "learning_rate": 0.0007199444049594198, + "loss": 0.8901664, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1946, + "time_per_iteration": 2.9499337673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111095, + "balance_loss_mlp": 1.09783912, + "diversity_loss_mlp": 0.0, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.07247382516020226, + "language_loss": 0.83384776, + "learning_rate": 0.0007196645810116322, + "loss": 0.84495866, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1947, + "time_per_iteration": 2.70394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113218, + "balance_loss_mlp": 1.1003499, + "diversity_loss_mlp": 0.0, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.07522309633784076, + "language_loss": 0.84431696, + "learning_rate": 0.0007193846717824912, + "loss": 0.8554492, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1948, + "time_per_iteration": 2.923752546310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116002, + "balance_loss_mlp": 1.10312748, + "diversity_loss_mlp": 0.0, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.06883561802065806, + "language_loss": 0.88268626, + "learning_rate": 0.0007191046773806669, + "loss": 0.89384627, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1949, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108921, + "balance_loss_mlp": 1.09593272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07969110082801287, + "language_loss": 0.83211446, + "learning_rate": 0.0007188245979148631, + "loss": 0.84320366, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1950, + "time_per_iteration": 3.193124294281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111725, + "balance_loss_mlp": 1.09892154, + "diversity_loss_mlp": 0.0, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.07005872092850987, + "language_loss": 0.87434363, + "learning_rate": 0.0007185444334938157, + "loss": 0.88546085, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1951, + "time_per_iteration": 2.669201135635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101783, + "balance_loss_mlp": 1.0892663, + "diversity_loss_mlp": 0.0, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.08195801919923047, + "language_loss": 0.85047525, + "learning_rate": 0.0007182641842262947, + "loss": 0.86149311, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.12518311, + "routerloss_mlp": 0.0, + "step": 1952, + "time_per_iteration": 2.602139472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092197, + "balance_loss_mlp": 1.07936394, + "diversity_loss_mlp": 0.0, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.07349771430020792, + "language_loss": 0.77754879, + "learning_rate": 0.0007179838502211022, + "loss": 0.78847075, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.128479, + "routerloss_mlp": 0.0, + "step": 1953, + "time_per_iteration": 2.85720157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094498, + "balance_loss_mlp": 1.08148086, + "diversity_loss_mlp": 0.0, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.0681681729591206, + "language_loss": 0.86330736, + "learning_rate": 0.0007177034315870738, + "loss": 0.87425238, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1954, + "time_per_iteration": 2.958862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101066, + "balance_loss_mlp": 1.08803654, + "diversity_loss_mlp": 0.0, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06642365438263753, + "language_loss": 0.90809441, + "learning_rate": 0.0007174229284330773, + "loss": 0.91910505, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1955, + "time_per_iteration": 2.5824947357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108936, + "balance_loss_mlp": 1.07642531, + "diversity_loss_mlp": 0.0, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.07788827503332588, + "language_loss": 0.86705017, + "learning_rate": 0.0007171423408680141, + "loss": 0.87794375, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1956, + "time_per_iteration": 2.8101606369018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00950311, + "balance_loss_mlp": 1.6602329, + "diversity_loss_mlp": 0.20739825, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.03218717292019043, + "language_loss": 0.89567441, + "learning_rate": 0.0007168616690008176, + "loss": 0.90517747, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01649548, + "step": 1957, + "time_per_iteration": 2.6774377822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081569, + "balance_loss_mlp": 1.06840825, + "diversity_loss_mlp": 0.0, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.07242251254882147, + "language_loss": 0.85681045, + "learning_rate": 0.0007165809129404545, + "loss": 0.86762613, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1958, + "time_per_iteration": 2.8396048545837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090105, + "balance_loss_mlp": 1.07657433, + "diversity_loss_mlp": 0.0, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.08227545286248691, + "language_loss": 0.86212921, + "learning_rate": 0.0007163000727959239, + "loss": 0.87303019, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1959, + "time_per_iteration": 2.478990316390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087148, + "balance_loss_mlp": 1.07989979, + "diversity_loss_mlp": 0.0, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.05215322395932221, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79046214, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.07226562, + "routerloss_mlp": 0.0, + "step": 1960, + "time_per_iteration": 4.869986057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095327, + "balance_loss_mlp": 1.08232689, + "diversity_loss_mlp": 0.0, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.08048811275026858, + "language_loss": 0.84568793, + "learning_rate": 0.00071573814069052, + "loss": 0.85664117, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.13018799, + "routerloss_mlp": 0.0, + "step": 1961, + "time_per_iteration": 2.9122819900512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109901, + "balance_loss_mlp": 1.08614171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.06061063893945359, + "language_loss": 0.88073885, + "learning_rate": 0.0007154570489478081, + "loss": 0.89172894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1962, + "time_per_iteration": 3.1824018955230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111399, + "balance_loss_mlp": 1.10154414, + "diversity_loss_mlp": 0.0, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.06274200702745775, + "language_loss": 0.86391222, + "learning_rate": 0.0007151758735572514, + "loss": 0.87505209, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.12451172, + "routerloss_mlp": 0.0, + "step": 1963, + "time_per_iteration": 2.997624158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111089, + "balance_loss_mlp": 1.09836888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.07983075782925624, + "language_loss": 0.80894458, + "learning_rate": 0.0007148946146280119, + "loss": 0.82005548, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.12731934, + "routerloss_mlp": 0.0, + "step": 1964, + "time_per_iteration": 2.836583137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00620122, + "balance_loss_mlp": 1.05382681, + "diversity_loss_mlp": 0.16216688, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.0017779517528101797, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.72812271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01212509, + "step": 1965, + "time_per_iteration": 4.906678915023804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_mlp": 1.02436352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.025755206304302582, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.7637251, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.06176758, + "routerloss_mlp": 0.0, + "step": 1966, + "time_per_iteration": 4.93319296836853 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127949, + "balance_loss_mlp": 1.11581361, + "diversity_loss_mlp": 0.0, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.05898800907157556, + "language_loss": 0.83873129, + "learning_rate": 0.0007140503377003022, + "loss": 0.85001081, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 1967, + "time_per_iteration": 2.9807000160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123739, + "balance_loss_mlp": 1.11125755, + "diversity_loss_mlp": 0.0, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.06421364750503517, + "language_loss": 0.84625173, + "learning_rate": 0.000713768745708599, + "loss": 0.85748911, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1968, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118961, + "balance_loss_mlp": 1.10671234, + "diversity_loss_mlp": 0.0, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.06880095080762995, + "language_loss": 0.77052647, + "learning_rate": 0.0007134870707245085, + "loss": 0.78171611, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.12249756, + "routerloss_mlp": 0.0, + "step": 1969, + "time_per_iteration": 3.302985429763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120912, + "balance_loss_mlp": 1.10852587, + "diversity_loss_mlp": 0.0, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.07142024228833302, + "language_loss": 0.84469545, + "learning_rate": 0.0007132053128573864, + "loss": 0.85590458, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.12384033, + "routerloss_mlp": 0.0, + "step": 1970, + "time_per_iteration": 2.7751197814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_mlp": 1.11231327, + "diversity_loss_mlp": 0.0, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06795721743578591, + "language_loss": 0.83786452, + "learning_rate": 0.0007129234722166211, + "loss": 0.84910882, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 1971, + "time_per_iteration": 2.806898832321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114684, + "balance_loss_mlp": 1.10238707, + "diversity_loss_mlp": 0.0, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.06601167392952549, + "language_loss": 0.91087604, + "learning_rate": 0.0007126415489116328, + "loss": 0.92202282, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.1229248, + "routerloss_mlp": 0.0, + "step": 1972, + "time_per_iteration": 2.656651496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.09782279, + "diversity_loss_mlp": 0.0, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06641244535011205, + "language_loss": 0.81145501, + "learning_rate": 0.0007123595430518736, + "loss": 0.82255375, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 1973, + "time_per_iteration": 2.8665072917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102568, + "balance_loss_mlp": 1.09068835, + "diversity_loss_mlp": 0.0, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.07235703206146665, + "language_loss": 0.86411089, + "learning_rate": 0.0007120774547468282, + "loss": 0.87513655, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 1974, + "time_per_iteration": 2.5590381622314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948323, + "balance_loss_mlp": 1.65707994, + "diversity_loss_mlp": 0.20756721, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.03148003592885531, + "language_loss": 0.81558585, + "learning_rate": 0.0007117952841060128, + "loss": 0.82506907, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01599924, + "step": 1975, + "time_per_iteration": 2.6777563095092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083824, + "balance_loss_mlp": 1.07167053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.07660828670939425, + "language_loss": 0.83672053, + "learning_rate": 0.0007115130312389756, + "loss": 0.8475588, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.12145996, + "routerloss_mlp": 0.0, + "step": 1976, + "time_per_iteration": 2.7103323936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.07200503, + "diversity_loss_mlp": 0.0, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.08353002189035653, + "language_loss": 0.79290646, + "learning_rate": 0.0007112306962552973, + "loss": 0.80375111, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.12463379, + "routerloss_mlp": 0.0, + "step": 1977, + "time_per_iteration": 2.576239824295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084774, + "balance_loss_mlp": 1.07254314, + "diversity_loss_mlp": 0.0, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.06483406604645132, + "language_loss": 0.85315859, + "learning_rate": 0.0007109482792645896, + "loss": 0.86400628, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 1978, + "time_per_iteration": 2.7146143913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.07276165, + "diversity_loss_mlp": 0.0, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.06865418790878511, + "language_loss": 0.83831733, + "learning_rate": 0.0007106657803764969, + "loss": 0.84916663, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 1979, + "time_per_iteration": 2.73152494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086146, + "balance_loss_mlp": 1.07395101, + "diversity_loss_mlp": 0.0, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.07620298141647525, + "language_loss": 0.81962979, + "learning_rate": 0.0007103831997006948, + "loss": 0.83049119, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.12194824, + "routerloss_mlp": 0.0, + "step": 1980, + "time_per_iteration": 2.7383615970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094611, + "balance_loss_mlp": 1.08276772, + "diversity_loss_mlp": 0.0, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.0842263164190672, + "language_loss": 0.85342598, + "learning_rate": 0.0007101005373468908, + "loss": 0.86437213, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 1981, + "time_per_iteration": 2.889251708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097444, + "balance_loss_mlp": 1.08543372, + "diversity_loss_mlp": 0.0, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.06048237516575629, + "language_loss": 0.86649287, + "learning_rate": 0.0007098177934248242, + "loss": 0.87746727, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 1982, + "time_per_iteration": 2.773146867752075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920145, + "balance_loss_mlp": 1.60273147, + "diversity_loss_mlp": 0.20649332, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.033525346661278974, + "language_loss": 0.85516387, + "learning_rate": 0.0007095349680442661, + "loss": 0.86436534, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01553278, + "step": 1983, + "time_per_iteration": 2.8675785064697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116298, + "balance_loss_mlp": 1.1045742, + "diversity_loss_mlp": 0.0, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.06407324010727367, + "language_loss": 0.78783178, + "learning_rate": 0.0007092520613150188, + "loss": 0.79899484, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 1984, + "time_per_iteration": 2.709177017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918651, + "balance_loss_mlp": 1.59999418, + "diversity_loss_mlp": 0.20665541, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.03070680845617011, + "language_loss": 0.80925471, + "learning_rate": 0.0007089690733469165, + "loss": 0.81844121, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532666, + "step": 1985, + "time_per_iteration": 2.750558376312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135734, + "balance_loss_mlp": 1.12384343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.08571071539105668, + "language_loss": 0.82313848, + "learning_rate": 0.000708686004249825, + "loss": 0.83449578, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 1986, + "time_per_iteration": 2.7550368309020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132102, + "balance_loss_mlp": 1.12012124, + "diversity_loss_mlp": 0.0, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.07744479108461458, + "language_loss": 0.91340905, + "learning_rate": 0.0007084028541336413, + "loss": 0.92473006, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.11975098, + "routerloss_mlp": 0.0, + "step": 1987, + "time_per_iteration": 2.703339099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914957, + "balance_loss_mlp": 1.59260678, + "diversity_loss_mlp": 0.20690078, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.03035395776464378, + "language_loss": 0.86267084, + "learning_rate": 0.0007081196231082942, + "loss": 0.87182039, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01520337, + "step": 1988, + "time_per_iteration": 2.8075153827667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.10567343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.07746710731409655, + "language_loss": 0.80053389, + "learning_rate": 0.0007078363112837436, + "loss": 0.81171107, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.12036133, + "routerloss_mlp": 0.0, + "step": 1989, + "time_per_iteration": 2.811197280883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104881, + "balance_loss_mlp": 1.09261441, + "diversity_loss_mlp": 0.0, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.07961201652041947, + "language_loss": 0.84721339, + "learning_rate": 0.000707552918769981, + "loss": 0.85826218, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 1990, + "time_per_iteration": 2.4908246994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102216, + "balance_loss_mlp": 1.08987188, + "diversity_loss_mlp": 0.0, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.06284554422997896, + "language_loss": 0.83619118, + "learning_rate": 0.000707269445677029, + "loss": 0.84721333, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.12341309, + "routerloss_mlp": 0.0, + "step": 1991, + "time_per_iteration": 2.733126401901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101588, + "balance_loss_mlp": 1.08921361, + "diversity_loss_mlp": 0.0, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.07203164936975576, + "language_loss": 0.85140717, + "learning_rate": 0.0007069858921149416, + "loss": 0.86242306, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.12371826, + "routerloss_mlp": 0.0, + "step": 1992, + "time_per_iteration": 2.9382007122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096798, + "balance_loss_mlp": 1.08434701, + "diversity_loss_mlp": 0.0, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.05485930037569587, + "language_loss": 0.85794246, + "learning_rate": 0.0007067022581938043, + "loss": 0.86891043, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.12457275, + "routerloss_mlp": 0.0, + "step": 1993, + "time_per_iteration": 2.857525110244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095406, + "balance_loss_mlp": 1.08321714, + "diversity_loss_mlp": 0.0, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.0871408980162776, + "language_loss": 0.83722532, + "learning_rate": 0.0007064185440237334, + "loss": 0.8481794, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1994, + "time_per_iteration": 2.7131123542785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099231, + "balance_loss_mlp": 1.08733368, + "diversity_loss_mlp": 0.0, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.06357294591464056, + "language_loss": 0.84358412, + "learning_rate": 0.0007061347497148764, + "loss": 0.85457647, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.11895752, + "routerloss_mlp": 0.0, + "step": 1995, + "time_per_iteration": 2.7398569583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102339, + "balance_loss_mlp": 1.09015, + "diversity_loss_mlp": 0.0, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.07322887134464046, + "language_loss": 0.86299884, + "learning_rate": 0.0007058508753774122, + "loss": 0.87402225, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1996, + "time_per_iteration": 2.6903162002563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108867, + "balance_loss_mlp": 1.09709477, + "diversity_loss_mlp": 0.0, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.0698381422429368, + "language_loss": 0.86921895, + "learning_rate": 0.0007055669211215505, + "loss": 0.88030767, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 1997, + "time_per_iteration": 2.695028066635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113141, + "balance_loss_mlp": 1.10084486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.08585182349688475, + "language_loss": 0.77776283, + "learning_rate": 0.0007052828870575322, + "loss": 0.78889418, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 1998, + "time_per_iteration": 2.685685873031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011259, + "balance_loss_mlp": 1.11406291, + "diversity_loss_mlp": 0.0, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.06979871165732322, + "language_loss": 0.87060714, + "learning_rate": 0.0007049987732956291, + "loss": 0.8818661, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.11834717, + "routerloss_mlp": 0.0, + "step": 1999, + "time_per_iteration": 2.9710631370544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110647, + "balance_loss_mlp": 1.09428668, + "diversity_loss_mlp": 0.0, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.05561177596637214, + "language_loss": 0.82812738, + "learning_rate": 0.0007047145799461439, + "loss": 0.83919203, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2000, + "time_per_iteration": 2.8492860794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105216, + "balance_loss_mlp": 1.09293747, + "diversity_loss_mlp": 0.0, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06017266002852966, + "language_loss": 0.82272708, + "learning_rate": 0.00070443030711941, + "loss": 0.83377922, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2001, + "time_per_iteration": 2.769383430480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100076, + "balance_loss_mlp": 1.08806002, + "diversity_loss_mlp": 0.0, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.061888534691205976, + "language_loss": 0.82098496, + "learning_rate": 0.0007041459549257924, + "loss": 0.83198571, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2002, + "time_per_iteration": 2.876244306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089803, + "balance_loss_mlp": 1.07744145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.06816771124006925, + "language_loss": 0.78024125, + "learning_rate": 0.0007038615234756859, + "loss": 0.79113925, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.12359619, + "routerloss_mlp": 0.0, + "step": 2003, + "time_per_iteration": 3.1744768619537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086899, + "balance_loss_mlp": 1.07477546, + "diversity_loss_mlp": 0.0, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.09233530116269285, + "language_loss": 0.83808231, + "learning_rate": 0.000703577012879517, + "loss": 0.84895122, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2004, + "time_per_iteration": 2.633391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089004, + "balance_loss_mlp": 1.07705307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.07105955558417659, + "language_loss": 0.88946962, + "learning_rate": 0.0007032924232477423, + "loss": 0.90035963, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2005, + "time_per_iteration": 2.6482574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109168, + "balance_loss_mlp": 1.0797528, + "diversity_loss_mlp": 0.0, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.07024694433071269, + "language_loss": 0.80605727, + "learning_rate": 0.0007030077546908493, + "loss": 0.81697416, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2006, + "time_per_iteration": 2.6219046115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087051, + "balance_loss_mlp": 1.08056581, + "diversity_loss_mlp": 0.0, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.032453276732354666, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84151709, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.06494141, + "routerloss_mlp": 0.0, + "step": 2007, + "time_per_iteration": 4.798014402389526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099136, + "balance_loss_mlp": 1.08744717, + "diversity_loss_mlp": 0.0, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.08661380313869275, + "language_loss": 0.79137146, + "learning_rate": 0.0007024381812438117, + "loss": 0.8023628, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.11676025, + "routerloss_mlp": 0.0, + "step": 2008, + "time_per_iteration": 2.5403189659118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110095, + "balance_loss_mlp": 1.08864713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.09407170185597404, + "language_loss": 0.83448064, + "learning_rate": 0.0007021532765747951, + "loss": 0.8454901, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.12310791, + "routerloss_mlp": 0.0, + "step": 2009, + "time_per_iteration": 2.9585187435150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094144, + "balance_loss_mlp": 1.08211613, + "diversity_loss_mlp": 0.0, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.0684890586406507, + "language_loss": 0.79048979, + "learning_rate": 0.0007018682934229162, + "loss": 0.80143124, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2010, + "time_per_iteration": 2.9703307151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_mlp": 1.0842756, + "diversity_loss_mlp": 0.0, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.06303649013837292, + "language_loss": 0.82761061, + "learning_rate": 0.0007015832318988152, + "loss": 0.83857542, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2011, + "time_per_iteration": 2.6060009002685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_mlp": 1.02231336, + "diversity_loss_mlp": 0.0, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.017766506591404385, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.7491802, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.05615234, + "routerloss_mlp": 0.0, + "step": 2012, + "time_per_iteration": 4.938155651092529 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109088, + "balance_loss_mlp": 1.07810068, + "diversity_loss_mlp": 0.0, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.060967443696148906, + "language_loss": 0.84265292, + "learning_rate": 0.0007010128741766604, + "loss": 0.85356176, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 2013, + "time_per_iteration": 2.7293431758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091499, + "balance_loss_mlp": 1.07861209, + "diversity_loss_mlp": 0.0, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.07873148114105366, + "language_loss": 0.84277219, + "learning_rate": 0.0007007275782000391, + "loss": 0.85368717, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 2014, + "time_per_iteration": 2.644911766052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091625, + "balance_loss_mlp": 1.07889354, + "diversity_loss_mlp": 0.0, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.0868083489465314, + "language_loss": 0.8502394, + "learning_rate": 0.0007004422042940605, + "loss": 0.86115563, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 2015, + "time_per_iteration": 2.5096747875213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109894, + "balance_loss_mlp": 1.08593392, + "diversity_loss_mlp": 0.0, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.08227522563153689, + "language_loss": 0.89877218, + "learning_rate": 0.0007001567525695169, + "loss": 0.90976155, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 2016, + "time_per_iteration": 2.606520891189575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105972, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.06437704205290017, + "language_loss": 0.83705699, + "learning_rate": 0.0006998712231372303, + "loss": 0.84811676, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 2017, + "time_per_iteration": 3.016061305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119415, + "balance_loss_mlp": 1.10692167, + "diversity_loss_mlp": 0.0, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06622760195410109, + "language_loss": 0.85886908, + "learning_rate": 0.0006995856161080532, + "loss": 0.87006325, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.12487793, + "routerloss_mlp": 0.0, + "step": 2018, + "time_per_iteration": 2.8263893127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_mlp": 1.11165869, + "diversity_loss_mlp": 0.0, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.06957079313074316, + "language_loss": 0.82328916, + "learning_rate": 0.0006992999315928679, + "loss": 0.83453172, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.1260376, + "routerloss_mlp": 0.0, + "step": 2019, + "time_per_iteration": 2.789020299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.11772799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.05589846380959986, + "language_loss": 0.85480869, + "learning_rate": 0.0006990141697025871, + "loss": 0.86611497, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 2020, + "time_per_iteration": 2.788597345352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067569, + "balance_loss_mlp": 1.06141829, + "diversity_loss_mlp": 0.0, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.034323999481440985, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77427208, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2021, + "time_per_iteration": 4.782108545303345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130924, + "balance_loss_mlp": 1.11879468, + "diversity_loss_mlp": 0.0, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0813348018947899, + "language_loss": 0.82333553, + "learning_rate": 0.0006984424142405392, + "loss": 0.83464473, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 2022, + "time_per_iteration": 2.804866075515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118053, + "balance_loss_mlp": 1.10578668, + "diversity_loss_mlp": 0.0, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.07379903296161248, + "language_loss": 0.82117045, + "learning_rate": 0.0006981564208907474, + "loss": 0.83235097, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2023, + "time_per_iteration": 2.5883662700653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130828, + "balance_loss_mlp": 1.11855519, + "diversity_loss_mlp": 0.0, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.07869766022149485, + "language_loss": 0.8995713, + "learning_rate": 0.0006978703506098102, + "loss": 0.91087961, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2024, + "time_per_iteration": 2.730283498764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127449, + "balance_loss_mlp": 1.11556411, + "diversity_loss_mlp": 0.0, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.0665173530375796, + "language_loss": 0.88210815, + "learning_rate": 0.00069758420350879, + "loss": 0.89338267, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2025, + "time_per_iteration": 2.62969708442688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932402, + "balance_loss_mlp": 1.62686133, + "diversity_loss_mlp": 0.20693868, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.03379762859523427, + "language_loss": 0.8613863, + "learning_rate": 0.000697297979698779, + "loss": 0.87071025, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01550185, + "step": 2026, + "time_per_iteration": 2.837543249130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107077, + "balance_loss_mlp": 1.09529877, + "diversity_loss_mlp": 0.0, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06049708379655892, + "language_loss": 0.83660531, + "learning_rate": 0.0006970116792908992, + "loss": 0.84767604, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2027, + "time_per_iteration": 3.1133604049682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107629, + "balance_loss_mlp": 1.0960542, + "diversity_loss_mlp": 0.0, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.07190738956644391, + "language_loss": 0.81380564, + "learning_rate": 0.000696725302396302, + "loss": 0.82488191, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2028, + "time_per_iteration": 2.6460230350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109766, + "balance_loss_mlp": 1.08604932, + "diversity_loss_mlp": 0.0, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.06814290150602269, + "language_loss": 0.85887402, + "learning_rate": 0.0006964388491261692, + "loss": 0.86985064, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2029, + "time_per_iteration": 3.296208143234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099195, + "balance_loss_mlp": 1.0871129, + "diversity_loss_mlp": 0.0, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.075812953715104, + "language_loss": 0.87511015, + "learning_rate": 0.0006961523195917114, + "loss": 0.88610214, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2030, + "time_per_iteration": 2.803239345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107522, + "balance_loss_mlp": 1.09573865, + "diversity_loss_mlp": 0.0, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.0665807006884719, + "language_loss": 0.78137511, + "learning_rate": 0.0006958657139041696, + "loss": 0.79245031, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2031, + "time_per_iteration": 2.739151954650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061018, + "balance_loss_mlp": 1.05531955, + "diversity_loss_mlp": 0.0, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.035996309550900246, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77773988, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.05688477, + "routerloss_mlp": 0.0, + "step": 2032, + "time_per_iteration": 4.918209552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094882, + "balance_loss_mlp": 1.08307993, + "diversity_loss_mlp": 0.0, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.0751880944680772, + "language_loss": 0.78643966, + "learning_rate": 0.0006952922745149434, + "loss": 0.79738843, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2033, + "time_per_iteration": 2.6274161338806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091358, + "balance_loss_mlp": 1.07940745, + "diversity_loss_mlp": 0.0, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.07391479182011068, + "language_loss": 0.87674987, + "learning_rate": 0.000695005441035888, + "loss": 0.88766348, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2034, + "time_per_iteration": 2.647348642349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018577, + "balance_loss_mlp": 1.01280713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.010435626825017296, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74742007, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2035, + "time_per_iteration": 4.8861188888549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107172, + "balance_loss_mlp": 1.094733, + "diversity_loss_mlp": 0.0, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.06114898183694146, + "language_loss": 0.81133932, + "learning_rate": 0.0006944315470656863, + "loss": 0.82241106, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.12438965, + "routerloss_mlp": 0.0, + "step": 2036, + "time_per_iteration": 3.0057246685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108633, + "balance_loss_mlp": 1.09606266, + "diversity_loss_mlp": 0.0, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.0812142536963638, + "language_loss": 0.90953541, + "learning_rate": 0.000694144486797345, + "loss": 0.92062169, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.12579346, + "routerloss_mlp": 0.0, + "step": 2037, + "time_per_iteration": 2.6566872596740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_mlp": 1.0060699, + "diversity_loss_mlp": 0.0, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.012879447335335118, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80532491, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2038, + "time_per_iteration": 4.609802722930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103729, + "balance_loss_mlp": 1.09141517, + "diversity_loss_mlp": 0.0, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.07718413790316761, + "language_loss": 0.89271998, + "learning_rate": 0.0006935701402514156, + "loss": 0.90375727, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.12316895, + "routerloss_mlp": 0.0, + "step": 2039, + "time_per_iteration": 2.610905408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101805, + "balance_loss_mlp": 1.01206541, + "diversity_loss_mlp": 0.0, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.016017309503016164, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74052942, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2040, + "time_per_iteration": 4.954579830169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106674, + "balance_loss_mlp": 1.09434199, + "diversity_loss_mlp": 0.0, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.0728619475730698, + "language_loss": 0.84539711, + "learning_rate": 0.0006929954931031422, + "loss": 0.85646391, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2041, + "time_per_iteration": 3.6979990005493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114297, + "balance_loss_mlp": 1.10201287, + "diversity_loss_mlp": 0.0, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.07303574322286652, + "language_loss": 0.88330269, + "learning_rate": 0.0006927080570819805, + "loss": 0.89444566, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2042, + "time_per_iteration": 2.5840306282043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126631, + "balance_loss_mlp": 1.11437607, + "diversity_loss_mlp": 0.0, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.09784101638347129, + "language_loss": 0.80726093, + "learning_rate": 0.0006924205462449161, + "loss": 0.81852722, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2043, + "time_per_iteration": 2.556964159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123407, + "balance_loss_mlp": 1.11139631, + "diversity_loss_mlp": 0.0, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.07674510212981295, + "language_loss": 0.81822228, + "learning_rate": 0.0006921329607035702, + "loss": 0.82945639, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.12005615, + "routerloss_mlp": 0.0, + "step": 2044, + "time_per_iteration": 3.2355051040649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109464, + "balance_loss_mlp": 1.09777582, + "diversity_loss_mlp": 0.0, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.0626655505852987, + "language_loss": 0.87889385, + "learning_rate": 0.0006918453005695938, + "loss": 0.88998848, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2045, + "time_per_iteration": 2.616405725479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112555, + "balance_loss_mlp": 1.10047281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.0593607382511463, + "language_loss": 0.8430419, + "learning_rate": 0.0006915575659546662, + "loss": 0.85416746, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2046, + "time_per_iteration": 2.6596429347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100096, + "balance_loss_mlp": 1.08785915, + "diversity_loss_mlp": 0.0, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.0680979304239865, + "language_loss": 0.80745959, + "learning_rate": 0.0006912697569704959, + "loss": 0.81846058, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2047, + "time_per_iteration": 2.5962154865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097855, + "balance_loss_mlp": 1.08564174, + "diversity_loss_mlp": 0.0, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07634449995136075, + "language_loss": 0.8702817, + "learning_rate": 0.0006909818737288205, + "loss": 0.88126016, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.12207031, + "routerloss_mlp": 0.0, + "step": 2048, + "time_per_iteration": 2.5559332370758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111282, + "balance_loss_mlp": 1.09955215, + "diversity_loss_mlp": 0.0, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07451514550279957, + "language_loss": 0.80715293, + "learning_rate": 0.000690693916341406, + "loss": 0.81826574, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2049, + "time_per_iteration": 2.605881690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115507, + "balance_loss_mlp": 1.10377121, + "diversity_loss_mlp": 0.0, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.06516266173427393, + "language_loss": 0.82286257, + "learning_rate": 0.0006904058849200475, + "loss": 0.83401763, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2050, + "time_per_iteration": 2.7183115482330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.09360313, + "diversity_loss_mlp": 0.0, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.0753850450331705, + "language_loss": 0.84972727, + "learning_rate": 0.0006901177795765683, + "loss": 0.8607837, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 2051, + "time_per_iteration": 2.627774715423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105318, + "balance_loss_mlp": 1.09354019, + "diversity_loss_mlp": 0.0, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.06465732667856934, + "language_loss": 0.81096435, + "learning_rate": 0.0006898296004228213, + "loss": 0.82201755, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2052, + "time_per_iteration": 2.7607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050397, + "balance_loss_mlp": 1.04446077, + "diversity_loss_mlp": 0.0, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03031396698302257, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177135, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.05932617, + "routerloss_mlp": 0.0, + "step": 2053, + "time_per_iteration": 4.876460552215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117073, + "balance_loss_mlp": 1.10529494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.1105412420488248, + "language_loss": 0.79620701, + "learning_rate": 0.0006892530211320763, + "loss": 0.80737776, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2054, + "time_per_iteration": 2.702591896057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00944261, + "balance_loss_mlp": 1.6481061, + "diversity_loss_mlp": 0.21043469, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.03587460904718008, + "language_loss": 0.84313488, + "learning_rate": 0.000688964621218926, + "loss": 0.85257751, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01499031, + "step": 2055, + "time_per_iteration": 2.6392524242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109262, + "balance_loss_mlp": 1.08063984, + "diversity_loss_mlp": 0.0, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.0862390851468888, + "language_loss": 0.80478442, + "learning_rate": 0.0006886761479432037, + "loss": 0.81571066, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.11968994, + "routerloss_mlp": 0.0, + "step": 2056, + "time_per_iteration": 2.8577234745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079958, + "balance_loss_mlp": 1.06739902, + "diversity_loss_mlp": 0.0, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.06874544900142358, + "language_loss": 0.84387571, + "learning_rate": 0.0006883876014169045, + "loss": 0.85467529, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.12554932, + "routerloss_mlp": 0.0, + "step": 2057, + "time_per_iteration": 2.572458505630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073975, + "balance_loss_mlp": 1.06154716, + "diversity_loss_mlp": 0.0, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07681071569739906, + "language_loss": 0.90056652, + "learning_rate": 0.000688098981752052, + "loss": 0.91130626, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 2058, + "time_per_iteration": 2.7125563621520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080003, + "balance_loss_mlp": 1.06697917, + "diversity_loss_mlp": 0.0, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08571295812058347, + "language_loss": 0.80176479, + "learning_rate": 0.0006878102890606982, + "loss": 0.81256485, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 2059, + "time_per_iteration": 3.0797197818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108467, + "balance_loss_mlp": 1.07161617, + "diversity_loss_mlp": 0.0, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.08415103615204221, + "language_loss": 0.81576395, + "learning_rate": 0.0006875215234549239, + "loss": 0.82661068, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 2060, + "time_per_iteration": 2.5358171463012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078952, + "balance_loss_mlp": 1.06604218, + "diversity_loss_mlp": 0.0, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.08360675720274492, + "language_loss": 0.85212821, + "learning_rate": 0.0006872326850468376, + "loss": 0.86291778, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 2061, + "time_per_iteration": 2.685746669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079022, + "balance_loss_mlp": 1.06612396, + "diversity_loss_mlp": 0.0, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.08669948408116639, + "language_loss": 0.78834969, + "learning_rate": 0.0006869437739485762, + "loss": 0.79913992, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.12908936, + "routerloss_mlp": 0.0, + "step": 2062, + "time_per_iteration": 2.608938455581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085084, + "balance_loss_mlp": 1.07266808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06314890183319057, + "language_loss": 0.92750764, + "learning_rate": 0.0006866547902723053, + "loss": 0.93835843, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.12420654, + "routerloss_mlp": 0.0, + "step": 2063, + "time_per_iteration": 2.654764175415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07135844, + "diversity_loss_mlp": 0.0, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10797740353372913, + "language_loss": 0.80444092, + "learning_rate": 0.000686365734130218, + "loss": 0.81527805, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.12365723, + "routerloss_mlp": 0.0, + "step": 2064, + "time_per_iteration": 2.7161076068878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085534, + "balance_loss_mlp": 1.07345843, + "diversity_loss_mlp": 0.0, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06605501724079509, + "language_loss": 0.83883071, + "learning_rate": 0.000686076605634536, + "loss": 0.84968603, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2065, + "time_per_iteration": 2.5960052013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088286, + "balance_loss_mlp": 1.07656133, + "diversity_loss_mlp": 0.0, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.06893141882644385, + "language_loss": 0.84303313, + "learning_rate": 0.0006857874048975088, + "loss": 0.85391599, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2066, + "time_per_iteration": 2.5419557094573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.08599246, + "diversity_loss_mlp": 0.0, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.07076940729430262, + "language_loss": 0.86944497, + "learning_rate": 0.0006854981320314142, + "loss": 0.88042831, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2067, + "time_per_iteration": 2.4425127506256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101956, + "balance_loss_mlp": 1.0900414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.08678893766230582, + "language_loss": 0.86775517, + "learning_rate": 0.0006852087871485579, + "loss": 0.87877476, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2068, + "time_per_iteration": 2.617234468460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104627, + "balance_loss_mlp": 1.09308147, + "diversity_loss_mlp": 0.0, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08540761893483814, + "language_loss": 0.81805646, + "learning_rate": 0.0006849193703612735, + "loss": 0.82910275, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2069, + "time_per_iteration": 2.7818312644958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.08808875, + "diversity_loss_mlp": 0.0, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.06305964525737012, + "language_loss": 0.77731991, + "learning_rate": 0.0006846298817819225, + "loss": 0.78832221, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2070, + "time_per_iteration": 2.970045328140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099656, + "balance_loss_mlp": 1.08777106, + "diversity_loss_mlp": 0.0, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.09229213766989015, + "language_loss": 0.81058359, + "learning_rate": 0.0006843403215228945, + "loss": 0.82158017, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2071, + "time_per_iteration": 2.47542405128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.08525538, + "diversity_loss_mlp": 0.0, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.06250612449775428, + "language_loss": 0.80665851, + "learning_rate": 0.0006840506896966065, + "loss": 0.81763273, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2072, + "time_per_iteration": 2.7048730850219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102151, + "balance_loss_mlp": 1.09000397, + "diversity_loss_mlp": 0.0, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.07670911788950584, + "language_loss": 0.82343054, + "learning_rate": 0.0006837609864155038, + "loss": 0.83445203, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2073, + "time_per_iteration": 2.940208673477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111546, + "balance_loss_mlp": 1.09976768, + "diversity_loss_mlp": 0.0, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.06443735331096001, + "language_loss": 0.83203363, + "learning_rate": 0.0006834712117920592, + "loss": 0.84314907, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2074, + "time_per_iteration": 2.6217153072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111182, + "balance_loss_mlp": 1.09892166, + "diversity_loss_mlp": 0.0, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.07401760730887977, + "language_loss": 0.85670066, + "learning_rate": 0.0006831813659387729, + "loss": 0.86781245, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2075, + "time_per_iteration": 2.5696237087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109855, + "balance_loss_mlp": 1.09774292, + "diversity_loss_mlp": 0.0, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.05990934262108594, + "language_loss": 0.84167391, + "learning_rate": 0.0006828914489681733, + "loss": 0.85277247, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2076, + "time_per_iteration": 2.7859339714050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119025, + "balance_loss_mlp": 1.1072948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.06517456650976074, + "language_loss": 0.85312855, + "learning_rate": 0.0006826014609928162, + "loss": 0.86431879, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2077, + "time_per_iteration": 2.6851699352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0060157, + "balance_loss_mlp": 1.02597332, + "diversity_loss_mlp": 0.1552759, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.0013651319096223075, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8380096, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01094547, + "step": 2078, + "time_per_iteration": 4.859188795089722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.1030947, + "diversity_loss_mlp": 0.0, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.0748648316539235, + "language_loss": 0.80062771, + "learning_rate": 0.0006820212724781896, + "loss": 0.81177354, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.11486816, + "routerloss_mlp": 0.0, + "step": 2079, + "time_per_iteration": 2.6628189086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.09492946, + "diversity_loss_mlp": 0.0, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06148312623903997, + "language_loss": 0.83733618, + "learning_rate": 0.0006817310721641694, + "loss": 0.84840119, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2080, + "time_per_iteration": 2.847182512283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119227, + "balance_loss_mlp": 1.10731816, + "diversity_loss_mlp": 0.0, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.07223167054032475, + "language_loss": 0.83566946, + "learning_rate": 0.00068144080129589, + "loss": 0.84686172, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2081, + "time_per_iteration": 2.7161402702331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_mlp": 1.10388541, + "diversity_loss_mlp": 0.0, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.07619573858560975, + "language_loss": 0.8280167, + "learning_rate": 0.0006811504599860441, + "loss": 0.83917284, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2082, + "time_per_iteration": 2.5584774017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104984, + "balance_loss_mlp": 1.0928719, + "diversity_loss_mlp": 0.0, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.1306421138400452, + "language_loss": 0.8569895, + "learning_rate": 0.0006808600483473526, + "loss": 0.86803931, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2083, + "time_per_iteration": 2.864786148071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094797, + "balance_loss_mlp": 1.0824883, + "diversity_loss_mlp": 0.0, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.06339794743033755, + "language_loss": 0.86393988, + "learning_rate": 0.0006805695664925629, + "loss": 0.87488782, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.12304688, + "routerloss_mlp": 0.0, + "step": 2084, + "time_per_iteration": 2.844709634780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089155, + "balance_loss_mlp": 1.07735372, + "diversity_loss_mlp": 0.0, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.0888076684038974, + "language_loss": 0.83841193, + "learning_rate": 0.0006802790145344506, + "loss": 0.84930348, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2085, + "time_per_iteration": 2.4883856773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083386, + "balance_loss_mlp": 1.07145894, + "diversity_loss_mlp": 0.0, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07803386161895243, + "language_loss": 0.87420845, + "learning_rate": 0.0006799883925858176, + "loss": 0.88504231, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2086, + "time_per_iteration": 2.8824286460876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088787, + "balance_loss_mlp": 1.0766871, + "diversity_loss_mlp": 0.0, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.06924310288687491, + "language_loss": 0.85459089, + "learning_rate": 0.0006796977007594933, + "loss": 0.86547881, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2087, + "time_per_iteration": 2.6597371101379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970559, + "balance_loss_mlp": 1.6983223, + "diversity_loss_mlp": 0.21244028, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.03280700890509502, + "language_loss": 0.86715519, + "learning_rate": 0.0006794069391683345, + "loss": 0.87686074, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01517779, + "step": 2088, + "time_per_iteration": 2.7649624347686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078316, + "balance_loss_mlp": 1.06610286, + "diversity_loss_mlp": 0.0, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.07764554073270104, + "language_loss": 0.80781567, + "learning_rate": 0.0006791161079252248, + "loss": 0.81859887, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2089, + "time_per_iteration": 2.6467885971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082706, + "balance_loss_mlp": 1.07014716, + "diversity_loss_mlp": 0.0, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.0935978018434956, + "language_loss": 0.82482743, + "learning_rate": 0.0006788252071430747, + "loss": 0.8356545, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.12561035, + "routerloss_mlp": 0.0, + "step": 2090, + "time_per_iteration": 2.684659242630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076621, + "balance_loss_mlp": 1.06417561, + "diversity_loss_mlp": 0.0, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.061003649340911806, + "language_loss": 0.86884034, + "learning_rate": 0.0006785342369348222, + "loss": 0.87960654, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.12451172, + "routerloss_mlp": 0.0, + "step": 2091, + "time_per_iteration": 2.7500762939453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081239, + "balance_loss_mlp": 1.06896663, + "diversity_loss_mlp": 0.0, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.08323404973511926, + "language_loss": 0.79681003, + "learning_rate": 0.0006782431974134316, + "loss": 0.80762231, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2092, + "time_per_iteration": 2.554500102996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_mlp": 1.07266974, + "diversity_loss_mlp": 0.0, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.06323665884579813, + "language_loss": 0.89339125, + "learning_rate": 0.0006779520886918949, + "loss": 0.90424317, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2093, + "time_per_iteration": 3.0625791549682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109256, + "balance_loss_mlp": 1.08038247, + "diversity_loss_mlp": 0.0, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.06591278584355922, + "language_loss": 0.81594688, + "learning_rate": 0.0006776609108832301, + "loss": 0.82687247, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2094, + "time_per_iteration": 2.84006929397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099273, + "balance_loss_mlp": 1.08723903, + "diversity_loss_mlp": 0.0, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.07397134749055344, + "language_loss": 0.84911013, + "learning_rate": 0.0006773696641004828, + "loss": 0.86010277, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.12030029, + "routerloss_mlp": 0.0, + "step": 2095, + "time_per_iteration": 2.5662059783935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110289, + "balance_loss_mlp": 1.09781969, + "diversity_loss_mlp": 0.0, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.07471072764212172, + "language_loss": 0.77422667, + "learning_rate": 0.0006770783484567247, + "loss": 0.78532958, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.12475586, + "routerloss_mlp": 0.0, + "step": 2096, + "time_per_iteration": 3.120000123977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106409, + "balance_loss_mlp": 1.09445786, + "diversity_loss_mlp": 0.0, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.05645154934481913, + "language_loss": 0.85885596, + "learning_rate": 0.000676786964065055, + "loss": 0.86992002, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2097, + "time_per_iteration": 2.7947449684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.09767413, + "diversity_loss_mlp": 0.0, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.06468702094514471, + "language_loss": 0.78823644, + "learning_rate": 0.0006764955110385986, + "loss": 0.7993331, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2098, + "time_per_iteration": 2.7805027961730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113539, + "balance_loss_mlp": 1.10162365, + "diversity_loss_mlp": 0.0, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.06520165677387538, + "language_loss": 0.80479109, + "learning_rate": 0.0006762039894905083, + "loss": 0.81592649, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2099, + "time_per_iteration": 2.5934462547302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113864, + "balance_loss_mlp": 1.10191941, + "diversity_loss_mlp": 0.0, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.07619139256642768, + "language_loss": 0.80502266, + "learning_rate": 0.000675912399533962, + "loss": 0.81616127, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2100, + "time_per_iteration": 2.5193917751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0095878, + "balance_loss_mlp": 1.67460704, + "diversity_loss_mlp": 0.21229821, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.026749352452392162, + "language_loss": 0.8501215, + "learning_rate": 0.0006756207412821656, + "loss": 0.85970926, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532745, + "step": 2101, + "time_per_iteration": 3.0674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125351, + "balance_loss_mlp": 1.11366224, + "diversity_loss_mlp": 0.0, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.07971707112625441, + "language_loss": 0.80680853, + "learning_rate": 0.0006753290148483505, + "loss": 0.81806201, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2102, + "time_per_iteration": 3.0177412033081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128591, + "balance_loss_mlp": 1.11720061, + "diversity_loss_mlp": 0.0, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.07197972569419236, + "language_loss": 0.78862077, + "learning_rate": 0.0006750372203457752, + "loss": 0.79990667, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2103, + "time_per_iteration": 2.4715232849121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133472, + "balance_loss_mlp": 1.12199795, + "diversity_loss_mlp": 0.0, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.05679089538273026, + "language_loss": 0.8629868, + "learning_rate": 0.0006747453578877242, + "loss": 0.87432158, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2104, + "time_per_iteration": 2.7127907276153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133404, + "balance_loss_mlp": 1.12154305, + "diversity_loss_mlp": 0.0, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.07881786572134404, + "language_loss": 0.83325595, + "learning_rate": 0.0006744534275875085, + "loss": 0.84459001, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.11853027, + "routerloss_mlp": 0.0, + "step": 2105, + "time_per_iteration": 2.9968934059143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_mlp": 1.11278331, + "diversity_loss_mlp": 0.0, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.06959652480101333, + "language_loss": 0.85228348, + "learning_rate": 0.0006741614295584657, + "loss": 0.86352497, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2106, + "time_per_iteration": 2.6837310791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128388, + "balance_loss_mlp": 1.1166873, + "diversity_loss_mlp": 0.0, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.07271017039443997, + "language_loss": 0.78820735, + "learning_rate": 0.0006738693639139595, + "loss": 0.79949123, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2107, + "time_per_iteration": 2.9876344203948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_mlp": 1.09982085, + "diversity_loss_mlp": 0.0, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.07545270814647756, + "language_loss": 0.7770499, + "learning_rate": 0.0006735772307673796, + "loss": 0.78816462, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2108, + "time_per_iteration": 3.5391368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.1007216, + "diversity_loss_mlp": 0.0, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.07028810729839409, + "language_loss": 0.8317976, + "learning_rate": 0.0006732850302321421, + "loss": 0.84292281, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2109, + "time_per_iteration": 2.924703359603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107897, + "balance_loss_mlp": 1.0962801, + "diversity_loss_mlp": 0.0, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.08331494403878895, + "language_loss": 0.84220135, + "learning_rate": 0.00067299276242169, + "loss": 0.85328031, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2110, + "time_per_iteration": 2.6628758907318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00591895, + "balance_loss_mlp": 1.01285744, + "diversity_loss_mlp": 0.15005666, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.0011574932258311419, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.74974066, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01043818, + "step": 2111, + "time_per_iteration": 4.913798093795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100112, + "balance_loss_mlp": 1.0884769, + "diversity_loss_mlp": 0.0, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.0671840972805921, + "language_loss": 0.77974957, + "learning_rate": 0.0006724080254290395, + "loss": 0.79075068, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2112, + "time_per_iteration": 2.790695905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087298, + "balance_loss_mlp": 1.07509685, + "diversity_loss_mlp": 0.0, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.06921545909042545, + "language_loss": 0.89956391, + "learning_rate": 0.0006721155564738566, + "loss": 0.91043687, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2113, + "time_per_iteration": 2.654052495956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00590146, + "balance_loss_mlp": 1.01069736, + "diversity_loss_mlp": 0.14874323, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.001129022163549877, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79212785, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01042587, + "step": 2114, + "time_per_iteration": 5.02890682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095601, + "balance_loss_mlp": 1.08348942, + "diversity_loss_mlp": 0.0, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.06673632265299649, + "language_loss": 0.85678279, + "learning_rate": 0.0006715304182135078, + "loss": 0.86773884, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2115, + "time_per_iteration": 2.6665151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_mlp": 1.07951176, + "diversity_loss_mlp": 0.0, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.08902530655488881, + "language_loss": 0.8859638, + "learning_rate": 0.0006712377491355127, + "loss": 0.89688623, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.12731934, + "routerloss_mlp": 0.0, + "step": 2116, + "time_per_iteration": 2.9124083518981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091485, + "balance_loss_mlp": 1.07896256, + "diversity_loss_mlp": 0.0, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.06275972542298792, + "language_loss": 0.81009984, + "learning_rate": 0.0006709450135771274, + "loss": 0.8210147, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2117, + "time_per_iteration": 2.9538469314575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109045, + "balance_loss_mlp": 1.07800436, + "diversity_loss_mlp": 0.0, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.06731197780562713, + "language_loss": 0.8655895, + "learning_rate": 0.0006706522116520023, + "loss": 0.87649393, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.12445068, + "routerloss_mlp": 0.0, + "step": 2118, + "time_per_iteration": 2.6403684616088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109386, + "balance_loss_mlp": 1.08127189, + "diversity_loss_mlp": 0.0, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.07339707473672348, + "language_loss": 0.82936597, + "learning_rate": 0.0006703593434738127, + "loss": 0.84030455, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.12579346, + "routerloss_mlp": 0.0, + "step": 2119, + "time_per_iteration": 2.706406354904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_mlp": 1.0847466, + "diversity_loss_mlp": 0.0, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.05750096894007485, + "language_loss": 0.78123623, + "learning_rate": 0.0006700664091562604, + "loss": 0.79220533, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2120, + "time_per_iteration": 2.5515992641448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102687, + "balance_loss_mlp": 1.09045601, + "diversity_loss_mlp": 0.0, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.08484846499370094, + "language_loss": 0.85241771, + "learning_rate": 0.0006697734088130725, + "loss": 0.86344457, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2121, + "time_per_iteration": 2.5997116565704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094995, + "balance_loss_mlp": 1.08268619, + "diversity_loss_mlp": 0.0, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.06901349076849703, + "language_loss": 0.85628182, + "learning_rate": 0.0006694803425580018, + "loss": 0.86723173, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.12310791, + "routerloss_mlp": 0.0, + "step": 2122, + "time_per_iteration": 2.975572109222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090381, + "balance_loss_mlp": 1.07825708, + "diversity_loss_mlp": 0.0, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.08123936309079019, + "language_loss": 0.84420574, + "learning_rate": 0.0006691872105048268, + "loss": 0.85510951, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.12133789, + "routerloss_mlp": 0.0, + "step": 2123, + "time_per_iteration": 2.5785253047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109138, + "balance_loss_mlp": 1.07879114, + "diversity_loss_mlp": 0.0, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.06700388653835253, + "language_loss": 0.84703517, + "learning_rate": 0.0006688940127673513, + "loss": 0.85794896, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.12597656, + "routerloss_mlp": 0.0, + "step": 2124, + "time_per_iteration": 2.794312000274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080411, + "balance_loss_mlp": 1.06789398, + "diversity_loss_mlp": 0.0, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.11477925500015464, + "language_loss": 0.85646629, + "learning_rate": 0.0006686007494594049, + "loss": 0.86727041, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2125, + "time_per_iteration": 2.8629977703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.06869102, + "diversity_loss_mlp": 0.0, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.08770785423003769, + "language_loss": 0.80226219, + "learning_rate": 0.0006683074206948425, + "loss": 0.81306815, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2126, + "time_per_iteration": 2.5477960109710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_mlp": 1.06884146, + "diversity_loss_mlp": 0.0, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.0688791895715759, + "language_loss": 0.81257784, + "learning_rate": 0.0006680140265875443, + "loss": 0.82338405, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2127, + "time_per_iteration": 2.824706792831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076947, + "balance_loss_mlp": 1.06504989, + "diversity_loss_mlp": 0.0, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.0706270365820259, + "language_loss": 0.95744675, + "learning_rate": 0.0006677205672514162, + "loss": 0.96821618, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2128, + "time_per_iteration": 2.6173171997070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081253, + "balance_loss_mlp": 1.06944525, + "diversity_loss_mlp": 0.0, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.08385407721227026, + "language_loss": 0.88751161, + "learning_rate": 0.000667427042800389, + "loss": 0.89832413, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2129, + "time_per_iteration": 2.746561288833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090066, + "balance_loss_mlp": 1.07828188, + "diversity_loss_mlp": 0.0, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.0802302808929841, + "language_loss": 0.82728851, + "learning_rate": 0.0006671334533484192, + "loss": 0.83818918, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2130, + "time_per_iteration": 2.7765390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094937, + "balance_loss_mlp": 1.08306408, + "diversity_loss_mlp": 0.0, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.06494454218377498, + "language_loss": 0.83394802, + "learning_rate": 0.0006668397990094881, + "loss": 0.84489739, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2131, + "time_per_iteration": 2.6814444065093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094306, + "balance_loss_mlp": 1.08240891, + "diversity_loss_mlp": 0.0, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.08851492372685672, + "language_loss": 0.84863144, + "learning_rate": 0.0006665460798976027, + "loss": 0.8595745, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2132, + "time_per_iteration": 2.734208822250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098079, + "balance_loss_mlp": 1.08680749, + "diversity_loss_mlp": 0.0, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.07834997970618658, + "language_loss": 0.8153789, + "learning_rate": 0.0006662522961267947, + "loss": 0.82635975, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2133, + "time_per_iteration": 2.642789363861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100422, + "balance_loss_mlp": 1.0889008, + "diversity_loss_mlp": 0.0, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.06175420460070233, + "language_loss": 0.87238759, + "learning_rate": 0.0006659584478111211, + "loss": 0.88339174, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2134, + "time_per_iteration": 2.8097283840179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110764, + "balance_loss_mlp": 1.09618366, + "diversity_loss_mlp": 0.0, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.07261990262121029, + "language_loss": 0.82762325, + "learning_rate": 0.000665664535064664, + "loss": 0.83869964, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2135, + "time_per_iteration": 3.034973382949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118337, + "balance_loss_mlp": 1.10702372, + "diversity_loss_mlp": 0.0, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.07277612177905571, + "language_loss": 0.82753229, + "learning_rate": 0.0006653705580015303, + "loss": 0.83871567, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.11309814, + "routerloss_mlp": 0.0, + "step": 2136, + "time_per_iteration": 2.719024181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130623, + "balance_loss_mlp": 1.11913705, + "diversity_loss_mlp": 0.0, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.09561286081072368, + "language_loss": 0.86333638, + "learning_rate": 0.0006650765167358523, + "loss": 0.87464261, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2137, + "time_per_iteration": 2.798013210296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119744, + "balance_loss_mlp": 1.10816908, + "diversity_loss_mlp": 0.0, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.06575385598885217, + "language_loss": 0.90120316, + "learning_rate": 0.0006647824113817864, + "loss": 0.9124006, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2138, + "time_per_iteration": 2.5290029048919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00941862, + "balance_loss_mlp": 1.64172852, + "diversity_loss_mlp": 0.21382158, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.027199696320483784, + "language_loss": 0.81782889, + "learning_rate": 0.000664488242053515, + "loss": 0.8272475, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01408678, + "step": 2139, + "time_per_iteration": 2.7610864639282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111286, + "balance_loss_mlp": 1.1009748, + "diversity_loss_mlp": 0.0, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.07795493316399416, + "language_loss": 0.83879304, + "learning_rate": 0.0006641940088652445, + "loss": 0.84992164, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 2140, + "time_per_iteration": 2.7797446250915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.08682573, + "diversity_loss_mlp": 0.0, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.09321248474614077, + "language_loss": 0.82214057, + "learning_rate": 0.0006638997119312065, + "loss": 0.83312857, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2141, + "time_per_iteration": 2.688427209854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_mlp": 1.07580638, + "diversity_loss_mlp": 0.0, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.05051376163622262, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76146024, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 2142, + "time_per_iteration": 4.916438817977905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084339, + "balance_loss_mlp": 1.07186329, + "diversity_loss_mlp": 0.0, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.0666522569579182, + "language_loss": 0.84487629, + "learning_rate": 0.000663310927282877, + "loss": 0.85571963, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.12475586, + "routerloss_mlp": 0.0, + "step": 2143, + "time_per_iteration": 2.742781162261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075707, + "balance_loss_mlp": 1.06302905, + "diversity_loss_mlp": 0.0, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.07553146792883669, + "language_loss": 0.85816187, + "learning_rate": 0.000663016439797172, + "loss": 0.86891896, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.12677002, + "routerloss_mlp": 0.0, + "step": 2144, + "time_per_iteration": 2.602322578430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075852, + "balance_loss_mlp": 1.06363273, + "diversity_loss_mlp": 0.0, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.09188682549299809, + "language_loss": 0.80924189, + "learning_rate": 0.0006627218890228724, + "loss": 0.82000041, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2145, + "time_per_iteration": 2.76790452003479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081048, + "balance_loss_mlp": 1.0687809, + "diversity_loss_mlp": 0.0, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.09235653357512275, + "language_loss": 0.83860421, + "learning_rate": 0.0006624272750743326, + "loss": 0.84941471, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2146, + "time_per_iteration": 2.986267566680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085622, + "balance_loss_mlp": 1.073385, + "diversity_loss_mlp": 0.0, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.06221373460159241, + "language_loss": 0.82866907, + "learning_rate": 0.0006621325980659322, + "loss": 0.83952528, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2147, + "time_per_iteration": 2.78074049949646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091906, + "balance_loss_mlp": 1.07981253, + "diversity_loss_mlp": 0.0, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.06655163113776748, + "language_loss": 0.81613219, + "learning_rate": 0.000661837858112075, + "loss": 0.82705128, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2148, + "time_per_iteration": 2.8118457794189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920817, + "balance_loss_mlp": 1.59947157, + "diversity_loss_mlp": 0.21162269, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.03430222900415099, + "language_loss": 0.88696158, + "learning_rate": 0.0006615430553271888, + "loss": 0.89616972, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01526995, + "step": 2149, + "time_per_iteration": 2.809389352798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.10438299, + "diversity_loss_mlp": 0.0, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.06824786639125466, + "language_loss": 0.85333586, + "learning_rate": 0.0006612481898257264, + "loss": 0.8644954, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2150, + "time_per_iteration": 2.855074644088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137757, + "balance_loss_mlp": 1.12599659, + "diversity_loss_mlp": 0.0, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.07789693292988349, + "language_loss": 0.851385, + "learning_rate": 0.000660953261722165, + "loss": 0.86276257, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.11749268, + "routerloss_mlp": 0.0, + "step": 2151, + "time_per_iteration": 2.5938022136688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113704, + "balance_loss_mlp": 1.12522054, + "diversity_loss_mlp": 0.0, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.08228338378299185, + "language_loss": 0.82884097, + "learning_rate": 0.0006606582711310055, + "loss": 0.84021133, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2152, + "time_per_iteration": 2.7282497882843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145368, + "balance_loss_mlp": 1.13366747, + "diversity_loss_mlp": 0.0, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.06559194318793425, + "language_loss": 0.82812124, + "learning_rate": 0.0006603632181667736, + "loss": 0.83957493, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2153, + "time_per_iteration": 2.6664750576019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103836, + "balance_loss_mlp": 1.09754133, + "diversity_loss_mlp": 0.0, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.03767833543400207, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.8004716, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.06298828, + "routerloss_mlp": 0.0, + "step": 2154, + "time_per_iteration": 4.910309791564941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135237, + "balance_loss_mlp": 1.12367392, + "diversity_loss_mlp": 0.0, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.0807614788835298, + "language_loss": 0.81897664, + "learning_rate": 0.0006597729255773153, + "loss": 0.83032906, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.11560059, + "routerloss_mlp": 0.0, + "step": 2155, + "time_per_iteration": 2.509021520614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146058, + "balance_loss_mlp": 1.13441765, + "diversity_loss_mlp": 0.0, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.07993173196210833, + "language_loss": 0.82465029, + "learning_rate": 0.0006594776861812608, + "loss": 0.83611095, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2156, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151315, + "balance_loss_mlp": 1.13991857, + "diversity_loss_mlp": 0.0, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.06494614409867079, + "language_loss": 0.8654387, + "learning_rate": 0.0006591823848704776, + "loss": 0.87695187, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.11395264, + "routerloss_mlp": 0.0, + "step": 2157, + "time_per_iteration": 2.9039251804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134696, + "balance_loss_mlp": 1.12316287, + "diversity_loss_mlp": 0.0, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.07584878913150254, + "language_loss": 0.81510401, + "learning_rate": 0.0006588870217596117, + "loss": 0.82645094, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2158, + "time_per_iteration": 2.7366249561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121714, + "balance_loss_mlp": 1.11010289, + "diversity_loss_mlp": 0.0, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.0768974217493938, + "language_loss": 0.8567549, + "learning_rate": 0.0006585915969633334, + "loss": 0.86797202, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2159, + "time_per_iteration": 2.557969331741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105923, + "balance_loss_mlp": 1.09437764, + "diversity_loss_mlp": 0.0, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.06453825749462137, + "language_loss": 0.89545041, + "learning_rate": 0.0006582961105963366, + "loss": 0.90650964, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2160, + "time_per_iteration": 2.782766103744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089959, + "balance_loss_mlp": 1.07836008, + "diversity_loss_mlp": 0.0, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.09389311079563152, + "language_loss": 0.77639234, + "learning_rate": 0.0006580005627733395, + "loss": 0.78729188, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2161, + "time_per_iteration": 2.7049734592437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086569, + "balance_loss_mlp": 1.07492197, + "diversity_loss_mlp": 0.0, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.08236412019602501, + "language_loss": 0.81618345, + "learning_rate": 0.0006577049536090838, + "loss": 0.8270492, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.11645508, + "routerloss_mlp": 0.0, + "step": 2162, + "time_per_iteration": 2.723243236541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078674, + "balance_loss_mlp": 1.06676459, + "diversity_loss_mlp": 0.0, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.09869721655750711, + "language_loss": 0.85591501, + "learning_rate": 0.000657409283218335, + "loss": 0.86670172, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2163, + "time_per_iteration": 2.64973783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078005, + "balance_loss_mlp": 1.0662148, + "diversity_loss_mlp": 0.0, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.06806079796586995, + "language_loss": 0.81014043, + "learning_rate": 0.0006571135517158829, + "loss": 0.82092047, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2164, + "time_per_iteration": 2.6662614345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261192, + "balance_loss_mlp": 1.25542271, + "diversity_loss_mlp": 0.0, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.0963910676883023, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.78025252, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2165, + "time_per_iteration": 4.733267068862915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.07227921, + "diversity_loss_mlp": 0.0, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.08489426271121504, + "language_loss": 0.83098751, + "learning_rate": 0.0006565219058351444, + "loss": 0.84183216, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.12194824, + "routerloss_mlp": 0.0, + "step": 2166, + "time_per_iteration": 2.555367946624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087654, + "balance_loss_mlp": 1.07506573, + "diversity_loss_mlp": 0.0, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.0663020588108057, + "language_loss": 0.82663929, + "learning_rate": 0.0006562259916865553, + "loss": 0.83751583, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.12585449, + "routerloss_mlp": 0.0, + "step": 2167, + "time_per_iteration": 2.5647947788238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085528, + "balance_loss_mlp": 1.07305884, + "diversity_loss_mlp": 0.0, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.11811458423881586, + "language_loss": 0.79392177, + "learning_rate": 0.0006559300168856573, + "loss": 0.80477709, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 2168, + "time_per_iteration": 2.737071990966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090803, + "balance_loss_mlp": 1.07860184, + "diversity_loss_mlp": 0.0, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.07183663020795078, + "language_loss": 0.86060214, + "learning_rate": 0.0006556339815473577, + "loss": 0.87151015, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2169, + "time_per_iteration": 2.6506707668304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087149, + "balance_loss_mlp": 1.07504892, + "diversity_loss_mlp": 0.0, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.07609133400056706, + "language_loss": 0.86409211, + "learning_rate": 0.000655337885786588, + "loss": 0.87496364, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2170, + "time_per_iteration": 2.8835949897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078201, + "balance_loss_mlp": 1.06654263, + "diversity_loss_mlp": 0.0, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.08298304012821277, + "language_loss": 0.85129267, + "learning_rate": 0.0006550417297183025, + "loss": 0.86207461, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2171, + "time_per_iteration": 2.6195385456085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087717, + "balance_loss_mlp": 1.07584357, + "diversity_loss_mlp": 0.0, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.07223590906341684, + "language_loss": 0.81395489, + "learning_rate": 0.0006547455134574793, + "loss": 0.82483202, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 2172, + "time_per_iteration": 2.688387155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091071, + "balance_loss_mlp": 1.07947183, + "diversity_loss_mlp": 0.0, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06986640066350178, + "language_loss": 0.84520721, + "learning_rate": 0.0006544492371191198, + "loss": 0.85611784, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2173, + "time_per_iteration": 3.1099753379821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094341, + "balance_loss_mlp": 1.08226562, + "diversity_loss_mlp": 0.0, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.06657472623207703, + "language_loss": 0.8341983, + "learning_rate": 0.0006541529008182485, + "loss": 0.84514177, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2174, + "time_per_iteration": 3.203376054763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_mlp": 1.09567666, + "diversity_loss_mlp": 0.0, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.07167092475387357, + "language_loss": 0.87561977, + "learning_rate": 0.0006538565046699136, + "loss": 0.8866933, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2175, + "time_per_iteration": 2.6136248111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122872, + "balance_loss_mlp": 1.1111474, + "diversity_loss_mlp": 0.0, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.08073018870716439, + "language_loss": 0.81308544, + "learning_rate": 0.0006535600487891862, + "loss": 0.82431418, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2176, + "time_per_iteration": 2.8484995365142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112142, + "balance_loss_mlp": 1.10968423, + "diversity_loss_mlp": 0.0, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.06933020813080157, + "language_loss": 0.89047962, + "learning_rate": 0.0006532635332911603, + "loss": 0.90169382, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2177, + "time_per_iteration": 2.6983814239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139797, + "balance_loss_mlp": 1.12828767, + "diversity_loss_mlp": 0.0, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.07833316419755533, + "language_loss": 0.80340332, + "learning_rate": 0.0006529669582909541, + "loss": 0.81480134, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2178, + "time_per_iteration": 3.247034788131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130167, + "balance_loss_mlp": 1.11881781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.08850961832331757, + "language_loss": 0.85867965, + "learning_rate": 0.0006526703239037077, + "loss": 0.86998129, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2179, + "time_per_iteration": 2.6653683185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933718, + "balance_loss_mlp": 1.62844765, + "diversity_loss_mlp": 0.20954823, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.029582524443817385, + "language_loss": 0.86593473, + "learning_rate": 0.0006523736302445851, + "loss": 0.87527192, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01471971, + "step": 2180, + "time_per_iteration": 2.857030153274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120353, + "balance_loss_mlp": 1.10893881, + "diversity_loss_mlp": 0.0, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.0687803817541909, + "language_loss": 0.77392578, + "learning_rate": 0.0006520768774287728, + "loss": 0.78512931, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2181, + "time_per_iteration": 5.625683307647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114289, + "balance_loss_mlp": 1.10282135, + "diversity_loss_mlp": 0.0, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06088029266780351, + "language_loss": 0.85493296, + "learning_rate": 0.0006517800655714806, + "loss": 0.86607587, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2182, + "time_per_iteration": 2.812955617904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105622, + "balance_loss_mlp": 1.09442866, + "diversity_loss_mlp": 0.0, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07098705372074567, + "language_loss": 0.85399854, + "learning_rate": 0.0006514831947879407, + "loss": 0.86505473, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.11193848, + "routerloss_mlp": 0.0, + "step": 2183, + "time_per_iteration": 2.961418867111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097617, + "balance_loss_mlp": 1.08642888, + "diversity_loss_mlp": 0.0, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.08450852264083888, + "language_loss": 0.78323019, + "learning_rate": 0.0006511862651934091, + "loss": 0.79420632, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2184, + "time_per_iteration": 3.076414108276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091172, + "balance_loss_mlp": 1.07956707, + "diversity_loss_mlp": 0.0, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.06921087236063693, + "language_loss": 0.82092035, + "learning_rate": 0.0006508892769031638, + "loss": 0.83183205, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2185, + "time_per_iteration": 2.638606309890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089875, + "balance_loss_mlp": 1.07868707, + "diversity_loss_mlp": 0.0, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.07895440454445611, + "language_loss": 0.87322706, + "learning_rate": 0.000650592230032506, + "loss": 0.88412583, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.11187744, + "routerloss_mlp": 0.0, + "step": 2186, + "time_per_iteration": 2.702061176300049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093256, + "balance_loss_mlp": 1.0815382, + "diversity_loss_mlp": 0.0, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.07748698496632533, + "language_loss": 0.85121393, + "learning_rate": 0.0006502951246967595, + "loss": 0.8621465, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.11706543, + "routerloss_mlp": 0.0, + "step": 2187, + "time_per_iteration": 2.871629476547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087261, + "balance_loss_mlp": 1.07582331, + "diversity_loss_mlp": 0.0, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.06016607527200091, + "language_loss": 0.86913472, + "learning_rate": 0.0006499979610112706, + "loss": 0.88000733, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2188, + "time_per_iteration": 2.795278787612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107962, + "balance_loss_mlp": 1.06803894, + "diversity_loss_mlp": 0.0, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.0593739697007924, + "language_loss": 0.84024572, + "learning_rate": 0.000649700739091409, + "loss": 0.85104191, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2189, + "time_per_iteration": 2.822756290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123251, + "balance_loss_mlp": 1.11500144, + "diversity_loss_mlp": 0.0, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.03860831682793276, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74959522, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.08251953, + "routerloss_mlp": 0.0, + "step": 2190, + "time_per_iteration": 4.79919958114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082066, + "balance_loss_mlp": 1.07052088, + "diversity_loss_mlp": 0.0, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.06761793691364075, + "language_loss": 0.85737348, + "learning_rate": 0.0006491061210101557, + "loss": 0.86819422, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2191, + "time_per_iteration": 2.661578416824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094285, + "balance_loss_mlp": 1.08270931, + "diversity_loss_mlp": 0.0, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.0725556462678514, + "language_loss": 0.83956218, + "learning_rate": 0.0006488087250796157, + "loss": 0.85050505, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2192, + "time_per_iteration": 2.881225347518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095445, + "balance_loss_mlp": 1.08376861, + "diversity_loss_mlp": 0.0, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.09298126342392905, + "language_loss": 0.81662476, + "learning_rate": 0.0006485112713764049, + "loss": 0.82757914, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2193, + "time_per_iteration": 2.8921914100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_mlp": 1.08214593, + "diversity_loss_mlp": 0.0, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.058244545196029895, + "language_loss": 0.83715278, + "learning_rate": 0.0006482137600160051, + "loss": 0.84809017, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2194, + "time_per_iteration": 2.484341859817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094196, + "balance_loss_mlp": 1.08240056, + "diversity_loss_mlp": 0.0, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.08574033239321836, + "language_loss": 0.847399, + "learning_rate": 0.0006479161911139206, + "loss": 0.85834098, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2195, + "time_per_iteration": 2.5937106609344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082105, + "balance_loss_mlp": 1.07043433, + "diversity_loss_mlp": 0.0, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08791937036502419, + "language_loss": 0.85522735, + "learning_rate": 0.0006476185647856778, + "loss": 0.86604846, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.11657715, + "routerloss_mlp": 0.0, + "step": 2196, + "time_per_iteration": 2.569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080125, + "balance_loss_mlp": 1.06815672, + "diversity_loss_mlp": 0.0, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.07778870715402122, + "language_loss": 0.82192588, + "learning_rate": 0.0006473208811468255, + "loss": 0.83272707, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2197, + "time_per_iteration": 2.899557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072137, + "balance_loss_mlp": 1.06046605, + "diversity_loss_mlp": 0.0, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.07330307904629892, + "language_loss": 0.84140831, + "learning_rate": 0.0006470231403129347, + "loss": 0.85212964, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.11663818, + "routerloss_mlp": 0.0, + "step": 2198, + "time_per_iteration": 2.602447509765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106933, + "balance_loss_mlp": 1.05760026, + "diversity_loss_mlp": 0.0, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.06409293690085444, + "language_loss": 0.81590885, + "learning_rate": 0.0006467253423995988, + "loss": 0.82660222, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2199, + "time_per_iteration": 2.8557229042053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107934, + "balance_loss_mlp": 1.06755078, + "diversity_loss_mlp": 0.0, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.07244216805562081, + "language_loss": 0.78831869, + "learning_rate": 0.000646427487522433, + "loss": 0.79911208, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2200, + "time_per_iteration": 2.65742826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084908, + "balance_loss_mlp": 1.07336855, + "diversity_loss_mlp": 0.0, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.07121994515744344, + "language_loss": 0.83032513, + "learning_rate": 0.0006461295757970749, + "loss": 0.84117424, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2201, + "time_per_iteration": 2.950655698776245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090426, + "balance_loss_mlp": 1.07880902, + "diversity_loss_mlp": 0.0, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.07713064950594434, + "language_loss": 0.81538546, + "learning_rate": 0.0006458316073391839, + "loss": 0.82628965, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2202, + "time_per_iteration": 2.8609914779663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089677, + "balance_loss_mlp": 1.07874584, + "diversity_loss_mlp": 0.0, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.07022827859020209, + "language_loss": 0.87709206, + "learning_rate": 0.0006455335822644422, + "loss": 0.88798881, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2203, + "time_per_iteration": 2.6978323459625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118526, + "balance_loss_mlp": 1.10743332, + "diversity_loss_mlp": 0.0, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.08724206882012846, + "language_loss": 0.78530163, + "learning_rate": 0.0006452355006885527, + "loss": 0.79648691, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.11090088, + "routerloss_mlp": 0.0, + "step": 2204, + "time_per_iteration": 2.686579704284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00922718, + "balance_loss_mlp": 1.60671031, + "diversity_loss_mlp": 0.20807257, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.038668439213979985, + "language_loss": 0.8761735, + "learning_rate": 0.0006449373627272412, + "loss": 0.88540065, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532654, + "step": 2205, + "time_per_iteration": 2.7558722496032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112883, + "balance_loss_mlp": 1.10164738, + "diversity_loss_mlp": 0.0, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.08032286277613819, + "language_loss": 0.82142913, + "learning_rate": 0.0006446391684962553, + "loss": 0.83255792, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.11230469, + "routerloss_mlp": 0.0, + "step": 2206, + "time_per_iteration": 2.6579248905181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117757, + "balance_loss_mlp": 1.10650921, + "diversity_loss_mlp": 0.0, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.06707307211931093, + "language_loss": 0.82899106, + "learning_rate": 0.000644340918111364, + "loss": 0.8401686, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2207, + "time_per_iteration": 2.5347208976745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117145, + "balance_loss_mlp": 1.10573626, + "diversity_loss_mlp": 0.0, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.09153331321335235, + "language_loss": 0.84820396, + "learning_rate": 0.0006440426116883585, + "loss": 0.85937536, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.11401367, + "routerloss_mlp": 0.0, + "step": 2208, + "time_per_iteration": 2.5513036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.11258864, + "diversity_loss_mlp": 0.0, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.07442494649717855, + "language_loss": 0.86227304, + "learning_rate": 0.0006437442493430519, + "loss": 0.87351412, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2209, + "time_per_iteration": 2.6560840606689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120019, + "balance_loss_mlp": 1.10829473, + "diversity_loss_mlp": 0.0, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.09545289030190586, + "language_loss": 0.86441422, + "learning_rate": 0.000643445831191278, + "loss": 0.8756144, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2210, + "time_per_iteration": 2.9028308391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_mlp": 1.09162724, + "diversity_loss_mlp": 0.0, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.07646392549286844, + "language_loss": 0.81526744, + "learning_rate": 0.0006431473573488937, + "loss": 0.82629919, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2211, + "time_per_iteration": 2.7377443313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089807, + "balance_loss_mlp": 1.0782795, + "diversity_loss_mlp": 0.0, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.08107145257136338, + "language_loss": 0.85147351, + "learning_rate": 0.0006428488279317765, + "loss": 0.86237156, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2212, + "time_per_iteration": 2.6276626586914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109177, + "balance_loss_mlp": 1.08065951, + "diversity_loss_mlp": 0.0, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.09124161172132733, + "language_loss": 0.87490094, + "learning_rate": 0.0006425502430558259, + "loss": 0.88581866, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.11120605, + "routerloss_mlp": 0.0, + "step": 2213, + "time_per_iteration": 2.588928699493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109161, + "balance_loss_mlp": 1.08046961, + "diversity_loss_mlp": 0.0, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.06865062693642494, + "language_loss": 0.84588826, + "learning_rate": 0.0006422516028369628, + "loss": 0.85680431, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.11138916, + "routerloss_mlp": 0.0, + "step": 2214, + "time_per_iteration": 2.639619827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085781, + "balance_loss_mlp": 1.07456374, + "diversity_loss_mlp": 0.0, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.06481575152476399, + "language_loss": 0.83497036, + "learning_rate": 0.0006419529073911296, + "loss": 0.84582818, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2215, + "time_per_iteration": 2.8564555644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091551, + "balance_loss_mlp": 1.08075058, + "diversity_loss_mlp": 0.0, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.07537518077633425, + "language_loss": 0.85102242, + "learning_rate": 0.0006416541568342901, + "loss": 0.86193788, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.10797119, + "routerloss_mlp": 0.0, + "step": 2216, + "time_per_iteration": 2.8998327255249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082292, + "balance_loss_mlp": 1.07092535, + "diversity_loss_mlp": 0.0, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.06331803259599181, + "language_loss": 0.84347832, + "learning_rate": 0.0006413553512824297, + "loss": 0.85430121, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2217, + "time_per_iteration": 2.754044532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084192, + "balance_loss_mlp": 1.07307625, + "diversity_loss_mlp": 0.0, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.07616444203019798, + "language_loss": 0.84374213, + "learning_rate": 0.0006410564908515549, + "loss": 0.85458404, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.11114502, + "routerloss_mlp": 0.0, + "step": 2218, + "time_per_iteration": 2.724478006362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081004, + "balance_loss_mlp": 1.06966138, + "diversity_loss_mlp": 0.0, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.0731173396075932, + "language_loss": 0.85161233, + "learning_rate": 0.0006407575756576935, + "loss": 0.86242241, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.11334229, + "routerloss_mlp": 0.0, + "step": 2219, + "time_per_iteration": 2.754624128341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093699, + "balance_loss_mlp": 1.08191478, + "diversity_loss_mlp": 0.0, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.068521011535794, + "language_loss": 0.87612599, + "learning_rate": 0.0006404586058168951, + "loss": 0.88706297, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2220, + "time_per_iteration": 2.6972298622131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100965, + "balance_loss_mlp": 1.08927631, + "diversity_loss_mlp": 0.0, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.1033551804820373, + "language_loss": 0.86327708, + "learning_rate": 0.0006401595814452296, + "loss": 0.87428677, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2221, + "time_per_iteration": 2.6071925163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_mlp": 1.08816695, + "diversity_loss_mlp": 0.0, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.07649462730323824, + "language_loss": 0.8070569, + "learning_rate": 0.000639860502658789, + "loss": 0.81805706, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 2222, + "time_per_iteration": 2.6844141483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101843, + "balance_loss_mlp": 1.08965993, + "diversity_loss_mlp": 0.0, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.0652732350229211, + "language_loss": 0.84929889, + "learning_rate": 0.0006395613695736853, + "loss": 0.86031729, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.1217041, + "routerloss_mlp": 0.0, + "step": 2223, + "time_per_iteration": 2.6799042224884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091013, + "balance_loss_mlp": 1.07850194, + "diversity_loss_mlp": 0.0, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.10552751254703834, + "language_loss": 0.82026577, + "learning_rate": 0.0006392621823060529, + "loss": 0.83117592, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.12518311, + "routerloss_mlp": 0.0, + "step": 2224, + "time_per_iteration": 2.722675323486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083211, + "balance_loss_mlp": 1.07109332, + "diversity_loss_mlp": 0.0, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.0790777786133485, + "language_loss": 0.8508532, + "learning_rate": 0.0006389629409720465, + "loss": 0.86168534, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2225, + "time_per_iteration": 2.6559393405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084084, + "balance_loss_mlp": 1.07179379, + "diversity_loss_mlp": 0.0, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.0811747132385773, + "language_loss": 0.88654399, + "learning_rate": 0.0006386636456878417, + "loss": 0.89738482, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2226, + "time_per_iteration": 2.898261308670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083891, + "balance_loss_mlp": 1.07153535, + "diversity_loss_mlp": 0.0, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.07696212536929578, + "language_loss": 0.92413348, + "learning_rate": 0.0006383642965696353, + "loss": 0.93497235, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 2227, + "time_per_iteration": 2.467622995376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932178, + "balance_loss_mlp": 1.62005818, + "diversity_loss_mlp": 0.21207821, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.033827312051000154, + "language_loss": 0.83018744, + "learning_rate": 0.000638064893733645, + "loss": 0.83950925, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01611001, + "step": 2228, + "time_per_iteration": 2.74554705619812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00939878, + "balance_loss_mlp": 1.63503206, + "diversity_loss_mlp": 0.21170495, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.03357304306136308, + "language_loss": 0.90087909, + "learning_rate": 0.000637765437296109, + "loss": 0.91027784, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01650969, + "step": 2229, + "time_per_iteration": 2.6807308197021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086799, + "balance_loss_mlp": 1.07446718, + "diversity_loss_mlp": 0.0, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.09425394332621637, + "language_loss": 0.85585725, + "learning_rate": 0.000637465927373287, + "loss": 0.86672527, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2230, + "time_per_iteration": 2.6279454231262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088133, + "balance_loss_mlp": 1.0761342, + "diversity_loss_mlp": 0.0, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.13300209785278838, + "language_loss": 0.79446864, + "learning_rate": 0.000637166364081459, + "loss": 0.80534995, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.11993408, + "routerloss_mlp": 0.0, + "step": 2231, + "time_per_iteration": 2.7252066135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108179, + "balance_loss_mlp": 1.07001245, + "diversity_loss_mlp": 0.0, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.08046243261781533, + "language_loss": 0.84081841, + "learning_rate": 0.0006368667475369256, + "loss": 0.85163629, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2232, + "time_per_iteration": 2.756286382675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_mlp": 1.03840148, + "diversity_loss_mlp": 0.0, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.02809293853716727, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79574001, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.078125, + "routerloss_mlp": 0.0, + "step": 2233, + "time_per_iteration": 4.852276086807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_mlp": 1.02313304, + "diversity_loss_mlp": 0.0, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.02329901381823612, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79926044, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.07470703, + "routerloss_mlp": 0.0, + "step": 2234, + "time_per_iteration": 4.812516689300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107186, + "balance_loss_mlp": 1.09534228, + "diversity_loss_mlp": 0.0, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.06628794940731256, + "language_loss": 0.86166692, + "learning_rate": 0.0006359675795504112, + "loss": 0.87273884, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 2235, + "time_per_iteration": 2.7691314220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112399, + "balance_loss_mlp": 1.11230159, + "diversity_loss_mlp": 0.0, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.08124483128316094, + "language_loss": 0.74637383, + "learning_rate": 0.0006356677511584775, + "loss": 0.75761378, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.11676025, + "routerloss_mlp": 0.0, + "step": 2236, + "time_per_iteration": 3.51676082611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138804, + "balance_loss_mlp": 1.12733603, + "diversity_loss_mlp": 0.0, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.08045247853644188, + "language_loss": 0.85975677, + "learning_rate": 0.0006353678700956511, + "loss": 0.87114477, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2237, + "time_per_iteration": 2.5487072467803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137661, + "balance_loss_mlp": 1.12605572, + "diversity_loss_mlp": 0.0, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.08414636037035166, + "language_loss": 0.84184766, + "learning_rate": 0.0006350679364783569, + "loss": 0.85322422, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2238, + "time_per_iteration": 2.730128288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113973, + "balance_loss_mlp": 1.1279577, + "diversity_loss_mlp": 0.0, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.06707032645836293, + "language_loss": 0.85872072, + "learning_rate": 0.0006347679504230393, + "loss": 0.87011802, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2239, + "time_per_iteration": 2.640791893005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136631, + "balance_loss_mlp": 1.12453079, + "diversity_loss_mlp": 0.0, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.07174503893432663, + "language_loss": 0.7626543, + "learning_rate": 0.0006344679120461632, + "loss": 0.77402061, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2240, + "time_per_iteration": 3.3352768421173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128316, + "balance_loss_mlp": 1.11687779, + "diversity_loss_mlp": 0.0, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.08647233478950261, + "language_loss": 0.79984182, + "learning_rate": 0.0006341678214642134, + "loss": 0.81112498, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2241, + "time_per_iteration": 2.662132740020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114748, + "balance_loss_mlp": 1.10336995, + "diversity_loss_mlp": 0.0, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.06482352137494116, + "language_loss": 0.82986903, + "learning_rate": 0.0006338676787936963, + "loss": 0.84101653, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2242, + "time_per_iteration": 3.064518451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123318, + "balance_loss_mlp": 1.11183178, + "diversity_loss_mlp": 0.0, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.07554467546841755, + "language_loss": 0.84015846, + "learning_rate": 0.0006335674841511367, + "loss": 0.85139167, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2243, + "time_per_iteration": 2.7494354248046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067953, + "balance_loss_mlp": 1.06189752, + "diversity_loss_mlp": 0.0, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.020266409588932003, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80249119, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.06054688, + "routerloss_mlp": 0.0, + "step": 2244, + "time_per_iteration": 5.019898414611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058136, + "balance_loss_mlp": 1.05208015, + "diversity_loss_mlp": 0.0, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.017496917907237546, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78423691, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.06054688, + "routerloss_mlp": 0.0, + "step": 2245, + "time_per_iteration": 4.940483808517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111091, + "balance_loss_mlp": 1.09893775, + "diversity_loss_mlp": 0.0, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.07826437205196314, + "language_loss": 0.82487583, + "learning_rate": 0.0006326665895567652, + "loss": 0.83598673, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.121521, + "routerloss_mlp": 0.0, + "step": 2246, + "time_per_iteration": 2.6287152767181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111115, + "balance_loss_mlp": 1.09895015, + "diversity_loss_mlp": 0.0, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.09268036537549412, + "language_loss": 0.87613881, + "learning_rate": 0.0006323661881916976, + "loss": 0.88725001, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.121521, + "routerloss_mlp": 0.0, + "step": 2247, + "time_per_iteration": 2.6966464519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110117, + "balance_loss_mlp": 1.08901072, + "diversity_loss_mlp": 0.0, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.07850654458656253, + "language_loss": 0.812437, + "learning_rate": 0.0006320657354375179, + "loss": 0.82344878, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2248, + "time_per_iteration": 3.0057384967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.08872366, + "diversity_loss_mlp": 0.0, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.07399569527983862, + "language_loss": 0.87203169, + "learning_rate": 0.0006317652314108726, + "loss": 0.88303995, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2249, + "time_per_iteration": 2.6106557846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083093, + "balance_loss_mlp": 1.07126176, + "diversity_loss_mlp": 0.0, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.07131076511794647, + "language_loss": 0.91191232, + "learning_rate": 0.0006314646762284277, + "loss": 0.92274326, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2250, + "time_per_iteration": 2.601017951965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_mlp": 1.02617049, + "diversity_loss_mlp": 0.0, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.02997957544407836, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76458681, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.06689453, + "routerloss_mlp": 0.0, + "step": 2251, + "time_per_iteration": 4.872025966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085431, + "balance_loss_mlp": 1.07351613, + "diversity_loss_mlp": 0.0, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.07162967916255573, + "language_loss": 0.77412337, + "learning_rate": 0.0006308634128629022, + "loss": 0.78497767, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2252, + "time_per_iteration": 2.858896255493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089815, + "balance_loss_mlp": 1.07750654, + "diversity_loss_mlp": 0.0, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.0655401202696214, + "language_loss": 0.8742274, + "learning_rate": 0.0006305627049132531, + "loss": 0.88512552, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2253, + "time_per_iteration": 2.8089702129364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108552, + "balance_loss_mlp": 1.07309866, + "diversity_loss_mlp": 0.0, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.05577202062379855, + "language_loss": 0.85968709, + "learning_rate": 0.0006302619462746662, + "loss": 0.87054229, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.12414551, + "routerloss_mlp": 0.0, + "step": 2254, + "time_per_iteration": 3.117469072341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090126, + "balance_loss_mlp": 1.07842588, + "diversity_loss_mlp": 0.0, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.07095559842956704, + "language_loss": 0.90230805, + "learning_rate": 0.0006299611370639069, + "loss": 0.91320932, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2255, + "time_per_iteration": 2.723188638687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084284, + "balance_loss_mlp": 1.07239318, + "diversity_loss_mlp": 0.0, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.07367301477096526, + "language_loss": 0.79524988, + "learning_rate": 0.0006296602773977593, + "loss": 0.80609274, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2256, + "time_per_iteration": 2.6743130683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099708, + "balance_loss_mlp": 1.08790588, + "diversity_loss_mlp": 0.0, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.06301035546935001, + "language_loss": 0.87406039, + "learning_rate": 0.0006293593673930277, + "loss": 0.88505745, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2257, + "time_per_iteration": 2.6397616863250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103769, + "balance_loss_mlp": 1.09211683, + "diversity_loss_mlp": 0.0, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07716264473653381, + "language_loss": 0.78774142, + "learning_rate": 0.0006290584071665358, + "loss": 0.79877913, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2258, + "time_per_iteration": 2.9148640632629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_mlp": 1.07634544, + "diversity_loss_mlp": 0.0, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.06859255861010008, + "language_loss": 0.82309216, + "learning_rate": 0.0006287573968351266, + "loss": 0.83397484, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2259, + "time_per_iteration": 2.582099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081836, + "balance_loss_mlp": 1.06989694, + "diversity_loss_mlp": 0.0, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.0728512329620832, + "language_loss": 0.8210361, + "learning_rate": 0.0006284563365156626, + "loss": 0.83185446, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2260, + "time_per_iteration": 2.802004814147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075245, + "balance_loss_mlp": 1.06343079, + "diversity_loss_mlp": 0.0, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.08318375282180102, + "language_loss": 0.87862843, + "learning_rate": 0.0006281552263250261, + "loss": 0.88938093, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.11810303, + "routerloss_mlp": 0.0, + "step": 2261, + "time_per_iteration": 2.5335495471954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_mlp": 1.02721453, + "diversity_loss_mlp": 0.0, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.02511862566194507, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81726044, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.07275391, + "routerloss_mlp": 0.0, + "step": 2262, + "time_per_iteration": 4.858395338058472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067582, + "balance_loss_mlp": 1.05593562, + "diversity_loss_mlp": 0.0, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.07030760098393707, + "language_loss": 0.81181604, + "learning_rate": 0.0006275528567978593, + "loss": 0.82249182, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2263, + "time_per_iteration": 2.9562113285064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106752, + "balance_loss_mlp": 1.05570674, + "diversity_loss_mlp": 0.0, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.09515047383985015, + "language_loss": 0.82464182, + "learning_rate": 0.0006272515976951898, + "loss": 0.83531702, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2264, + "time_per_iteration": 3.0750486850738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106778, + "balance_loss_mlp": 1.05625236, + "diversity_loss_mlp": 0.0, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.06538835415995116, + "language_loss": 0.7903443, + "learning_rate": 0.0006269502891890687, + "loss": 0.80102211, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2265, + "time_per_iteration": 3.0723042488098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069278, + "balance_loss_mlp": 1.05721438, + "diversity_loss_mlp": 0.0, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.06791130510000161, + "language_loss": 0.88071477, + "learning_rate": 0.0006266489313964743, + "loss": 0.89140749, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.12060547, + "routerloss_mlp": 0.0, + "step": 2266, + "time_per_iteration": 2.7362618446350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00937641, + "balance_loss_mlp": 1.63294578, + "diversity_loss_mlp": 0.21328503, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.028233172977391998, + "language_loss": 0.85207379, + "learning_rate": 0.0006263475244344041, + "loss": 0.8614502, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01452552, + "step": 2267, + "time_per_iteration": 2.8842954635620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082065, + "balance_loss_mlp": 1.06979251, + "diversity_loss_mlp": 0.0, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.07502115173737808, + "language_loss": 0.84271002, + "learning_rate": 0.0006260460684198746, + "loss": 0.8535307, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.12268066, + "routerloss_mlp": 0.0, + "step": 2268, + "time_per_iteration": 2.6355533599853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.07749879, + "diversity_loss_mlp": 0.0, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.07640014386484298, + "language_loss": 0.84040511, + "learning_rate": 0.0006257445634699213, + "loss": 0.85130346, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.12322998, + "routerloss_mlp": 0.0, + "step": 2269, + "time_per_iteration": 2.5279150009155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089112, + "balance_loss_mlp": 1.07683921, + "diversity_loss_mlp": 0.0, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.16142331523875347, + "language_loss": 0.83037758, + "learning_rate": 0.0006254430097015993, + "loss": 0.84126872, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.12268066, + "routerloss_mlp": 0.0, + "step": 2270, + "time_per_iteration": 2.660228729248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_mlp": 1.03087568, + "diversity_loss_mlp": 0.0, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.024589935077845904, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77516735, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.06787109, + "routerloss_mlp": 0.0, + "step": 2271, + "time_per_iteration": 4.794579744338989 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070138, + "balance_loss_mlp": 1.05796623, + "diversity_loss_mlp": 0.0, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.057648382072647573, + "language_loss": 0.85053569, + "learning_rate": 0.0006248397561781609, + "loss": 0.86123705, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2272, + "time_per_iteration": 2.862569570541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067836, + "balance_loss_mlp": 1.05557537, + "diversity_loss_mlp": 0.0, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.08840424380788836, + "language_loss": 0.86255217, + "learning_rate": 0.0006245380566572482, + "loss": 0.87323052, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2273, + "time_per_iteration": 2.7386484146118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068942, + "balance_loss_mlp": 1.0566572, + "diversity_loss_mlp": 0.0, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07723857249852564, + "language_loss": 0.75794655, + "learning_rate": 0.0006242363087863744, + "loss": 0.76863599, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.12286377, + "routerloss_mlp": 0.0, + "step": 2274, + "time_per_iteration": 2.948030710220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010675, + "balance_loss_mlp": 1.05560887, + "diversity_loss_mlp": 0.0, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.06687985923679116, + "language_loss": 0.86043644, + "learning_rate": 0.0006239345126826878, + "loss": 0.87111151, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2275, + "time_per_iteration": 2.787750482559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071289, + "balance_loss_mlp": 1.05926108, + "diversity_loss_mlp": 0.0, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.07503499995760528, + "language_loss": 0.83946115, + "learning_rate": 0.0006236326684633561, + "loss": 0.85017407, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2276, + "time_per_iteration": 2.8109841346740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071151, + "balance_loss_mlp": 1.05921769, + "diversity_loss_mlp": 0.0, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.08049471875944368, + "language_loss": 0.75253642, + "learning_rate": 0.0006233307762455658, + "loss": 0.76324785, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.11932373, + "routerloss_mlp": 0.0, + "step": 2277, + "time_per_iteration": 2.632291793823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072177, + "balance_loss_mlp": 1.06043518, + "diversity_loss_mlp": 0.0, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.0727539933311737, + "language_loss": 0.83312476, + "learning_rate": 0.0006230288361465216, + "loss": 0.8438465, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2278, + "time_per_iteration": 3.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106943, + "balance_loss_mlp": 1.05752659, + "diversity_loss_mlp": 0.0, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.08745359184854619, + "language_loss": 0.84888816, + "learning_rate": 0.0006227268482834473, + "loss": 0.85958248, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2279, + "time_per_iteration": 2.9116861820220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929134, + "balance_loss_mlp": 1.61467147, + "diversity_loss_mlp": 0.21327347, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.03053717197724305, + "language_loss": 0.8733198, + "learning_rate": 0.000622424812773585, + "loss": 0.88261116, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0151619, + "step": 2280, + "time_per_iteration": 2.83655047416687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087033, + "balance_loss_mlp": 1.07515955, + "diversity_loss_mlp": 0.0, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.09030781332224262, + "language_loss": 0.8003484, + "learning_rate": 0.000622122729734195, + "loss": 0.81121874, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2281, + "time_per_iteration": 2.598515033721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088881, + "balance_loss_mlp": 1.07746708, + "diversity_loss_mlp": 0.0, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.05965815533468205, + "language_loss": 0.87430406, + "learning_rate": 0.0006218205992825566, + "loss": 0.88519287, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2282, + "time_per_iteration": 2.6424663066864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084394, + "balance_loss_mlp": 1.07271123, + "diversity_loss_mlp": 0.0, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.06483845116972914, + "language_loss": 0.81733787, + "learning_rate": 0.0006215184215359671, + "loss": 0.8281818, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2283, + "time_per_iteration": 2.736311674118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087917, + "balance_loss_mlp": 1.07662153, + "diversity_loss_mlp": 0.0, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.0656289826640407, + "language_loss": 0.86697561, + "learning_rate": 0.0006212161966117425, + "loss": 0.8778547, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.11297607, + "routerloss_mlp": 0.0, + "step": 2284, + "time_per_iteration": 2.727402448654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091514, + "balance_loss_mlp": 1.07989156, + "diversity_loss_mlp": 0.0, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.07463232969806483, + "language_loss": 0.81628394, + "learning_rate": 0.0006209139246272164, + "loss": 0.8271991, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 2285, + "time_per_iteration": 2.978759527206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.08205843, + "diversity_loss_mlp": 0.0, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.08236326374350296, + "language_loss": 0.81938732, + "learning_rate": 0.0006206116056997421, + "loss": 0.83032608, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.1182251, + "routerloss_mlp": 0.0, + "step": 2286, + "time_per_iteration": 2.6111207008361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085811, + "balance_loss_mlp": 1.07444477, + "diversity_loss_mlp": 0.0, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.06662472973472185, + "language_loss": 0.82727671, + "learning_rate": 0.0006203092399466892, + "loss": 0.83813483, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2287, + "time_per_iteration": 2.6246864795684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109279, + "balance_loss_mlp": 1.08137023, + "diversity_loss_mlp": 0.0, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.06470350083987941, + "language_loss": 0.85380936, + "learning_rate": 0.0006200068274854473, + "loss": 0.86473733, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.11419678, + "routerloss_mlp": 0.0, + "step": 2288, + "time_per_iteration": 2.675197124481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091416, + "balance_loss_mlp": 1.07988858, + "diversity_loss_mlp": 0.0, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.0650031810595099, + "language_loss": 0.8588661, + "learning_rate": 0.0006197043684334229, + "loss": 0.86978024, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2289, + "time_per_iteration": 2.787095785140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092106, + "balance_loss_mlp": 1.08063841, + "diversity_loss_mlp": 0.0, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.0715970788084748, + "language_loss": 0.79333103, + "learning_rate": 0.0006194018629080411, + "loss": 0.80425215, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2290, + "time_per_iteration": 2.817836284637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103028, + "balance_loss_mlp": 1.09150028, + "diversity_loss_mlp": 0.0, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.07061114258803743, + "language_loss": 0.81714827, + "learning_rate": 0.0006190993110267451, + "loss": 0.82817852, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2291, + "time_per_iteration": 2.741288900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108227, + "balance_loss_mlp": 1.09614503, + "diversity_loss_mlp": 0.0, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.07455801894128893, + "language_loss": 0.84193838, + "learning_rate": 0.0006187967129069958, + "loss": 0.85302061, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2292, + "time_per_iteration": 2.5778286457061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106682, + "balance_loss_mlp": 1.09472573, + "diversity_loss_mlp": 0.0, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.06400814904414545, + "language_loss": 0.8690064, + "learning_rate": 0.0006184940686662722, + "loss": 0.88007319, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2293, + "time_per_iteration": 2.7292487621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111022, + "balance_loss_mlp": 1.09812045, + "diversity_loss_mlp": 0.0, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.06813451942076464, + "language_loss": 0.90379488, + "learning_rate": 0.0006181913784220714, + "loss": 0.91489702, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2294, + "time_per_iteration": 2.6506428718566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081962, + "balance_loss_mlp": 1.0750953, + "diversity_loss_mlp": 0.0, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.029819366941177792, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81635749, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.06884766, + "routerloss_mlp": 0.0, + "step": 2295, + "time_per_iteration": 4.882002592086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110182, + "balance_loss_mlp": 1.09772444, + "diversity_loss_mlp": 0.0, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.07012194180041048, + "language_loss": 0.7971437, + "learning_rate": 0.0006175858603933146, + "loss": 0.80824548, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.12469482, + "routerloss_mlp": 0.0, + "step": 2296, + "time_per_iteration": 2.8836371898651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908854, + "balance_loss_mlp": 1.58032632, + "diversity_loss_mlp": 0.2095283, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.03267646081870075, + "language_loss": 0.80986243, + "learning_rate": 0.0006172830328438416, + "loss": 0.81895095, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01392685, + "step": 2297, + "time_per_iteration": 2.9758472442626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093338, + "balance_loss_mlp": 1.0806725, + "diversity_loss_mlp": 0.0, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.0684627092891604, + "language_loss": 0.86739677, + "learning_rate": 0.0006169801597610572, + "loss": 0.87833017, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 2298, + "time_per_iteration": 2.796999454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080861, + "balance_loss_mlp": 1.06855834, + "diversity_loss_mlp": 0.0, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.09148837874044675, + "language_loss": 0.89672303, + "learning_rate": 0.0006166772412625469, + "loss": 0.90753162, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.12304688, + "routerloss_mlp": 0.0, + "step": 2299, + "time_per_iteration": 2.719217300415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079493, + "balance_loss_mlp": 1.06674969, + "diversity_loss_mlp": 0.0, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.0806717243265584, + "language_loss": 0.81995088, + "learning_rate": 0.0006163742774659141, + "loss": 0.83074582, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 2300, + "time_per_iteration": 2.857851266860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082582, + "balance_loss_mlp": 1.07051837, + "diversity_loss_mlp": 0.0, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.07368324051857801, + "language_loss": 0.85920924, + "learning_rate": 0.0006160712684887801, + "loss": 0.87003505, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.1206665, + "routerloss_mlp": 0.0, + "step": 2301, + "time_per_iteration": 2.7615816593170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076648, + "balance_loss_mlp": 1.06491232, + "diversity_loss_mlp": 0.0, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.07775198871362894, + "language_loss": 0.81987381, + "learning_rate": 0.0006157682144487832, + "loss": 0.83064032, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2302, + "time_per_iteration": 2.759446620941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071769, + "balance_loss_mlp": 1.05998516, + "diversity_loss_mlp": 0.0, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.07391427816126875, + "language_loss": 0.82887244, + "learning_rate": 0.0006154651154635793, + "loss": 0.83959019, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2303, + "time_per_iteration": 2.8566582202911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074582, + "balance_loss_mlp": 1.0627867, + "diversity_loss_mlp": 0.0, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.07276664214775759, + "language_loss": 0.84800553, + "learning_rate": 0.0006151619716508421, + "loss": 0.85875136, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2304, + "time_per_iteration": 2.678624153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070842, + "balance_loss_mlp": 1.05890322, + "diversity_loss_mlp": 0.0, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.0708190445963316, + "language_loss": 0.87117589, + "learning_rate": 0.0006148587831282625, + "loss": 0.88188434, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2305, + "time_per_iteration": 2.6833643913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065001, + "balance_loss_mlp": 1.05813479, + "diversity_loss_mlp": 0.0, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.03167846404368131, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80241072, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.06884766, + "routerloss_mlp": 0.0, + "step": 2306, + "time_per_iteration": 4.908214092254639 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_mlp": 1.06202734, + "diversity_loss_mlp": 0.0, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.10781991147306623, + "language_loss": 0.87386847, + "learning_rate": 0.0006142522724244255, + "loss": 0.8846153, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.12664795, + "routerloss_mlp": 0.0, + "step": 2307, + "time_per_iteration": 2.559011459350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_mlp": 1.03301477, + "diversity_loss_mlp": 0.0, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.019467834986953515, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77524698, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.06982422, + "routerloss_mlp": 0.0, + "step": 2308, + "time_per_iteration": 4.990226984024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010956, + "balance_loss_mlp": 1.08379281, + "diversity_loss_mlp": 0.0, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.134173965781989, + "language_loss": 0.77330542, + "learning_rate": 0.000613645584293942, + "loss": 0.78426147, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2309, + "time_per_iteration": 2.925625801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096392, + "balance_loss_mlp": 1.08444726, + "diversity_loss_mlp": 0.0, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.07260585347328512, + "language_loss": 0.83497787, + "learning_rate": 0.0006133421739881185, + "loss": 0.84594172, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2310, + "time_per_iteration": 2.6521387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105727, + "balance_loss_mlp": 1.09360933, + "diversity_loss_mlp": 0.0, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.08716252058009813, + "language_loss": 0.82747865, + "learning_rate": 0.0006130387196789605, + "loss": 0.8385359, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2311, + "time_per_iteration": 2.7266759872436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100575, + "balance_loss_mlp": 1.08809423, + "diversity_loss_mlp": 0.0, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.057672451626414926, + "language_loss": 0.84308195, + "learning_rate": 0.0006127352214842795, + "loss": 0.85408771, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2312, + "time_per_iteration": 2.9728119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104798, + "balance_loss_mlp": 1.09263897, + "diversity_loss_mlp": 0.0, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.09124128780751645, + "language_loss": 0.85551131, + "learning_rate": 0.0006124316795219041, + "loss": 0.86655927, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2313, + "time_per_iteration": 2.793999671936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098642, + "balance_loss_mlp": 1.08649504, + "diversity_loss_mlp": 0.0, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.07392199689713573, + "language_loss": 0.82170153, + "learning_rate": 0.0006121280939096794, + "loss": 0.83268797, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.12145996, + "routerloss_mlp": 0.0, + "step": 2314, + "time_per_iteration": 2.7882213592529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087686, + "balance_loss_mlp": 1.07496047, + "diversity_loss_mlp": 0.0, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.07188819518398708, + "language_loss": 0.87831259, + "learning_rate": 0.000611824464765468, + "loss": 0.88918942, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.12738037, + "routerloss_mlp": 0.0, + "step": 2315, + "time_per_iteration": 2.570239305496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_mlp": 1.03435254, + "diversity_loss_mlp": 0.0, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.031544046963938845, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79636735, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.07421875, + "routerloss_mlp": 0.0, + "step": 2316, + "time_per_iteration": 4.63933539390564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107211, + "balance_loss_mlp": 1.05995071, + "diversity_loss_mlp": 0.0, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.10006595419905694, + "language_loss": 0.85561663, + "learning_rate": 0.000611217076352619, + "loss": 0.86633772, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2317, + "time_per_iteration": 2.763282299041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068374, + "balance_loss_mlp": 1.05613708, + "diversity_loss_mlp": 0.0, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.07080250397958886, + "language_loss": 0.8323034, + "learning_rate": 0.0006109133173197905, + "loss": 0.84298718, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 2318, + "time_per_iteration": 2.7228074073791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.0546751, + "diversity_loss_mlp": 0.0, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.07919775459104113, + "language_loss": 0.85392821, + "learning_rate": 0.0006106095152265935, + "loss": 0.86459887, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.12390137, + "routerloss_mlp": 0.0, + "step": 2319, + "time_per_iteration": 2.950333595275879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067661, + "balance_loss_mlp": 1.05547166, + "diversity_loss_mlp": 0.0, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.061336847968553085, + "language_loss": 0.84789562, + "learning_rate": 0.0006103056701909739, + "loss": 0.85857224, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2320, + "time_per_iteration": 2.9283788204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076472, + "balance_loss_mlp": 1.06437278, + "diversity_loss_mlp": 0.0, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.06696737396207848, + "language_loss": 0.83276129, + "learning_rate": 0.0006100017823308956, + "loss": 0.84352595, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2321, + "time_per_iteration": 3.159337282180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072799, + "balance_loss_mlp": 1.06091988, + "diversity_loss_mlp": 0.0, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.07676377008356373, + "language_loss": 0.79803503, + "learning_rate": 0.0006096978517643377, + "loss": 0.80876303, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2322, + "time_per_iteration": 2.8253674507141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00921995, + "balance_loss_mlp": 1.60181236, + "diversity_loss_mlp": 0.21422489, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.03237790796068106, + "language_loss": 0.83347481, + "learning_rate": 0.0006093938786092968, + "loss": 0.84269476, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01397606, + "step": 2323, + "time_per_iteration": 2.648444890975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110106, + "balance_loss_mlp": 1.09840608, + "diversity_loss_mlp": 0.0, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.07300553293113453, + "language_loss": 0.90023661, + "learning_rate": 0.0006090898629837857, + "loss": 0.91133773, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2324, + "time_per_iteration": 2.852698564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126468, + "balance_loss_mlp": 1.11461282, + "diversity_loss_mlp": 0.0, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.06000654076761871, + "language_loss": 0.87143672, + "learning_rate": 0.0006087858050058337, + "loss": 0.8827014, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2325, + "time_per_iteration": 2.7674834728240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138358, + "balance_loss_mlp": 1.12663388, + "diversity_loss_mlp": 0.0, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.0853990663964482, + "language_loss": 0.82412744, + "learning_rate": 0.0006084817047934866, + "loss": 0.83551097, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2326, + "time_per_iteration": 2.6421871185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121575, + "balance_loss_mlp": 1.10977352, + "diversity_loss_mlp": 0.0, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.08985792381424736, + "language_loss": 0.89330196, + "learning_rate": 0.0006081775624648066, + "loss": 0.90451771, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2327, + "time_per_iteration": 2.578197956085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131674, + "balance_loss_mlp": 1.12057006, + "diversity_loss_mlp": 0.0, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.0872530433154025, + "language_loss": 0.83162999, + "learning_rate": 0.0006078733781378721, + "loss": 0.84294665, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2328, + "time_per_iteration": 2.6186208724975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099348, + "balance_loss_mlp": 1.08810675, + "diversity_loss_mlp": 0.0, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.07633837573658239, + "language_loss": 0.82202363, + "learning_rate": 0.0006075691519307781, + "loss": 0.83301711, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2329, + "time_per_iteration": 2.9000244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094166, + "balance_loss_mlp": 1.08247721, + "diversity_loss_mlp": 0.0, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.0736281868256213, + "language_loss": 0.81618124, + "learning_rate": 0.0006072648839616356, + "loss": 0.82712287, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.11694336, + "routerloss_mlp": 0.0, + "step": 2330, + "time_per_iteration": 2.6364829540252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083826, + "balance_loss_mlp": 1.07230425, + "diversity_loss_mlp": 0.0, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.0657010816534965, + "language_loss": 0.82723016, + "learning_rate": 0.0006069605743485718, + "loss": 0.83806837, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2331, + "time_per_iteration": 3.3334474563598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086805, + "balance_loss_mlp": 1.07531917, + "diversity_loss_mlp": 0.0, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.07225675858451452, + "language_loss": 0.83265316, + "learning_rate": 0.0006066562232097303, + "loss": 0.84352124, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2332, + "time_per_iteration": 2.705143690109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082054, + "balance_loss_mlp": 1.07051468, + "diversity_loss_mlp": 0.0, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.06521315479324259, + "language_loss": 0.8614397, + "learning_rate": 0.0006063518306632708, + "loss": 0.87226027, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2333, + "time_per_iteration": 2.9501705169677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085822, + "balance_loss_mlp": 1.07427073, + "diversity_loss_mlp": 0.0, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.07251688845149425, + "language_loss": 0.82197714, + "learning_rate": 0.0006060473968273688, + "loss": 0.83283544, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2334, + "time_per_iteration": 2.708394765853882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_mlp": 1.032179, + "diversity_loss_mlp": 0.0, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.02865006957504222, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78918916, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.07177734, + "routerloss_mlp": 0.0, + "step": 2335, + "time_per_iteration": 4.866912841796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_mlp": 1.01901519, + "diversity_loss_mlp": 0.0, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.021847156852776353, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82031286, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.07080078, + "routerloss_mlp": 0.0, + "step": 2336, + "time_per_iteration": 4.834076642990112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108818, + "balance_loss_mlp": 1.07613969, + "diversity_loss_mlp": 0.0, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.09890748330953583, + "language_loss": 0.88285863, + "learning_rate": 0.0006051338487650047, + "loss": 0.89374042, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 2337, + "time_per_iteration": 2.4428114891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00930205, + "balance_loss_mlp": 1.62015963, + "diversity_loss_mlp": 0.20974493, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.03186253719782368, + "language_loss": 0.82399797, + "learning_rate": 0.0006048292509534095, + "loss": 0.83329999, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01525321, + "step": 2338, + "time_per_iteration": 2.6332457065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079855, + "balance_loss_mlp": 1.06772542, + "diversity_loss_mlp": 0.0, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.08456945041025239, + "language_loss": 0.77873439, + "learning_rate": 0.0006045246124434895, + "loss": 0.7895329, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 2339, + "time_per_iteration": 2.7590980529785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073061, + "balance_loss_mlp": 1.06156278, + "diversity_loss_mlp": 0.0, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.06841757056071682, + "language_loss": 0.86623305, + "learning_rate": 0.0006042199333535162, + "loss": 0.87696362, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2340, + "time_per_iteration": 3.293574333190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079106, + "balance_loss_mlp": 1.06769133, + "diversity_loss_mlp": 0.0, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06101547553515947, + "language_loss": 0.84343052, + "learning_rate": 0.0006039152138017763, + "loss": 0.85422158, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2341, + "time_per_iteration": 3.0700981616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087051, + "balance_loss_mlp": 1.07579744, + "diversity_loss_mlp": 0.0, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.09071323966594208, + "language_loss": 0.83541143, + "learning_rate": 0.0006036104539065726, + "loss": 0.84628195, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.11260986, + "routerloss_mlp": 0.0, + "step": 2342, + "time_per_iteration": 2.6694719791412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089407, + "balance_loss_mlp": 1.07793319, + "diversity_loss_mlp": 0.0, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.08270437502254605, + "language_loss": 0.84371507, + "learning_rate": 0.000603305653786223, + "loss": 0.85460913, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2343, + "time_per_iteration": 3.16105318069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083424, + "balance_loss_mlp": 1.07187295, + "diversity_loss_mlp": 0.0, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.07028076371432387, + "language_loss": 0.84103405, + "learning_rate": 0.0006030008135590622, + "loss": 0.85186827, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2344, + "time_per_iteration": 2.7197835445404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082164, + "balance_loss_mlp": 1.07096398, + "diversity_loss_mlp": 0.0, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.05864949769745669, + "language_loss": 0.7999413, + "learning_rate": 0.0006026959333434387, + "loss": 0.81076288, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2345, + "time_per_iteration": 2.777010202407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919083, + "balance_loss_mlp": 1.6008426, + "diversity_loss_mlp": 0.20793086, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.028469676504860836, + "language_loss": 0.77684712, + "learning_rate": 0.0006023910132577181, + "loss": 0.78603798, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01469593, + "step": 2346, + "time_per_iteration": 2.689173936843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093318, + "balance_loss_mlp": 1.08186746, + "diversity_loss_mlp": 0.0, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.07173117007756048, + "language_loss": 0.84956741, + "learning_rate": 0.0006020860534202806, + "loss": 0.86050057, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2347, + "time_per_iteration": 2.499941110610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099048, + "balance_loss_mlp": 1.08747303, + "diversity_loss_mlp": 0.0, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.06525031943024168, + "language_loss": 0.81076705, + "learning_rate": 0.0006017810539495224, + "loss": 0.82175756, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2348, + "time_per_iteration": 2.9487318992614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094541, + "balance_loss_mlp": 1.08284068, + "diversity_loss_mlp": 0.0, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.07881291561071736, + "language_loss": 0.82607108, + "learning_rate": 0.0006014760149638547, + "loss": 0.83701646, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.11700439, + "routerloss_mlp": 0.0, + "step": 2349, + "time_per_iteration": 2.7228691577911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096509, + "balance_loss_mlp": 1.0852139, + "diversity_loss_mlp": 0.0, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.08019466042103662, + "language_loss": 0.88398969, + "learning_rate": 0.000601170936581704, + "loss": 0.8949548, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.112854, + "routerloss_mlp": 0.0, + "step": 2350, + "time_per_iteration": 2.521714687347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090727, + "balance_loss_mlp": 1.07951522, + "diversity_loss_mlp": 0.0, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.08533615412567333, + "language_loss": 0.84897137, + "learning_rate": 0.0006008658189215121, + "loss": 0.85987866, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2351, + "time_per_iteration": 2.6506216526031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.07545722, + "diversity_loss_mlp": 0.0, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.09237808795246917, + "language_loss": 0.80232167, + "learning_rate": 0.0006005606621017366, + "loss": 0.81319243, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2352, + "time_per_iteration": 2.5878968238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010807, + "balance_loss_mlp": 1.06907678, + "diversity_loss_mlp": 0.0, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.07057821380790058, + "language_loss": 0.80339801, + "learning_rate": 0.0006002554662408496, + "loss": 0.81420493, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2353, + "time_per_iteration": 2.883782386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.0691061, + "diversity_loss_mlp": 0.0, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.0736680584084088, + "language_loss": 0.9135446, + "learning_rate": 0.0005999502314573388, + "loss": 0.9243511, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2354, + "time_per_iteration": 2.645484685897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103614, + "balance_loss_mlp": 1.09201527, + "diversity_loss_mlp": 0.0, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.07036557956994945, + "language_loss": 0.86196381, + "learning_rate": 0.0005996449578697066, + "loss": 0.87299991, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2355, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906536, + "balance_loss_mlp": 1.57839537, + "diversity_loss_mlp": 0.20635399, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.031145483684461562, + "language_loss": 0.81619978, + "learning_rate": 0.0005993396455964709, + "loss": 0.82526517, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01416124, + "step": 2356, + "time_per_iteration": 2.7277767658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.14805746, + "diversity_loss_mlp": 0.0, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.07904312092760724, + "language_loss": 0.81657517, + "learning_rate": 0.0005990342947561647, + "loss": 0.82816887, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.11315918, + "routerloss_mlp": 0.0, + "step": 2357, + "time_per_iteration": 2.696223258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167894, + "balance_loss_mlp": 1.15651524, + "diversity_loss_mlp": 0.0, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.07381995676601517, + "language_loss": 0.78198934, + "learning_rate": 0.0005987289054673351, + "loss": 0.79366827, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2358, + "time_per_iteration": 2.602642059326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360078, + "balance_loss_mlp": 1.35392714, + "diversity_loss_mlp": 0.0, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.12195170998658643, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77935815, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2359, + "time_per_iteration": 4.880090713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146892, + "balance_loss_mlp": 1.13553107, + "diversity_loss_mlp": 0.0, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.07250720881476776, + "language_loss": 0.91548061, + "learning_rate": 0.0005981180120183722, + "loss": 0.9269495, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2360, + "time_per_iteration": 2.680730104446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133243, + "balance_loss_mlp": 1.121382, + "diversity_loss_mlp": 0.0, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.055968167495159496, + "language_loss": 0.85338825, + "learning_rate": 0.0005978125080954089, + "loss": 0.8647207, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.11853027, + "routerloss_mlp": 0.0, + "step": 2361, + "time_per_iteration": 2.791376829147339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124184, + "balance_loss_mlp": 1.11265099, + "diversity_loss_mlp": 0.0, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.08653591933533131, + "language_loss": 0.77322888, + "learning_rate": 0.000597506966198262, + "loss": 0.7844708, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2362, + "time_per_iteration": 2.97446870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119088, + "balance_loss_mlp": 1.10733426, + "diversity_loss_mlp": 0.0, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.09240364374598002, + "language_loss": 0.84247041, + "learning_rate": 0.0005972013864455536, + "loss": 0.85366124, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.11743164, + "routerloss_mlp": 0.0, + "step": 2363, + "time_per_iteration": 2.577167510986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108786, + "balance_loss_mlp": 1.09771168, + "diversity_loss_mlp": 0.0, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.0787330127694287, + "language_loss": 0.8535012, + "learning_rate": 0.0005968957689559203, + "loss": 0.8645891, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2364, + "time_per_iteration": 2.7120981216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105615, + "balance_loss_mlp": 1.09457588, + "diversity_loss_mlp": 0.0, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.07389843074969835, + "language_loss": 0.88484383, + "learning_rate": 0.0005965901138480131, + "loss": 0.89590001, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2365, + "time_per_iteration": 2.578874349594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110202, + "balance_loss_mlp": 1.09081471, + "diversity_loss_mlp": 0.0, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.06426783448513047, + "language_loss": 0.87068385, + "learning_rate": 0.0005962844212404982, + "loss": 0.88170409, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.11206055, + "routerloss_mlp": 0.0, + "step": 2366, + "time_per_iteration": 2.6638920307159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096105, + "balance_loss_mlp": 1.08472049, + "diversity_loss_mlp": 0.0, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.05830156527831164, + "language_loss": 0.87147355, + "learning_rate": 0.0005959786912520558, + "loss": 0.88243461, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 2367, + "time_per_iteration": 2.6142454147338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088371, + "balance_loss_mlp": 1.07726681, + "diversity_loss_mlp": 0.0, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.06261196085687584, + "language_loss": 0.83712542, + "learning_rate": 0.0005956729240013806, + "loss": 0.84800917, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2368, + "time_per_iteration": 2.786256790161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095858, + "balance_loss_mlp": 1.08447385, + "diversity_loss_mlp": 0.0, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.06874460659515655, + "language_loss": 0.91648531, + "learning_rate": 0.0005953671196071824, + "loss": 0.92744386, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2369, + "time_per_iteration": 2.756943941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093695, + "balance_loss_mlp": 1.08220375, + "diversity_loss_mlp": 0.0, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.07258619671695062, + "language_loss": 0.80044961, + "learning_rate": 0.0005950612781881846, + "loss": 0.81138659, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2370, + "time_per_iteration": 2.6791019439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906758, + "balance_loss_mlp": 1.57760763, + "diversity_loss_mlp": 0.20680004, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.03266097765038979, + "language_loss": 0.76005763, + "learning_rate": 0.0005947553998631259, + "loss": 0.76912522, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01455403, + "step": 2371, + "time_per_iteration": 2.908493995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010769, + "balance_loss_mlp": 1.06543183, + "diversity_loss_mlp": 0.0, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.05564189265933484, + "language_loss": 0.79205543, + "learning_rate": 0.000594449484750758, + "loss": 0.80282438, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2372, + "time_per_iteration": 3.18151593208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072251, + "balance_loss_mlp": 1.06046152, + "diversity_loss_mlp": 0.0, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.07444834598910231, + "language_loss": 0.83208215, + "learning_rate": 0.0005941435329698484, + "loss": 0.84280467, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2373, + "time_per_iteration": 2.6709630489349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107048, + "balance_loss_mlp": 1.05895281, + "diversity_loss_mlp": 0.0, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.06837725942446468, + "language_loss": 0.83204812, + "learning_rate": 0.0005938375446391778, + "loss": 0.84275293, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2374, + "time_per_iteration": 2.6943106651306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_mlp": 1.06261396, + "diversity_loss_mlp": 0.0, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.0748623734907781, + "language_loss": 0.8912878, + "learning_rate": 0.0005935315198775415, + "loss": 0.90203297, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2375, + "time_per_iteration": 2.6303911209106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066821, + "balance_loss_mlp": 1.05491209, + "diversity_loss_mlp": 0.0, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.06590971106227904, + "language_loss": 0.87093645, + "learning_rate": 0.0005932254588037486, + "loss": 0.88160467, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2376, + "time_per_iteration": 2.5003554821014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106434, + "balance_loss_mlp": 1.0520016, + "diversity_loss_mlp": 0.0, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.07188519107297629, + "language_loss": 0.86239958, + "learning_rate": 0.000592919361536623, + "loss": 0.87304294, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.12335205, + "routerloss_mlp": 0.0, + "step": 2377, + "time_per_iteration": 2.6426758766174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106134, + "balance_loss_mlp": 1.04946113, + "diversity_loss_mlp": 0.0, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.06083573176815847, + "language_loss": 0.88679874, + "learning_rate": 0.0005926132281950017, + "loss": 0.89741206, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2378, + "time_per_iteration": 2.7510690689086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065869, + "balance_loss_mlp": 1.05310154, + "diversity_loss_mlp": 0.0, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.07940360452878177, + "language_loss": 0.85365742, + "learning_rate": 0.0005923070588977367, + "loss": 0.86431611, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.12774658, + "routerloss_mlp": 0.0, + "step": 2379, + "time_per_iteration": 2.7969985008239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066405, + "balance_loss_mlp": 1.05444837, + "diversity_loss_mlp": 0.0, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.06398281947580985, + "language_loss": 0.86384034, + "learning_rate": 0.0005920008537636931, + "loss": 0.87450439, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.11956787, + "routerloss_mlp": 0.0, + "step": 2380, + "time_per_iteration": 2.90964412689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_mlp": 1.05391335, + "diversity_loss_mlp": 0.0, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.05698304417859526, + "language_loss": 0.86739266, + "learning_rate": 0.0005916946129117504, + "loss": 0.87805718, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.12548828, + "routerloss_mlp": 0.0, + "step": 2381, + "time_per_iteration": 2.9013612270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.06223381, + "diversity_loss_mlp": 0.0, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.07634094682432664, + "language_loss": 0.80304879, + "learning_rate": 0.0005913883364608017, + "loss": 0.81379426, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2382, + "time_per_iteration": 3.086503505706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108411, + "balance_loss_mlp": 1.07212973, + "diversity_loss_mlp": 0.0, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.06243795661807547, + "language_loss": 0.8841778, + "learning_rate": 0.0005910820245297542, + "loss": 0.89501894, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.11975098, + "routerloss_mlp": 0.0, + "step": 2383, + "time_per_iteration": 2.8612842559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090258, + "balance_loss_mlp": 1.07756186, + "diversity_loss_mlp": 0.0, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.08243832238560393, + "language_loss": 0.80972016, + "learning_rate": 0.000590775677237529, + "loss": 0.82062268, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 2384, + "time_per_iteration": 2.731405735015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094631, + "balance_loss_mlp": 1.08257282, + "diversity_loss_mlp": 0.0, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.07578687885193977, + "language_loss": 0.80532229, + "learning_rate": 0.0005904692947030601, + "loss": 0.81626856, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.1204834, + "routerloss_mlp": 0.0, + "step": 2385, + "time_per_iteration": 2.6176209449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106556, + "balance_loss_mlp": 1.09437895, + "diversity_loss_mlp": 0.0, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.08078833732724985, + "language_loss": 0.8953619, + "learning_rate": 0.0005901628770452963, + "loss": 0.90642744, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.1217041, + "routerloss_mlp": 0.0, + "step": 2386, + "time_per_iteration": 2.5513737201690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.10345697, + "diversity_loss_mlp": 0.0, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.09403156888929357, + "language_loss": 0.87502134, + "learning_rate": 0.000589856424383199, + "loss": 0.88617843, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2387, + "time_per_iteration": 2.599862813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111298, + "balance_loss_mlp": 1.10114813, + "diversity_loss_mlp": 0.0, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.08117329221401763, + "language_loss": 0.8309918, + "learning_rate": 0.000589549936835744, + "loss": 0.8421216, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2388, + "time_per_iteration": 2.914754867553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101869, + "balance_loss_mlp": 1.0899775, + "diversity_loss_mlp": 0.0, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06559429512714879, + "language_loss": 0.79056096, + "learning_rate": 0.0005892434145219202, + "loss": 0.80157959, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2389, + "time_per_iteration": 2.6295268535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898813, + "balance_loss_mlp": 1.5620172, + "diversity_loss_mlp": 0.2081904, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.0365067866217014, + "language_loss": 0.82780147, + "learning_rate": 0.0005889368575607303, + "loss": 0.83678961, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01370906, + "step": 2390, + "time_per_iteration": 2.8635401725769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089349, + "balance_loss_mlp": 1.07753515, + "diversity_loss_mlp": 0.0, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.056196182118315396, + "language_loss": 0.78421402, + "learning_rate": 0.00058863026607119, + "loss": 0.79510748, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2391, + "time_per_iteration": 3.0734708309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099092, + "balance_loss_mlp": 1.08715332, + "diversity_loss_mlp": 0.0, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.07079174515079527, + "language_loss": 0.795928, + "learning_rate": 0.0005883236401723287, + "loss": 0.80691886, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.11932373, + "routerloss_mlp": 0.0, + "step": 2392, + "time_per_iteration": 3.1697676181793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.08348131, + "diversity_loss_mlp": 0.0, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.08882239564338372, + "language_loss": 0.84418833, + "learning_rate": 0.0005880169799831893, + "loss": 0.85514069, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2393, + "time_per_iteration": 2.668509006500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095056, + "balance_loss_mlp": 1.08327174, + "diversity_loss_mlp": 0.0, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.06874062850812142, + "language_loss": 0.81593782, + "learning_rate": 0.0005877102856228278, + "loss": 0.82688844, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2394, + "time_per_iteration": 2.862039566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099301, + "balance_loss_mlp": 1.08791018, + "diversity_loss_mlp": 0.0, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.07005170830273995, + "language_loss": 0.84822053, + "learning_rate": 0.0005874035572103133, + "loss": 0.85921353, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.1138916, + "routerloss_mlp": 0.0, + "step": 2395, + "time_per_iteration": 2.660466194152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092906, + "balance_loss_mlp": 1.08152771, + "diversity_loss_mlp": 0.0, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.09691208121118819, + "language_loss": 0.82382149, + "learning_rate": 0.0005870967948647288, + "loss": 0.83475053, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2396, + "time_per_iteration": 2.8379006385803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259876, + "balance_loss_mlp": 1.25238955, + "diversity_loss_mlp": 0.0, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.08205623370138872, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75568175, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.07470703, + "routerloss_mlp": 0.0, + "step": 2397, + "time_per_iteration": 5.0380027294158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912357, + "balance_loss_mlp": 1.5885272, + "diversity_loss_mlp": 0.20776251, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.030510515868204604, + "language_loss": 0.86040902, + "learning_rate": 0.0005864831688507443, + "loss": 0.86953259, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0142122, + "step": 2398, + "time_per_iteration": 2.9795196056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099565, + "balance_loss_mlp": 1.08854449, + "diversity_loss_mlp": 0.0, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.07495608045078013, + "language_loss": 0.75224954, + "learning_rate": 0.0005861763054205754, + "loss": 0.76324517, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2399, + "time_per_iteration": 2.7307660579681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908198, + "balance_loss_mlp": 1.58042729, + "diversity_loss_mlp": 0.20863593, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.03052990379504839, + "language_loss": 0.8056978, + "learning_rate": 0.0005858694085337976, + "loss": 0.81477976, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01366598, + "step": 2400, + "time_per_iteration": 2.8421711921691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115275, + "balance_loss_mlp": 1.10424817, + "diversity_loss_mlp": 0.0, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.08470381171074581, + "language_loss": 0.8355788, + "learning_rate": 0.0005855624783095589, + "loss": 0.84673154, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2401, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114727, + "balance_loss_mlp": 1.10386109, + "diversity_loss_mlp": 0.0, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.07139821582333657, + "language_loss": 0.85265267, + "learning_rate": 0.00058525551486702, + "loss": 0.86379993, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2402, + "time_per_iteration": 2.5159239768981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119193, + "balance_loss_mlp": 1.10795164, + "diversity_loss_mlp": 0.0, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.08747389081307531, + "language_loss": 0.80850065, + "learning_rate": 0.0005849485183253548, + "loss": 0.81969261, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.11242676, + "routerloss_mlp": 0.0, + "step": 2403, + "time_per_iteration": 2.643031358718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110943, + "balance_loss_mlp": 1.09971905, + "diversity_loss_mlp": 0.0, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.06974006499463392, + "language_loss": 0.8764264, + "learning_rate": 0.0005846414888037501, + "loss": 0.88753581, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.11224365, + "routerloss_mlp": 0.0, + "step": 2404, + "time_per_iteration": 2.4847412109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091069, + "balance_loss_mlp": 1.07962489, + "diversity_loss_mlp": 0.0, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.07303422211334305, + "language_loss": 0.82384312, + "learning_rate": 0.0005843344264214049, + "loss": 0.83475375, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.11444092, + "routerloss_mlp": 0.0, + "step": 2405, + "time_per_iteration": 2.7470028400421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093931, + "balance_loss_mlp": 1.08265948, + "diversity_loss_mlp": 0.0, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.06660378994806349, + "language_loss": 0.84838545, + "learning_rate": 0.0005840273312975317, + "loss": 0.85932475, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2406, + "time_per_iteration": 2.834179162979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082019, + "balance_loss_mlp": 1.07018733, + "diversity_loss_mlp": 0.0, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.07201348711751891, + "language_loss": 0.89853442, + "learning_rate": 0.0005837202035513555, + "loss": 0.90935457, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2407, + "time_per_iteration": 2.578505277633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081302, + "balance_loss_mlp": 1.06933987, + "diversity_loss_mlp": 0.0, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.06479654524201506, + "language_loss": 0.81299376, + "learning_rate": 0.0005834130433021136, + "loss": 0.82380676, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.11956787, + "routerloss_mlp": 0.0, + "step": 2408, + "time_per_iteration": 2.742830991744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075359, + "balance_loss_mlp": 1.0631156, + "diversity_loss_mlp": 0.0, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.06628126289532602, + "language_loss": 0.73402894, + "learning_rate": 0.0005831058506690563, + "loss": 0.74478251, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 2409, + "time_per_iteration": 2.6239566802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875374, + "balance_loss_mlp": 1.5126431, + "diversity_loss_mlp": 0.20975235, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.03030502692098504, + "language_loss": 0.86162984, + "learning_rate": 0.0005827986257714464, + "loss": 0.87038362, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01417591, + "step": 2410, + "time_per_iteration": 2.9302031993865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069185, + "balance_loss_mlp": 1.05664992, + "diversity_loss_mlp": 0.0, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.07558638886093381, + "language_loss": 0.88803709, + "learning_rate": 0.0005824913687285591, + "loss": 0.89872897, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.12542725, + "routerloss_mlp": 0.0, + "step": 2411, + "time_per_iteration": 2.685814142227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070655, + "balance_loss_mlp": 1.05821514, + "diversity_loss_mlp": 0.0, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.1080687232114875, + "language_loss": 0.81367224, + "learning_rate": 0.0005821840796596821, + "loss": 0.82437879, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.12445068, + "routerloss_mlp": 0.0, + "step": 2412, + "time_per_iteration": 2.6551058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073612, + "balance_loss_mlp": 1.06099916, + "diversity_loss_mlp": 0.0, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.07026214254932567, + "language_loss": 0.80428362, + "learning_rate": 0.0005818767586841158, + "loss": 0.81501973, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.12609863, + "routerloss_mlp": 0.0, + "step": 2413, + "time_per_iteration": 2.759437322616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085225, + "balance_loss_mlp": 1.07259476, + "diversity_loss_mlp": 0.0, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.08627931539992734, + "language_loss": 0.86441922, + "learning_rate": 0.0005815694059211726, + "loss": 0.8752715, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.12640381, + "routerloss_mlp": 0.0, + "step": 2414, + "time_per_iteration": 2.658977746963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171514, + "balance_loss_mlp": 1.16250181, + "diversity_loss_mlp": 0.0, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.047494824411654174, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82045138, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 2415, + "time_per_iteration": 4.799519777297974 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145606, + "balance_loss_mlp": 1.13711834, + "diversity_loss_mlp": 0.0, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.043373387729815825, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78090668, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.08496094, + "routerloss_mlp": 0.0, + "step": 2416, + "time_per_iteration": 4.990553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0087124, + "balance_loss_mlp": 1.50839305, + "diversity_loss_mlp": 0.20828754, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.030578892859867562, + "language_loss": 0.86378521, + "learning_rate": 0.0005806471581013931, + "loss": 0.87249762, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01289999, + "step": 2417, + "time_per_iteration": 2.6900436878204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122345, + "balance_loss_mlp": 1.11040044, + "diversity_loss_mlp": 0.0, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.07418438196536063, + "language_loss": 0.78360349, + "learning_rate": 0.0005803396793823146, + "loss": 0.79482698, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2418, + "time_per_iteration": 2.8027873039245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113389, + "balance_loss_mlp": 1.12212396, + "diversity_loss_mlp": 0.0, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.07660062238284089, + "language_loss": 0.85582161, + "learning_rate": 0.0005800321694726065, + "loss": 0.86716056, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2419, + "time_per_iteration": 4.293209075927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870744, + "balance_loss_mlp": 1.50698626, + "diversity_loss_mlp": 0.20827082, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.03270390918014964, + "language_loss": 0.86636543, + "learning_rate": 0.0005797246284916545, + "loss": 0.87507284, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01311516, + "step": 2420, + "time_per_iteration": 2.7184417247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112587, + "balance_loss_mlp": 1.1061976, + "diversity_loss_mlp": 0.0, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.04763479459010098, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78617769, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2421, + "time_per_iteration": 4.978823900222778 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164162, + "balance_loss_mlp": 1.1527952, + "diversity_loss_mlp": 0.0, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.08359324638355049, + "language_loss": 0.87635398, + "learning_rate": 0.0005791094537936233, + "loss": 0.8879956, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2422, + "time_per_iteration": 2.706270217895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145768, + "balance_loss_mlp": 1.1349256, + "diversity_loss_mlp": 0.0, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.07317342210777962, + "language_loss": 0.81790811, + "learning_rate": 0.0005788018203153762, + "loss": 0.82936579, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2423, + "time_per_iteration": 2.5965187549591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114513, + "balance_loss_mlp": 1.13404965, + "diversity_loss_mlp": 0.0, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.08308161607945047, + "language_loss": 0.85607517, + "learning_rate": 0.000578494156243549, + "loss": 0.86752647, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.11083984, + "routerloss_mlp": 0.0, + "step": 2424, + "time_per_iteration": 2.5783984661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124685, + "balance_loss_mlp": 1.1135745, + "diversity_loss_mlp": 0.0, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.06702614551613306, + "language_loss": 0.88852286, + "learning_rate": 0.0005781864616975878, + "loss": 0.89976966, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2425, + "time_per_iteration": 2.6615347862243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105595, + "balance_loss_mlp": 1.09463954, + "diversity_loss_mlp": 0.0, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.0790317604017366, + "language_loss": 0.84397781, + "learning_rate": 0.0005778787367969502, + "loss": 0.85503376, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2426, + "time_per_iteration": 2.5796711444854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095822, + "balance_loss_mlp": 1.08478928, + "diversity_loss_mlp": 0.0, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.062032004097500974, + "language_loss": 0.80925953, + "learning_rate": 0.0005775709816611053, + "loss": 0.82021779, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.11029053, + "routerloss_mlp": 0.0, + "step": 2427, + "time_per_iteration": 2.9491348266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.07454419, + "diversity_loss_mlp": 0.0, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.0676389696771178, + "language_loss": 0.83549029, + "learning_rate": 0.0005772631964095346, + "loss": 0.8463425, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.10681152, + "routerloss_mlp": 0.0, + "step": 2428, + "time_per_iteration": 2.6981353759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081501, + "balance_loss_mlp": 1.07072484, + "diversity_loss_mlp": 0.0, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.08126061261115217, + "language_loss": 0.8576231, + "learning_rate": 0.000576955381161731, + "loss": 0.86843812, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.10778809, + "routerloss_mlp": 0.0, + "step": 2429, + "time_per_iteration": 2.6633517742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074344, + "balance_loss_mlp": 1.06313229, + "diversity_loss_mlp": 0.0, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.08275287351868318, + "language_loss": 0.86212349, + "learning_rate": 0.0005766475360371985, + "loss": 0.87286699, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2430, + "time_per_iteration": 2.5904853343963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072898, + "balance_loss_mlp": 1.06205034, + "diversity_loss_mlp": 0.0, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.0860704645170746, + "language_loss": 0.84563982, + "learning_rate": 0.0005763396611554536, + "loss": 0.85636878, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.10852051, + "routerloss_mlp": 0.0, + "step": 2431, + "time_per_iteration": 2.6467607021331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071528, + "balance_loss_mlp": 1.0607698, + "diversity_loss_mlp": 0.0, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.08998246562287979, + "language_loss": 0.80544329, + "learning_rate": 0.0005760317566360237, + "loss": 0.81615859, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 2432, + "time_per_iteration": 3.006641387939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075816, + "balance_loss_mlp": 1.0648669, + "diversity_loss_mlp": 0.0, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.07509845156715887, + "language_loss": 0.84929144, + "learning_rate": 0.000575723822598448, + "loss": 0.86004961, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2433, + "time_per_iteration": 2.764425277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067328, + "balance_loss_mlp": 1.0558188, + "diversity_loss_mlp": 0.0, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.06651895210271294, + "language_loss": 0.8167448, + "learning_rate": 0.0005754158591622773, + "loss": 0.82741809, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2434, + "time_per_iteration": 2.9786107540130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075165, + "balance_loss_mlp": 1.06366098, + "diversity_loss_mlp": 0.0, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.07251033111677281, + "language_loss": 0.82255369, + "learning_rate": 0.0005751078664470732, + "loss": 0.83330536, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2435, + "time_per_iteration": 2.5367684364318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079887, + "balance_loss_mlp": 1.06816268, + "diversity_loss_mlp": 0.0, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.07721942828462902, + "language_loss": 0.85977614, + "learning_rate": 0.0005747998445724094, + "loss": 0.87057501, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2436, + "time_per_iteration": 2.636200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108497, + "balance_loss_mlp": 1.07313251, + "diversity_loss_mlp": 0.0, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.07122055500535385, + "language_loss": 0.89087129, + "learning_rate": 0.0005744917936578707, + "loss": 0.90172094, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2437, + "time_per_iteration": 2.7820210456848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.07790279, + "diversity_loss_mlp": 0.0, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.0674848593159629, + "language_loss": 0.84104413, + "learning_rate": 0.0005741837138230526, + "loss": 0.85194385, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.1206665, + "routerloss_mlp": 0.0, + "step": 2438, + "time_per_iteration": 2.7324602603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091997, + "balance_loss_mlp": 1.07981968, + "diversity_loss_mlp": 0.0, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.08534673561441382, + "language_loss": 0.86345065, + "learning_rate": 0.0005738756051875627, + "loss": 0.87437063, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2439, + "time_per_iteration": 3.0705649852752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098053, + "balance_loss_mlp": 1.08564377, + "diversity_loss_mlp": 0.0, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.06467123496854205, + "language_loss": 0.83114249, + "learning_rate": 0.0005735674678710192, + "loss": 0.84212297, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.12414551, + "routerloss_mlp": 0.0, + "step": 2440, + "time_per_iteration": 2.6645498275756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089062, + "balance_loss_mlp": 1.07644403, + "diversity_loss_mlp": 0.0, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.09155388913703945, + "language_loss": 0.81178355, + "learning_rate": 0.0005732593019930517, + "loss": 0.82267421, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.12628174, + "routerloss_mlp": 0.0, + "step": 2441, + "time_per_iteration": 2.892775774002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084176, + "balance_loss_mlp": 1.07203436, + "diversity_loss_mlp": 0.0, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.07090754106091501, + "language_loss": 0.87927258, + "learning_rate": 0.0005729511076733008, + "loss": 0.89011431, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2442, + "time_per_iteration": 2.629671096801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080039, + "balance_loss_mlp": 1.06766534, + "diversity_loss_mlp": 0.0, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.0886658808398658, + "language_loss": 0.85080904, + "learning_rate": 0.000572642885031418, + "loss": 0.86160946, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.1237793, + "routerloss_mlp": 0.0, + "step": 2443, + "time_per_iteration": 2.858177900314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083351, + "balance_loss_mlp": 1.07077432, + "diversity_loss_mlp": 0.0, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.06516149518751314, + "language_loss": 0.80735445, + "learning_rate": 0.0005723346341870662, + "loss": 0.81818795, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.12573242, + "routerloss_mlp": 0.0, + "step": 2444, + "time_per_iteration": 2.7146968841552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084098, + "balance_loss_mlp": 1.07161689, + "diversity_loss_mlp": 0.0, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.08093347646647668, + "language_loss": 0.86360067, + "learning_rate": 0.0005720263552599188, + "loss": 0.87444162, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2445, + "time_per_iteration": 2.5240447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077035, + "balance_loss_mlp": 1.06469131, + "diversity_loss_mlp": 0.0, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.10031003663616385, + "language_loss": 0.80052316, + "learning_rate": 0.0005717180483696604, + "loss": 0.81129348, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.12347412, + "routerloss_mlp": 0.0, + "step": 2446, + "time_per_iteration": 2.8576042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076731, + "balance_loss_mlp": 1.06456566, + "diversity_loss_mlp": 0.0, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.06704052343949889, + "language_loss": 0.82989585, + "learning_rate": 0.0005714097136359862, + "loss": 0.84066319, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2447, + "time_per_iteration": 2.624566078186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841696, + "balance_loss_mlp": 1.45028305, + "diversity_loss_mlp": 0.205522, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.027205551471082397, + "language_loss": 0.86918223, + "learning_rate": 0.0005711013511786027, + "loss": 0.87759912, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01379322, + "step": 2448, + "time_per_iteration": 2.797086238861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106901, + "balance_loss_mlp": 1.05689788, + "diversity_loss_mlp": 0.0, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.06342125158561994, + "language_loss": 0.83811176, + "learning_rate": 0.0005707929611172263, + "loss": 0.84880185, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2449, + "time_per_iteration": 2.731825351715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071528, + "balance_loss_mlp": 1.05951726, + "diversity_loss_mlp": 0.0, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.09170207604049842, + "language_loss": 0.84256124, + "learning_rate": 0.000570484543571585, + "loss": 0.85327655, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2450, + "time_per_iteration": 2.5735461711883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064618, + "balance_loss_mlp": 1.05268502, + "diversity_loss_mlp": 0.0, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.08479509676509417, + "language_loss": 0.82936448, + "learning_rate": 0.0005701760986614171, + "loss": 0.84001064, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2451, + "time_per_iteration": 2.537297248840332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071081, + "balance_loss_mlp": 1.0591718, + "diversity_loss_mlp": 0.0, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.059658494784791405, + "language_loss": 0.8734417, + "learning_rate": 0.0005698676265064714, + "loss": 0.88415247, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2452, + "time_per_iteration": 2.5586979389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076856, + "balance_loss_mlp": 1.06525099, + "diversity_loss_mlp": 0.0, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.0707454592736124, + "language_loss": 0.89208829, + "learning_rate": 0.0005695591272265074, + "loss": 0.90285689, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2453, + "time_per_iteration": 2.527719736099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088311, + "balance_loss_mlp": 1.07617581, + "diversity_loss_mlp": 0.0, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.07134640406799209, + "language_loss": 0.81947398, + "learning_rate": 0.0005692506009412954, + "loss": 0.83035707, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.12133789, + "routerloss_mlp": 0.0, + "step": 2454, + "time_per_iteration": 2.6558947563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0064123, + "balance_loss_mlp": 1.11988485, + "diversity_loss_mlp": 0.13842735, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.002527541257966033, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78192496, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01207405, + "step": 2455, + "time_per_iteration": 5.005730628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_mlp": 1.07716715, + "diversity_loss_mlp": 0.0, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07179176619920838, + "language_loss": 0.89308333, + "learning_rate": 0.0005686334678342593, + "loss": 0.90397304, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2456, + "time_per_iteration": 2.8779940605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094143, + "balance_loss_mlp": 1.08280611, + "diversity_loss_mlp": 0.0, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.08187467616753978, + "language_loss": 0.81664062, + "learning_rate": 0.0005683248612520274, + "loss": 0.82758206, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.11334229, + "routerloss_mlp": 0.0, + "step": 2457, + "time_per_iteration": 3.0844156742095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087436, + "balance_loss_mlp": 1.07605195, + "diversity_loss_mlp": 0.0, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.08330432962991885, + "language_loss": 0.83940041, + "learning_rate": 0.0005680162281437321, + "loss": 0.85027468, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2458, + "time_per_iteration": 2.886364221572876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108407, + "balance_loss_mlp": 1.07263231, + "diversity_loss_mlp": 0.0, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.06607837126207569, + "language_loss": 0.84340584, + "learning_rate": 0.000567707568629195, + "loss": 0.8542465, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2459, + "time_per_iteration": 2.7153613567352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082795, + "balance_loss_mlp": 1.0712074, + "diversity_loss_mlp": 0.0, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.0662532862091719, + "language_loss": 0.82247961, + "learning_rate": 0.0005673988828282486, + "loss": 0.8333075, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2460, + "time_per_iteration": 2.6740705966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_mlp": 1.06760526, + "diversity_loss_mlp": 0.0, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.05997115702153478, + "language_loss": 0.81122911, + "learning_rate": 0.0005670901708607352, + "loss": 0.82202172, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.11645508, + "routerloss_mlp": 0.0, + "step": 2461, + "time_per_iteration": 3.0222864151000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077887, + "balance_loss_mlp": 1.0661211, + "diversity_loss_mlp": 0.0, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.12722631062247966, + "language_loss": 0.83784962, + "learning_rate": 0.0005667814328465076, + "loss": 0.84862852, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2462, + "time_per_iteration": 2.62223744392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071899, + "balance_loss_mlp": 1.06031179, + "diversity_loss_mlp": 0.0, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.10920156375550993, + "language_loss": 0.82163846, + "learning_rate": 0.0005664726689054285, + "loss": 0.83235747, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2463, + "time_per_iteration": 2.474776029586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072445, + "balance_loss_mlp": 1.06096554, + "diversity_loss_mlp": 0.0, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.07990467081118383, + "language_loss": 0.80772603, + "learning_rate": 0.0005661638791573704, + "loss": 0.81845051, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2464, + "time_per_iteration": 2.699165105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073096, + "balance_loss_mlp": 1.06145513, + "diversity_loss_mlp": 0.0, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.06593248790897067, + "language_loss": 0.86978662, + "learning_rate": 0.0005658550637222164, + "loss": 0.8805176, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2465, + "time_per_iteration": 2.6154093742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070219, + "balance_loss_mlp": 1.0586381, + "diversity_loss_mlp": 0.0, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.06422453310815268, + "language_loss": 0.82103038, + "learning_rate": 0.0005655462227198592, + "loss": 0.83173257, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2466, + "time_per_iteration": 2.888040065765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068955, + "balance_loss_mlp": 1.05703366, + "diversity_loss_mlp": 0.0, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.07464863741428074, + "language_loss": 0.84426093, + "learning_rate": 0.0005652373562702016, + "loss": 0.85495043, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2467, + "time_per_iteration": 2.6240220069885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071196, + "balance_loss_mlp": 1.05926943, + "diversity_loss_mlp": 0.0, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.06778780294468974, + "language_loss": 0.88405621, + "learning_rate": 0.000564928464493156, + "loss": 0.89476824, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2468, + "time_per_iteration": 2.598493814468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068768, + "balance_loss_mlp": 1.05676329, + "diversity_loss_mlp": 0.0, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.06443301027733518, + "language_loss": 0.81735635, + "learning_rate": 0.000564619547508645, + "loss": 0.82804406, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.11999512, + "routerloss_mlp": 0.0, + "step": 2469, + "time_per_iteration": 4.510512828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_mlp": 1.05816698, + "diversity_loss_mlp": 0.0, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.0879456232971056, + "language_loss": 0.82882106, + "learning_rate": 0.0005643106054366008, + "loss": 0.83952397, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.12121582, + "routerloss_mlp": 0.0, + "step": 2470, + "time_per_iteration": 2.5648152828216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.06276536, + "diversity_loss_mlp": 0.0, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.06194770014341408, + "language_loss": 0.79193991, + "learning_rate": 0.000564001638396965, + "loss": 0.8026849, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.11706543, + "routerloss_mlp": 0.0, + "step": 2471, + "time_per_iteration": 2.7267987728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073205, + "balance_loss_mlp": 1.06152296, + "diversity_loss_mlp": 0.0, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.06505306942508977, + "language_loss": 0.82164901, + "learning_rate": 0.0005636926465096897, + "loss": 0.83238107, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2472, + "time_per_iteration": 3.035590887069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078551, + "balance_loss_mlp": 1.06670165, + "diversity_loss_mlp": 0.0, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.08684318660371242, + "language_loss": 0.8723672, + "learning_rate": 0.0005633836298947363, + "loss": 0.88315272, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2473, + "time_per_iteration": 4.002026796340942 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091096, + "balance_loss_mlp": 1.07912695, + "diversity_loss_mlp": 0.0, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.0706680414575132, + "language_loss": 0.70566314, + "learning_rate": 0.000563074588672075, + "loss": 0.71657413, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2474, + "time_per_iteration": 2.6985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089769, + "balance_loss_mlp": 1.07802129, + "diversity_loss_mlp": 0.0, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.06282750442858279, + "language_loss": 0.85378051, + "learning_rate": 0.0005627655229616868, + "loss": 0.86467826, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.11743164, + "routerloss_mlp": 0.0, + "step": 2475, + "time_per_iteration": 2.7580935955047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091111, + "balance_loss_mlp": 1.07941031, + "diversity_loss_mlp": 0.0, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.07002888905047219, + "language_loss": 0.90058106, + "learning_rate": 0.0005624564328835616, + "loss": 0.91149217, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2476, + "time_per_iteration": 2.789257764816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108666, + "balance_loss_mlp": 1.07509637, + "diversity_loss_mlp": 0.0, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.06042863191219761, + "language_loss": 0.84203571, + "learning_rate": 0.0005621473185576986, + "loss": 0.85290229, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2477, + "time_per_iteration": 2.724280834197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089922, + "balance_loss_mlp": 1.07846594, + "diversity_loss_mlp": 0.0, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.07203405271885309, + "language_loss": 0.87555075, + "learning_rate": 0.0005618381801041068, + "loss": 0.88644993, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2478, + "time_per_iteration": 2.6800026893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085708, + "balance_loss_mlp": 1.0738883, + "diversity_loss_mlp": 0.0, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.08495018756940642, + "language_loss": 0.83006722, + "learning_rate": 0.0005615290176428044, + "loss": 0.84092432, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.11810303, + "routerloss_mlp": 0.0, + "step": 2479, + "time_per_iteration": 2.6456432342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078377, + "balance_loss_mlp": 1.06658673, + "diversity_loss_mlp": 0.0, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.07371403414772894, + "language_loss": 0.84979588, + "learning_rate": 0.0005612198312938187, + "loss": 0.86057961, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2480, + "time_per_iteration": 2.7325923442840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085233, + "balance_loss_mlp": 1.0737772, + "diversity_loss_mlp": 0.0, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.05926830515799366, + "language_loss": 0.79493093, + "learning_rate": 0.0005609106211771868, + "loss": 0.80578327, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2481, + "time_per_iteration": 2.8374931812286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108305, + "balance_loss_mlp": 1.07103384, + "diversity_loss_mlp": 0.0, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.06643858588339867, + "language_loss": 0.88938701, + "learning_rate": 0.0005606013874129543, + "loss": 0.90021759, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2482, + "time_per_iteration": 2.7547929286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081649, + "balance_loss_mlp": 1.07017505, + "diversity_loss_mlp": 0.0, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.06416127972697647, + "language_loss": 0.80410159, + "learning_rate": 0.0005602921301211768, + "loss": 0.81491804, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2483, + "time_per_iteration": 2.7025153636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080053, + "balance_loss_mlp": 1.06850159, + "diversity_loss_mlp": 0.0, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.07652865967226291, + "language_loss": 0.8209163, + "learning_rate": 0.0005599828494219185, + "loss": 0.83171678, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.11541748, + "routerloss_mlp": 0.0, + "step": 2484, + "time_per_iteration": 2.5415024757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070825, + "balance_loss_mlp": 1.05903542, + "diversity_loss_mlp": 0.0, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.07721505579443601, + "language_loss": 0.89162952, + "learning_rate": 0.0005596735454352527, + "loss": 0.90233779, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2485, + "time_per_iteration": 2.8591346740722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077742, + "balance_loss_mlp": 1.06591046, + "diversity_loss_mlp": 0.0, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.07819028279068943, + "language_loss": 0.85696715, + "learning_rate": 0.0005593642182812619, + "loss": 0.86774457, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.1182251, + "routerloss_mlp": 0.0, + "step": 2486, + "time_per_iteration": 2.679927349090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_mlp": 1.06575358, + "diversity_loss_mlp": 0.0, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.0859238614993436, + "language_loss": 0.83753216, + "learning_rate": 0.0005590548680800378, + "loss": 0.84830678, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.11694336, + "routerloss_mlp": 0.0, + "step": 2487, + "time_per_iteration": 3.0984909534454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071222, + "balance_loss_mlp": 1.05950415, + "diversity_loss_mlp": 0.0, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.06795851613398404, + "language_loss": 0.76434267, + "learning_rate": 0.0005587454949516804, + "loss": 0.77505481, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2488, + "time_per_iteration": 2.692324161529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_mlp": 1.06507468, + "diversity_loss_mlp": 0.0, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.06921637005003253, + "language_loss": 0.8785038, + "learning_rate": 0.0005584360990162993, + "loss": 0.88927084, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.11627197, + "routerloss_mlp": 0.0, + "step": 2489, + "time_per_iteration": 2.646521806716919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077817, + "balance_loss_mlp": 1.06614649, + "diversity_loss_mlp": 0.0, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.06386300972416134, + "language_loss": 0.85713631, + "learning_rate": 0.0005581266803940124, + "loss": 0.86791456, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.11657715, + "routerloss_mlp": 0.0, + "step": 2490, + "time_per_iteration": 2.735152244567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.05925143, + "diversity_loss_mlp": 0.0, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.0718717211843218, + "language_loss": 0.87536263, + "learning_rate": 0.0005578172392049471, + "loss": 0.88607073, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2491, + "time_per_iteration": 2.7718377113342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00892921, + "balance_loss_mlp": 1.54530287, + "diversity_loss_mlp": 0.21191472, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.033555176901221506, + "language_loss": 0.84551859, + "learning_rate": 0.0005575077755692386, + "loss": 0.85444778, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01431197, + "step": 2492, + "time_per_iteration": 2.81888747215271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070402, + "balance_loss_mlp": 1.05893993, + "diversity_loss_mlp": 0.0, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.054684262853474656, + "language_loss": 0.86001486, + "learning_rate": 0.0005571982896070316, + "loss": 0.8707189, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.11456299, + "routerloss_mlp": 0.0, + "step": 2493, + "time_per_iteration": 2.655311346054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084039, + "balance_loss_mlp": 1.07248712, + "diversity_loss_mlp": 0.0, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.07545203546694841, + "language_loss": 0.89854079, + "learning_rate": 0.0005568887814384792, + "loss": 0.90938115, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2494, + "time_per_iteration": 2.5930681228637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_mlp": 1.07098675, + "diversity_loss_mlp": 0.0, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.07194257940045806, + "language_loss": 0.87281573, + "learning_rate": 0.000556579251183743, + "loss": 0.88364077, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2495, + "time_per_iteration": 2.6386003494262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076942, + "balance_loss_mlp": 1.06520605, + "diversity_loss_mlp": 0.0, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.0750590648958695, + "language_loss": 0.80158448, + "learning_rate": 0.0005562696989629936, + "loss": 0.81235385, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.11737061, + "routerloss_mlp": 0.0, + "step": 2496, + "time_per_iteration": 2.7050864696502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00880705, + "balance_loss_mlp": 1.52288473, + "diversity_loss_mlp": 0.21003026, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.02916103721032611, + "language_loss": 0.82606125, + "learning_rate": 0.0005559601248964095, + "loss": 0.83486831, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01424794, + "step": 2497, + "time_per_iteration": 2.6473939418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085332, + "balance_loss_mlp": 1.0741564, + "diversity_loss_mlp": 0.0, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.07410871061403823, + "language_loss": 0.85882998, + "learning_rate": 0.0005556505291041783, + "loss": 0.86968333, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.11175537, + "routerloss_mlp": 0.0, + "step": 2498, + "time_per_iteration": 2.665832042694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105898, + "balance_loss_mlp": 1.09428692, + "diversity_loss_mlp": 0.0, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.06465509842390993, + "language_loss": 0.84413946, + "learning_rate": 0.0005553409117064954, + "loss": 0.8551985, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2499, + "time_per_iteration": 2.880300521850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00859857, + "balance_loss_mlp": 1.48415303, + "diversity_loss_mlp": 0.20870377, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.02869897963967695, + "language_loss": 0.84937358, + "learning_rate": 0.0005550312728235654, + "loss": 0.85797209, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01342856, + "step": 2500, + "time_per_iteration": 2.7199203968048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109419, + "balance_loss_mlp": 1.08251953, + "diversity_loss_mlp": 0.0, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.07331859457791397, + "language_loss": 0.83879191, + "learning_rate": 0.0005547216125756003, + "loss": 0.84973377, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2501, + "time_per_iteration": 2.732786178588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098928, + "balance_loss_mlp": 1.08708501, + "diversity_loss_mlp": 0.0, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.07387575947985975, + "language_loss": 0.82064617, + "learning_rate": 0.0005544119310828211, + "loss": 0.83163536, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2502, + "time_per_iteration": 3.1029446125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100673, + "balance_loss_mlp": 1.08865714, + "diversity_loss_mlp": 0.0, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.06596898477591598, + "language_loss": 0.84657413, + "learning_rate": 0.0005541022284654568, + "loss": 0.8575809, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2503, + "time_per_iteration": 2.901026725769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092163, + "balance_loss_mlp": 1.08015907, + "diversity_loss_mlp": 0.0, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.0759157238743441, + "language_loss": 0.83907866, + "learning_rate": 0.0005537925048437446, + "loss": 0.85000032, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2504, + "time_per_iteration": 2.6014060974121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00594545, + "balance_loss_mlp": 1.03097272, + "diversity_loss_mlp": 0.13453583, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.0017952613590721677, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76346016, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01179097, + "step": 2505, + "time_per_iteration": 4.960138320922852 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00867388, + "balance_loss_mlp": 1.49711311, + "diversity_loss_mlp": 0.20998067, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.029195885141922995, + "language_loss": 0.88189656, + "learning_rate": 0.0005531729950682664, + "loss": 0.8905704, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01384138, + "step": 2506, + "time_per_iteration": 3.056671142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082436, + "balance_loss_mlp": 1.07027662, + "diversity_loss_mlp": 0.0, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.09591114443507165, + "language_loss": 0.84746361, + "learning_rate": 0.000552863209155015, + "loss": 0.85828793, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2507, + "time_per_iteration": 2.473930835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00866012, + "balance_loss_mlp": 1.49284506, + "diversity_loss_mlp": 0.21081753, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.03047035716712285, + "language_loss": 0.82048851, + "learning_rate": 0.0005525534027184461, + "loss": 0.82914865, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01418037, + "step": 2508, + "time_per_iteration": 2.5708260536193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078028, + "balance_loss_mlp": 1.06624985, + "diversity_loss_mlp": 0.0, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.06261213728600334, + "language_loss": 0.83131289, + "learning_rate": 0.0005522435758788365, + "loss": 0.84209323, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2509, + "time_per_iteration": 2.7291650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00853572, + "balance_loss_mlp": 1.46908307, + "diversity_loss_mlp": 0.20966808, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.03495470447814039, + "language_loss": 0.80126894, + "learning_rate": 0.0005519337287564721, + "loss": 0.80980462, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01419635, + "step": 2510, + "time_per_iteration": 2.843698024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077046, + "balance_loss_mlp": 1.06536365, + "diversity_loss_mlp": 0.0, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07525780944119016, + "language_loss": 0.83495927, + "learning_rate": 0.000551623861471646, + "loss": 0.84572971, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2511, + "time_per_iteration": 2.7327091693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.1273582, + "diversity_loss_mlp": 0.0, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.052890092991212126, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79952717, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.06542969, + "routerloss_mlp": 0.0, + "step": 2512, + "time_per_iteration": 4.820046901702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073764, + "balance_loss_mlp": 1.06182551, + "diversity_loss_mlp": 0.0, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.09417698665840035, + "language_loss": 0.8670119, + "learning_rate": 0.0005510040668958211, + "loss": 0.87774956, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2513, + "time_per_iteration": 2.579780101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051826, + "balance_loss_mlp": 1.04515004, + "diversity_loss_mlp": 0.0, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.02705432320804172, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78812408, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.06689453, + "routerloss_mlp": 0.0, + "step": 2514, + "time_per_iteration": 4.83507227897644 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106953, + "balance_loss_mlp": 1.05716157, + "diversity_loss_mlp": 0.0, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.07432123735470587, + "language_loss": 0.83170015, + "learning_rate": 0.0005503841931138645, + "loss": 0.84239542, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.12365723, + "routerloss_mlp": 0.0, + "step": 2515, + "time_per_iteration": 2.6834895610809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071737, + "balance_loss_mlp": 1.05963731, + "diversity_loss_mlp": 0.0, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.07510504832931036, + "language_loss": 0.81515384, + "learning_rate": 0.0005500742268214025, + "loss": 0.82587123, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2516, + "time_per_iteration": 2.494479179382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084077, + "balance_loss_mlp": 1.0715425, + "diversity_loss_mlp": 0.0, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.06432693662792612, + "language_loss": 0.85142744, + "learning_rate": 0.0005497642410884014, + "loss": 0.86226821, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.12542725, + "routerloss_mlp": 0.0, + "step": 2517, + "time_per_iteration": 2.760425090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080501, + "balance_loss_mlp": 1.06788325, + "diversity_loss_mlp": 0.0, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.06763953923030977, + "language_loss": 0.85120749, + "learning_rate": 0.0005494542360352085, + "loss": 0.86201251, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.12628174, + "routerloss_mlp": 0.0, + "step": 2518, + "time_per_iteration": 2.6524109840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108191, + "balance_loss_mlp": 1.06955993, + "diversity_loss_mlp": 0.0, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.06089591080825084, + "language_loss": 0.85741639, + "learning_rate": 0.0005491442117821783, + "loss": 0.86823547, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 2519, + "time_per_iteration": 2.7461459636688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079216, + "balance_loss_mlp": 1.06654429, + "diversity_loss_mlp": 0.0, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.07584750574127574, + "language_loss": 0.87494171, + "learning_rate": 0.0005488341684496732, + "loss": 0.88573384, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.12677002, + "routerloss_mlp": 0.0, + "step": 2520, + "time_per_iteration": 2.6621458530426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080655, + "balance_loss_mlp": 1.06843615, + "diversity_loss_mlp": 0.0, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06605179609441998, + "language_loss": 0.9207437, + "learning_rate": 0.0005485241061580624, + "loss": 0.9315502, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2521, + "time_per_iteration": 2.772949457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089898, + "balance_loss_mlp": 1.07741094, + "diversity_loss_mlp": 0.0, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.06556104217544546, + "language_loss": 0.8458938, + "learning_rate": 0.0005482140250277228, + "loss": 0.85679281, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 2522, + "time_per_iteration": 2.978330135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847105, + "balance_loss_mlp": 1.45509815, + "diversity_loss_mlp": 0.21114388, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.03368619412239962, + "language_loss": 0.87090278, + "learning_rate": 0.0005479039251790387, + "loss": 0.87937379, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01398425, + "step": 2523, + "time_per_iteration": 2.6939120292663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00840008, + "balance_loss_mlp": 1.44148707, + "diversity_loss_mlp": 0.21069397, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.03188648694570784, + "language_loss": 0.84722733, + "learning_rate": 0.0005475938067324014, + "loss": 0.85562754, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0139178, + "step": 2524, + "time_per_iteration": 2.859184980392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106923, + "balance_loss_mlp": 1.09528267, + "diversity_loss_mlp": 0.0, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.06962736532334403, + "language_loss": 0.83518255, + "learning_rate": 0.0005472836698082098, + "loss": 0.84625173, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2525, + "time_per_iteration": 2.534783363342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101033, + "balance_loss_mlp": 1.08923149, + "diversity_loss_mlp": 0.0, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.07423434170097615, + "language_loss": 0.84140873, + "learning_rate": 0.0005469735145268694, + "loss": 0.85241902, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2526, + "time_per_iteration": 2.7064108848571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090982, + "balance_loss_mlp": 1.07928169, + "diversity_loss_mlp": 0.0, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.0731540325655248, + "language_loss": 0.81093931, + "learning_rate": 0.0005466633410087933, + "loss": 0.82184911, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2527, + "time_per_iteration": 2.682969570159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085513, + "balance_loss_mlp": 1.07793164, + "diversity_loss_mlp": 0.0, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.03711409557498352, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78346336, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.07568359, + "routerloss_mlp": 0.0, + "step": 2528, + "time_per_iteration": 4.962444067001343 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085086, + "balance_loss_mlp": 1.07360601, + "diversity_loss_mlp": 0.0, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.07791605184695856, + "language_loss": 0.88148236, + "learning_rate": 0.0005460429397441214, + "loss": 0.89233321, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2529, + "time_per_iteration": 2.5908102989196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00835644, + "balance_loss_mlp": 1.43002903, + "diversity_loss_mlp": 0.21195745, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.03186279831907627, + "language_loss": 0.87013817, + "learning_rate": 0.0005457327122383866, + "loss": 0.87849462, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01465126, + "step": 2530, + "time_per_iteration": 2.656264543533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036926, + "balance_loss_mlp": 1.02939153, + "diversity_loss_mlp": 0.0, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.02373673385224348, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75673413, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.07519531, + "routerloss_mlp": 0.0, + "step": 2531, + "time_per_iteration": 4.838496208190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100935, + "balance_loss_mlp": 1.08965194, + "diversity_loss_mlp": 0.0, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.06845758574896237, + "language_loss": 0.75823385, + "learning_rate": 0.0005451122040823244, + "loss": 0.76924324, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2532, + "time_per_iteration": 2.770751714706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099102, + "balance_loss_mlp": 1.08746696, + "diversity_loss_mlp": 0.0, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.07387169787784394, + "language_loss": 0.77164292, + "learning_rate": 0.0005448019236728997, + "loss": 0.7826339, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 2533, + "time_per_iteration": 2.8874497413635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00837303, + "balance_loss_mlp": 1.43305767, + "diversity_loss_mlp": 0.21233971, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.03246629845535473, + "language_loss": 0.8471576, + "learning_rate": 0.0005444916258698255, + "loss": 0.85553062, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01460437, + "step": 2534, + "time_per_iteration": 2.623748540878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112009, + "balance_loss_mlp": 1.10867584, + "diversity_loss_mlp": 0.0, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.06488105381348498, + "language_loss": 0.86077154, + "learning_rate": 0.0005441813107935704, + "loss": 0.87197244, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2535, + "time_per_iteration": 2.6705739498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124443, + "balance_loss_mlp": 1.11277819, + "diversity_loss_mlp": 0.0, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.07112550287999594, + "language_loss": 0.86025345, + "learning_rate": 0.0005438709785646091, + "loss": 0.87149793, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2536, + "time_per_iteration": 2.5624749660491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120427, + "balance_loss_mlp": 1.10864902, + "diversity_loss_mlp": 0.0, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.08492074314505418, + "language_loss": 0.86885595, + "learning_rate": 0.0005435606293034234, + "loss": 0.8800602, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2537, + "time_per_iteration": 2.6347479820251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121847, + "balance_loss_mlp": 1.11035514, + "diversity_loss_mlp": 0.0, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.08214525409599778, + "language_loss": 0.84619427, + "learning_rate": 0.0005432502631305016, + "loss": 0.8574127, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2538, + "time_per_iteration": 2.700613021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_mlp": 1.10190618, + "diversity_loss_mlp": 0.0, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.06429037959601741, + "language_loss": 0.83193302, + "learning_rate": 0.0005429398801663386, + "loss": 0.84306723, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2539, + "time_per_iteration": 2.9839913845062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097658, + "balance_loss_mlp": 1.08599913, + "diversity_loss_mlp": 0.0, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.12053819121868696, + "language_loss": 0.8290484, + "learning_rate": 0.0005426294805314355, + "loss": 0.84002495, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2540, + "time_per_iteration": 2.5029373168945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094803, + "balance_loss_mlp": 1.08291781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.06245664696917761, + "language_loss": 0.80155998, + "learning_rate": 0.0005423190643463003, + "loss": 0.81250799, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2541, + "time_per_iteration": 2.949772357940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093208, + "balance_loss_mlp": 1.08163261, + "diversity_loss_mlp": 0.0, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.07791209549750817, + "language_loss": 0.8281579, + "learning_rate": 0.0005420086317314473, + "loss": 0.83908999, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2542, + "time_per_iteration": 2.6383941173553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088553, + "balance_loss_mlp": 1.0765729, + "diversity_loss_mlp": 0.0, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.06362759827284906, + "language_loss": 0.81081557, + "learning_rate": 0.0005416981828073971, + "loss": 0.82170111, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.11981201, + "routerloss_mlp": 0.0, + "step": 2543, + "time_per_iteration": 2.8023576736450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007156, + "balance_loss_mlp": 0.99990815, + "diversity_loss_mlp": 0.0, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.01938913368632236, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78122175, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.07226562, + "routerloss_mlp": 0.0, + "step": 2544, + "time_per_iteration": 4.817458629608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093446, + "balance_loss_mlp": 1.08184147, + "diversity_loss_mlp": 0.0, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.08678858450341921, + "language_loss": 0.84937072, + "learning_rate": 0.000541077236513819, + "loss": 0.86030519, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2545, + "time_per_iteration": 2.5271120071411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089352, + "balance_loss_mlp": 1.07800293, + "diversity_loss_mlp": 0.0, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.07207098978073255, + "language_loss": 0.82449925, + "learning_rate": 0.0005407667393853638, + "loss": 0.83539271, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2546, + "time_per_iteration": 2.6385204792022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093617, + "balance_loss_mlp": 1.08250618, + "diversity_loss_mlp": 0.0, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.06843607218978102, + "language_loss": 0.83673334, + "learning_rate": 0.0005404562264298569, + "loss": 0.84766948, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2547, + "time_per_iteration": 2.845250368118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102851, + "balance_loss_mlp": 1.09120405, + "diversity_loss_mlp": 0.0, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.06940893068641271, + "language_loss": 0.83999467, + "learning_rate": 0.0005401456977678498, + "loss": 0.8510232, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2548, + "time_per_iteration": 2.638720750808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099322, + "balance_loss_mlp": 1.08754444, + "diversity_loss_mlp": 0.0, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.08453175850654031, + "language_loss": 0.77431965, + "learning_rate": 0.0005398351535199008, + "loss": 0.78531289, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2549, + "time_per_iteration": 3.064035415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103016, + "balance_loss_mlp": 1.09175706, + "diversity_loss_mlp": 0.0, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.07238427843662706, + "language_loss": 0.84189212, + "learning_rate": 0.0005395245938065735, + "loss": 0.85292226, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2550, + "time_per_iteration": 2.7746829986572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118468, + "balance_loss_mlp": 1.10702372, + "diversity_loss_mlp": 0.0, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.08583684211433391, + "language_loss": 0.82631576, + "learning_rate": 0.0005392140187484379, + "loss": 0.83750039, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2551, + "time_per_iteration": 2.582195281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124142, + "balance_loss_mlp": 1.11273384, + "diversity_loss_mlp": 0.0, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.0682243054902728, + "language_loss": 0.89719319, + "learning_rate": 0.0005389034284660701, + "loss": 0.90843463, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.11401367, + "routerloss_mlp": 0.0, + "step": 2552, + "time_per_iteration": 2.824427366256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131566, + "balance_loss_mlp": 1.12022352, + "diversity_loss_mlp": 0.0, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.08386347311462448, + "language_loss": 0.82537109, + "learning_rate": 0.000538592823080052, + "loss": 0.83668673, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.11340332, + "routerloss_mlp": 0.0, + "step": 2553, + "time_per_iteration": 3.24122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127167, + "balance_loss_mlp": 1.11565781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.06967590045443849, + "language_loss": 0.84592807, + "learning_rate": 0.000538282202710971, + "loss": 0.85719973, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.11505127, + "routerloss_mlp": 0.0, + "step": 2554, + "time_per_iteration": 2.5753910541534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130476, + "balance_loss_mlp": 1.11918652, + "diversity_loss_mlp": 0.0, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.07442252581599826, + "language_loss": 0.82315147, + "learning_rate": 0.000537971567479421, + "loss": 0.83445626, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2555, + "time_per_iteration": 2.7354228496551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127557, + "balance_loss_mlp": 1.11596429, + "diversity_loss_mlp": 0.0, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.09076326784032986, + "language_loss": 0.88129175, + "learning_rate": 0.0005376609175060011, + "loss": 0.8925674, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2556, + "time_per_iteration": 2.6124610900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106232, + "balance_loss_mlp": 1.09465659, + "diversity_loss_mlp": 0.0, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.07210041581715526, + "language_loss": 0.80779845, + "learning_rate": 0.0005373502529113162, + "loss": 0.81886077, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2557, + "time_per_iteration": 2.823993444442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100884, + "balance_loss_mlp": 1.08888519, + "diversity_loss_mlp": 0.0, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.07460313059090624, + "language_loss": 0.81449521, + "learning_rate": 0.0005370395738159773, + "loss": 0.82550406, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2558, + "time_per_iteration": 2.6436777114868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00834873, + "balance_loss_mlp": 1.42800272, + "diversity_loss_mlp": 0.21467975, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.03347414568603151, + "language_loss": 0.82822633, + "learning_rate": 0.0005367288803406003, + "loss": 0.83657515, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01353174, + "step": 2559, + "time_per_iteration": 2.662224531173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083349, + "balance_loss_mlp": 1.07132101, + "diversity_loss_mlp": 0.0, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.0788259825299616, + "language_loss": 0.818443, + "learning_rate": 0.0005364181726058073, + "loss": 0.82927656, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.12023926, + "routerloss_mlp": 0.0, + "step": 2560, + "time_per_iteration": 2.686300277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.06417727, + "diversity_loss_mlp": 0.0, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.07955060847799823, + "language_loss": 0.8272332, + "learning_rate": 0.0005361074507322261, + "loss": 0.83799613, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2561, + "time_per_iteration": 2.5809431076049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073469, + "balance_loss_mlp": 1.06138754, + "diversity_loss_mlp": 0.0, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.07091460094801966, + "language_loss": 0.81425411, + "learning_rate": 0.000535796714840489, + "loss": 0.82498884, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2562, + "time_per_iteration": 2.6425187587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073356, + "balance_loss_mlp": 1.06107163, + "diversity_loss_mlp": 0.0, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.10871355986071002, + "language_loss": 0.83800626, + "learning_rate": 0.0005354859650512348, + "loss": 0.84873986, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2563, + "time_per_iteration": 2.7957375049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074544, + "balance_loss_mlp": 1.06282604, + "diversity_loss_mlp": 0.0, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.0798917687203661, + "language_loss": 0.87428886, + "learning_rate": 0.0005351752014851074, + "loss": 0.88503432, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2564, + "time_per_iteration": 2.6205673217773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085324, + "balance_loss_mlp": 1.07352281, + "diversity_loss_mlp": 0.0, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.06874397476353511, + "language_loss": 0.83621442, + "learning_rate": 0.0005348644242627553, + "loss": 0.84706771, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2565, + "time_per_iteration": 2.7460625171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010105, + "balance_loss_mlp": 1.00411022, + "diversity_loss_mlp": 0.0, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.013767653611631516, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76297128, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2566, + "time_per_iteration": 4.943475723266602 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110567, + "balance_loss_mlp": 1.09899187, + "diversity_loss_mlp": 0.0, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.08759046492811678, + "language_loss": 0.81650245, + "learning_rate": 0.0005342428293320013, + "loss": 0.82760805, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2567, + "time_per_iteration": 2.7889564037323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102659, + "balance_loss_mlp": 1.09142327, + "diversity_loss_mlp": 0.0, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.07999691418133484, + "language_loss": 0.8344667, + "learning_rate": 0.0005339320118649238, + "loss": 0.84549326, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.11230469, + "routerloss_mlp": 0.0, + "step": 2568, + "time_per_iteration": 2.7774229049682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.09715271, + "diversity_loss_mlp": 0.0, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.07608170940546952, + "language_loss": 0.86422324, + "learning_rate": 0.000533621181224271, + "loss": 0.87530512, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2569, + "time_per_iteration": 2.7708005905151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08442283, + "diversity_loss_mlp": 0.0, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.06858054906862693, + "language_loss": 0.8138749, + "learning_rate": 0.0005333103375307182, + "loss": 0.82483125, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2570, + "time_per_iteration": 2.8407034873962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090688, + "balance_loss_mlp": 1.07972121, + "diversity_loss_mlp": 0.0, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06174009778797697, + "language_loss": 0.85711801, + "learning_rate": 0.0005329994809049451, + "loss": 0.86802495, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.10974121, + "routerloss_mlp": 0.0, + "step": 2571, + "time_per_iteration": 2.7500712871551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096363, + "balance_loss_mlp": 1.08508563, + "diversity_loss_mlp": 0.0, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.06855083904022342, + "language_loss": 0.88066995, + "learning_rate": 0.0005326886114676375, + "loss": 0.89163363, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2572, + "time_per_iteration": 2.730137825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_mlp": 1.07269001, + "diversity_loss_mlp": 0.0, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06053914015656951, + "language_loss": 0.88364595, + "learning_rate": 0.0005323777293394854, + "loss": 0.89448464, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2573, + "time_per_iteration": 2.539825201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084718, + "balance_loss_mlp": 1.07365584, + "diversity_loss_mlp": 0.0, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.06797932871808014, + "language_loss": 0.81904709, + "learning_rate": 0.000532066834641184, + "loss": 0.8298943, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.11065674, + "routerloss_mlp": 0.0, + "step": 2574, + "time_per_iteration": 2.6663713455200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103824, + "balance_loss_mlp": 1.09271336, + "diversity_loss_mlp": 0.0, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.07191084425213706, + "language_loss": 0.85331243, + "learning_rate": 0.0005317559274934334, + "loss": 0.86435068, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.11114502, + "routerloss_mlp": 0.0, + "step": 2575, + "time_per_iteration": 2.756410598754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097116, + "balance_loss_mlp": 1.08592236, + "diversity_loss_mlp": 0.0, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.08893709148941176, + "language_loss": 0.80365205, + "learning_rate": 0.0005314450080169382, + "loss": 0.81462318, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2576, + "time_per_iteration": 2.613163471221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092174, + "balance_loss_mlp": 1.0810523, + "diversity_loss_mlp": 0.0, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.10818754121519983, + "language_loss": 0.8082127, + "learning_rate": 0.0005311340763324083, + "loss": 0.81913447, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.11126709, + "routerloss_mlp": 0.0, + "step": 2577, + "time_per_iteration": 2.5670807361602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087439, + "balance_loss_mlp": 1.07612574, + "diversity_loss_mlp": 0.0, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.07097138632102568, + "language_loss": 0.82323599, + "learning_rate": 0.0005308231325605578, + "loss": 0.83411032, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.11315918, + "routerloss_mlp": 0.0, + "step": 2578, + "time_per_iteration": 2.6519079208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085757, + "balance_loss_mlp": 1.07421172, + "diversity_loss_mlp": 0.0, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.06601832089031445, + "language_loss": 0.76727217, + "learning_rate": 0.0005305121768221061, + "loss": 0.7781297, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2579, + "time_per_iteration": 3.1306209564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_mlp": 1.03489161, + "diversity_loss_mlp": 0.0, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.022004289450105873, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76079202, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2580, + "time_per_iteration": 4.8141255378723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079045, + "balance_loss_mlp": 1.06767821, + "diversity_loss_mlp": 0.0, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.06618835036619775, + "language_loss": 0.91614985, + "learning_rate": 0.0005298902299282984, + "loss": 0.92694032, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2581, + "time_per_iteration": 2.586012125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087579, + "balance_loss_mlp": 1.07617044, + "diversity_loss_mlp": 0.0, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.07143589820149647, + "language_loss": 0.84265745, + "learning_rate": 0.0005295792390144033, + "loss": 0.85353327, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2582, + "time_per_iteration": 2.704911708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096311, + "balance_loss_mlp": 1.08442605, + "diversity_loss_mlp": 0.0, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.07556433689349051, + "language_loss": 0.83576399, + "learning_rate": 0.0005292682366168294, + "loss": 0.84672707, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2583, + "time_per_iteration": 2.5530638694763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105009, + "balance_loss_mlp": 1.09309435, + "diversity_loss_mlp": 0.0, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.06699014279274042, + "language_loss": 0.80089158, + "learning_rate": 0.0005289572228563181, + "loss": 0.81194162, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2584, + "time_per_iteration": 2.729093551635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100707, + "balance_loss_mlp": 1.08861935, + "diversity_loss_mlp": 0.0, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.0657007833960997, + "language_loss": 0.83234823, + "learning_rate": 0.000528646197853616, + "loss": 0.8433553, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2585, + "time_per_iteration": 2.727252721786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113697, + "balance_loss_mlp": 1.10166335, + "diversity_loss_mlp": 0.0, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.07376563164337009, + "language_loss": 0.85810697, + "learning_rate": 0.0005283351617294735, + "loss": 0.86924398, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.12023926, + "routerloss_mlp": 0.0, + "step": 2586, + "time_per_iteration": 2.945610761642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011716, + "balance_loss_mlp": 1.00470638, + "diversity_loss_mlp": 0.0, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.017193207514109847, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77648377, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.0703125, + "routerloss_mlp": 0.0, + "step": 2587, + "time_per_iteration": 5.038366079330444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.07597303, + "diversity_loss_mlp": 0.0, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.06591325697086226, + "language_loss": 0.86769819, + "learning_rate": 0.0005277130565998916, + "loss": 0.87858337, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.12554932, + "routerloss_mlp": 0.0, + "step": 2588, + "time_per_iteration": 2.7726681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086804, + "balance_loss_mlp": 1.07443595, + "diversity_loss_mlp": 0.0, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.05822748641904789, + "language_loss": 0.81899714, + "learning_rate": 0.0005274019878359748, + "loss": 0.82986516, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.12371826, + "routerloss_mlp": 0.0, + "step": 2589, + "time_per_iteration": 2.733985424041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.06275249, + "diversity_loss_mlp": 0.0, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.0736619230298454, + "language_loss": 0.87174684, + "learning_rate": 0.0005270909084336628, + "loss": 0.88249791, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.12335205, + "routerloss_mlp": 0.0, + "step": 2590, + "time_per_iteration": 2.648728370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075145, + "balance_loss_mlp": 1.06231809, + "diversity_loss_mlp": 0.0, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.07329601175103365, + "language_loss": 0.8877548, + "learning_rate": 0.0005267798185137276, + "loss": 0.89850616, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.12835693, + "routerloss_mlp": 0.0, + "step": 2591, + "time_per_iteration": 2.616903066635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061242, + "balance_loss_mlp": 1.04852843, + "diversity_loss_mlp": 0.0, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.0712913700859702, + "language_loss": 0.89140213, + "learning_rate": 0.0005264687181969444, + "loss": 0.90201461, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.12713623, + "routerloss_mlp": 0.0, + "step": 2592, + "time_per_iteration": 2.7121951580047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067017, + "balance_loss_mlp": 1.05430353, + "diversity_loss_mlp": 0.0, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.07969645648170227, + "language_loss": 0.75208342, + "learning_rate": 0.0005261576076040937, + "loss": 0.76275361, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.12719727, + "routerloss_mlp": 0.0, + "step": 2593, + "time_per_iteration": 3.248811721801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059604, + "balance_loss_mlp": 1.04746807, + "diversity_loss_mlp": 0.0, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.07355463018535204, + "language_loss": 0.84396625, + "learning_rate": 0.0005258464868559591, + "loss": 0.85456228, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.12121582, + "routerloss_mlp": 0.0, + "step": 2594, + "time_per_iteration": 2.6535778045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_mlp": 1.0461601, + "diversity_loss_mlp": 0.0, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.06735340586139127, + "language_loss": 0.88490266, + "learning_rate": 0.0005255353560733284, + "loss": 0.89548326, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2595, + "time_per_iteration": 2.5711045265197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_mlp": 1.03453541, + "diversity_loss_mlp": 0.0, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.025598241729826776, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76619136, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 2596, + "time_per_iteration": 4.7992448806762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106698, + "balance_loss_mlp": 1.05498767, + "diversity_loss_mlp": 0.0, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.07107233717475309, + "language_loss": 0.83179224, + "learning_rate": 0.0005249130648877492, + "loss": 0.84246206, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2597, + "time_per_iteration": 2.7089900970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068426, + "balance_loss_mlp": 1.05646324, + "diversity_loss_mlp": 0.0, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.08792128719199578, + "language_loss": 0.84945238, + "learning_rate": 0.0005246019047263953, + "loss": 0.86013663, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2598, + "time_per_iteration": 2.4586942195892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070932, + "balance_loss_mlp": 1.0594883, + "diversity_loss_mlp": 0.0, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.08031275074858332, + "language_loss": 0.82562858, + "learning_rate": 0.0005242907350137353, + "loss": 0.83633792, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2599, + "time_per_iteration": 2.547146797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075807, + "balance_loss_mlp": 1.06445217, + "diversity_loss_mlp": 0.0, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.08690624784708721, + "language_loss": 0.79332286, + "learning_rate": 0.0005239795558705754, + "loss": 0.80408096, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2600, + "time_per_iteration": 2.5985541343688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077027, + "balance_loss_mlp": 1.06555915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.06025548364908716, + "language_loss": 0.89517641, + "learning_rate": 0.0005236683674177264, + "loss": 0.90594667, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2601, + "time_per_iteration": 2.6358349323272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090245, + "balance_loss_mlp": 1.07874131, + "diversity_loss_mlp": 0.0, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.06252214062087984, + "language_loss": 0.82497251, + "learning_rate": 0.0005233571697760021, + "loss": 0.83587497, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.11505127, + "routerloss_mlp": 0.0, + "step": 2602, + "time_per_iteration": 2.8629817962646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112229, + "balance_loss_mlp": 1.10087442, + "diversity_loss_mlp": 0.0, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.06974132169475507, + "language_loss": 0.8293485, + "learning_rate": 0.0005230459630662203, + "loss": 0.84047079, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.11352539, + "routerloss_mlp": 0.0, + "step": 2603, + "time_per_iteration": 2.939380168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114631, + "balance_loss_mlp": 1.10359812, + "diversity_loss_mlp": 0.0, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.10511771954620508, + "language_loss": 0.81605637, + "learning_rate": 0.0005227347474092022, + "loss": 0.82720268, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2604, + "time_per_iteration": 2.7169747352600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112322, + "balance_loss_mlp": 1.11197877, + "diversity_loss_mlp": 0.0, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.07495893748856379, + "language_loss": 0.83243322, + "learning_rate": 0.0005224235229257724, + "loss": 0.84366548, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.11236572, + "routerloss_mlp": 0.0, + "step": 2605, + "time_per_iteration": 2.6940438747406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113092, + "balance_loss_mlp": 1.10178471, + "diversity_loss_mlp": 0.0, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.06884013858989874, + "language_loss": 0.86851203, + "learning_rate": 0.0005221122897367589, + "loss": 0.87964296, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.11309814, + "routerloss_mlp": 0.0, + "step": 2606, + "time_per_iteration": 2.800685405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109504, + "balance_loss_mlp": 1.09854841, + "diversity_loss_mlp": 0.0, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.08142217271827161, + "language_loss": 0.81335354, + "learning_rate": 0.0005218010479629932, + "loss": 0.82444859, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2607, + "time_per_iteration": 2.657087564468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098904, + "balance_loss_mlp": 1.08753133, + "diversity_loss_mlp": 0.0, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.08269023882009051, + "language_loss": 0.82140303, + "learning_rate": 0.0005214897977253102, + "loss": 0.83239204, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2608, + "time_per_iteration": 2.649846076965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084998, + "balance_loss_mlp": 1.07372093, + "diversity_loss_mlp": 0.0, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.061165709745894754, + "language_loss": 0.84233439, + "learning_rate": 0.0005211785391445473, + "loss": 0.8531844, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2609, + "time_per_iteration": 2.7179222106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087043, + "balance_loss_mlp": 1.07538986, + "diversity_loss_mlp": 0.0, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.06641391212047838, + "language_loss": 0.79080439, + "learning_rate": 0.0005208672723415467, + "loss": 0.80167478, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2610, + "time_per_iteration": 2.7928884029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.07359457, + "diversity_loss_mlp": 0.0, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.07063839016412009, + "language_loss": 0.79436052, + "learning_rate": 0.0005205559974371525, + "loss": 0.80521345, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2611, + "time_per_iteration": 2.75744366645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085649, + "balance_loss_mlp": 1.07412767, + "diversity_loss_mlp": 0.0, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.06307258943078059, + "language_loss": 0.82345438, + "learning_rate": 0.0005202447145522123, + "loss": 0.83431089, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2612, + "time_per_iteration": 2.6847879886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084149, + "balance_loss_mlp": 1.07245421, + "diversity_loss_mlp": 0.0, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.060686478103186246, + "language_loss": 0.79358983, + "learning_rate": 0.0005199334238075769, + "loss": 0.80443138, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2613, + "time_per_iteration": 2.560041666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084812, + "balance_loss_mlp": 1.07277226, + "diversity_loss_mlp": 0.0, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.086387426867178, + "language_loss": 0.91963339, + "learning_rate": 0.0005196221253241, + "loss": 0.93048155, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.12030029, + "routerloss_mlp": 0.0, + "step": 2614, + "time_per_iteration": 2.6397578716278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107839, + "balance_loss_mlp": 1.06617713, + "diversity_loss_mlp": 0.0, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.09198716130289855, + "language_loss": 0.82890773, + "learning_rate": 0.0005193108192226383, + "loss": 0.83969164, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2615, + "time_per_iteration": 2.7370193004608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.06396329, + "diversity_loss_mlp": 0.0, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.08941342921082604, + "language_loss": 0.86907744, + "learning_rate": 0.000518999505624052, + "loss": 0.87983918, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2616, + "time_per_iteration": 2.733515739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067104, + "balance_loss_mlp": 1.05521274, + "diversity_loss_mlp": 0.0, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.05504525356098391, + "language_loss": 0.83447164, + "learning_rate": 0.000518688184649203, + "loss": 0.84514272, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2617, + "time_per_iteration": 2.816542625427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075166, + "balance_loss_mlp": 1.06264269, + "diversity_loss_mlp": 0.0, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.07489503160460931, + "language_loss": 0.83596766, + "learning_rate": 0.0005183768564189577, + "loss": 0.84671938, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2618, + "time_per_iteration": 2.5781893730163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081949, + "balance_loss_mlp": 1.07029045, + "diversity_loss_mlp": 0.0, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.0695581827230682, + "language_loss": 0.81485611, + "learning_rate": 0.0005180655210541838, + "loss": 0.82567555, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2619, + "time_per_iteration": 2.5642077922821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091231, + "balance_loss_mlp": 1.07894695, + "diversity_loss_mlp": 0.0, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.08072673001204132, + "language_loss": 0.83226323, + "learning_rate": 0.0005177541786757527, + "loss": 0.84317553, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2620, + "time_per_iteration": 2.7365450859069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100722, + "balance_loss_mlp": 1.0882231, + "diversity_loss_mlp": 0.0, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.0921594393427519, + "language_loss": 0.82626402, + "learning_rate": 0.000517442829404538, + "loss": 0.83727121, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2621, + "time_per_iteration": 3.053333044052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097629, + "balance_loss_mlp": 1.08534431, + "diversity_loss_mlp": 0.0, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.0844592365120011, + "language_loss": 0.87026393, + "learning_rate": 0.0005171314733614166, + "loss": 0.88124025, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.12286377, + "routerloss_mlp": 0.0, + "step": 2622, + "time_per_iteration": 2.8867554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099715, + "balance_loss_mlp": 1.08721614, + "diversity_loss_mlp": 0.0, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.07191738026805333, + "language_loss": 0.78457403, + "learning_rate": 0.0005168201106672671, + "loss": 0.79557121, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 2623, + "time_per_iteration": 2.7532849311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083535, + "balance_loss_mlp": 1.07122076, + "diversity_loss_mlp": 0.0, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.06664161086213699, + "language_loss": 0.84876573, + "learning_rate": 0.0005165087414429717, + "loss": 0.85960108, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.12316895, + "routerloss_mlp": 0.0, + "step": 2624, + "time_per_iteration": 2.614475965499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_mlp": 1.061566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.06712294156504883, + "language_loss": 0.83509946, + "learning_rate": 0.0005161973658094144, + "loss": 0.84583604, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2625, + "time_per_iteration": 2.6536033153533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875819, + "balance_loss_mlp": 1.51064336, + "diversity_loss_mlp": 0.21324398, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.02954045761884847, + "language_loss": 0.82599998, + "learning_rate": 0.000515885983887482, + "loss": 0.83475816, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01387555, + "step": 2626, + "time_per_iteration": 2.801612138748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070563, + "balance_loss_mlp": 1.05863595, + "diversity_loss_mlp": 0.0, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.07357396162877478, + "language_loss": 0.84283531, + "learning_rate": 0.0005155745957980636, + "loss": 0.8535409, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2627, + "time_per_iteration": 2.6239585876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071483, + "balance_loss_mlp": 1.0589962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.06901961430938243, + "language_loss": 0.88532668, + "learning_rate": 0.000515263201662051, + "loss": 0.89604151, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2628, + "time_per_iteration": 2.65803861618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107038, + "balance_loss_mlp": 1.05840504, + "diversity_loss_mlp": 0.0, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.06314416177701848, + "language_loss": 0.8250618, + "learning_rate": 0.0005149518016003378, + "loss": 0.8357656, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.11968994, + "routerloss_mlp": 0.0, + "step": 2629, + "time_per_iteration": 3.1646623611450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061865, + "balance_loss_mlp": 1.04946709, + "diversity_loss_mlp": 0.0, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.1007750022567515, + "language_loss": 0.82337832, + "learning_rate": 0.0005146403957338206, + "loss": 0.83399695, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.12402344, + "routerloss_mlp": 0.0, + "step": 2630, + "time_per_iteration": 2.5879476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.05236936, + "diversity_loss_mlp": 0.0, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.06667308072604639, + "language_loss": 0.82288837, + "learning_rate": 0.0005143289841833975, + "loss": 0.83353263, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.12060547, + "routerloss_mlp": 0.0, + "step": 2631, + "time_per_iteration": 2.8448615074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.05643749, + "diversity_loss_mlp": 0.0, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.09203997555384738, + "language_loss": 0.82179189, + "learning_rate": 0.0005140175670699696, + "loss": 0.83247638, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.11999512, + "routerloss_mlp": 0.0, + "step": 2632, + "time_per_iteration": 2.642666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067258, + "balance_loss_mlp": 1.05545044, + "diversity_loss_mlp": 0.0, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.04894531982576629, + "language_loss": 0.82796603, + "learning_rate": 0.0005137061445144395, + "loss": 0.8386386, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2633, + "time_per_iteration": 2.8800737857818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076133, + "balance_loss_mlp": 1.06476033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.06583044180155191, + "language_loss": 0.87074906, + "learning_rate": 0.000513394716637712, + "loss": 0.88151038, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2634, + "time_per_iteration": 2.7507505416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_mlp": 1.02921486, + "diversity_loss_mlp": 0.0, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.03533282921310782, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80227697, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.06787109, + "routerloss_mlp": 0.0, + "step": 2635, + "time_per_iteration": 4.825605869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110028, + "balance_loss_mlp": 1.08881176, + "diversity_loss_mlp": 0.0, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.07735545811428028, + "language_loss": 0.81068468, + "learning_rate": 0.0005127718454042958, + "loss": 0.82168746, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2636, + "time_per_iteration": 2.8241050243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099835, + "balance_loss_mlp": 1.08840299, + "diversity_loss_mlp": 0.0, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.08187506034762644, + "language_loss": 0.83836603, + "learning_rate": 0.0005124604022894269, + "loss": 0.8493644, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2637, + "time_per_iteration": 2.9366774559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019034, + "balance_loss_mlp": 1.01259708, + "diversity_loss_mlp": 0.0, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.025963071476552062, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.7820726, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.06445312, + "routerloss_mlp": 0.0, + "step": 2638, + "time_per_iteration": 4.828620433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092892, + "balance_loss_mlp": 1.08166814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.07837351333742608, + "language_loss": 0.83244252, + "learning_rate": 0.0005118375016679325, + "loss": 0.84337139, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.11224365, + "routerloss_mlp": 0.0, + "step": 2639, + "time_per_iteration": 2.801852226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077953, + "balance_loss_mlp": 1.0666697, + "diversity_loss_mlp": 0.0, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.07879033409242599, + "language_loss": 0.80358827, + "learning_rate": 0.0005115260444031382, + "loss": 0.81436777, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2640, + "time_per_iteration": 2.596771240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010253, + "balance_loss_mlp": 1.00422084, + "diversity_loss_mlp": 0.0, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011737851482073082, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79742074, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.06030273, + "routerloss_mlp": 0.0, + "step": 2641, + "time_per_iteration": 4.948842287063599 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075403, + "balance_loss_mlp": 1.06412029, + "diversity_loss_mlp": 0.0, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.08031663653823312, + "language_loss": 0.8740893, + "learning_rate": 0.0005109031165700483, + "loss": 0.88484335, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.112854, + "routerloss_mlp": 0.0, + "step": 2642, + "time_per_iteration": 2.5833895206451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060876, + "balance_loss_mlp": 1.04938459, + "diversity_loss_mlp": 0.0, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.06372027514248847, + "language_loss": 0.83170295, + "learning_rate": 0.0005105916462435945, + "loss": 0.84231174, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2643, + "time_per_iteration": 2.841296911239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106144, + "balance_loss_mlp": 1.05014455, + "diversity_loss_mlp": 0.0, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.0681709540800111, + "language_loss": 0.85266602, + "learning_rate": 0.0005102801718050989, + "loss": 0.86328042, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2644, + "time_per_iteration": 2.680905818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058539, + "balance_loss_mlp": 1.04714894, + "diversity_loss_mlp": 0.0, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.07434027721258654, + "language_loss": 0.89314902, + "learning_rate": 0.0005099686933754867, + "loss": 0.90373439, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.1138916, + "routerloss_mlp": 0.0, + "step": 2645, + "time_per_iteration": 2.723043441772461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062253, + "balance_loss_mlp": 1.05088663, + "diversity_loss_mlp": 0.0, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.07256046334666034, + "language_loss": 0.8429243, + "learning_rate": 0.0005096572110756845, + "loss": 0.85354686, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2646, + "time_per_iteration": 2.6682143211364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069615, + "balance_loss_mlp": 1.05801558, + "diversity_loss_mlp": 0.0, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.06200075514200526, + "language_loss": 0.85445803, + "learning_rate": 0.0005093457250266205, + "loss": 0.86515421, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2647, + "time_per_iteration": 2.682891368865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.05816472, + "diversity_loss_mlp": 0.0, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.1092618136395953, + "language_loss": 0.83279526, + "learning_rate": 0.000509034235349224, + "loss": 0.84349322, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.11627197, + "routerloss_mlp": 0.0, + "step": 2648, + "time_per_iteration": 2.7173004150390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068823, + "balance_loss_mlp": 1.05756938, + "diversity_loss_mlp": 0.0, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.07759183255272654, + "language_loss": 0.81290972, + "learning_rate": 0.0005087227421644266, + "loss": 0.82359791, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2649, + "time_per_iteration": 2.79217791557312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066501, + "balance_loss_mlp": 1.05469334, + "diversity_loss_mlp": 0.0, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.07036579944312285, + "language_loss": 0.85978615, + "learning_rate": 0.0005084112455931602, + "loss": 0.87045121, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2650, + "time_per_iteration": 2.593323230743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.06125915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.06673546987966349, + "language_loss": 0.85377133, + "learning_rate": 0.0005080997457563586, + "loss": 0.86449993, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2651, + "time_per_iteration": 2.5473101139068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074592, + "balance_loss_mlp": 1.06324303, + "diversity_loss_mlp": 0.0, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.07839929831674766, + "language_loss": 0.79146206, + "learning_rate": 0.0005077882427749569, + "loss": 0.80220807, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.11340332, + "routerloss_mlp": 0.0, + "step": 2652, + "time_per_iteration": 2.5378577709198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081444, + "balance_loss_mlp": 1.07002354, + "diversity_loss_mlp": 0.0, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.09222135648623411, + "language_loss": 0.84599656, + "learning_rate": 0.0005074767367698913, + "loss": 0.85681099, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2653, + "time_per_iteration": 2.7541823387145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.0749042, + "diversity_loss_mlp": 0.0, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.07250262260433718, + "language_loss": 0.82987714, + "learning_rate": 0.0005071652278620988, + "loss": 0.84074312, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2654, + "time_per_iteration": 3.0615251064300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089781, + "balance_loss_mlp": 1.07870018, + "diversity_loss_mlp": 0.0, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.07582936293709001, + "language_loss": 0.83328903, + "learning_rate": 0.0005068537161725186, + "loss": 0.84418684, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.11083984, + "routerloss_mlp": 0.0, + "step": 2655, + "time_per_iteration": 2.7840993404388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092544, + "balance_loss_mlp": 1.08139753, + "diversity_loss_mlp": 0.0, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.07786356346883126, + "language_loss": 0.84288549, + "learning_rate": 0.0005065422018220893, + "loss": 0.85381097, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.1114502, + "routerloss_mlp": 0.0, + "step": 2656, + "time_per_iteration": 2.832575798034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102102, + "balance_loss_mlp": 1.09118247, + "diversity_loss_mlp": 0.0, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.08194812181942494, + "language_loss": 0.80392313, + "learning_rate": 0.0005062306849317521, + "loss": 0.81494415, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.10931396, + "routerloss_mlp": 0.0, + "step": 2657, + "time_per_iteration": 2.794966220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100168, + "balance_loss_mlp": 1.08891487, + "diversity_loss_mlp": 0.0, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.08210850574888065, + "language_loss": 0.83486134, + "learning_rate": 0.0005059191656224487, + "loss": 0.84586298, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2658, + "time_per_iteration": 2.744889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093286, + "balance_loss_mlp": 1.08238411, + "diversity_loss_mlp": 0.0, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.07321009008554179, + "language_loss": 0.88860798, + "learning_rate": 0.0005056076440151212, + "loss": 0.89954078, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.10906982, + "routerloss_mlp": 0.0, + "step": 2659, + "time_per_iteration": 2.6951825618743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113007, + "balance_loss_mlp": 1.12453902, + "diversity_loss_mlp": 0.0, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.07076104465295206, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77418184, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2660, + "time_per_iteration": 4.850585460662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081367, + "balance_loss_mlp": 1.07051301, + "diversity_loss_mlp": 0.0, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06225287802871053, + "language_loss": 0.86966121, + "learning_rate": 0.0005049845943901691, + "loss": 0.88047487, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.10864258, + "routerloss_mlp": 0.0, + "step": 2661, + "time_per_iteration": 2.8342370986938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079888, + "balance_loss_mlp": 1.0692786, + "diversity_loss_mlp": 0.0, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.058043198592839004, + "language_loss": 0.86637139, + "learning_rate": 0.0005046730666144338, + "loss": 0.87717032, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2662, + "time_per_iteration": 2.8066177368164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078601, + "balance_loss_mlp": 1.06801558, + "diversity_loss_mlp": 0.0, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.058701328600128284, + "language_loss": 0.87834954, + "learning_rate": 0.0005043615370244532, + "loss": 0.88913548, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 2663, + "time_per_iteration": 3.3716113567352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105143, + "balance_loss_mlp": 1.04589903, + "diversity_loss_mlp": 0.0, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.02890820887526385, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79295814, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2664, + "time_per_iteration": 4.632098913192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074303, + "balance_loss_mlp": 1.0636878, + "diversity_loss_mlp": 0.0, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.05776678043634197, + "language_loss": 0.85301316, + "learning_rate": 0.0005037384728855425, + "loss": 0.86375624, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2665, + "time_per_iteration": 2.8025074005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077204, + "balance_loss_mlp": 1.06618285, + "diversity_loss_mlp": 0.0, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.08001364709617295, + "language_loss": 0.84092522, + "learning_rate": 0.0005034269385785075, + "loss": 0.85169727, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2666, + "time_per_iteration": 2.6508989334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070249, + "balance_loss_mlp": 1.05929327, + "diversity_loss_mlp": 0.0, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.06550806602425656, + "language_loss": 0.849998, + "learning_rate": 0.0005031154029410168, + "loss": 0.86070049, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.10955811, + "routerloss_mlp": 0.0, + "step": 2667, + "time_per_iteration": 2.6072959899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062599, + "balance_loss_mlp": 1.05130351, + "diversity_loss_mlp": 0.0, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.07261202613887993, + "language_loss": 0.86903906, + "learning_rate": 0.0005028038660940197, + "loss": 0.87966514, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2668, + "time_per_iteration": 2.5607664585113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060608, + "balance_loss_mlp": 1.04923522, + "diversity_loss_mlp": 0.0, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.06521290367629204, + "language_loss": 0.84553415, + "learning_rate": 0.0005024923281584648, + "loss": 0.8561402, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 2669, + "time_per_iteration": 2.623643159866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066692, + "balance_loss_mlp": 1.05528402, + "diversity_loss_mlp": 0.0, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.06549707374857121, + "language_loss": 0.82560658, + "learning_rate": 0.0005021807892553026, + "loss": 0.83627355, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2670, + "time_per_iteration": 2.699392318725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062757, + "balance_loss_mlp": 1.05140269, + "diversity_loss_mlp": 0.0, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.07318428846825417, + "language_loss": 0.84862608, + "learning_rate": 0.0005018692495054828, + "loss": 0.85925364, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2671, + "time_per_iteration": 2.7645046710968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106912, + "balance_loss_mlp": 1.05812323, + "diversity_loss_mlp": 0.0, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.06397327244364565, + "language_loss": 0.80696338, + "learning_rate": 0.0005015577090299561, + "loss": 0.81765461, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.11004639, + "routerloss_mlp": 0.0, + "step": 2672, + "time_per_iteration": 2.684048891067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068328, + "balance_loss_mlp": 1.05731261, + "diversity_loss_mlp": 0.0, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.06574977800170037, + "language_loss": 0.86744952, + "learning_rate": 0.0005012461679496729, + "loss": 0.87813282, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2673, + "time_per_iteration": 2.5885825157165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077367, + "balance_loss_mlp": 1.06613708, + "diversity_loss_mlp": 0.0, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.09032594792095527, + "language_loss": 0.87748468, + "learning_rate": 0.0005009346263855848, + "loss": 0.88825834, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.11236572, + "routerloss_mlp": 0.0, + "step": 2674, + "time_per_iteration": 2.5970752239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092275, + "balance_loss_mlp": 1.08141518, + "diversity_loss_mlp": 0.0, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.06465969942237398, + "language_loss": 0.83699256, + "learning_rate": 0.0005006230844586422, + "loss": 0.84791529, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2675, + "time_per_iteration": 2.7912445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00882234, + "balance_loss_mlp": 1.52600026, + "diversity_loss_mlp": 0.21199086, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.0263651655655577, + "language_loss": 0.78895926, + "learning_rate": 0.0005003115422897968, + "loss": 0.79778159, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01323896, + "step": 2676, + "time_per_iteration": 2.8051552772521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111408, + "balance_loss_mlp": 1.10282683, + "diversity_loss_mlp": 0.0, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.0741463219638638, + "language_loss": 0.87253916, + "learning_rate": 0.0005, + "loss": 0.88367999, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2677, + "time_per_iteration": 2.6435391902923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119404, + "balance_loss_mlp": 1.10841274, + "diversity_loss_mlp": 0.0, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.08792863943872284, + "language_loss": 0.79283178, + "learning_rate": 0.0004996884577102033, + "loss": 0.80402583, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.10992432, + "routerloss_mlp": 0.0, + "step": 2678, + "time_per_iteration": 3.089707374572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111646, + "balance_loss_mlp": 1.10545659, + "diversity_loss_mlp": 0.0, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.08112886088857633, + "language_loss": 0.84611261, + "learning_rate": 0.000499376915541358, + "loss": 0.85727721, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.10998535, + "routerloss_mlp": 0.0, + "step": 2679, + "time_per_iteration": 2.7143540382385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109969, + "balance_loss_mlp": 1.08910465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.16255458440641746, + "language_loss": 0.81113428, + "learning_rate": 0.0004990653736144155, + "loss": 0.82213122, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 2680, + "time_per_iteration": 2.857952356338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084916, + "balance_loss_mlp": 1.07416916, + "diversity_loss_mlp": 0.0, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.06912387000686389, + "language_loss": 0.85820174, + "learning_rate": 0.0004987538320503271, + "loss": 0.86905092, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.10748291, + "routerloss_mlp": 0.0, + "step": 2681, + "time_per_iteration": 2.485462188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077984, + "balance_loss_mlp": 1.06715369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.08121908376237164, + "language_loss": 0.83137929, + "learning_rate": 0.0004984422909700442, + "loss": 0.84215909, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2682, + "time_per_iteration": 2.7179505825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068711, + "balance_loss_mlp": 1.05784559, + "diversity_loss_mlp": 0.0, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.07829442771548371, + "language_loss": 0.83800036, + "learning_rate": 0.0004981307504945173, + "loss": 0.84868753, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2683, + "time_per_iteration": 2.71893048286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061815, + "balance_loss_mlp": 1.05075228, + "diversity_loss_mlp": 0.0, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.08619577510477876, + "language_loss": 0.89448887, + "learning_rate": 0.0004978192107446976, + "loss": 0.90510702, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2684, + "time_per_iteration": 2.7385506629943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062179, + "balance_loss_mlp": 1.05111599, + "diversity_loss_mlp": 0.0, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.08129158019501125, + "language_loss": 0.8740204, + "learning_rate": 0.0004975076718415353, + "loss": 0.88464212, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2685, + "time_per_iteration": 2.599379777908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_mlp": 1.04478931, + "diversity_loss_mlp": 0.0, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.06772474949474022, + "language_loss": 0.90610582, + "learning_rate": 0.0004971961339059806, + "loss": 0.91666389, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.11016846, + "routerloss_mlp": 0.0, + "step": 2686, + "time_per_iteration": 2.498819589614868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057473, + "balance_loss_mlp": 1.04611838, + "diversity_loss_mlp": 0.0, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.06487308694775892, + "language_loss": 0.84021914, + "learning_rate": 0.0004968845970589832, + "loss": 0.85079384, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2687, + "time_per_iteration": 2.6814825534820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061325, + "balance_loss_mlp": 1.04982185, + "diversity_loss_mlp": 0.0, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.06911328459433905, + "language_loss": 0.8435297, + "learning_rate": 0.0004965730614214926, + "loss": 0.8541429, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2688, + "time_per_iteration": 2.6537294387817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106762, + "balance_loss_mlp": 1.05618167, + "diversity_loss_mlp": 0.0, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.07039148040030412, + "language_loss": 0.85285878, + "learning_rate": 0.0004962615271144576, + "loss": 0.86353499, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2689, + "time_per_iteration": 2.50710129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064714, + "balance_loss_mlp": 1.05325246, + "diversity_loss_mlp": 0.0, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.0770213433091723, + "language_loss": 0.82680881, + "learning_rate": 0.0004959499942588264, + "loss": 0.83745599, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.11456299, + "routerloss_mlp": 0.0, + "step": 2690, + "time_per_iteration": 2.892293930053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049886, + "balance_loss_mlp": 1.04297149, + "diversity_loss_mlp": 0.0, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.03551055813206397, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79249913, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 2691, + "time_per_iteration": 4.764665842056274 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070219, + "balance_loss_mlp": 1.05894208, + "diversity_loss_mlp": 0.0, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.08037192658361764, + "language_loss": 0.85416174, + "learning_rate": 0.0004953269333855661, + "loss": 0.86486399, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2692, + "time_per_iteration": 2.785511016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075514, + "balance_loss_mlp": 1.06407034, + "diversity_loss_mlp": 0.0, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.06114385406953633, + "language_loss": 0.84516799, + "learning_rate": 0.0004950154056098309, + "loss": 0.85592318, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.11437988, + "routerloss_mlp": 0.0, + "step": 2693, + "time_per_iteration": 2.683246374130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.07183599, + "diversity_loss_mlp": 0.0, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.08066804074186672, + "language_loss": 0.84078431, + "learning_rate": 0.0004947038797692867, + "loss": 0.85161769, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2694, + "time_per_iteration": 2.8312196731567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00872465, + "balance_loss_mlp": 1.50766385, + "diversity_loss_mlp": 0.2097543, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.031552182630998016, + "language_loss": 0.77636528, + "learning_rate": 0.0004943923559848789, + "loss": 0.78508997, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01375636, + "step": 2695, + "time_per_iteration": 2.8084189891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010865, + "balance_loss_mlp": 1.07534158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.055486891719670514, + "language_loss": 0.90695632, + "learning_rate": 0.0004940808343775515, + "loss": 0.91782129, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.1116333, + "routerloss_mlp": 0.0, + "step": 2696, + "time_per_iteration": 2.6868011951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00874209, + "balance_loss_mlp": 1.50797677, + "diversity_loss_mlp": 0.21290711, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.034010170020107075, + "language_loss": 0.82213199, + "learning_rate": 0.0004937693150682479, + "loss": 0.83087409, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01376703, + "step": 2697, + "time_per_iteration": 2.5905513763427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090314, + "balance_loss_mlp": 1.07915568, + "diversity_loss_mlp": 0.0, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.06705206433038317, + "language_loss": 0.7658723, + "learning_rate": 0.0004934577981779107, + "loss": 0.77677542, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.1116333, + "routerloss_mlp": 0.0, + "step": 2698, + "time_per_iteration": 2.7049057483673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087095, + "balance_loss_mlp": 1.07585335, + "diversity_loss_mlp": 0.0, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.061529133753451364, + "language_loss": 0.812904, + "learning_rate": 0.0004931462838274817, + "loss": 0.82377493, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.11242676, + "routerloss_mlp": 0.0, + "step": 2699, + "time_per_iteration": 2.8723175525665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089813, + "balance_loss_mlp": 1.07877994, + "diversity_loss_mlp": 0.0, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.08487292742433496, + "language_loss": 0.84222901, + "learning_rate": 0.0004928347721379011, + "loss": 0.85312712, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2700, + "time_per_iteration": 2.639867067337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080545, + "balance_loss_mlp": 1.06974459, + "diversity_loss_mlp": 0.0, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.06134037245316137, + "language_loss": 0.82221866, + "learning_rate": 0.0004925232632301089, + "loss": 0.83302414, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.10797119, + "routerloss_mlp": 0.0, + "step": 2701, + "time_per_iteration": 2.622311592102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077123, + "balance_loss_mlp": 1.0660243, + "diversity_loss_mlp": 0.0, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.06337758152829237, + "language_loss": 0.79842103, + "learning_rate": 0.0004922117572250431, + "loss": 0.80919224, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.11096191, + "routerloss_mlp": 0.0, + "step": 2702, + "time_per_iteration": 2.6980605125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070723, + "balance_loss_mlp": 1.05936241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.07398400160993446, + "language_loss": 0.80852163, + "learning_rate": 0.0004919002542436414, + "loss": 0.81922889, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2703, + "time_per_iteration": 2.8354647159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072853, + "balance_loss_mlp": 1.0619514, + "diversity_loss_mlp": 0.0, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.064542502306726, + "language_loss": 0.8126899, + "learning_rate": 0.0004915887544068399, + "loss": 0.8234185, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.10906982, + "routerloss_mlp": 0.0, + "step": 2704, + "time_per_iteration": 2.6693973541259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068776, + "balance_loss_mlp": 1.05770195, + "diversity_loss_mlp": 0.0, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.06578360362401801, + "language_loss": 0.7856639, + "learning_rate": 0.0004912772578355736, + "loss": 0.79635167, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2705, + "time_per_iteration": 2.892735481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0611918, + "diversity_loss_mlp": 0.0, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.07750798967783011, + "language_loss": 0.82549465, + "learning_rate": 0.000490965764650776, + "loss": 0.83621788, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.11126709, + "routerloss_mlp": 0.0, + "step": 2706, + "time_per_iteration": 2.8544106483459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070979, + "balance_loss_mlp": 1.05984521, + "diversity_loss_mlp": 0.0, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.06572065456776559, + "language_loss": 0.82828736, + "learning_rate": 0.0004906542749733798, + "loss": 0.83899713, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.11132812, + "routerloss_mlp": 0.0, + "step": 2707, + "time_per_iteration": 3.6044294834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.06353068, + "diversity_loss_mlp": 0.0, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.055629683487612144, + "language_loss": 0.85401118, + "learning_rate": 0.0004903427889243156, + "loss": 0.86475539, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.10894775, + "routerloss_mlp": 0.0, + "step": 2708, + "time_per_iteration": 2.830115795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075334, + "balance_loss_mlp": 1.06425905, + "diversity_loss_mlp": 0.0, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.06692681375903406, + "language_loss": 0.85444081, + "learning_rate": 0.0004900313066245134, + "loss": 0.86519414, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2709, + "time_per_iteration": 2.6552441120147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106839, + "balance_loss_mlp": 1.05745232, + "diversity_loss_mlp": 0.0, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.06855502771674758, + "language_loss": 0.81061214, + "learning_rate": 0.0004897198281949012, + "loss": 0.82129598, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.10949707, + "routerloss_mlp": 0.0, + "step": 2710, + "time_per_iteration": 2.645981550216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00874972, + "balance_loss_mlp": 1.51124442, + "diversity_loss_mlp": 0.21021394, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.03577466895356274, + "language_loss": 0.78009295, + "learning_rate": 0.0004894083537564057, + "loss": 0.78884268, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01424256, + "step": 2711, + "time_per_iteration": 2.746945858001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0086804, + "balance_loss_mlp": 1.49602354, + "diversity_loss_mlp": 0.21089339, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.02967241377466632, + "language_loss": 0.80981171, + "learning_rate": 0.0004890968834299519, + "loss": 0.81849211, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01458106, + "step": 2712, + "time_per_iteration": 2.749049663543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072348, + "balance_loss_mlp": 1.06096959, + "diversity_loss_mlp": 0.0, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.06422523073894505, + "language_loss": 0.78739542, + "learning_rate": 0.0004887854173364633, + "loss": 0.79811883, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2713, + "time_per_iteration": 2.760077953338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00862336, + "balance_loss_mlp": 1.48416615, + "diversity_loss_mlp": 0.2112534, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.02839704110509781, + "language_loss": 0.81564224, + "learning_rate": 0.0004884739555968617, + "loss": 0.8242656, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01462588, + "step": 2714, + "time_per_iteration": 2.902200698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043267, + "balance_loss_mlp": 1.03711605, + "diversity_loss_mlp": 0.0, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.025188943281148922, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.8002032, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2715, + "time_per_iteration": 4.977273464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847492, + "balance_loss_mlp": 1.45660305, + "diversity_loss_mlp": 0.21012819, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.03573397478438407, + "language_loss": 0.86888605, + "learning_rate": 0.0004878510456629992, + "loss": 0.87736094, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01412619, + "step": 2716, + "time_per_iteration": 2.998455286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068588, + "balance_loss_mlp": 1.05767989, + "diversity_loss_mlp": 0.0, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.06765059094142209, + "language_loss": 0.85142076, + "learning_rate": 0.00048753959771057314, + "loss": 0.86210662, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.10925293, + "routerloss_mlp": 0.0, + "step": 2717, + "time_per_iteration": 2.6113662719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065726, + "balance_loss_mlp": 1.05442464, + "diversity_loss_mlp": 0.0, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.08600503840688169, + "language_loss": 0.82445514, + "learning_rate": 0.0004872281545957044, + "loss": 0.83511233, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.11297607, + "routerloss_mlp": 0.0, + "step": 2718, + "time_per_iteration": 2.7617604732513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070418, + "balance_loss_mlp": 1.05911732, + "diversity_loss_mlp": 0.0, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.061040572409093316, + "language_loss": 0.86051857, + "learning_rate": 0.0004869167164393055, + "loss": 0.87122279, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.11303711, + "routerloss_mlp": 0.0, + "step": 2719, + "time_per_iteration": 2.932154417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069726, + "balance_loss_mlp": 1.05857992, + "diversity_loss_mlp": 0.0, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.11614833297327579, + "language_loss": 0.89542395, + "learning_rate": 0.00048660528336228793, + "loss": 0.90612125, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.11151123, + "routerloss_mlp": 0.0, + "step": 2720, + "time_per_iteration": 2.7917380332946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071611, + "balance_loss_mlp": 1.06013143, + "diversity_loss_mlp": 0.0, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.05730438157509479, + "language_loss": 0.90177751, + "learning_rate": 0.0004862938554855606, + "loss": 0.91249359, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2721, + "time_per_iteration": 2.809875965118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074661, + "balance_loss_mlp": 1.06371188, + "diversity_loss_mlp": 0.0, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.06740042101514945, + "language_loss": 0.86071771, + "learning_rate": 0.0004859824329300304, + "loss": 0.87146431, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.10949707, + "routerloss_mlp": 0.0, + "step": 2722, + "time_per_iteration": 2.5660176277160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070887, + "balance_loss_mlp": 1.05932951, + "diversity_loss_mlp": 0.0, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.06312939516717878, + "language_loss": 0.83826602, + "learning_rate": 0.00048567101581660244, + "loss": 0.84897488, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.11560059, + "routerloss_mlp": 0.0, + "step": 2723, + "time_per_iteration": 2.593005895614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107168, + "balance_loss_mlp": 1.0603317, + "diversity_loss_mlp": 0.0, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.07171512526566694, + "language_loss": 0.86622667, + "learning_rate": 0.00048535960426617956, + "loss": 0.87694347, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2724, + "time_per_iteration": 2.611551523208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070313, + "balance_loss_mlp": 1.05852962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.07077799246948024, + "language_loss": 0.81735158, + "learning_rate": 0.0004850481983996621, + "loss": 0.82805473, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2725, + "time_per_iteration": 2.7656939029693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058325, + "balance_loss_mlp": 1.04673731, + "diversity_loss_mlp": 0.0, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.07497614956550303, + "language_loss": 0.87961793, + "learning_rate": 0.0004847367983379492, + "loss": 0.89020109, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2726, + "time_per_iteration": 2.523099899291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066059, + "balance_loss_mlp": 1.05477571, + "diversity_loss_mlp": 0.0, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.06275633211650163, + "language_loss": 0.78715622, + "learning_rate": 0.00048442540420193643, + "loss": 0.79781681, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2727, + "time_per_iteration": 2.9433038234710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056804, + "balance_loss_mlp": 1.04506755, + "diversity_loss_mlp": 0.0, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.07393634521455344, + "language_loss": 0.79367208, + "learning_rate": 0.0004841140161125182, + "loss": 0.80424011, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2728, + "time_per_iteration": 3.619252920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063037, + "balance_loss_mlp": 1.05171847, + "diversity_loss_mlp": 0.0, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.07165329358033216, + "language_loss": 0.84827459, + "learning_rate": 0.0004838026341905857, + "loss": 0.85890496, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.11322021, + "routerloss_mlp": 0.0, + "step": 2729, + "time_per_iteration": 2.716114044189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057536, + "balance_loss_mlp": 1.04594862, + "diversity_loss_mlp": 0.0, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.13042739485624238, + "language_loss": 0.85312545, + "learning_rate": 0.00048349125855702844, + "loss": 0.86370087, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2730, + "time_per_iteration": 2.787280559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00837258, + "balance_loss_mlp": 1.43598437, + "diversity_loss_mlp": 0.21135046, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.027658523195400363, + "language_loss": 0.81318069, + "learning_rate": 0.00048317988933273287, + "loss": 0.82155323, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01359018, + "step": 2731, + "time_per_iteration": 2.763814687728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057346, + "balance_loss_mlp": 1.04585993, + "diversity_loss_mlp": 0.0, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.07420390441928848, + "language_loss": 0.82373381, + "learning_rate": 0.00048286852663858367, + "loss": 0.83430725, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2732, + "time_per_iteration": 2.9533157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063655, + "balance_loss_mlp": 1.05203819, + "diversity_loss_mlp": 0.0, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.07616653501098058, + "language_loss": 0.8428973, + "learning_rate": 0.000482557170595462, + "loss": 0.8535338, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2733, + "time_per_iteration": 2.865147829055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065104, + "balance_loss_mlp": 1.0532366, + "diversity_loss_mlp": 0.0, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.060395165010054055, + "language_loss": 0.87880594, + "learning_rate": 0.0004822458213242475, + "loss": 0.88945693, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.11859131, + "routerloss_mlp": 0.0, + "step": 2734, + "time_per_iteration": 2.557253360748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070633, + "balance_loss_mlp": 1.05886698, + "diversity_loss_mlp": 0.0, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.1031910380133139, + "language_loss": 0.86086309, + "learning_rate": 0.00048193447894581627, + "loss": 0.8715694, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2735, + "time_per_iteration": 3.122976541519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076001, + "balance_loss_mlp": 1.06436014, + "diversity_loss_mlp": 0.0, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.06843040001694842, + "language_loss": 0.8809998, + "learning_rate": 0.00048162314358104243, + "loss": 0.89175981, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2736, + "time_per_iteration": 2.6340246200561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00824973, + "balance_loss_mlp": 1.41347969, + "diversity_loss_mlp": 0.20989257, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.031515925317837694, + "language_loss": 0.83306372, + "learning_rate": 0.0004813118153507969, + "loss": 0.84131336, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01328672, + "step": 2737, + "time_per_iteration": 2.7356157302856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_mlp": 1.03480983, + "diversity_loss_mlp": 0.0, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.03217065957479051, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83488321, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2738, + "time_per_iteration": 4.772867202758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.06062317, + "diversity_loss_mlp": 0.0, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.0555866415390632, + "language_loss": 0.83715498, + "learning_rate": 0.00048068918077736163, + "loss": 0.84787494, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2739, + "time_per_iteration": 3.2028074264526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076914, + "balance_loss_mlp": 1.06573176, + "diversity_loss_mlp": 0.0, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.06998122113459494, + "language_loss": 0.81445146, + "learning_rate": 0.0004803778746759001, + "loss": 0.82522058, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2740, + "time_per_iteration": 2.87070369720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082959, + "balance_loss_mlp": 1.07215285, + "diversity_loss_mlp": 0.0, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.07737040857299185, + "language_loss": 0.82122779, + "learning_rate": 0.00048006657619242317, + "loss": 0.83205736, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.10809326, + "routerloss_mlp": 0.0, + "step": 2741, + "time_per_iteration": 2.6385269165039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107519, + "balance_loss_mlp": 1.06447887, + "diversity_loss_mlp": 0.0, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.07879516603511716, + "language_loss": 0.78380877, + "learning_rate": 0.00047975528544778775, + "loss": 0.79456067, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.10717773, + "routerloss_mlp": 0.0, + "step": 2742, + "time_per_iteration": 2.6197235584259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079206, + "balance_loss_mlp": 1.06839335, + "diversity_loss_mlp": 0.0, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.07439948679259917, + "language_loss": 0.88816094, + "learning_rate": 0.00047944400256284754, + "loss": 0.89895302, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.10827637, + "routerloss_mlp": 0.0, + "step": 2743, + "time_per_iteration": 2.6887855529785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00830459, + "balance_loss_mlp": 1.42072511, + "diversity_loss_mlp": 0.21262056, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.03227823662204125, + "language_loss": 0.799101, + "learning_rate": 0.0004791327276584532, + "loss": 0.80740565, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01378582, + "step": 2744, + "time_per_iteration": 2.8497848510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.07629538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.0718535906247093, + "language_loss": 0.80497956, + "learning_rate": 0.00047882146085545264, + "loss": 0.81585032, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.10784912, + "routerloss_mlp": 0.0, + "step": 2745, + "time_per_iteration": 2.6078941822052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017458, + "balance_loss_mlp": 1.01199865, + "diversity_loss_mlp": 0.0, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.013176381696238814, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76419842, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.0546875, + "routerloss_mlp": 0.0, + "step": 2746, + "time_per_iteration": 4.974900007247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078307, + "balance_loss_mlp": 1.06777453, + "diversity_loss_mlp": 0.0, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.0894490118638191, + "language_loss": 0.79344547, + "learning_rate": 0.00047819895203700684, + "loss": 0.80422854, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2747, + "time_per_iteration": 2.717135190963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015273, + "balance_loss_mlp": 1.00983751, + "diversity_loss_mlp": 0.0, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.009473538771460566, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76527709, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.05444336, + "routerloss_mlp": 0.0, + "step": 2748, + "time_per_iteration": 4.642770290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085947, + "balance_loss_mlp": 1.07577801, + "diversity_loss_mlp": 0.0, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.07060951554594143, + "language_loss": 0.88469762, + "learning_rate": 0.0004775764770742277, + "loss": 0.89555711, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2749, + "time_per_iteration": 2.8018476963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087955, + "balance_loss_mlp": 1.07761312, + "diversity_loss_mlp": 0.0, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.08234082280170717, + "language_loss": 0.86406553, + "learning_rate": 0.00047726525259079777, + "loss": 0.8749451, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2750, + "time_per_iteration": 2.8415229320526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00831428, + "balance_loss_mlp": 1.42309499, + "diversity_loss_mlp": 0.21321589, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.03400797212131273, + "language_loss": 0.88723552, + "learning_rate": 0.0004769540369337798, + "loss": 0.89554983, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01327293, + "step": 2751, + "time_per_iteration": 2.752032518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100532, + "balance_loss_mlp": 1.09000587, + "diversity_loss_mlp": 0.0, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06288245154731438, + "language_loss": 0.85769415, + "learning_rate": 0.00047664283022399794, + "loss": 0.86869949, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2752, + "time_per_iteration": 2.8568003177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107464, + "balance_loss_mlp": 1.09725976, + "diversity_loss_mlp": 0.0, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.0883883166781065, + "language_loss": 0.80924225, + "learning_rate": 0.00047633163258227376, + "loss": 0.82031691, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.10205078, + "routerloss_mlp": 0.0, + "step": 2753, + "time_per_iteration": 2.8275938034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104119, + "balance_loss_mlp": 1.09359312, + "diversity_loss_mlp": 0.0, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.06733658380062774, + "language_loss": 0.85417688, + "learning_rate": 0.0004760204441294247, + "loss": 0.86521804, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2754, + "time_per_iteration": 2.6338090896606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104137, + "balance_loss_mlp": 1.09376574, + "diversity_loss_mlp": 0.0, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.06936353635633287, + "language_loss": 0.85999346, + "learning_rate": 0.00047570926498626486, + "loss": 0.87103486, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 2755, + "time_per_iteration": 2.716575860977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108637, + "balance_loss_mlp": 1.09822416, + "diversity_loss_mlp": 0.0, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.061285448286525046, + "language_loss": 0.81361842, + "learning_rate": 0.00047539809527360474, + "loss": 0.82470477, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2756, + "time_per_iteration": 2.881225109100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102434, + "balance_loss_mlp": 1.0919373, + "diversity_loss_mlp": 0.0, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.05865021558391441, + "language_loss": 0.82642096, + "learning_rate": 0.0004750869351122511, + "loss": 0.83744538, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 2757, + "time_per_iteration": 2.9978790283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096362, + "balance_loss_mlp": 1.08600891, + "diversity_loss_mlp": 0.0, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.07787390265260127, + "language_loss": 0.81663013, + "learning_rate": 0.00047477578462300685, + "loss": 0.82759368, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2758, + "time_per_iteration": 2.700833797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090285, + "balance_loss_mlp": 1.07975245, + "diversity_loss_mlp": 0.0, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.069319292192906, + "language_loss": 0.80022508, + "learning_rate": 0.0004744646439266718, + "loss": 0.81112796, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.10528564, + "routerloss_mlp": 0.0, + "step": 2759, + "time_per_iteration": 3.0144033432006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084993, + "balance_loss_mlp": 1.07477677, + "diversity_loss_mlp": 0.0, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.05678736813253772, + "language_loss": 0.92058611, + "learning_rate": 0.000474153513144041, + "loss": 0.93143606, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 2760, + "time_per_iteration": 2.890305995941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082633, + "balance_loss_mlp": 1.07224369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.06975892982263965, + "language_loss": 0.8659752, + "learning_rate": 0.00047384239239590633, + "loss": 0.87680155, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2761, + "time_per_iteration": 2.864649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076622, + "balance_loss_mlp": 1.06607819, + "diversity_loss_mlp": 0.0, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06592907525694008, + "language_loss": 0.88956439, + "learning_rate": 0.0004735312818030556, + "loss": 0.90033066, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 2762, + "time_per_iteration": 2.7256298065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.06967998, + "diversity_loss_mlp": 0.0, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.06903030148880929, + "language_loss": 0.82737643, + "learning_rate": 0.0004732201814862727, + "loss": 0.83817625, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 2763, + "time_per_iteration": 2.785104990005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078579, + "balance_loss_mlp": 1.0687145, + "diversity_loss_mlp": 0.0, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.07391416357546753, + "language_loss": 0.81619537, + "learning_rate": 0.0004729090915663373, + "loss": 0.82698119, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 2764, + "time_per_iteration": 2.841716766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841129, + "balance_loss_mlp": 1.43825924, + "diversity_loss_mlp": 0.21717778, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.03676047653681057, + "language_loss": 0.84753668, + "learning_rate": 0.00047259801216402534, + "loss": 0.85594797, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01341068, + "step": 2765, + "time_per_iteration": 2.5414865016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078812, + "balance_loss_mlp": 1.06872129, + "diversity_loss_mlp": 0.0, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.08353685320939014, + "language_loss": 0.86307138, + "learning_rate": 0.00047228694340010845, + "loss": 0.87385947, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 2766, + "time_per_iteration": 2.571230173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083419, + "balance_loss_mlp": 1.07304192, + "diversity_loss_mlp": 0.0, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.07758433064211989, + "language_loss": 0.85983396, + "learning_rate": 0.0004719758853953544, + "loss": 0.87066811, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 2767, + "time_per_iteration": 3.5577545166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.07479465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.08923013324738549, + "language_loss": 0.83480549, + "learning_rate": 0.00047166483827052645, + "loss": 0.84565854, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.10510254, + "routerloss_mlp": 0.0, + "step": 2768, + "time_per_iteration": 2.3904964923858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014357, + "balance_loss_mlp": 1.0088253, + "diversity_loss_mlp": 0.0, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.015852342000118255, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78092843, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2769, + "time_per_iteration": 4.993681907653809 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100393, + "balance_loss_mlp": 1.08974218, + "diversity_loss_mlp": 0.0, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.07499519146645399, + "language_loss": 0.8344022, + "learning_rate": 0.000471042777143682, + "loss": 0.84540612, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.10656738, + "routerloss_mlp": 0.0, + "step": 2770, + "time_per_iteration": 3.2187654972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099844, + "balance_loss_mlp": 1.0895741, + "diversity_loss_mlp": 0.0, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.07177386868704265, + "language_loss": 0.79602164, + "learning_rate": 0.0004707317633831707, + "loss": 0.80702007, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 2771, + "time_per_iteration": 2.5579092502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097617, + "balance_loss_mlp": 1.08694136, + "diversity_loss_mlp": 0.0, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.08358365289860634, + "language_loss": 0.78326285, + "learning_rate": 0.00047042076098559673, + "loss": 0.79423904, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.10687256, + "routerloss_mlp": 0.0, + "step": 2772, + "time_per_iteration": 2.6240808963775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089428, + "balance_loss_mlp": 1.07924104, + "diversity_loss_mlp": 0.0, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.07827879900232339, + "language_loss": 0.7374208, + "learning_rate": 0.00047010977007170174, + "loss": 0.7483151, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 2773, + "time_per_iteration": 3.239807605743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108646, + "balance_loss_mlp": 1.07606506, + "diversity_loss_mlp": 0.0, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.0770996892807777, + "language_loss": 0.82462615, + "learning_rate": 0.00046979879076222334, + "loss": 0.83549076, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 2774, + "time_per_iteration": 2.6871917247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081473, + "balance_loss_mlp": 1.07122087, + "diversity_loss_mlp": 0.0, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.060681013844514214, + "language_loss": 0.84932172, + "learning_rate": 0.0004694878231778939, + "loss": 0.86013645, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2775, + "time_per_iteration": 3.3516969680786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083848, + "balance_loss_mlp": 1.07336903, + "diversity_loss_mlp": 0.0, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.06561156947814625, + "language_loss": 0.84353071, + "learning_rate": 0.0004691768674394423, + "loss": 0.85436922, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2776, + "time_per_iteration": 2.9356815814971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010203, + "balance_loss_mlp": 1.01491189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.017317997453326725, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85504305, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 2777, + "time_per_iteration": 4.766932010650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017275, + "balance_loss_mlp": 1.01186275, + "diversity_loss_mlp": 0.0, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.016201867017030143, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77670807, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 2778, + "time_per_iteration": 5.022111177444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081648, + "balance_loss_mlp": 1.07109189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.08348606714079294, + "language_loss": 0.79229748, + "learning_rate": 0.00046824407250656676, + "loss": 0.803114, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.10565186, + "routerloss_mlp": 0.0, + "step": 2779, + "time_per_iteration": 2.6202685832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079296, + "balance_loss_mlp": 1.06859064, + "diversity_loss_mlp": 0.0, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.0812040646365834, + "language_loss": 0.83481312, + "learning_rate": 0.0004679331653588161, + "loss": 0.84560603, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.1071167, + "routerloss_mlp": 0.0, + "step": 2780, + "time_per_iteration": 2.6287879943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.07337165, + "diversity_loss_mlp": 0.0, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.08148878126655458, + "language_loss": 0.85570091, + "learning_rate": 0.0004676222706605147, + "loss": 0.86654037, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2781, + "time_per_iteration": 2.634186029434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082358, + "balance_loss_mlp": 1.07175457, + "diversity_loss_mlp": 0.0, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.08561637601090062, + "language_loss": 0.84885913, + "learning_rate": 0.0004673113885323626, + "loss": 0.85968268, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.10601807, + "routerloss_mlp": 0.0, + "step": 2782, + "time_per_iteration": 2.839108943939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084061, + "balance_loss_mlp": 1.07358241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.0730092425976976, + "language_loss": 0.78793383, + "learning_rate": 0.00046700051909505494, + "loss": 0.79877448, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.10479736, + "routerloss_mlp": 0.0, + "step": 2783, + "time_per_iteration": 3.1548988819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080824, + "balance_loss_mlp": 1.06943369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06678731146909953, + "language_loss": 0.84066731, + "learning_rate": 0.000466689662469282, + "loss": 0.85147554, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2784, + "time_per_iteration": 2.6213507652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082867, + "balance_loss_mlp": 1.07235312, + "diversity_loss_mlp": 0.0, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.06931446022689573, + "language_loss": 0.83996934, + "learning_rate": 0.00046637881877572917, + "loss": 0.85079801, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.10522461, + "routerloss_mlp": 0.0, + "step": 2785, + "time_per_iteration": 3.1161208152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084248, + "balance_loss_mlp": 1.07350779, + "diversity_loss_mlp": 0.0, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.05978198327100757, + "language_loss": 0.84824258, + "learning_rate": 0.0004660679881350764, + "loss": 0.85908508, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 2786, + "time_per_iteration": 2.7317774295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_mlp": 1.0375849, + "diversity_loss_mlp": 0.0, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.025126940202686972, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.7665174, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.06005859, + "routerloss_mlp": 0.0, + "step": 2787, + "time_per_iteration": 5.0151801109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079952, + "balance_loss_mlp": 1.06945598, + "diversity_loss_mlp": 0.0, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07181749108152896, + "language_loss": 0.78038859, + "learning_rate": 0.0004654463664951667, + "loss": 0.79118812, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 2788, + "time_per_iteration": 2.9862492084503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074852, + "balance_loss_mlp": 1.06444538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06160548649513732, + "language_loss": 0.83008492, + "learning_rate": 0.0004651355757372447, + "loss": 0.84083349, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 2789, + "time_per_iteration": 2.6209347248077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00838367, + "balance_loss_mlp": 1.43426061, + "diversity_loss_mlp": 0.2158158, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.029696530744324656, + "language_loss": 0.8589375, + "learning_rate": 0.00046482479851489274, + "loss": 0.86732113, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01332852, + "step": 2790, + "time_per_iteration": 2.6991934776306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077231, + "balance_loss_mlp": 1.06660962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.09378702232215988, + "language_loss": 0.77937293, + "learning_rate": 0.00046451403494876525, + "loss": 0.79014528, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.10632324, + "routerloss_mlp": 0.0, + "step": 2791, + "time_per_iteration": 2.8735973834991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070268, + "balance_loss_mlp": 1.05943799, + "diversity_loss_mlp": 0.0, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.07434319158841775, + "language_loss": 0.84554839, + "learning_rate": 0.0004642032851595111, + "loss": 0.85625106, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.1083374, + "routerloss_mlp": 0.0, + "step": 2792, + "time_per_iteration": 2.7458536624908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065699, + "balance_loss_mlp": 1.05472004, + "diversity_loss_mlp": 0.0, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.06545464420604186, + "language_loss": 0.85163087, + "learning_rate": 0.00046389254926777404, + "loss": 0.86228788, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.10980225, + "routerloss_mlp": 0.0, + "step": 2793, + "time_per_iteration": 2.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062925, + "balance_loss_mlp": 1.0519762, + "diversity_loss_mlp": 0.0, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.06502650627416932, + "language_loss": 0.78292251, + "learning_rate": 0.0004635818273941926, + "loss": 0.79355174, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.10955811, + "routerloss_mlp": 0.0, + "step": 2794, + "time_per_iteration": 3.569359302520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058844, + "balance_loss_mlp": 1.04798412, + "diversity_loss_mlp": 0.0, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.0851115940139546, + "language_loss": 0.81696212, + "learning_rate": 0.0004632711196593997, + "loss": 0.82755053, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2795, + "time_per_iteration": 2.763248920440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059516, + "balance_loss_mlp": 1.04872167, + "diversity_loss_mlp": 0.0, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.08577601840657965, + "language_loss": 0.85307401, + "learning_rate": 0.00046296042618402297, + "loss": 0.86366916, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.10791016, + "routerloss_mlp": 0.0, + "step": 2796, + "time_per_iteration": 3.059995651245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065204, + "balance_loss_mlp": 1.05436158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.05816929772054262, + "language_loss": 0.79285312, + "learning_rate": 0.0004626497470886839, + "loss": 0.80350512, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2797, + "time_per_iteration": 2.9551138877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059667, + "balance_loss_mlp": 1.04897988, + "diversity_loss_mlp": 0.0, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.06686475877008137, + "language_loss": 0.82082057, + "learning_rate": 0.00046233908249399897, + "loss": 0.83141726, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.10693359, + "routerloss_mlp": 0.0, + "step": 2798, + "time_per_iteration": 2.7494163513183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_mlp": 1.06012726, + "diversity_loss_mlp": 0.0, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.06311972638358435, + "language_loss": 0.78919041, + "learning_rate": 0.00046202843252057905, + "loss": 0.79990107, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.10943604, + "routerloss_mlp": 0.0, + "step": 2799, + "time_per_iteration": 2.586824655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076382, + "balance_loss_mlp": 1.06545627, + "diversity_loss_mlp": 0.0, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.06763496495115903, + "language_loss": 0.83705521, + "learning_rate": 0.00046171779728902896, + "loss": 0.84781897, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2800, + "time_per_iteration": 2.5922951698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084728, + "balance_loss_mlp": 1.07354665, + "diversity_loss_mlp": 0.0, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.12725923305511472, + "language_loss": 0.86135888, + "learning_rate": 0.000461407176919948, + "loss": 0.87220615, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2801, + "time_per_iteration": 2.532080888748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_mlp": 1.07459974, + "diversity_loss_mlp": 0.0, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.08372818850883645, + "language_loss": 0.85317719, + "learning_rate": 0.00046109657153392997, + "loss": 0.8640309, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.10772705, + "routerloss_mlp": 0.0, + "step": 2802, + "time_per_iteration": 2.7498726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082922, + "balance_loss_mlp": 1.07185912, + "diversity_loss_mlp": 0.0, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.07972844989907181, + "language_loss": 0.82981819, + "learning_rate": 0.0004607859812515622, + "loss": 0.84064734, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2803, + "time_per_iteration": 2.5823397636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077838, + "balance_loss_mlp": 1.06679916, + "diversity_loss_mlp": 0.0, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.06982591680837838, + "language_loss": 0.88185596, + "learning_rate": 0.00046047540619342667, + "loss": 0.89263427, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2804, + "time_per_iteration": 2.582594156265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089122, + "balance_loss_mlp": 1.07845902, + "diversity_loss_mlp": 0.0, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.06923180186476277, + "language_loss": 0.80359995, + "learning_rate": 0.00046016484648009933, + "loss": 0.81449121, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.10675049, + "routerloss_mlp": 0.0, + "step": 2805, + "time_per_iteration": 2.705085277557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082055, + "balance_loss_mlp": 1.0713259, + "diversity_loss_mlp": 0.0, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.06938884531628577, + "language_loss": 0.81049907, + "learning_rate": 0.0004598543022321501, + "loss": 0.82131958, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.10736084, + "routerloss_mlp": 0.0, + "step": 2806, + "time_per_iteration": 2.6722495555877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855076, + "balance_loss_mlp": 1.46593428, + "diversity_loss_mlp": 0.21781196, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.030466031644405155, + "language_loss": 0.79783833, + "learning_rate": 0.0004595437735701433, + "loss": 0.80638903, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01320273, + "step": 2807, + "time_per_iteration": 2.734110116958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088135, + "balance_loss_mlp": 1.07728648, + "diversity_loss_mlp": 0.0, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.08474622827734493, + "language_loss": 0.83849192, + "learning_rate": 0.00045923326061463623, + "loss": 0.84937334, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2808, + "time_per_iteration": 2.7606189250946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089545, + "balance_loss_mlp": 1.07878006, + "diversity_loss_mlp": 0.0, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.06442619071995537, + "language_loss": 0.8173002, + "learning_rate": 0.00045892276348618113, + "loss": 0.82819563, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.10772705, + "routerloss_mlp": 0.0, + "step": 2809, + "time_per_iteration": 2.9691591262817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_mlp": 1.02887774, + "diversity_loss_mlp": 0.0, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.01908051648382603, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79294789, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 2810, + "time_per_iteration": 4.957923173904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089256, + "balance_loss_mlp": 1.07848597, + "diversity_loss_mlp": 0.0, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.05960464217413758, + "language_loss": 0.80596066, + "learning_rate": 0.000458301817192603, + "loss": 0.81685317, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.10778809, + "routerloss_mlp": 0.0, + "step": 2811, + "time_per_iteration": 2.852247714996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021724, + "balance_loss_mlp": 1.0165503, + "diversity_loss_mlp": 0.0, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.015447521326512613, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81863511, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.05175781, + "routerloss_mlp": 0.0, + "step": 2812, + "time_per_iteration": 4.808724880218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080487, + "balance_loss_mlp": 1.06993747, + "diversity_loss_mlp": 0.0, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.06805695837678187, + "language_loss": 0.87130654, + "learning_rate": 0.00045768093565369983, + "loss": 0.88211143, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.10552979, + "routerloss_mlp": 0.0, + "step": 2813, + "time_per_iteration": 2.7794101238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090945, + "balance_loss_mlp": 1.08034182, + "diversity_loss_mlp": 0.0, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.06578755075233327, + "language_loss": 0.8208549, + "learning_rate": 0.0004573705194685646, + "loss": 0.83176434, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.1060791, + "routerloss_mlp": 0.0, + "step": 2814, + "time_per_iteration": 2.686871290206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084437, + "balance_loss_mlp": 1.07364845, + "diversity_loss_mlp": 0.0, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.07321549809116977, + "language_loss": 0.84966654, + "learning_rate": 0.00045706011983366157, + "loss": 0.86051095, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.10784912, + "routerloss_mlp": 0.0, + "step": 2815, + "time_per_iteration": 2.676772117614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843207, + "balance_loss_mlp": 1.44560027, + "diversity_loss_mlp": 0.21445701, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.03775972378408833, + "language_loss": 0.82685602, + "learning_rate": 0.00045674973686949847, + "loss": 0.83528805, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01317827, + "step": 2816, + "time_per_iteration": 2.548164129257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079521, + "balance_loss_mlp": 1.06887531, + "diversity_loss_mlp": 0.0, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.06715248152064907, + "language_loss": 0.85478067, + "learning_rate": 0.0004564393706965766, + "loss": 0.86557591, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.10656738, + "routerloss_mlp": 0.0, + "step": 2817, + "time_per_iteration": 2.9715416431427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078759, + "balance_loss_mlp": 1.06789875, + "diversity_loss_mlp": 0.0, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.07300594242261846, + "language_loss": 0.81410033, + "learning_rate": 0.00045612902143539116, + "loss": 0.82488787, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.10864258, + "routerloss_mlp": 0.0, + "step": 2818, + "time_per_iteration": 2.5861568450927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.05926371, + "diversity_loss_mlp": 0.0, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.07796543703625758, + "language_loss": 0.8169418, + "learning_rate": 0.00045581868920642986, + "loss": 0.82763875, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 2819, + "time_per_iteration": 2.495675563812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079235, + "balance_loss_mlp": 1.06864905, + "diversity_loss_mlp": 0.0, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.08284985931126, + "language_loss": 0.79605496, + "learning_rate": 0.00045550837413017457, + "loss": 0.80684733, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2820, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081511, + "balance_loss_mlp": 1.07137275, + "diversity_loss_mlp": 0.0, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06853869944040722, + "language_loss": 0.85501075, + "learning_rate": 0.0004551980763271005, + "loss": 0.86582589, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 2821, + "time_per_iteration": 2.6689629554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080137, + "balance_loss_mlp": 1.06970072, + "diversity_loss_mlp": 0.0, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.07047505467714002, + "language_loss": 0.83788973, + "learning_rate": 0.0004548877959176756, + "loss": 0.84869111, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2822, + "time_per_iteration": 2.8898305892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.06903815, + "diversity_loss_mlp": 0.0, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.06782192405371351, + "language_loss": 0.86297488, + "learning_rate": 0.00045457753302236166, + "loss": 0.87376869, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.10339355, + "routerloss_mlp": 0.0, + "step": 2823, + "time_per_iteration": 2.626262903213501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087962, + "balance_loss_mlp": 1.07755554, + "diversity_loss_mlp": 0.0, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.07336203540826484, + "language_loss": 0.87131381, + "learning_rate": 0.00045426728776161353, + "loss": 0.88219345, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2824, + "time_per_iteration": 2.7630255222320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085716, + "balance_loss_mlp": 1.07529116, + "diversity_loss_mlp": 0.0, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.07766893457840997, + "language_loss": 0.81382459, + "learning_rate": 0.00045395706025587863, + "loss": 0.82468176, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2825, + "time_per_iteration": 2.653036594390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070472, + "balance_loss_mlp": 1.05976105, + "diversity_loss_mlp": 0.0, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.08392292239142347, + "language_loss": 0.82965428, + "learning_rate": 0.00045364685062559843, + "loss": 0.84035897, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.10717773, + "routerloss_mlp": 0.0, + "step": 2826, + "time_per_iteration": 2.8091156482696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075397, + "balance_loss_mlp": 1.06498957, + "diversity_loss_mlp": 0.0, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.06510139608888613, + "language_loss": 0.91622829, + "learning_rate": 0.0004533366589912067, + "loss": 0.92698228, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2827, + "time_per_iteration": 2.949005365371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075847, + "balance_loss_mlp": 1.06538677, + "diversity_loss_mlp": 0.0, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.07049343673366977, + "language_loss": 0.77641904, + "learning_rate": 0.0004530264854731306, + "loss": 0.78717756, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.10461426, + "routerloss_mlp": 0.0, + "step": 2828, + "time_per_iteration": 3.054252862930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079568, + "balance_loss_mlp": 1.06920242, + "diversity_loss_mlp": 0.0, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.05986165572949975, + "language_loss": 0.84122354, + "learning_rate": 0.00045271633019179034, + "loss": 0.85201919, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 2829, + "time_per_iteration": 2.788818836212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077904, + "balance_loss_mlp": 1.06762242, + "diversity_loss_mlp": 0.0, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.05963281032217842, + "language_loss": 0.87701666, + "learning_rate": 0.0004524061932675986, + "loss": 0.88779569, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.10284424, + "routerloss_mlp": 0.0, + "step": 2830, + "time_per_iteration": 2.861154079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073509, + "balance_loss_mlp": 1.06306028, + "diversity_loss_mlp": 0.0, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.11132414831600651, + "language_loss": 0.87095535, + "learning_rate": 0.00045209607482096125, + "loss": 0.88169038, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.10455322, + "routerloss_mlp": 0.0, + "step": 2831, + "time_per_iteration": 3.041248321533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107632, + "balance_loss_mlp": 1.06573415, + "diversity_loss_mlp": 0.0, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.07049073021000962, + "language_loss": 0.84385192, + "learning_rate": 0.0004517859749722772, + "loss": 0.85461509, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2832, + "time_per_iteration": 2.663478374481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075242, + "balance_loss_mlp": 1.0643816, + "diversity_loss_mlp": 0.0, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.06386820666055518, + "language_loss": 0.79316235, + "learning_rate": 0.0004514758938419376, + "loss": 0.80391467, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.10870361, + "routerloss_mlp": 0.0, + "step": 2833, + "time_per_iteration": 2.8141582012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_mlp": 1.03721869, + "diversity_loss_mlp": 0.0, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.027736452139364785, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77963334, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2834, + "time_per_iteration": 4.960749864578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075071, + "balance_loss_mlp": 1.06446719, + "diversity_loss_mlp": 0.0, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.06436328535255592, + "language_loss": 0.83993077, + "learning_rate": 0.00045085578821782175, + "loss": 0.85068148, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.1060791, + "routerloss_mlp": 0.0, + "step": 2835, + "time_per_iteration": 2.6025185585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020548, + "balance_loss_mlp": 1.01516008, + "diversity_loss_mlp": 0.0, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.015651807900939278, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77155292, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 2836, + "time_per_iteration": 4.911514043807983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079494, + "balance_loss_mlp": 1.06864595, + "diversity_loss_mlp": 0.0, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.05502946705999508, + "language_loss": 0.81078947, + "learning_rate": 0.00045023575891159866, + "loss": 0.82158434, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.10852051, + "routerloss_mlp": 0.0, + "step": 2837, + "time_per_iteration": 2.7158284187316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008506, + "balance_loss_mlp": 1.00321293, + "diversity_loss_mlp": 0.0, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.010060791837063862, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75772309, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 2838, + "time_per_iteration": 4.9448912143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078413, + "balance_loss_mlp": 1.06803036, + "diversity_loss_mlp": 0.0, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.059936217606746015, + "language_loss": 0.78111225, + "learning_rate": 0.0004496158068861354, + "loss": 0.79189646, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 2839, + "time_per_iteration": 2.8019115924835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.07090366, + "diversity_loss_mlp": 0.0, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.06804602152838367, + "language_loss": 0.80713242, + "learning_rate": 0.00044930586015455207, + "loss": 0.81794775, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.10638428, + "routerloss_mlp": 0.0, + "step": 2840, + "time_per_iteration": 2.771359443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076777, + "balance_loss_mlp": 1.06646562, + "diversity_loss_mlp": 0.0, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.0578733121218936, + "language_loss": 0.88904727, + "learning_rate": 0.000448995933104179, + "loss": 0.89981508, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2841, + "time_per_iteration": 2.8486392498016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081465, + "balance_loss_mlp": 1.07075977, + "diversity_loss_mlp": 0.0, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.07392730491467848, + "language_loss": 0.80162299, + "learning_rate": 0.00044868602585534077, + "loss": 0.81243765, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.10699463, + "routerloss_mlp": 0.0, + "step": 2842, + "time_per_iteration": 2.8463480472564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074348, + "balance_loss_mlp": 1.06379187, + "diversity_loss_mlp": 0.0, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.0858024928700591, + "language_loss": 0.89360344, + "learning_rate": 0.0004483761385283541, + "loss": 0.90434694, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.10565186, + "routerloss_mlp": 0.0, + "step": 2843, + "time_per_iteration": 2.534032106399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870358, + "balance_loss_mlp": 1.4994092, + "diversity_loss_mlp": 0.21570696, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.030684440159293704, + "language_loss": 0.8165319, + "learning_rate": 0.0004480662712435281, + "loss": 0.82523549, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01280049, + "step": 2844, + "time_per_iteration": 2.7523300647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081741, + "balance_loss_mlp": 1.07085109, + "diversity_loss_mlp": 0.0, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.08261462073704483, + "language_loss": 0.88389564, + "learning_rate": 0.0004477564241211635, + "loss": 0.89471304, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 2845, + "time_per_iteration": 2.5676896572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068187, + "balance_loss_mlp": 1.0573566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.07762403474355188, + "language_loss": 0.868963, + "learning_rate": 0.0004474465972815541, + "loss": 0.87964487, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.10839844, + "routerloss_mlp": 0.0, + "step": 2846, + "time_per_iteration": 2.4843738079071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073996, + "balance_loss_mlp": 1.06337464, + "diversity_loss_mlp": 0.0, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.05857404260801407, + "language_loss": 0.87612844, + "learning_rate": 0.000447136790844985, + "loss": 0.88686836, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.10626221, + "routerloss_mlp": 0.0, + "step": 2847, + "time_per_iteration": 2.659214973449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068933, + "balance_loss_mlp": 1.05774474, + "diversity_loss_mlp": 0.0, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.0657788254057266, + "language_loss": 0.80922693, + "learning_rate": 0.00044682700493173385, + "loss": 0.81991625, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.11187744, + "routerloss_mlp": 0.0, + "step": 2848, + "time_per_iteration": 2.8093039989471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071859, + "balance_loss_mlp": 1.06077814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.06921376228249611, + "language_loss": 0.80399549, + "learning_rate": 0.00044651723966207004, + "loss": 0.81471407, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.11090088, + "routerloss_mlp": 0.0, + "step": 2849, + "time_per_iteration": 3.1084961891174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.05826974, + "diversity_loss_mlp": 0.0, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.06382752106805908, + "language_loss": 0.78137773, + "learning_rate": 0.00044620749515625536, + "loss": 0.79206896, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2850, + "time_per_iteration": 2.8127682209014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_mlp": 1.05505395, + "diversity_loss_mlp": 0.0, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.07084116902380141, + "language_loss": 0.85142213, + "learning_rate": 0.00044589777153454334, + "loss": 0.86208153, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 2851, + "time_per_iteration": 2.7690277099609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063836, + "balance_loss_mlp": 1.05239749, + "diversity_loss_mlp": 0.0, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.06308922523972363, + "language_loss": 0.83850712, + "learning_rate": 0.00044558806891717895, + "loss": 0.84914547, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2852, + "time_per_iteration": 2.542076587677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066106, + "balance_loss_mlp": 1.05529404, + "diversity_loss_mlp": 0.0, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.06000502851088379, + "language_loss": 0.79783493, + "learning_rate": 0.0004452783874243998, + "loss": 0.808496, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.1081543, + "routerloss_mlp": 0.0, + "step": 2853, + "time_per_iteration": 2.8680150508880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070657, + "balance_loss_mlp": 1.06022012, + "diversity_loss_mlp": 0.0, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.07387916596955035, + "language_loss": 0.84572864, + "learning_rate": 0.00044496872717643475, + "loss": 0.85643518, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 2854, + "time_per_iteration": 2.676128625869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_mlp": 1.04261672, + "diversity_loss_mlp": 0.0, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.03710413532206065, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78137678, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2855, + "time_per_iteration": 4.937518835067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.06609333, + "diversity_loss_mlp": 0.0, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.06582649113696544, + "language_loss": 0.81989098, + "learning_rate": 0.0004443494708958217, + "loss": 0.83065504, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 2856, + "time_per_iteration": 2.9764318466186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077652, + "balance_loss_mlp": 1.06707263, + "diversity_loss_mlp": 0.0, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.05962775351044122, + "language_loss": 0.80705082, + "learning_rate": 0.0004440398751035906, + "loss": 0.81782728, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2857, + "time_per_iteration": 2.8708760738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107925, + "balance_loss_mlp": 1.06846118, + "diversity_loss_mlp": 0.0, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.08652259855452149, + "language_loss": 0.83723986, + "learning_rate": 0.00044373030103700645, + "loss": 0.84803236, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.10791016, + "routerloss_mlp": 0.0, + "step": 2858, + "time_per_iteration": 2.629887342453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857386, + "balance_loss_mlp": 1.47058845, + "diversity_loss_mlp": 0.21831456, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.03034959963101528, + "language_loss": 0.79655832, + "learning_rate": 0.000443420748816257, + "loss": 0.80513215, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01293462, + "step": 2859, + "time_per_iteration": 2.8473408222198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107821, + "balance_loss_mlp": 1.06795764, + "diversity_loss_mlp": 0.0, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.07076083110298415, + "language_loss": 0.78692329, + "learning_rate": 0.0004431112185615208, + "loss": 0.79770535, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2860, + "time_per_iteration": 2.751131534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.0721283, + "diversity_loss_mlp": 0.0, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.06396450124437818, + "language_loss": 0.7993266, + "learning_rate": 0.00044280171039296845, + "loss": 0.81015229, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.10449219, + "routerloss_mlp": 0.0, + "step": 2861, + "time_per_iteration": 2.606870651245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082002, + "balance_loss_mlp": 1.0716126, + "diversity_loss_mlp": 0.0, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.0734058146638898, + "language_loss": 0.8832019, + "learning_rate": 0.0004424922244307616, + "loss": 0.89402187, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2862, + "time_per_iteration": 2.728055477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081346, + "balance_loss_mlp": 1.07124305, + "diversity_loss_mlp": 0.0, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.08810368166009505, + "language_loss": 0.82030249, + "learning_rate": 0.00044218276079505315, + "loss": 0.83111596, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.10101318, + "routerloss_mlp": 0.0, + "step": 2863, + "time_per_iteration": 2.8925743103027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076982, + "balance_loss_mlp": 1.0667721, + "diversity_loss_mlp": 0.0, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.06918705117949257, + "language_loss": 0.74817479, + "learning_rate": 0.0004418733196059876, + "loss": 0.75894463, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 2864, + "time_per_iteration": 2.747131109237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068041, + "balance_loss_mlp": 1.0579797, + "diversity_loss_mlp": 0.0, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.060188467246496694, + "language_loss": 0.79747194, + "learning_rate": 0.0004415639009837008, + "loss": 0.80815232, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 2865, + "time_per_iteration": 2.838609218597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077074, + "balance_loss_mlp": 1.06704867, + "diversity_loss_mlp": 0.0, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.06869441498871262, + "language_loss": 0.82126647, + "learning_rate": 0.00044125450504831955, + "loss": 0.83203721, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2866, + "time_per_iteration": 2.7267115116119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080685, + "balance_loss_mlp": 1.07046294, + "diversity_loss_mlp": 0.0, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.0812577822304444, + "language_loss": 0.82503623, + "learning_rate": 0.0004409451319199622, + "loss": 0.83584309, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 2867, + "time_per_iteration": 2.6727194786071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080071, + "balance_loss_mlp": 1.07005203, + "diversity_loss_mlp": 0.0, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07302760882162292, + "language_loss": 0.84415638, + "learning_rate": 0.0004406357817187381, + "loss": 0.8549571, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2868, + "time_per_iteration": 2.9669716358184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084226, + "balance_loss_mlp": 1.07424247, + "diversity_loss_mlp": 0.0, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.06120403113840053, + "language_loss": 0.81250817, + "learning_rate": 0.0004403264545647474, + "loss": 0.82335043, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 2869, + "time_per_iteration": 3.535280704498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092656, + "balance_loss_mlp": 1.08244562, + "diversity_loss_mlp": 0.0, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.05305368525165607, + "language_loss": 0.84751379, + "learning_rate": 0.00044001715057808154, + "loss": 0.85844034, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 2870, + "time_per_iteration": 2.757197618484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00867753, + "balance_loss_mlp": 1.49414647, + "diversity_loss_mlp": 0.21602358, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.02933333976418528, + "language_loss": 0.81627762, + "learning_rate": 0.0004397078698788232, + "loss": 0.82495517, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01266836, + "step": 2871, + "time_per_iteration": 3.241936445236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046288, + "balance_loss_mlp": 1.04097104, + "diversity_loss_mlp": 0.0, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.0256992480173019, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81488657, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 2872, + "time_per_iteration": 4.879035234451294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103168, + "balance_loss_mlp": 1.09304726, + "diversity_loss_mlp": 0.0, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.06889966135830194, + "language_loss": 0.78025937, + "learning_rate": 0.00043908937882281343, + "loss": 0.79129106, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.10119629, + "routerloss_mlp": 0.0, + "step": 2873, + "time_per_iteration": 2.624072313308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097291, + "balance_loss_mlp": 1.08644319, + "diversity_loss_mlp": 0.0, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.06659644406743612, + "language_loss": 0.82492054, + "learning_rate": 0.0004387801687061814, + "loss": 0.83589351, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2874, + "time_per_iteration": 2.839524269104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100748, + "balance_loss_mlp": 1.09040689, + "diversity_loss_mlp": 0.0, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.06411004123803754, + "language_loss": 0.80204833, + "learning_rate": 0.0004384709823571958, + "loss": 0.81305587, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2875, + "time_per_iteration": 2.768268346786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092947, + "balance_loss_mlp": 1.08278441, + "diversity_loss_mlp": 0.0, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.0827933156096061, + "language_loss": 0.83099473, + "learning_rate": 0.0004381618198958932, + "loss": 0.84192419, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 2876, + "time_per_iteration": 3.509364604949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084381, + "balance_loss_mlp": 1.07393849, + "diversity_loss_mlp": 0.0, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.0672046455921574, + "language_loss": 0.83616996, + "learning_rate": 0.00043785268144230137, + "loss": 0.84701377, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.10449219, + "routerloss_mlp": 0.0, + "step": 2877, + "time_per_iteration": 2.8941080570220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078858, + "balance_loss_mlp": 1.06849325, + "diversity_loss_mlp": 0.0, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.08466064144544548, + "language_loss": 0.82657743, + "learning_rate": 0.00043754356711643837, + "loss": 0.83736604, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 2878, + "time_per_iteration": 2.6849513053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072509, + "balance_loss_mlp": 1.0620904, + "diversity_loss_mlp": 0.0, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.08115939494621484, + "language_loss": 0.84283209, + "learning_rate": 0.0004372344770383132, + "loss": 0.85355723, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2879, + "time_per_iteration": 2.809833526611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064473, + "balance_loss_mlp": 1.05426884, + "diversity_loss_mlp": 0.0, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.15468249092113104, + "language_loss": 0.82951438, + "learning_rate": 0.00043692541132792507, + "loss": 0.84015906, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.10205078, + "routerloss_mlp": 0.0, + "step": 2880, + "time_per_iteration": 2.6886332035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106573, + "balance_loss_mlp": 1.05541205, + "diversity_loss_mlp": 0.0, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.07258014540865806, + "language_loss": 0.83396262, + "learning_rate": 0.00043661637010526384, + "loss": 0.84461993, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2881, + "time_per_iteration": 2.484912872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010637, + "balance_loss_mlp": 1.05335283, + "diversity_loss_mlp": 0.0, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.07022154553173111, + "language_loss": 0.83217472, + "learning_rate": 0.00043630735349031025, + "loss": 0.8428117, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 2882, + "time_per_iteration": 2.627950429916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.05427396, + "diversity_loss_mlp": 0.0, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.05734398116556458, + "language_loss": 0.81837022, + "learning_rate": 0.00043599836160303495, + "loss": 0.8290168, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2883, + "time_per_iteration": 2.87358021736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061705, + "balance_loss_mlp": 1.05094647, + "diversity_loss_mlp": 0.0, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.05952583825506871, + "language_loss": 0.77472365, + "learning_rate": 0.0004356893945633995, + "loss": 0.78534073, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 2884, + "time_per_iteration": 2.9415786266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058414, + "balance_loss_mlp": 1.04738104, + "diversity_loss_mlp": 0.0, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.06387157363580499, + "language_loss": 0.81997669, + "learning_rate": 0.0004353804524913551, + "loss": 0.8305608, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2885, + "time_per_iteration": 2.5772132873535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106399, + "balance_loss_mlp": 1.05298674, + "diversity_loss_mlp": 0.0, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.07314612024272811, + "language_loss": 0.82015049, + "learning_rate": 0.0004350715355068441, + "loss": 0.8307904, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.11010742, + "routerloss_mlp": 0.0, + "step": 2886, + "time_per_iteration": 2.7211849689483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062969, + "balance_loss_mlp": 1.05221653, + "diversity_loss_mlp": 0.0, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.08671001380075964, + "language_loss": 0.79774809, + "learning_rate": 0.00043476264372979847, + "loss": 0.8083778, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.10754395, + "routerloss_mlp": 0.0, + "step": 2887, + "time_per_iteration": 2.5452206134796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064606, + "balance_loss_mlp": 1.05403173, + "diversity_loss_mlp": 0.0, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.08125450311694367, + "language_loss": 0.78590369, + "learning_rate": 0.0004344537772801408, + "loss": 0.79654968, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.10577393, + "routerloss_mlp": 0.0, + "step": 2888, + "time_per_iteration": 3.870267391204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_mlp": 1.02839172, + "diversity_loss_mlp": 0.0, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.026917818165577125, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74456155, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 2889, + "time_per_iteration": 4.943026065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091351, + "balance_loss_mlp": 1.08043766, + "diversity_loss_mlp": 0.0, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.07456412824125162, + "language_loss": 0.83536172, + "learning_rate": 0.0004338361208426298, + "loss": 0.84627521, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.10919189, + "routerloss_mlp": 0.0, + "step": 2890, + "time_per_iteration": 2.65266752243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094404, + "balance_loss_mlp": 1.08348465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.057576040721241756, + "language_loss": 0.81499392, + "learning_rate": 0.00043352733109457164, + "loss": 0.82593793, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.10919189, + "routerloss_mlp": 0.0, + "step": 2891, + "time_per_iteration": 2.927246332168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106638, + "balance_loss_mlp": 1.09556401, + "diversity_loss_mlp": 0.0, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.0763949134442708, + "language_loss": 0.84462321, + "learning_rate": 0.00043321856715349244, + "loss": 0.85568959, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2892, + "time_per_iteration": 2.970857858657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110278, + "balance_loss_mlp": 1.0918721, + "diversity_loss_mlp": 0.0, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.07453927070697552, + "language_loss": 0.80594504, + "learning_rate": 0.00043290982913926466, + "loss": 0.81697285, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.10913086, + "routerloss_mlp": 0.0, + "step": 2893, + "time_per_iteration": 2.8581972122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105658, + "balance_loss_mlp": 1.09473801, + "diversity_loss_mlp": 0.0, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.08476057735977802, + "language_loss": 0.84177083, + "learning_rate": 0.0004326011171717514, + "loss": 0.85282743, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2894, + "time_per_iteration": 2.90563702583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094642, + "balance_loss_mlp": 1.08371019, + "diversity_loss_mlp": 0.0, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.06785531665857511, + "language_loss": 0.80468631, + "learning_rate": 0.0004322924313708051, + "loss": 0.8156327, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.10931396, + "routerloss_mlp": 0.0, + "step": 2895, + "time_per_iteration": 2.51784610748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092855, + "balance_loss_mlp": 1.08219218, + "diversity_loss_mlp": 0.0, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.07706946900287333, + "language_loss": 0.84533763, + "learning_rate": 0.0004319837718562681, + "loss": 0.85626626, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.10668945, + "routerloss_mlp": 0.0, + "step": 2896, + "time_per_iteration": 2.5862512588500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083747, + "balance_loss_mlp": 1.07321525, + "diversity_loss_mlp": 0.0, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.0793708179068888, + "language_loss": 0.83050567, + "learning_rate": 0.0004316751387479726, + "loss": 0.84134316, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2897, + "time_per_iteration": 2.778136730194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857516, + "balance_loss_mlp": 1.47219694, + "diversity_loss_mlp": 0.21748725, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.034004819690404205, + "language_loss": 0.82499564, + "learning_rate": 0.0004313665321657409, + "loss": 0.83357084, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01267361, + "step": 2898, + "time_per_iteration": 3.7754030227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.06795418, + "diversity_loss_mlp": 0.0, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.08236969633510602, + "language_loss": 0.79824448, + "learning_rate": 0.00043105795222938436, + "loss": 0.80903113, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.1071167, + "routerloss_mlp": 0.0, + "step": 2899, + "time_per_iteration": 2.7090694904327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073397, + "balance_loss_mlp": 1.06296027, + "diversity_loss_mlp": 0.0, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.07659548301877016, + "language_loss": 0.78690445, + "learning_rate": 0.00043074939905870467, + "loss": 0.79763848, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2900, + "time_per_iteration": 2.6444900035858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069144, + "balance_loss_mlp": 1.05899358, + "diversity_loss_mlp": 0.0, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.08372730008806528, + "language_loss": 0.80284113, + "learning_rate": 0.0004304408727734927, + "loss": 0.81353253, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 2901, + "time_per_iteration": 2.6800661087036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855039, + "balance_loss_mlp": 1.46478724, + "diversity_loss_mlp": 0.21833366, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.026106559121528438, + "language_loss": 0.88945115, + "learning_rate": 0.0004301323734935288, + "loss": 0.89800155, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01347797, + "step": 2902, + "time_per_iteration": 2.6880388259887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_mlp": 1.05446076, + "diversity_loss_mlp": 0.0, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.08715674624995783, + "language_loss": 0.87386537, + "learning_rate": 0.000429823901338583, + "loss": 0.88451326, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2903, + "time_per_iteration": 2.611330032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.06004524, + "diversity_loss_mlp": 0.0, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.07350666628476007, + "language_loss": 0.86772639, + "learning_rate": 0.00042951545642841513, + "loss": 0.87843215, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2904, + "time_per_iteration": 3.066653251647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06802535, + "diversity_loss_mlp": 0.0, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.06907930895976065, + "language_loss": 0.86694556, + "learning_rate": 0.0004292070388827737, + "loss": 0.87773216, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.10644531, + "routerloss_mlp": 0.0, + "step": 2905, + "time_per_iteration": 2.5430614948272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068326, + "balance_loss_mlp": 1.05785918, + "diversity_loss_mlp": 0.0, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06877653703862108, + "language_loss": 0.81346464, + "learning_rate": 0.00042889864882139753, + "loss": 0.82414794, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2906, + "time_per_iteration": 2.5722434520721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075755, + "balance_loss_mlp": 1.06534863, + "diversity_loss_mlp": 0.0, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06732553967994827, + "language_loss": 0.81503737, + "learning_rate": 0.0004285902863640139, + "loss": 0.82579494, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.10406494, + "routerloss_mlp": 0.0, + "step": 2907, + "time_per_iteration": 2.643721580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074648, + "balance_loss_mlp": 1.06431222, + "diversity_loss_mlp": 0.0, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.06943407338412115, + "language_loss": 0.86278725, + "learning_rate": 0.00042828195163033966, + "loss": 0.87353367, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.10339355, + "routerloss_mlp": 0.0, + "step": 2908, + "time_per_iteration": 2.7045791149139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081822, + "balance_loss_mlp": 1.07135582, + "diversity_loss_mlp": 0.0, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07324820072157985, + "language_loss": 0.79102659, + "learning_rate": 0.0004279736447400812, + "loss": 0.80184484, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2909, + "time_per_iteration": 2.585176944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107588, + "balance_loss_mlp": 1.06558049, + "diversity_loss_mlp": 0.0, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.07142642262643135, + "language_loss": 0.78468478, + "learning_rate": 0.00042766536581293385, + "loss": 0.79544365, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 2910, + "time_per_iteration": 2.723602771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090371, + "balance_loss_mlp": 1.07975566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.0702995437532307, + "language_loss": 0.79552364, + "learning_rate": 0.0004273571149685819, + "loss": 0.80642736, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2911, + "time_per_iteration": 2.7220258712768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091791, + "balance_loss_mlp": 1.08147311, + "diversity_loss_mlp": 0.0, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.06270923487878967, + "language_loss": 0.84021366, + "learning_rate": 0.00042704889232669937, + "loss": 0.85113156, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 2912, + "time_per_iteration": 2.6799380779266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848913, + "balance_loss_mlp": 1.45588994, + "diversity_loss_mlp": 0.21708892, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.03254511626684893, + "language_loss": 0.85648382, + "learning_rate": 0.0004267406980069484, + "loss": 0.86497295, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01242387, + "step": 2913, + "time_per_iteration": 2.7309391498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.10193157, + "diversity_loss_mlp": 0.0, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.05402445789476675, + "language_loss": 0.79744071, + "learning_rate": 0.0004264325321289808, + "loss": 0.80856508, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.10510254, + "routerloss_mlp": 0.0, + "step": 2914, + "time_per_iteration": 2.8245773315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104335, + "balance_loss_mlp": 1.09404707, + "diversity_loss_mlp": 0.0, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.07588418732744176, + "language_loss": 0.86308336, + "learning_rate": 0.00042612439481243736, + "loss": 0.87412667, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.10284424, + "routerloss_mlp": 0.0, + "step": 2915, + "time_per_iteration": 2.7910971641540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109566, + "balance_loss_mlp": 1.09916496, + "diversity_loss_mlp": 0.0, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.07165476469353879, + "language_loss": 0.90284097, + "learning_rate": 0.00042581628617694735, + "loss": 0.91393661, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2916, + "time_per_iteration": 2.7449898719787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00839442, + "balance_loss_mlp": 1.43753612, + "diversity_loss_mlp": 0.21687999, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.03331291255724556, + "language_loss": 0.81856477, + "learning_rate": 0.0004255082063421296, + "loss": 0.82695925, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01223436, + "step": 2917, + "time_per_iteration": 2.705263614654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.12130046, + "diversity_loss_mlp": 0.0, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.07697799391889214, + "language_loss": 0.84842837, + "learning_rate": 0.00042520015542759065, + "loss": 0.85974395, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2918, + "time_per_iteration": 2.8643360137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110893, + "balance_loss_mlp": 1.09857666, + "diversity_loss_mlp": 0.0, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.059259650717302215, + "language_loss": 0.88182557, + "learning_rate": 0.00042489213355292687, + "loss": 0.89291489, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2919, + "time_per_iteration": 2.871605634689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113923, + "balance_loss_mlp": 1.1035037, + "diversity_loss_mlp": 0.0, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.07025137955977834, + "language_loss": 0.81129396, + "learning_rate": 0.00042458414083772276, + "loss": 0.82243323, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2920, + "time_per_iteration": 2.5280137062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110383, + "balance_loss_mlp": 1.09353638, + "diversity_loss_mlp": 0.0, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.06291310679725345, + "language_loss": 0.85259616, + "learning_rate": 0.000424276177401552, + "loss": 0.86363447, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 2921, + "time_per_iteration": 2.8061861991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091107, + "balance_loss_mlp": 1.08052063, + "diversity_loss_mlp": 0.0, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.06947728514830868, + "language_loss": 0.8586399, + "learning_rate": 0.0004239682433639763, + "loss": 0.86955094, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2922, + "time_per_iteration": 2.7068192958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.07726383, + "diversity_loss_mlp": 0.0, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.06724553342566655, + "language_loss": 0.85617495, + "learning_rate": 0.0004236603388445467, + "loss": 0.86705184, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 2923, + "time_per_iteration": 2.5658164024353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083423, + "balance_loss_mlp": 1.07329023, + "diversity_loss_mlp": 0.0, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.06491959150956746, + "language_loss": 0.82087809, + "learning_rate": 0.00042335246396280166, + "loss": 0.83171237, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.10131836, + "routerloss_mlp": 0.0, + "step": 2924, + "time_per_iteration": 2.7210686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076248, + "balance_loss_mlp": 1.06606197, + "diversity_loss_mlp": 0.0, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.06924351044147684, + "language_loss": 0.90442908, + "learning_rate": 0.0004230446188382693, + "loss": 0.91519153, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 2925, + "time_per_iteration": 2.5210559368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072823, + "balance_loss_mlp": 1.06237423, + "diversity_loss_mlp": 0.0, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.06189914516088338, + "language_loss": 0.80191588, + "learning_rate": 0.0004227368035904654, + "loss": 0.81264406, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.10455322, + "routerloss_mlp": 0.0, + "step": 2926, + "time_per_iteration": 2.957545757293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073658, + "balance_loss_mlp": 1.06312013, + "diversity_loss_mlp": 0.0, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.07119677802103677, + "language_loss": 0.8312782, + "learning_rate": 0.00042242901833889474, + "loss": 0.84201479, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.10540771, + "routerloss_mlp": 0.0, + "step": 2927, + "time_per_iteration": 2.6197497844696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069584, + "balance_loss_mlp": 1.05933261, + "diversity_loss_mlp": 0.0, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.07548469953325632, + "language_loss": 0.85944557, + "learning_rate": 0.0004221212632030501, + "loss": 0.87014145, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2928, + "time_per_iteration": 3.0718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074032, + "balance_loss_mlp": 1.0636375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.0702405954135719, + "language_loss": 0.8005904, + "learning_rate": 0.0004218135383024124, + "loss": 0.81133074, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2929, + "time_per_iteration": 2.6883885860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068804, + "balance_loss_mlp": 1.05836129, + "diversity_loss_mlp": 0.0, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.07423933793606223, + "language_loss": 0.85405028, + "learning_rate": 0.0004215058437564511, + "loss": 0.86473835, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2930, + "time_per_iteration": 2.5645458698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.06520677, + "diversity_loss_mlp": 0.0, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.07045402067927274, + "language_loss": 0.82365847, + "learning_rate": 0.00042119817968462397, + "loss": 0.83441579, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.10528564, + "routerloss_mlp": 0.0, + "step": 2931, + "time_per_iteration": 2.596431255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843243, + "balance_loss_mlp": 1.44432163, + "diversity_loss_mlp": 0.21611315, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.034099962370994746, + "language_loss": 0.87154222, + "learning_rate": 0.0004208905462063766, + "loss": 0.8799746, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01302544, + "step": 2932, + "time_per_iteration": 2.7103724479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088146, + "balance_loss_mlp": 1.07760167, + "diversity_loss_mlp": 0.0, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.07257480225633914, + "language_loss": 0.84035242, + "learning_rate": 0.00042058294344114315, + "loss": 0.8512339, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 2933, + "time_per_iteration": 2.6817541122436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846618, + "balance_loss_mlp": 1.45035362, + "diversity_loss_mlp": 0.21710092, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.03239193802507573, + "language_loss": 0.77597153, + "learning_rate": 0.0004202753715083456, + "loss": 0.78443778, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01289086, + "step": 2934, + "time_per_iteration": 3.1172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.08684492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.08960488369203884, + "language_loss": 0.8126961, + "learning_rate": 0.0004199678305273936, + "loss": 0.82367325, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2935, + "time_per_iteration": 2.648293972015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096103, + "balance_loss_mlp": 1.08564794, + "diversity_loss_mlp": 0.0, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.06584718006017456, + "language_loss": 0.81395173, + "learning_rate": 0.0004196603206176854, + "loss": 0.82491279, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.10461426, + "routerloss_mlp": 0.0, + "step": 2936, + "time_per_iteration": 2.9504921436309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110133, + "balance_loss_mlp": 1.09094691, + "diversity_loss_mlp": 0.0, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.06854637503151859, + "language_loss": 0.83705592, + "learning_rate": 0.000419352841898607, + "loss": 0.84806919, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2937, + "time_per_iteration": 2.965176582336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100386, + "balance_loss_mlp": 1.09003913, + "diversity_loss_mlp": 0.0, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.06908295336200668, + "language_loss": 0.77684075, + "learning_rate": 0.000419045394489532, + "loss": 0.7878446, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2938, + "time_per_iteration": 2.692997455596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094877, + "balance_loss_mlp": 1.08429718, + "diversity_loss_mlp": 0.0, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.06508171061148607, + "language_loss": 0.76831025, + "learning_rate": 0.0004187379785098224, + "loss": 0.77925897, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2939, + "time_per_iteration": 3.123154401779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110149, + "balance_loss_mlp": 1.09110653, + "diversity_loss_mlp": 0.0, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.08014464510269267, + "language_loss": 0.83749938, + "learning_rate": 0.00041843059407882744, + "loss": 0.84851432, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2940, + "time_per_iteration": 2.9720611572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099107, + "balance_loss_mlp": 1.0887475, + "diversity_loss_mlp": 0.0, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.06910210619422795, + "language_loss": 0.82642627, + "learning_rate": 0.0004181232413158842, + "loss": 0.83741736, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2941, + "time_per_iteration": 2.657360315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094217, + "balance_loss_mlp": 1.08388722, + "diversity_loss_mlp": 0.0, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.08913898875539945, + "language_loss": 0.82192254, + "learning_rate": 0.0004178159203403179, + "loss": 0.83286464, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2942, + "time_per_iteration": 2.8812596797943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080275, + "balance_loss_mlp": 1.07014799, + "diversity_loss_mlp": 0.0, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.06202774017820852, + "language_loss": 0.8130517, + "learning_rate": 0.0004175086312714409, + "loss": 0.82385445, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 2943, + "time_per_iteration": 2.561537027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.07015431, + "diversity_loss_mlp": 0.0, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.05809127095966742, + "language_loss": 0.83570457, + "learning_rate": 0.00041720137422855366, + "loss": 0.84651101, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.10491943, + "routerloss_mlp": 0.0, + "step": 2944, + "time_per_iteration": 2.7395284175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075472, + "balance_loss_mlp": 1.06576228, + "diversity_loss_mlp": 0.0, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.07239714207057282, + "language_loss": 0.79116005, + "learning_rate": 0.00041689414933094383, + "loss": 0.80191475, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 2945, + "time_per_iteration": 2.654930353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067367, + "balance_loss_mlp": 1.05734193, + "diversity_loss_mlp": 0.0, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.07615309090382201, + "language_loss": 0.80823922, + "learning_rate": 0.00041658695669788653, + "loss": 0.81891298, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2946, + "time_per_iteration": 2.747903347015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069298, + "balance_loss_mlp": 1.05894506, + "diversity_loss_mlp": 0.0, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.09594015960064259, + "language_loss": 0.81304628, + "learning_rate": 0.00041627979644864453, + "loss": 0.82373923, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2947, + "time_per_iteration": 2.8192365169525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064628, + "balance_loss_mlp": 1.05435264, + "diversity_loss_mlp": 0.0, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.06124486727819338, + "language_loss": 0.81212783, + "learning_rate": 0.0004159726687024683, + "loss": 0.82277411, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 2948, + "time_per_iteration": 2.634019613265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066585, + "balance_loss_mlp": 1.05610037, + "diversity_loss_mlp": 0.0, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.0698899799050157, + "language_loss": 0.7929486, + "learning_rate": 0.00041566557357859506, + "loss": 0.80361444, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2949, + "time_per_iteration": 2.861374616622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.05816913, + "diversity_loss_mlp": 0.0, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.0603589352170923, + "language_loss": 0.79605162, + "learning_rate": 0.0004153585111962502, + "loss": 0.80673802, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2950, + "time_per_iteration": 3.3136749267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076091, + "balance_loss_mlp": 1.06528509, + "diversity_loss_mlp": 0.0, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.07046051490297799, + "language_loss": 0.84271163, + "learning_rate": 0.0004150514816746453, + "loss": 0.85347259, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.10803223, + "routerloss_mlp": 0.0, + "step": 2951, + "time_per_iteration": 2.7142550945281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079575, + "balance_loss_mlp": 1.0689894, + "diversity_loss_mlp": 0.0, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.07561213643312675, + "language_loss": 0.85564739, + "learning_rate": 0.0004147444851329802, + "loss": 0.8664431, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2952, + "time_per_iteration": 2.663442611694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079915, + "balance_loss_mlp": 1.06943655, + "diversity_loss_mlp": 0.0, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.06334656392280237, + "language_loss": 0.85917854, + "learning_rate": 0.00041443752169044126, + "loss": 0.86997765, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2953, + "time_per_iteration": 3.0424787998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083209, + "balance_loss_mlp": 1.07296944, + "diversity_loss_mlp": 0.0, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.08759511227816434, + "language_loss": 0.84844387, + "learning_rate": 0.0004141305914662025, + "loss": 0.85927594, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.10241699, + "routerloss_mlp": 0.0, + "step": 2954, + "time_per_iteration": 2.720574378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080604, + "balance_loss_mlp": 1.06977344, + "diversity_loss_mlp": 0.0, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0625505952609041, + "language_loss": 0.80443704, + "learning_rate": 0.0004138236945794246, + "loss": 0.81524312, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.10839844, + "routerloss_mlp": 0.0, + "step": 2955, + "time_per_iteration": 2.880007743835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067912, + "balance_loss_mlp": 1.05775595, + "diversity_loss_mlp": 0.0, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.08164782403227437, + "language_loss": 0.84066302, + "learning_rate": 0.00041351683114925576, + "loss": 0.85134214, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2956, + "time_per_iteration": 3.061213731765747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072205, + "balance_loss_mlp": 1.06213737, + "diversity_loss_mlp": 0.0, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.06079019071224684, + "language_loss": 0.86355555, + "learning_rate": 0.0004132100012948308, + "loss": 0.87427759, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 2957, + "time_per_iteration": 2.631786823272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069231, + "balance_loss_mlp": 1.0587523, + "diversity_loss_mlp": 0.0, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.07979265854660174, + "language_loss": 0.84526646, + "learning_rate": 0.00041290320513527145, + "loss": 0.85595882, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2958, + "time_per_iteration": 2.5593366622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061477, + "balance_loss_mlp": 1.05111814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.09201222931646683, + "language_loss": 0.85128796, + "learning_rate": 0.0004125964427896867, + "loss": 0.86190271, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.1036377, + "routerloss_mlp": 0.0, + "step": 2959, + "time_per_iteration": 2.667381525039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063517, + "balance_loss_mlp": 1.05320501, + "diversity_loss_mlp": 0.0, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06922825543149586, + "language_loss": 0.79212141, + "learning_rate": 0.0004122897143771723, + "loss": 0.80275661, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2960, + "time_per_iteration": 2.523068904876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067248, + "balance_loss_mlp": 1.0569005, + "diversity_loss_mlp": 0.0, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.06880331468011665, + "language_loss": 0.81306094, + "learning_rate": 0.0004119830200168109, + "loss": 0.82373345, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 2961, + "time_per_iteration": 2.7224626541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106382, + "balance_loss_mlp": 1.05356169, + "diversity_loss_mlp": 0.0, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.08443053343043137, + "language_loss": 0.88515878, + "learning_rate": 0.0004116763598276714, + "loss": 0.89579695, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 2962, + "time_per_iteration": 2.4910728931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067582, + "balance_loss_mlp": 1.05738318, + "diversity_loss_mlp": 0.0, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.07427131552828858, + "language_loss": 0.81298989, + "learning_rate": 0.00041136973392881017, + "loss": 0.82366574, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.10198975, + "routerloss_mlp": 0.0, + "step": 2963, + "time_per_iteration": 2.8261218070983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063452, + "balance_loss_mlp": 1.05275846, + "diversity_loss_mlp": 0.0, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.0795338566562928, + "language_loss": 0.82039535, + "learning_rate": 0.00041106314243926983, + "loss": 0.83102989, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.10699463, + "routerloss_mlp": 0.0, + "step": 2964, + "time_per_iteration": 2.7321033477783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058191, + "balance_loss_mlp": 1.04802823, + "diversity_loss_mlp": 0.0, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.07985594809339186, + "language_loss": 0.87473917, + "learning_rate": 0.0004107565854780798, + "loss": 0.88532114, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2965, + "time_per_iteration": 2.685188055038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105982, + "balance_loss_mlp": 1.0495863, + "diversity_loss_mlp": 0.0, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.12021988187086102, + "language_loss": 0.80887079, + "learning_rate": 0.000410450063164256, + "loss": 0.81946903, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.10241699, + "routerloss_mlp": 0.0, + "step": 2966, + "time_per_iteration": 2.8859732151031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061114, + "balance_loss_mlp": 1.05084372, + "diversity_loss_mlp": 0.0, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.07877125068742231, + "language_loss": 0.82298398, + "learning_rate": 0.00041014357561680115, + "loss": 0.83359516, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 2967, + "time_per_iteration": 2.5546090602874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072036, + "balance_loss_mlp": 1.06186163, + "diversity_loss_mlp": 0.0, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.0603559044145355, + "language_loss": 0.86396813, + "learning_rate": 0.0004098371229547039, + "loss": 0.87468845, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.10174561, + "routerloss_mlp": 0.0, + "step": 2968, + "time_per_iteration": 2.7246880531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055906, + "balance_loss_mlp": 1.05082798, + "diversity_loss_mlp": 0.0, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.032213471653528905, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81066716, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 2969, + "time_per_iteration": 4.802457571029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845784, + "balance_loss_mlp": 1.44834208, + "diversity_loss_mlp": 0.21849446, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.042172582609019446, + "language_loss": 0.80489594, + "learning_rate": 0.00040922432276247107, + "loss": 0.81335378, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01236574, + "step": 2970, + "time_per_iteration": 2.579711675643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100592, + "balance_loss_mlp": 1.09026289, + "diversity_loss_mlp": 0.0, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.08651791755700546, + "language_loss": 0.84556907, + "learning_rate": 0.0004089179754702457, + "loss": 0.85657501, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2971, + "time_per_iteration": 2.744509220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109172, + "balance_loss_mlp": 1.08128309, + "diversity_loss_mlp": 0.0, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.0875480726861112, + "language_loss": 0.79658413, + "learning_rate": 0.00040861166353919843, + "loss": 0.80750132, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2972, + "time_per_iteration": 2.816767692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843649, + "balance_loss_mlp": 1.44322622, + "diversity_loss_mlp": 0.21953782, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.0303598736791247, + "language_loss": 0.81879437, + "learning_rate": 0.00040830538708824983, + "loss": 0.82723081, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01226737, + "step": 2973, + "time_per_iteration": 2.8936269283294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084736, + "balance_loss_mlp": 1.07479978, + "diversity_loss_mlp": 0.0, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.06866249599002382, + "language_loss": 0.81754982, + "learning_rate": 0.000407999146236307, + "loss": 0.82839715, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2974, + "time_per_iteration": 2.558587074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086743, + "balance_loss_mlp": 1.07657444, + "diversity_loss_mlp": 0.0, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.07286762161416734, + "language_loss": 0.83382261, + "learning_rate": 0.0004076929411022634, + "loss": 0.84468997, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2975, + "time_per_iteration": 2.604498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082309, + "balance_loss_mlp": 1.07231879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.06868291627032407, + "language_loss": 0.79575276, + "learning_rate": 0.0004073867718049982, + "loss": 0.80657583, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 2976, + "time_per_iteration": 3.082519054412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841274, + "balance_loss_mlp": 1.44052804, + "diversity_loss_mlp": 0.21771878, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.03510584247140754, + "language_loss": 0.8255651, + "learning_rate": 0.00040708063846337704, + "loss": 0.83397782, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01215104, + "step": 2977, + "time_per_iteration": 2.7563750743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108914, + "balance_loss_mlp": 1.07897186, + "diversity_loss_mlp": 0.0, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.07105452232664011, + "language_loss": 0.81019402, + "learning_rate": 0.00040677454119625143, + "loss": 0.82108539, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2978, + "time_per_iteration": 2.575923442840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089611, + "balance_loss_mlp": 1.07962155, + "diversity_loss_mlp": 0.0, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.07243213986729599, + "language_loss": 0.82912952, + "learning_rate": 0.0004064684801224587, + "loss": 0.84002566, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.09985352, + "routerloss_mlp": 0.0, + "step": 2979, + "time_per_iteration": 2.5965535640716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085844, + "balance_loss_mlp": 1.07600939, + "diversity_loss_mlp": 0.0, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.11138747568582645, + "language_loss": 0.80322999, + "learning_rate": 0.00040616245536082224, + "loss": 0.81408834, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 2980, + "time_per_iteration": 2.599320650100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079792, + "balance_loss_mlp": 1.07008803, + "diversity_loss_mlp": 0.0, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.06764455313032879, + "language_loss": 0.81366718, + "learning_rate": 0.00040585646703015165, + "loss": 0.82446504, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 2981, + "time_per_iteration": 2.8000056743621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_mlp": 1.0740515, + "diversity_loss_mlp": 0.0, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.07435230765684324, + "language_loss": 0.78094304, + "learning_rate": 0.0004055505152492419, + "loss": 0.79178286, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2982, + "time_per_iteration": 2.6867222785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075905, + "balance_loss_mlp": 1.06574273, + "diversity_loss_mlp": 0.0, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.06874763078804642, + "language_loss": 0.74040514, + "learning_rate": 0.00040524460013687425, + "loss": 0.7511642, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2983, + "time_per_iteration": 2.722419500350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070682, + "balance_loss_mlp": 1.06058455, + "diversity_loss_mlp": 0.0, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.06717754752260814, + "language_loss": 0.81118953, + "learning_rate": 0.0004049387218118155, + "loss": 0.82189637, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 2984, + "time_per_iteration": 2.960744857788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065588, + "balance_loss_mlp": 1.05519915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07543134348802799, + "language_loss": 0.85138291, + "learning_rate": 0.00040463288039281777, + "loss": 0.86203879, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2985, + "time_per_iteration": 2.769758939743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_mlp": 1.03847778, + "diversity_loss_mlp": 0.0, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.0202426857746204, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78919691, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.04785156, + "routerloss_mlp": 0.0, + "step": 2986, + "time_per_iteration": 4.966659784317017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062164, + "balance_loss_mlp": 1.05206716, + "diversity_loss_mlp": 0.0, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.15131369926607025, + "language_loss": 0.82060635, + "learning_rate": 0.0004040213087479444, + "loss": 0.83122802, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 2987, + "time_per_iteration": 2.9445290565490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071505, + "balance_loss_mlp": 1.0615747, + "diversity_loss_mlp": 0.0, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.0782867157663105, + "language_loss": 0.85397077, + "learning_rate": 0.0004037155787595018, + "loss": 0.86468589, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2988, + "time_per_iteration": 2.5765254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066911, + "balance_loss_mlp": 1.05708241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.06722963936024443, + "language_loss": 0.80743146, + "learning_rate": 0.000403409886151987, + "loss": 0.81810057, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 2989, + "time_per_iteration": 2.916736364364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_mlp": 1.02410662, + "diversity_loss_mlp": 0.0, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.01652195359171043, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.8302803, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.0480957, + "routerloss_mlp": 0.0, + "step": 2990, + "time_per_iteration": 4.79939866065979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019783, + "balance_loss_mlp": 1.0149194, + "diversity_loss_mlp": 0.0, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.012607930583697005, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79218388, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 2991, + "time_per_iteration": 4.873241901397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107448, + "balance_loss_mlp": 1.06493187, + "diversity_loss_mlp": 0.0, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.07321689676824589, + "language_loss": 0.7675758, + "learning_rate": 0.00040249303380173807, + "loss": 0.77832061, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 2992, + "time_per_iteration": 3.119454860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075897, + "balance_loss_mlp": 1.06607461, + "diversity_loss_mlp": 0.0, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.06951674167184135, + "language_loss": 0.78929973, + "learning_rate": 0.00040218749190459126, + "loss": 0.80005872, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 2993, + "time_per_iteration": 2.735741138458252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074749, + "balance_loss_mlp": 1.06464601, + "diversity_loss_mlp": 0.0, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.09040694151318206, + "language_loss": 0.82524914, + "learning_rate": 0.00040188198798162775, + "loss": 0.83599663, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.10101318, + "routerloss_mlp": 0.0, + "step": 2994, + "time_per_iteration": 2.604189872741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107928, + "balance_loss_mlp": 1.06903386, + "diversity_loss_mlp": 0.0, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.07247823517444965, + "language_loss": 0.85413349, + "learning_rate": 0.000401576522151455, + "loss": 0.86492634, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.10247803, + "routerloss_mlp": 0.0, + "step": 2995, + "time_per_iteration": 2.8580820560455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082336, + "balance_loss_mlp": 1.07231033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.07641213429349043, + "language_loss": 0.82611746, + "learning_rate": 0.0004012710945326651, + "loss": 0.83694082, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2996, + "time_per_iteration": 2.7899913787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093927, + "balance_loss_mlp": 1.08396673, + "diversity_loss_mlp": 0.0, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.06499516885792743, + "language_loss": 0.81305802, + "learning_rate": 0.0004009657052438355, + "loss": 0.82399726, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 2997, + "time_per_iteration": 2.7985143661499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109354, + "balance_loss_mlp": 1.08339536, + "diversity_loss_mlp": 0.0, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.07919341256021087, + "language_loss": 0.85873878, + "learning_rate": 0.00040066035440352904, + "loss": 0.86967415, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 2998, + "time_per_iteration": 2.633052110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032353, + "balance_loss_mlp": 1.02706063, + "diversity_loss_mlp": 0.0, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.024696349234847453, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80325484, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 2999, + "time_per_iteration": 4.901000022888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111633, + "balance_loss_mlp": 1.10161996, + "diversity_loss_mlp": 0.0, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.09685011562347093, + "language_loss": 0.76085562, + "learning_rate": 0.00040004976854266145, + "loss": 0.77197194, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3000, + "time_per_iteration": 2.5440561771392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106478, + "balance_loss_mlp": 1.09615445, + "diversity_loss_mlp": 0.0, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.08566214489971447, + "language_loss": 0.81596673, + "learning_rate": 0.0003997445337591505, + "loss": 0.82703155, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.10327148, + "routerloss_mlp": 0.0, + "step": 3001, + "time_per_iteration": 2.6576101779937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101254, + "balance_loss_mlp": 1.09120488, + "diversity_loss_mlp": 0.0, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.07034086792873868, + "language_loss": 0.74008942, + "learning_rate": 0.0003994393378982635, + "loss": 0.75110197, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3002, + "time_per_iteration": 2.646756172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_mlp": 1.02816153, + "diversity_loss_mlp": 0.0, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.018933197318392565, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80571294, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.05126953, + "routerloss_mlp": 0.0, + "step": 3003, + "time_per_iteration": 4.810927867889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084692, + "balance_loss_mlp": 1.07440448, + "diversity_loss_mlp": 0.0, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.09168460196837042, + "language_loss": 0.8788178, + "learning_rate": 0.0003988290634182961, + "loss": 0.88966477, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.10290527, + "routerloss_mlp": 0.0, + "step": 3004, + "time_per_iteration": 2.8026678562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086517, + "balance_loss_mlp": 1.0765686, + "diversity_loss_mlp": 0.0, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.07023697016091271, + "language_loss": 0.80836314, + "learning_rate": 0.0003985239850361453, + "loss": 0.81922829, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3005, + "time_per_iteration": 2.605581760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108379, + "balance_loss_mlp": 1.0739491, + "diversity_loss_mlp": 0.0, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.08589270039345176, + "language_loss": 0.84542817, + "learning_rate": 0.0003982189460504777, + "loss": 0.85626608, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.09838867, + "routerloss_mlp": 0.0, + "step": 3006, + "time_per_iteration": 2.755309820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081707, + "balance_loss_mlp": 1.07148504, + "diversity_loss_mlp": 0.0, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.07367765629951939, + "language_loss": 0.79058981, + "learning_rate": 0.00039791394657971935, + "loss": 0.80140698, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3007, + "time_per_iteration": 2.7115721702575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083463, + "balance_loss_mlp": 1.07349145, + "diversity_loss_mlp": 0.0, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.08639799759711958, + "language_loss": 0.84195948, + "learning_rate": 0.00039760898674228205, + "loss": 0.85279417, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.09967041, + "routerloss_mlp": 0.0, + "step": 3008, + "time_per_iteration": 2.6536192893981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.07249665, + "diversity_loss_mlp": 0.0, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.06522284264232586, + "language_loss": 0.80620825, + "learning_rate": 0.0003973040666565613, + "loss": 0.81703728, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.10406494, + "routerloss_mlp": 0.0, + "step": 3009, + "time_per_iteration": 3.0663528442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083981, + "balance_loss_mlp": 1.07382393, + "diversity_loss_mlp": 0.0, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.06612730330601824, + "language_loss": 0.82148051, + "learning_rate": 0.000396999186440938, + "loss": 0.83232027, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 3010, + "time_per_iteration": 2.8332176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078314, + "balance_loss_mlp": 1.06794286, + "diversity_loss_mlp": 0.0, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.0828593686110812, + "language_loss": 0.85258269, + "learning_rate": 0.000396694346213777, + "loss": 0.86336583, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 3011, + "time_per_iteration": 2.6009714603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107282, + "balance_loss_mlp": 1.06272256, + "diversity_loss_mlp": 0.0, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.06962390382868744, + "language_loss": 0.83265769, + "learning_rate": 0.0003963895460934276, + "loss": 0.84338593, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 3012, + "time_per_iteration": 3.1654391288757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069146, + "balance_loss_mlp": 1.05900097, + "diversity_loss_mlp": 0.0, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.07925389671051855, + "language_loss": 0.84790504, + "learning_rate": 0.00039608478619822376, + "loss": 0.85859656, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.10144043, + "routerloss_mlp": 0.0, + "step": 3013, + "time_per_iteration": 2.427522659301758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_mlp": 1.05792189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.06006231039706783, + "language_loss": 0.82350284, + "learning_rate": 0.00039578006664648394, + "loss": 0.83418107, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3014, + "time_per_iteration": 2.744586229324341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073341, + "balance_loss_mlp": 1.06352377, + "diversity_loss_mlp": 0.0, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.06972986465808689, + "language_loss": 0.81348431, + "learning_rate": 0.0003954753875565105, + "loss": 0.82421774, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3015, + "time_per_iteration": 3.0640695095062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072847, + "balance_loss_mlp": 1.06282723, + "diversity_loss_mlp": 0.0, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.07357715078918559, + "language_loss": 0.82623494, + "learning_rate": 0.00039517074904659057, + "loss": 0.83696342, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3016, + "time_per_iteration": 2.6665265560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010727, + "balance_loss_mlp": 1.06269789, + "diversity_loss_mlp": 0.0, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.06753013197016527, + "language_loss": 0.84737754, + "learning_rate": 0.00039486615123499535, + "loss": 0.85810453, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.10003662, + "routerloss_mlp": 0.0, + "step": 3017, + "time_per_iteration": 2.868724822998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067949, + "balance_loss_mlp": 1.05761325, + "diversity_loss_mlp": 0.0, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.06414820954678578, + "language_loss": 0.84855384, + "learning_rate": 0.00039456159423997996, + "loss": 0.85923326, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 3018, + "time_per_iteration": 2.7043581008911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067563, + "balance_loss_mlp": 1.05765033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.06908857206879536, + "language_loss": 0.89950442, + "learning_rate": 0.00039425707817978406, + "loss": 0.91018009, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3019, + "time_per_iteration": 2.661128044128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106838, + "balance_loss_mlp": 1.0578835, + "diversity_loss_mlp": 0.0, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.08125232064199928, + "language_loss": 0.83649898, + "learning_rate": 0.00039395260317263124, + "loss": 0.84718275, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 3020, + "time_per_iteration": 2.5645148754119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070808, + "balance_loss_mlp": 1.06039524, + "diversity_loss_mlp": 0.0, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.06887634041791851, + "language_loss": 0.85043871, + "learning_rate": 0.0003936481693367291, + "loss": 0.86114681, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3021, + "time_per_iteration": 2.7062771320343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077533, + "balance_loss_mlp": 1.06673217, + "diversity_loss_mlp": 0.0, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08641696356618225, + "language_loss": 0.87619507, + "learning_rate": 0.0003933437767902697, + "loss": 0.88697034, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.10803223, + "routerloss_mlp": 0.0, + "step": 3022, + "time_per_iteration": 2.7680017948150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078804, + "balance_loss_mlp": 1.06846249, + "diversity_loss_mlp": 0.0, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.0708496595357851, + "language_loss": 0.78467089, + "learning_rate": 0.00039303942565142825, + "loss": 0.79545891, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 3023, + "time_per_iteration": 2.7319986820220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071706, + "balance_loss_mlp": 1.06121564, + "diversity_loss_mlp": 0.0, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.06941107329713525, + "language_loss": 0.76844412, + "learning_rate": 0.0003927351160383644, + "loss": 0.77916121, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 3024, + "time_per_iteration": 2.7925262451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069902, + "balance_loss_mlp": 1.05980492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.07084631667240687, + "language_loss": 0.77815473, + "learning_rate": 0.000392430848069222, + "loss": 0.78885376, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 3025, + "time_per_iteration": 2.5290136337280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075514, + "balance_loss_mlp": 1.06532741, + "diversity_loss_mlp": 0.0, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.07224483468752362, + "language_loss": 0.82501459, + "learning_rate": 0.00039212662186212795, + "loss": 0.83576977, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3026, + "time_per_iteration": 2.6017684936523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106609, + "balance_loss_mlp": 1.05593956, + "diversity_loss_mlp": 0.0, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.05478704818063415, + "language_loss": 0.77076197, + "learning_rate": 0.0003918224375351934, + "loss": 0.78142285, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 3027, + "time_per_iteration": 2.707127571105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069708, + "balance_loss_mlp": 1.05940795, + "diversity_loss_mlp": 0.0, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.07026049561627037, + "language_loss": 0.78559566, + "learning_rate": 0.0003915182952065135, + "loss": 0.79629278, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 3028, + "time_per_iteration": 2.6728062629699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863261, + "balance_loss_mlp": 1.48110199, + "diversity_loss_mlp": 0.21947324, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.028926470462326558, + "language_loss": 0.87632734, + "learning_rate": 0.0003912141949941664, + "loss": 0.88495994, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0129736, + "step": 3029, + "time_per_iteration": 2.7290279865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068105, + "balance_loss_mlp": 1.05748928, + "diversity_loss_mlp": 0.0, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.11092566755711959, + "language_loss": 0.82848042, + "learning_rate": 0.0003909101370162143, + "loss": 0.83916146, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 3030, + "time_per_iteration": 2.5907628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_mlp": 1.05161262, + "diversity_loss_mlp": 0.0, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.028764883169419067, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73491609, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.06103516, + "routerloss_mlp": 0.0, + "step": 3031, + "time_per_iteration": 4.87787127494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066859, + "balance_loss_mlp": 1.05651772, + "diversity_loss_mlp": 0.0, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.06710106844205427, + "language_loss": 0.82853395, + "learning_rate": 0.0003903021482356622, + "loss": 0.83920258, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 3032, + "time_per_iteration": 2.777536153793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067707, + "balance_loss_mlp": 1.05757427, + "diversity_loss_mlp": 0.0, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.05521171326439417, + "language_loss": 0.82775813, + "learning_rate": 0.00038999821766910465, + "loss": 0.83843517, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.10131836, + "routerloss_mlp": 0.0, + "step": 3033, + "time_per_iteration": 2.990370035171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064798, + "balance_loss_mlp": 1.05444503, + "diversity_loss_mlp": 0.0, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.06933125597123427, + "language_loss": 0.85725427, + "learning_rate": 0.00038969432980902606, + "loss": 0.86790228, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 3034, + "time_per_iteration": 2.522594690322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101659, + "balance_loss_mlp": 1.01134527, + "diversity_loss_mlp": 0.0, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.016170176694849804, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80801094, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.05249023, + "routerloss_mlp": 0.0, + "step": 3035, + "time_per_iteration": 4.804777383804321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070254, + "balance_loss_mlp": 1.06007361, + "diversity_loss_mlp": 0.0, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.06630987198212972, + "language_loss": 0.82630336, + "learning_rate": 0.00038908668268020953, + "loss": 0.83700585, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.10180664, + "routerloss_mlp": 0.0, + "step": 3036, + "time_per_iteration": 2.6598165035247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064922, + "balance_loss_mlp": 1.0547123, + "diversity_loss_mlp": 0.0, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.06353975651870693, + "language_loss": 0.85077345, + "learning_rate": 0.00038878292364738097, + "loss": 0.86142278, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3037, + "time_per_iteration": 2.817431688308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066587, + "balance_loss_mlp": 1.05653155, + "diversity_loss_mlp": 0.0, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.06847185322789755, + "language_loss": 0.86992419, + "learning_rate": 0.0003884792077928508, + "loss": 0.88059008, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 3038, + "time_per_iteration": 2.515582323074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067425, + "balance_loss_mlp": 1.05704808, + "diversity_loss_mlp": 0.0, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.08132102193369704, + "language_loss": 0.76704037, + "learning_rate": 0.0003881755352345322, + "loss": 0.77771461, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 3039, + "time_per_iteration": 2.506476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.05959702, + "diversity_loss_mlp": 0.0, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.05655703451029381, + "language_loss": 0.87182224, + "learning_rate": 0.0003878719060903207, + "loss": 0.88252252, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 3040, + "time_per_iteration": 2.5755503177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077595, + "balance_loss_mlp": 1.06733704, + "diversity_loss_mlp": 0.0, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.07213898072930079, + "language_loss": 0.83620822, + "learning_rate": 0.0003875683204780961, + "loss": 0.84698415, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 3041, + "time_per_iteration": 2.7087528705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00858209, + "balance_loss_mlp": 1.47420132, + "diversity_loss_mlp": 0.21720865, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.0337374590034744, + "language_loss": 0.85750413, + "learning_rate": 0.00038726477851572043, + "loss": 0.86608613, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01250451, + "step": 3042, + "time_per_iteration": 2.8391060829162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085797, + "balance_loss_mlp": 1.07552087, + "diversity_loss_mlp": 0.0, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.07424787281712622, + "language_loss": 0.8043561, + "learning_rate": 0.0003869612803210395, + "loss": 0.81521404, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3043, + "time_per_iteration": 2.6728439331054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_mlp": 1.07525158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.0731909762270397, + "language_loss": 0.83286428, + "learning_rate": 0.0003866578260118817, + "loss": 0.8437193, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 3044, + "time_per_iteration": 2.6332969665527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108221, + "balance_loss_mlp": 1.07239914, + "diversity_loss_mlp": 0.0, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.07445534470947208, + "language_loss": 0.82966632, + "learning_rate": 0.0003863544157060581, + "loss": 0.84048843, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3045, + "time_per_iteration": 2.668837785720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081605, + "balance_loss_mlp": 1.07137656, + "diversity_loss_mlp": 0.0, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.07387128485113956, + "language_loss": 0.82359195, + "learning_rate": 0.0003860510495213634, + "loss": 0.83440793, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3046, + "time_per_iteration": 2.8229498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106696, + "balance_loss_mlp": 1.05705416, + "diversity_loss_mlp": 0.0, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.08160785595799389, + "language_loss": 0.78622752, + "learning_rate": 0.0003857477275755746, + "loss": 0.79689717, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3047, + "time_per_iteration": 2.6294050216674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066008, + "balance_loss_mlp": 1.0557915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.0580402220657833, + "language_loss": 0.83646655, + "learning_rate": 0.00038544444998645167, + "loss": 0.84712666, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.10217285, + "routerloss_mlp": 0.0, + "step": 3048, + "time_per_iteration": 3.0289785861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059507, + "balance_loss_mlp": 1.04951751, + "diversity_loss_mlp": 0.0, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.0674332369398686, + "language_loss": 0.81847656, + "learning_rate": 0.00038514121687173767, + "loss": 0.82907164, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 3049, + "time_per_iteration": 2.5797152519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058576, + "balance_loss_mlp": 1.04861593, + "diversity_loss_mlp": 0.0, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.08495884025795868, + "language_loss": 0.82019609, + "learning_rate": 0.00038483802834915807, + "loss": 0.83078188, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3050, + "time_per_iteration": 3.0199241638183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061818, + "balance_loss_mlp": 1.05154216, + "diversity_loss_mlp": 0.0, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.07816426751212531, + "language_loss": 0.78978479, + "learning_rate": 0.00038453488453642074, + "loss": 0.800403, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3051, + "time_per_iteration": 2.7338953018188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105642, + "balance_loss_mlp": 1.04610801, + "diversity_loss_mlp": 0.0, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.07385283463746846, + "language_loss": 0.86878967, + "learning_rate": 0.00038423178555121697, + "loss": 0.87935388, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.10308838, + "routerloss_mlp": 0.0, + "step": 3052, + "time_per_iteration": 2.7545297145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058315, + "balance_loss_mlp": 1.04783666, + "diversity_loss_mlp": 0.0, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.07920619209623277, + "language_loss": 0.85583031, + "learning_rate": 0.00038392873151121994, + "loss": 0.86641347, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 3053, + "time_per_iteration": 3.07143235206604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.04924083, + "diversity_loss_mlp": 0.0, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.07754087781816771, + "language_loss": 0.83137167, + "learning_rate": 0.0003836257225340859, + "loss": 0.84196955, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.10552979, + "routerloss_mlp": 0.0, + "step": 3054, + "time_per_iteration": 2.6132304668426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066843, + "balance_loss_mlp": 1.05597091, + "diversity_loss_mlp": 0.0, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.0689474058081498, + "language_loss": 0.82020974, + "learning_rate": 0.00038332275873745336, + "loss": 0.83087826, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.10882568, + "routerloss_mlp": 0.0, + "step": 3055, + "time_per_iteration": 3.107823371887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855378, + "balance_loss_mlp": 1.46855807, + "diversity_loss_mlp": 0.21676093, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.026786885849911755, + "language_loss": 0.82891941, + "learning_rate": 0.0003830198402389431, + "loss": 0.83747321, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01271825, + "step": 3056, + "time_per_iteration": 2.7645249366760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_mlp": 1.03548789, + "diversity_loss_mlp": 0.0, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.027829027984012215, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78389645, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.05224609, + "routerloss_mlp": 0.0, + "step": 3057, + "time_per_iteration": 4.995454549789429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082248, + "balance_loss_mlp": 1.07115602, + "diversity_loss_mlp": 0.0, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.10105227922023945, + "language_loss": 0.83302426, + "learning_rate": 0.0003824141396066855, + "loss": 0.8438468, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.11096191, + "routerloss_mlp": 0.0, + "step": 3058, + "time_per_iteration": 2.568283796310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086707, + "balance_loss_mlp": 1.07570362, + "diversity_loss_mlp": 0.0, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.10870959422332387, + "language_loss": 0.8283565, + "learning_rate": 0.000382111357708092, + "loss": 0.83922356, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.10998535, + "routerloss_mlp": 0.0, + "step": 3059, + "time_per_iteration": 2.7063958644866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080617, + "balance_loss_mlp": 1.06985879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.09017347087331092, + "language_loss": 0.83373827, + "learning_rate": 0.00038180862157792864, + "loss": 0.84454447, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 3060, + "time_per_iteration": 2.7716259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071354, + "balance_loss_mlp": 1.06098306, + "diversity_loss_mlp": 0.0, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.06780881013643715, + "language_loss": 0.81814772, + "learning_rate": 0.0003815059313337279, + "loss": 0.82886124, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 3061, + "time_per_iteration": 2.664134979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.06180596, + "diversity_loss_mlp": 0.0, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.06335749004143083, + "language_loss": 0.78063929, + "learning_rate": 0.00038120328709300436, + "loss": 0.79135942, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3062, + "time_per_iteration": 2.8627028465270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066046, + "balance_loss_mlp": 1.05566847, + "diversity_loss_mlp": 0.0, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.06769296518732247, + "language_loss": 0.8382163, + "learning_rate": 0.0003809006889732549, + "loss": 0.84887671, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 3063, + "time_per_iteration": 2.809983253479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066721, + "balance_loss_mlp": 1.05686879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.07471445768221775, + "language_loss": 0.88052714, + "learning_rate": 0.0003805981370919589, + "loss": 0.89119434, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3064, + "time_per_iteration": 2.526881456375122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106806, + "balance_loss_mlp": 1.05822492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.06588713514234819, + "language_loss": 0.83812523, + "learning_rate": 0.0003802956315665771, + "loss": 0.84880579, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3065, + "time_per_iteration": 2.6691834926605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072593, + "balance_loss_mlp": 1.06285346, + "diversity_loss_mlp": 0.0, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.11425397529110681, + "language_loss": 0.8185159, + "learning_rate": 0.0003799931725145529, + "loss": 0.82924175, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.09729004, + "routerloss_mlp": 0.0, + "step": 3066, + "time_per_iteration": 2.6098556518554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_mlp": 1.06719375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.07983506473752326, + "language_loss": 0.85902935, + "learning_rate": 0.00037969076005331083, + "loss": 0.86980045, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3067, + "time_per_iteration": 2.7626185417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081851, + "balance_loss_mlp": 1.07184935, + "diversity_loss_mlp": 0.0, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.07247659487205776, + "language_loss": 0.8802191, + "learning_rate": 0.00037938839430025817, + "loss": 0.89103758, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3068, + "time_per_iteration": 2.6493396759033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088575, + "balance_loss_mlp": 1.07886577, + "diversity_loss_mlp": 0.0, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.0655302097756617, + "language_loss": 0.85496283, + "learning_rate": 0.0003790860753727835, + "loss": 0.8658486, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3069, + "time_per_iteration": 2.7941815853118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089673, + "balance_loss_mlp": 1.07995713, + "diversity_loss_mlp": 0.0, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0796849495747384, + "language_loss": 0.82864797, + "learning_rate": 0.00037878380338825766, + "loss": 0.83954477, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3070, + "time_per_iteration": 2.6861939430236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102877, + "balance_loss_mlp": 1.09311378, + "diversity_loss_mlp": 0.0, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.08458672700427887, + "language_loss": 0.81556624, + "learning_rate": 0.00037848157846403287, + "loss": 0.82659507, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3071, + "time_per_iteration": 2.873662233352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101959, + "balance_loss_mlp": 1.09236836, + "diversity_loss_mlp": 0.0, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.07248408902015292, + "language_loss": 0.83281767, + "learning_rate": 0.0003781794007174435, + "loss": 0.84383726, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3072, + "time_per_iteration": 2.762472629547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088348, + "balance_loss_mlp": 1.08360386, + "diversity_loss_mlp": 0.0, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.032251872290910595, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75162888, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.04736328, + "routerloss_mlp": 0.0, + "step": 3073, + "time_per_iteration": 4.854618787765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107188, + "balance_loss_mlp": 1.09715033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.058981009489694675, + "language_loss": 0.80947924, + "learning_rate": 0.0003775751872264152, + "loss": 0.8205511, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.1003418, + "routerloss_mlp": 0.0, + "step": 3074, + "time_per_iteration": 2.771085023880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101985, + "balance_loss_mlp": 1.09195375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.056077752757325364, + "language_loss": 0.87175214, + "learning_rate": 0.0003772731517165527, + "loss": 0.88277197, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.10028076, + "routerloss_mlp": 0.0, + "step": 3075, + "time_per_iteration": 2.8292393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103862, + "balance_loss_mlp": 1.09419441, + "diversity_loss_mlp": 0.0, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.07602524147414737, + "language_loss": 0.83311272, + "learning_rate": 0.0003769711638534784, + "loss": 0.84415126, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3076, + "time_per_iteration": 2.97261381149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099488, + "balance_loss_mlp": 1.08962953, + "diversity_loss_mlp": 0.0, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.07287223806238774, + "language_loss": 0.79046565, + "learning_rate": 0.00037666922375443446, + "loss": 0.8014605, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3077, + "time_per_iteration": 2.6755480766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093205, + "balance_loss_mlp": 1.08349538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.06803693763690793, + "language_loss": 0.81907725, + "learning_rate": 0.00037636733153664396, + "loss": 0.83000934, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3078, + "time_per_iteration": 2.8055219650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109815, + "balance_loss_mlp": 1.08854795, + "diversity_loss_mlp": 0.0, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.08595437511710807, + "language_loss": 0.80202127, + "learning_rate": 0.0003760654873173124, + "loss": 0.81300277, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3079, + "time_per_iteration": 2.6700353622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089136, + "balance_loss_mlp": 1.07927787, + "diversity_loss_mlp": 0.0, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.06826446524438025, + "language_loss": 0.82043588, + "learning_rate": 0.00037576369121362566, + "loss": 0.8313272, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3080, + "time_per_iteration": 2.596071481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089019, + "balance_loss_mlp": 1.07946444, + "diversity_loss_mlp": 0.0, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.057614109423291045, + "language_loss": 0.81680822, + "learning_rate": 0.0003754619433427516, + "loss": 0.82769841, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3081, + "time_per_iteration": 2.9003093242645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087273, + "balance_loss_mlp": 1.07771826, + "diversity_loss_mlp": 0.0, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.09118109008842482, + "language_loss": 0.7796042, + "learning_rate": 0.0003751602438218392, + "loss": 0.79047692, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3082, + "time_per_iteration": 2.7739951610565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06927121, + "diversity_loss_mlp": 0.0, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.07641398361038237, + "language_loss": 0.84107417, + "learning_rate": 0.0003748585927680186, + "loss": 0.85186076, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3083, + "time_per_iteration": 2.6706809997558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087865, + "balance_loss_mlp": 1.07850111, + "diversity_loss_mlp": 0.0, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.07450452823339063, + "language_loss": 0.82992828, + "learning_rate": 0.00037455699029840086, + "loss": 0.84080696, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3084, + "time_per_iteration": 2.648775100708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082396, + "balance_loss_mlp": 1.07310402, + "diversity_loss_mlp": 0.0, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.0678124296562273, + "language_loss": 0.84694779, + "learning_rate": 0.0003742554365300787, + "loss": 0.85777175, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3085, + "time_per_iteration": 2.787437677383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00854998, + "balance_loss_mlp": 1.4632709, + "diversity_loss_mlp": 0.21810779, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.030613192067315453, + "language_loss": 0.79049134, + "learning_rate": 0.0003739539315801255, + "loss": 0.79904133, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01430825, + "step": 3086, + "time_per_iteration": 2.9476425647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088902, + "balance_loss_mlp": 1.07956231, + "diversity_loss_mlp": 0.0, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.08021663243926581, + "language_loss": 0.91758776, + "learning_rate": 0.000373652475565596, + "loss": 0.92847675, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3087, + "time_per_iteration": 2.473820924758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086482, + "balance_loss_mlp": 1.07684994, + "diversity_loss_mlp": 0.0, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.0746565513598584, + "language_loss": 0.81288451, + "learning_rate": 0.00037335106860352587, + "loss": 0.8237493, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3088, + "time_per_iteration": 2.6710119247436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.07624292, + "diversity_loss_mlp": 0.0, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.06157127364570171, + "language_loss": 0.82947195, + "learning_rate": 0.00037304971081093146, + "loss": 0.84033072, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3089, + "time_per_iteration": 2.5530550479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095759, + "balance_loss_mlp": 1.0863055, + "diversity_loss_mlp": 0.0, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.06188782031055571, + "language_loss": 0.80896157, + "learning_rate": 0.00037274840230481024, + "loss": 0.81991911, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3090, + "time_per_iteration": 2.707697868347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094547, + "balance_loss_mlp": 1.08488476, + "diversity_loss_mlp": 0.0, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.07660649649984981, + "language_loss": 0.79309815, + "learning_rate": 0.00037244714320214077, + "loss": 0.80404359, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3091, + "time_per_iteration": 2.524418354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094365, + "balance_loss_mlp": 1.08449435, + "diversity_loss_mlp": 0.0, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.07189913531932149, + "language_loss": 0.83442843, + "learning_rate": 0.000372145933619882, + "loss": 0.84537208, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3092, + "time_per_iteration": 2.889267683029175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098289, + "balance_loss_mlp": 1.0883646, + "diversity_loss_mlp": 0.0, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.08404319768947686, + "language_loss": 0.82928061, + "learning_rate": 0.000371844773674974, + "loss": 0.84026349, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3093, + "time_per_iteration": 2.729433059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00849837, + "balance_loss_mlp": 1.45755267, + "diversity_loss_mlp": 0.21677493, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.03215359042810467, + "language_loss": 0.82038867, + "learning_rate": 0.0003715436634843375, + "loss": 0.82888705, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01267278, + "step": 3094, + "time_per_iteration": 2.8759658336639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110065, + "balance_loss_mlp": 1.10049295, + "diversity_loss_mlp": 0.0, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.05868361705811182, + "language_loss": 0.80998492, + "learning_rate": 0.00037124260316487355, + "loss": 0.82108557, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3095, + "time_per_iteration": 2.8515610694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.11049807, + "diversity_loss_mlp": 0.0, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.06311708190042467, + "language_loss": 0.89435279, + "learning_rate": 0.0003709415928334643, + "loss": 0.90555483, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3096, + "time_per_iteration": 2.5820794105529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00850727, + "balance_loss_mlp": 1.45894229, + "diversity_loss_mlp": 0.21772251, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.03378868601366531, + "language_loss": 0.80653715, + "learning_rate": 0.00037064063260697233, + "loss": 0.81504446, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01239414, + "step": 3097, + "time_per_iteration": 2.897676467895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138893, + "balance_loss_mlp": 1.12893891, + "diversity_loss_mlp": 0.0, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.06769209825818075, + "language_loss": 0.78597271, + "learning_rate": 0.0003703397226022407, + "loss": 0.79736161, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3098, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056672, + "balance_loss_mlp": 1.05123568, + "diversity_loss_mlp": 0.0, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.0345928166567928, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76556545, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.05444336, + "routerloss_mlp": 0.0, + "step": 3099, + "time_per_iteration": 4.977718114852905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847219, + "balance_loss_mlp": 1.45243645, + "diversity_loss_mlp": 0.21764749, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.029968084230811296, + "language_loss": 0.83180296, + "learning_rate": 0.0003697380537253339, + "loss": 0.84027505, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01217673, + "step": 3100, + "time_per_iteration": 2.673551559448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.11119175, + "diversity_loss_mlp": 0.0, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.06630352939366652, + "language_loss": 0.81596649, + "learning_rate": 0.0003694372950867471, + "loss": 0.82717824, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 3101, + "time_per_iteration": 2.7776670455932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119741, + "balance_loss_mlp": 1.1100198, + "diversity_loss_mlp": 0.0, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.07189145573728124, + "language_loss": 0.77408171, + "learning_rate": 0.0003691365871370976, + "loss": 0.78527915, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3102, + "time_per_iteration": 3.04355525970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116521, + "balance_loss_mlp": 1.1067102, + "diversity_loss_mlp": 0.0, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06839859357083694, + "language_loss": 0.8504554, + "learning_rate": 0.00036883592999313093, + "loss": 0.8616206, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3103, + "time_per_iteration": 2.6881608963012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_mlp": 1.1020087, + "diversity_loss_mlp": 0.0, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.07720585150601726, + "language_loss": 0.7960434, + "learning_rate": 0.0003685353237715722, + "loss": 0.80715817, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3104, + "time_per_iteration": 2.910879135131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104035, + "balance_loss_mlp": 1.09433126, + "diversity_loss_mlp": 0.0, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.08349083770410728, + "language_loss": 0.81658864, + "learning_rate": 0.0003682347685891274, + "loss": 0.82762903, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3105, + "time_per_iteration": 2.8556530475616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093856, + "balance_loss_mlp": 1.08412814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.07861180875636395, + "language_loss": 0.80587226, + "learning_rate": 0.0003679342645624822, + "loss": 0.81681079, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3106, + "time_per_iteration": 2.9788949489593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091288, + "balance_loss_mlp": 1.08144689, + "diversity_loss_mlp": 0.0, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.062123999367099406, + "language_loss": 0.81345969, + "learning_rate": 0.0003676338118083025, + "loss": 0.82437259, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.09832764, + "routerloss_mlp": 0.0, + "step": 3107, + "time_per_iteration": 3.0514276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083265, + "balance_loss_mlp": 1.07369304, + "diversity_loss_mlp": 0.0, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.07200241428310707, + "language_loss": 0.79341209, + "learning_rate": 0.0003673334104432347, + "loss": 0.8042447, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3108, + "time_per_iteration": 2.6402766704559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084433, + "balance_loss_mlp": 1.07493854, + "diversity_loss_mlp": 0.0, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.06431634181531254, + "language_loss": 0.83437502, + "learning_rate": 0.0003670330605839048, + "loss": 0.84521937, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3109, + "time_per_iteration": 2.8350021839141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071839, + "balance_loss_mlp": 1.06252289, + "diversity_loss_mlp": 0.0, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.08338826074003908, + "language_loss": 0.76629049, + "learning_rate": 0.0003667327623469191, + "loss": 0.77700889, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3110, + "time_per_iteration": 2.7434427738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086579, + "balance_loss_mlp": 1.0770725, + "diversity_loss_mlp": 0.0, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.07334566089126898, + "language_loss": 0.7758621, + "learning_rate": 0.00036643251584886333, + "loss": 0.78672791, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3111, + "time_per_iteration": 2.7712619304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080276, + "balance_loss_mlp": 1.07075715, + "diversity_loss_mlp": 0.0, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.0661546294312284, + "language_loss": 0.81729323, + "learning_rate": 0.00036613232120630393, + "loss": 0.82809597, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3112, + "time_per_iteration": 2.6437926292419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077999, + "balance_loss_mlp": 1.06822348, + "diversity_loss_mlp": 0.0, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.09952194732663294, + "language_loss": 0.80305058, + "learning_rate": 0.00036583217853578643, + "loss": 0.81383061, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3113, + "time_per_iteration": 2.5917038917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085265, + "balance_loss_mlp": 1.07562053, + "diversity_loss_mlp": 0.0, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.09394979208953491, + "language_loss": 0.77671385, + "learning_rate": 0.000365532087953837, + "loss": 0.78756654, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.09631348, + "routerloss_mlp": 0.0, + "step": 3114, + "time_per_iteration": 3.6197850704193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075561, + "balance_loss_mlp": 1.06598282, + "diversity_loss_mlp": 0.0, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.08322265150120763, + "language_loss": 0.89675403, + "learning_rate": 0.00036523204957696065, + "loss": 0.90750962, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3115, + "time_per_iteration": 2.5928850173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068785, + "balance_loss_mlp": 1.05900383, + "diversity_loss_mlp": 0.0, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.07018475264035358, + "language_loss": 0.80565965, + "learning_rate": 0.00036493206352164324, + "loss": 0.81634748, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3116, + "time_per_iteration": 2.9302330017089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070118, + "balance_loss_mlp": 1.06046212, + "diversity_loss_mlp": 0.0, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.07338463965566117, + "language_loss": 0.85090643, + "learning_rate": 0.000364632129904349, + "loss": 0.86160767, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3117, + "time_per_iteration": 2.7801764011383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072158, + "balance_loss_mlp": 1.0622344, + "diversity_loss_mlp": 0.0, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.06545944211786243, + "language_loss": 0.78013116, + "learning_rate": 0.00036433224884152283, + "loss": 0.79085279, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 3118, + "time_per_iteration": 2.714756727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107233, + "balance_loss_mlp": 1.06249511, + "diversity_loss_mlp": 0.0, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.08041065589047977, + "language_loss": 0.77752131, + "learning_rate": 0.00036403242044958875, + "loss": 0.78824466, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.09832764, + "routerloss_mlp": 0.0, + "step": 3119, + "time_per_iteration": 2.583292245864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078089, + "balance_loss_mlp": 1.06846261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.07420053325288596, + "language_loss": 0.91699272, + "learning_rate": 0.0003637326448449507, + "loss": 0.92777365, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3120, + "time_per_iteration": 2.717006206512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080309, + "balance_loss_mlp": 1.07065916, + "diversity_loss_mlp": 0.0, + "epoch": 0.6004232397075798, + "flos": 545146661376.0, + "grad_norm": 0.053625374444117885, + "language_loss": 0.86324787, + "learning_rate": 0.00036343292214399177, + "loss": 0.87405097, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3121, + "time_per_iteration": 2.7628395557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092008, + "balance_loss_mlp": 1.08205438, + "diversity_loss_mlp": 0.0, + "epoch": 0.6006156213928434, + "flos": 629947694592.0, + "grad_norm": 0.08110417303016995, + "language_loss": 0.77154052, + "learning_rate": 0.00036313325246307456, + "loss": 0.78246063, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3122, + "time_per_iteration": 2.7920055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097808, + "balance_loss_mlp": 1.08813453, + "diversity_loss_mlp": 0.0, + "epoch": 0.600808003078107, + "flos": 582315277824.0, + "grad_norm": 0.07750521229706399, + "language_loss": 0.87508434, + "learning_rate": 0.0003628336359185411, + "loss": 0.88606238, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 3123, + "time_per_iteration": 2.6752257347106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086195, + "balance_loss_mlp": 1.07632422, + "diversity_loss_mlp": 0.0, + "epoch": 0.6010003847633705, + "flos": 635274855936.0, + "grad_norm": 0.09005007447476754, + "language_loss": 0.75524527, + "learning_rate": 0.000362534072626713, + "loss": 0.7661072, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3124, + "time_per_iteration": 2.7923338413238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077955, + "balance_loss_mlp": 1.06818557, + "diversity_loss_mlp": 0.0, + "epoch": 0.6011927664486341, + "flos": 718763922432.0, + "grad_norm": 0.07223530633843779, + "language_loss": 0.81714958, + "learning_rate": 0.00036223456270389093, + "loss": 0.82792914, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3125, + "time_per_iteration": 3.0091912746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075718, + "balance_loss_mlp": 1.06540036, + "diversity_loss_mlp": 0.0, + "epoch": 0.6013851481338977, + "flos": 499036184064.0, + "grad_norm": 0.06403369467156497, + "language_loss": 0.80792087, + "learning_rate": 0.00036193510626635517, + "loss": 0.81867802, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 3126, + "time_per_iteration": 2.704378843307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066992, + "balance_loss_mlp": 1.05687714, + "diversity_loss_mlp": 0.0, + "epoch": 0.6015775298191612, + "flos": 749587447296.0, + "grad_norm": 0.06193993783441067, + "language_loss": 0.81725299, + "learning_rate": 0.0003616357034303649, + "loss": 0.82792288, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3127, + "time_per_iteration": 3.002530813217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062328, + "balance_loss_mlp": 1.05243957, + "diversity_loss_mlp": 0.0, + "epoch": 0.6017699115044248, + "flos": 593063202816.0, + "grad_norm": 0.054941683840542065, + "language_loss": 0.78751493, + "learning_rate": 0.0003613363543121584, + "loss": 0.79813826, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.09887695, + "routerloss_mlp": 0.0, + "step": 3128, + "time_per_iteration": 2.8690690994262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063837, + "balance_loss_mlp": 1.05367482, + "diversity_loss_mlp": 0.0, + "epoch": 0.6019622931896883, + "flos": 515111270400.0, + "grad_norm": 0.06760978748019858, + "language_loss": 0.85022873, + "learning_rate": 0.00036103705902795357, + "loss": 0.86086708, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 3129, + "time_per_iteration": 2.7233073711395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106265, + "balance_loss_mlp": 1.0526309, + "diversity_loss_mlp": 0.0, + "epoch": 0.6021546748749519, + "flos": 490469852160.0, + "grad_norm": 0.08999540715217709, + "language_loss": 0.79606092, + "learning_rate": 0.0003607378176939471, + "loss": 0.80668741, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3130, + "time_per_iteration": 2.6465327739715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060318, + "balance_loss_mlp": 1.0503943, + "diversity_loss_mlp": 0.0, + "epoch": 0.6023470565602155, + "flos": 541032721920.0, + "grad_norm": 0.0812918345139536, + "language_loss": 0.82358718, + "learning_rate": 0.00036043863042631465, + "loss": 0.83419037, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3131, + "time_per_iteration": 2.645275354385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060921, + "balance_loss_mlp": 1.05113363, + "diversity_loss_mlp": 0.0, + "epoch": 0.6025394382454791, + "flos": 845020408320.0, + "grad_norm": 0.07968064937120022, + "language_loss": 0.7648955, + "learning_rate": 0.00036013949734121133, + "loss": 0.77550471, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3132, + "time_per_iteration": 3.1564602851867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847858, + "balance_loss_mlp": 1.44895816, + "diversity_loss_mlp": 0.22101411, + "epoch": 0.6027318199307425, + "flos": 577173496320.0, + "grad_norm": 0.03213509913040014, + "language_loss": 0.82544625, + "learning_rate": 0.00035984041855477043, + "loss": 0.83392477, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01287225, + "step": 3133, + "time_per_iteration": 2.7710041999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00606016, + "balance_loss_mlp": 1.03831875, + "diversity_loss_mlp": 0.14934492, + "epoch": 0.6029242016160061, + "flos": 1470976754688.0, + "grad_norm": 0.0016585081527992916, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79315913, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01218408, + "step": 3134, + "time_per_iteration": 5.010243892669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058814, + "balance_loss_mlp": 1.04887819, + "diversity_loss_mlp": 0.0, + "epoch": 0.6031165833012697, + "flos": 480744626688.0, + "grad_norm": 0.06935738535706247, + "language_loss": 0.79867685, + "learning_rate": 0.00035924242434230637, + "loss": 0.80926502, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 3135, + "time_per_iteration": 2.644461154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059705, + "balance_loss_mlp": 1.04970384, + "diversity_loss_mlp": 0.0, + "epoch": 0.6033089649865333, + "flos": 499468612608.0, + "grad_norm": 0.08930778928911463, + "language_loss": 0.78960454, + "learning_rate": 0.00035894350914844516, + "loss": 0.80020154, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3136, + "time_per_iteration": 2.6219546794891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060882, + "balance_loss_mlp": 1.05073738, + "diversity_loss_mlp": 0.0, + "epoch": 0.6035013466717969, + "flos": 556613710848.0, + "grad_norm": 0.07477991129212373, + "language_loss": 0.82716846, + "learning_rate": 0.0003586446487175703, + "loss": 0.83777732, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 3137, + "time_per_iteration": 2.7377843856811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057253, + "balance_loss_mlp": 1.04716182, + "diversity_loss_mlp": 0.0, + "epoch": 0.6036937283570604, + "flos": 594827421696.0, + "grad_norm": 0.06084036951856249, + "language_loss": 0.85439289, + "learning_rate": 0.0003583458431657099, + "loss": 0.86496538, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 3138, + "time_per_iteration": 2.773810863494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056899, + "balance_loss_mlp": 1.04697502, + "diversity_loss_mlp": 0.0, + "epoch": 0.603886110042324, + "flos": 540958569984.0, + "grad_norm": 0.10358798927054172, + "language_loss": 0.82887417, + "learning_rate": 0.00035804709260887056, + "loss": 0.83944315, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.09924316, + "routerloss_mlp": 0.0, + "step": 3139, + "time_per_iteration": 2.7064261436462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0084935, + "balance_loss_mlp": 1.45506001, + "diversity_loss_mlp": 0.21838406, + "epoch": 0.6040784917275875, + "flos": 518582808576.0, + "grad_norm": 0.02792942393132789, + "language_loss": 0.89382195, + "learning_rate": 0.0003577483971630373, + "loss": 0.9023155, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01262751, + "step": 3140, + "time_per_iteration": 2.747962236404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063532, + "balance_loss_mlp": 1.053352, + "diversity_loss_mlp": 0.0, + "epoch": 0.6042708734128511, + "flos": 660751395840.0, + "grad_norm": 0.05833739987767841, + "language_loss": 0.84937215, + "learning_rate": 0.00035744975694417414, + "loss": 0.86000752, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.10180664, + "routerloss_mlp": 0.0, + "step": 3141, + "time_per_iteration": 2.886625289916992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060532, + "balance_loss_mlp": 1.05025589, + "diversity_loss_mlp": 0.0, + "epoch": 0.6044632550981146, + "flos": 572330520576.0, + "grad_norm": 0.07799366016494108, + "language_loss": 0.82322264, + "learning_rate": 0.00035715117206822344, + "loss": 0.83382797, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3142, + "time_per_iteration": 2.8120434284210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061407, + "balance_loss_mlp": 1.05125666, + "diversity_loss_mlp": 0.0, + "epoch": 0.6046556367833782, + "flos": 546681083904.0, + "grad_norm": 0.06292121779847899, + "language_loss": 0.80965286, + "learning_rate": 0.0003568526426511065, + "loss": 0.82026696, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 3143, + "time_per_iteration": 2.600508689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857497, + "balance_loss_mlp": 1.4695704, + "diversity_loss_mlp": 0.22092447, + "epoch": 0.6048480184686418, + "flos": 776838117888.0, + "grad_norm": 0.033476134745844106, + "language_loss": 0.83131814, + "learning_rate": 0.000356554168808722, + "loss": 0.8398931, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0122495, + "step": 3144, + "time_per_iteration": 3.026810646057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106893, + "balance_loss_mlp": 1.058887, + "diversity_loss_mlp": 0.0, + "epoch": 0.6050404001539054, + "flos": 657144036864.0, + "grad_norm": 0.07082652980877534, + "language_loss": 0.85014772, + "learning_rate": 0.00035625575065694837, + "loss": 0.86083698, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.10040283, + "routerloss_mlp": 0.0, + "step": 3145, + "time_per_iteration": 2.840867519378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845224, + "balance_loss_mlp": 1.44920301, + "diversity_loss_mlp": 0.21683007, + "epoch": 0.605232781839169, + "flos": 548983816704.0, + "grad_norm": 0.03030378734616264, + "language_loss": 0.77627134, + "learning_rate": 0.0003559573883116415, + "loss": 0.78472358, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01220777, + "step": 3146, + "time_per_iteration": 2.7349908351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107352, + "balance_loss_mlp": 1.06324959, + "diversity_loss_mlp": 0.0, + "epoch": 0.6054251635244324, + "flos": 605402449920.0, + "grad_norm": 0.05605665058846549, + "language_loss": 0.85758018, + "learning_rate": 0.00035565908188863604, + "loss": 0.86831534, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.10272217, + "routerloss_mlp": 0.0, + "step": 3147, + "time_per_iteration": 2.8125319480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845087, + "balance_loss_mlp": 1.44807422, + "diversity_loss_mlp": 0.21802135, + "epoch": 0.605617545209696, + "flos": 613679887872.0, + "grad_norm": 0.03003998541469304, + "language_loss": 0.79795343, + "learning_rate": 0.00035536083150374464, + "loss": 0.80640435, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01203923, + "step": 3148, + "time_per_iteration": 2.8052470684051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017561, + "balance_loss_mlp": 1.01191068, + "diversity_loss_mlp": 0.0, + "epoch": 0.6058099268949596, + "flos": 1498301577216.0, + "grad_norm": 0.017174605961616223, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75765514, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.05639648, + "routerloss_mlp": 0.0, + "step": 3149, + "time_per_iteration": 4.839694023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068624, + "balance_loss_mlp": 1.05813408, + "diversity_loss_mlp": 0.0, + "epoch": 0.6060023085802232, + "flos": 670476621312.0, + "grad_norm": 0.07659984741592324, + "language_loss": 0.86092103, + "learning_rate": 0.0003547644993114475, + "loss": 0.87160718, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.10491943, + "routerloss_mlp": 0.0, + "step": 3150, + "time_per_iteration": 2.847841739654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072042, + "balance_loss_mlp": 1.06145024, + "diversity_loss_mlp": 0.0, + "epoch": 0.6061946902654868, + "flos": 606168562176.0, + "grad_norm": 0.11052058943541425, + "language_loss": 0.79770887, + "learning_rate": 0.00035446641773555806, + "loss": 0.80842924, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 3151, + "time_per_iteration": 2.748117208480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068453, + "balance_loss_mlp": 1.05804002, + "diversity_loss_mlp": 0.0, + "epoch": 0.6063870719507503, + "flos": 557844185088.0, + "grad_norm": 0.06928200582264574, + "language_loss": 0.87033039, + "learning_rate": 0.000354168392660816, + "loss": 0.88101488, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 3152, + "time_per_iteration": 2.7237491607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064757, + "balance_loss_mlp": 1.05449951, + "diversity_loss_mlp": 0.0, + "epoch": 0.6065794536360138, + "flos": 557154796032.0, + "grad_norm": 0.08776252561897581, + "language_loss": 0.83035654, + "learning_rate": 0.0003538704242029252, + "loss": 0.84100413, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 3153, + "time_per_iteration": 2.687469959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064416, + "balance_loss_mlp": 1.05382478, + "diversity_loss_mlp": 0.0, + "epoch": 0.6067718353212774, + "flos": 690144385536.0, + "grad_norm": 0.06996316305541914, + "language_loss": 0.78274238, + "learning_rate": 0.0003535725124775672, + "loss": 0.79338652, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 3154, + "time_per_iteration": 2.844794750213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056628, + "balance_loss_mlp": 1.04631591, + "diversity_loss_mlp": 0.0, + "epoch": 0.606964217006541, + "flos": 521804726784.0, + "grad_norm": 0.06399916678040601, + "language_loss": 0.86628783, + "learning_rate": 0.00035327465760040126, + "loss": 0.87685412, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 3155, + "time_per_iteration": 2.7096383571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_mlp": 1.03957009, + "diversity_loss_mlp": 0.0, + "epoch": 0.6071565986918045, + "flos": 641555707392.0, + "grad_norm": 0.08275092128409181, + "language_loss": 0.84610963, + "learning_rate": 0.00035297685968706526, + "loss": 0.85660648, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3156, + "time_per_iteration": 2.770024061203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054018, + "balance_loss_mlp": 1.04370594, + "diversity_loss_mlp": 0.0, + "epoch": 0.6073489803770681, + "flos": 560581917696.0, + "grad_norm": 0.07863496537101755, + "language_loss": 0.83056825, + "learning_rate": 0.00035267911885317454, + "loss": 0.84110844, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 3157, + "time_per_iteration": 2.671334743499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050585, + "balance_loss_mlp": 1.04051757, + "diversity_loss_mlp": 0.0, + "epoch": 0.6075413620623317, + "flos": 586088193024.0, + "grad_norm": 0.06000790250856451, + "language_loss": 0.81843442, + "learning_rate": 0.0003523814352143222, + "loss": 0.82894027, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3158, + "time_per_iteration": 2.820080518722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053512, + "balance_loss_mlp": 1.04349208, + "diversity_loss_mlp": 0.0, + "epoch": 0.6077337437475953, + "flos": 630812551680.0, + "grad_norm": 0.0842902191025903, + "language_loss": 0.91154212, + "learning_rate": 0.00035208380888607937, + "loss": 0.92207724, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3159, + "time_per_iteration": 2.769655466079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_mlp": 1.02448559, + "diversity_loss_mlp": 0.0, + "epoch": 0.6079261254328588, + "flos": 1468503696384.0, + "grad_norm": 0.01971528727847153, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80491835, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.05371094, + "routerloss_mlp": 0.0, + "step": 3160, + "time_per_iteration": 4.852057933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020567, + "balance_loss_mlp": 1.015203, + "diversity_loss_mlp": 0.0, + "epoch": 0.6081185071181223, + "flos": 1523024861184.0, + "grad_norm": 0.015706814795434412, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76712799, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.05371094, + "routerloss_mlp": 0.0, + "step": 3161, + "time_per_iteration": 5.034492015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105269, + "balance_loss_mlp": 1.04277158, + "diversity_loss_mlp": 0.0, + "epoch": 0.6083108888033859, + "flos": 556319674368.0, + "grad_norm": 0.07240231538807727, + "language_loss": 0.82060492, + "learning_rate": 0.00035119127492038446, + "loss": 0.83113182, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3162, + "time_per_iteration": 2.7958009243011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058888, + "balance_loss_mlp": 1.04918981, + "diversity_loss_mlp": 0.0, + "epoch": 0.6085032704886495, + "flos": 841166000640.0, + "grad_norm": 0.08243185287386566, + "language_loss": 0.8267377, + "learning_rate": 0.00035089387898984436, + "loss": 0.83732659, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.09692383, + "routerloss_mlp": 0.0, + "step": 3163, + "time_per_iteration": 3.0141196250915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106001, + "balance_loss_mlp": 1.04982388, + "diversity_loss_mlp": 0.0, + "epoch": 0.6086956521739131, + "flos": 684792631296.0, + "grad_norm": 0.07404044041946549, + "language_loss": 0.81452298, + "learning_rate": 0.0003505965409474343, + "loss": 0.82512313, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3164, + "time_per_iteration": 2.884279727935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00822199, + "balance_loss_mlp": 1.40056133, + "diversity_loss_mlp": 0.21809974, + "epoch": 0.6088880338591766, + "flos": 535799536128.0, + "grad_norm": 0.02989314006565827, + "language_loss": 0.86555362, + "learning_rate": 0.0003502992609085913, + "loss": 0.8737756, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01286863, + "step": 3165, + "time_per_iteration": 2.665219306945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064479, + "balance_loss_mlp": 1.05481732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6090804155444401, + "flos": 731533026816.0, + "grad_norm": 0.0721176964117247, + "language_loss": 0.82392001, + "learning_rate": 0.00035000203898872954, + "loss": 0.83456486, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3166, + "time_per_iteration": 3.0119569301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064027, + "balance_loss_mlp": 1.05416799, + "diversity_loss_mlp": 0.0, + "epoch": 0.6092727972297037, + "flos": 699014665728.0, + "grad_norm": 0.07129548452914211, + "language_loss": 0.84480536, + "learning_rate": 0.0003497048753032406, + "loss": 0.85544562, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3167, + "time_per_iteration": 2.854588031768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069841, + "balance_loss_mlp": 1.05985689, + "diversity_loss_mlp": 0.0, + "epoch": 0.6094651789149673, + "flos": 1051946735616.0, + "grad_norm": 0.07231997141892146, + "language_loss": 0.80835009, + "learning_rate": 0.000349407769967494, + "loss": 0.8190484, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 3168, + "time_per_iteration": 3.3936102390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072291, + "balance_loss_mlp": 1.06240892, + "diversity_loss_mlp": 0.0, + "epoch": 0.6096575606002309, + "flos": 503085883392.0, + "grad_norm": 0.08318926372150726, + "language_loss": 0.8467539, + "learning_rate": 0.0003491107230968361, + "loss": 0.85747683, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.09881592, + "routerloss_mlp": 0.0, + "step": 3169, + "time_per_iteration": 2.618696928024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070277, + "balance_loss_mlp": 1.06021023, + "diversity_loss_mlp": 0.0, + "epoch": 0.6098499422854944, + "flos": 585643281408.0, + "grad_norm": 0.06713277413300113, + "language_loss": 0.81751496, + "learning_rate": 0.00034881373480659085, + "loss": 0.82821774, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3170, + "time_per_iteration": 2.862299919128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063164, + "balance_loss_mlp": 1.05321598, + "diversity_loss_mlp": 0.0, + "epoch": 0.610042323970758, + "flos": 469205996544.0, + "grad_norm": 0.08200914133790435, + "language_loss": 0.77840459, + "learning_rate": 0.0003485168052120594, + "loss": 0.78903627, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3171, + "time_per_iteration": 2.564657688140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060206, + "balance_loss_mlp": 1.05049598, + "diversity_loss_mlp": 0.0, + "epoch": 0.6102347056560216, + "flos": 514177403904.0, + "grad_norm": 0.07281146068818606, + "language_loss": 0.80045426, + "learning_rate": 0.00034821993442851973, + "loss": 0.81105626, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3172, + "time_per_iteration": 2.6049551963806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058405, + "balance_loss_mlp": 1.04840922, + "diversity_loss_mlp": 0.0, + "epoch": 0.6104270873412851, + "flos": 469013276160.0, + "grad_norm": 0.08175384117022455, + "language_loss": 0.82176208, + "learning_rate": 0.00034792312257122735, + "loss": 0.83234608, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 3173, + "time_per_iteration": 2.6007068157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00813523, + "balance_loss_mlp": 1.38556361, + "diversity_loss_mlp": 0.21673629, + "epoch": 0.6106194690265486, + "flos": 549875837952.0, + "grad_norm": 0.0335182000566727, + "language_loss": 0.80848879, + "learning_rate": 0.00034762636975541506, + "loss": 0.81662405, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01237353, + "step": 3174, + "time_per_iteration": 2.6783013343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061612, + "balance_loss_mlp": 1.05138397, + "diversity_loss_mlp": 0.0, + "epoch": 0.6108118507118122, + "flos": 472857772032.0, + "grad_norm": 0.07909505551334972, + "language_loss": 0.81032109, + "learning_rate": 0.0003473296760962923, + "loss": 0.82093716, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.10229492, + "routerloss_mlp": 0.0, + "step": 3175, + "time_per_iteration": 2.7157249450683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017221, + "balance_loss_mlp": 1.01159382, + "diversity_loss_mlp": 0.0, + "epoch": 0.6110042323970758, + "flos": 1445166904320.0, + "grad_norm": 0.020158265394599716, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79550958, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.05615234, + "routerloss_mlp": 0.0, + "step": 3176, + "time_per_iteration": 4.707489728927612 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_mlp": 1.04915345, + "diversity_loss_mlp": 0.0, + "epoch": 0.6111966140823394, + "flos": 794153590272.0, + "grad_norm": 0.08734600695876651, + "language_loss": 0.8132062, + "learning_rate": 0.00034673646670883976, + "loss": 0.82379746, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.09973145, + "routerloss_mlp": 0.0, + "step": 3177, + "time_per_iteration": 2.965688705444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101108, + "balance_loss_mlp": 1.00557232, + "diversity_loss_mlp": 0.0, + "epoch": 0.611388995767603, + "flos": 1557650663424.0, + "grad_norm": 0.01801959168057259, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76726103, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.05517578, + "routerloss_mlp": 0.0, + "step": 3178, + "time_per_iteration": 4.958420991897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00819092, + "balance_loss_mlp": 1.39532781, + "diversity_loss_mlp": 0.21795917, + "epoch": 0.6115813774528664, + "flos": 712169210880.0, + "grad_norm": 0.031831362939539476, + "language_loss": 0.81821573, + "learning_rate": 0.0003461434953300865, + "loss": 0.82640672, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01244847, + "step": 3179, + "time_per_iteration": 2.92270827293396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063068, + "balance_loss_mlp": 1.05295873, + "diversity_loss_mlp": 0.0, + "epoch": 0.61177375913813, + "flos": 684308072448.0, + "grad_norm": 0.055258394831610054, + "language_loss": 0.81141388, + "learning_rate": 0.0003458470991817515, + "loss": 0.82204449, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3180, + "time_per_iteration": 2.9693758487701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060777, + "balance_loss_mlp": 1.05068588, + "diversity_loss_mlp": 0.0, + "epoch": 0.6119661408233936, + "flos": 511662127104.0, + "grad_norm": 0.06960725666926779, + "language_loss": 0.85075366, + "learning_rate": 0.0003455507628808802, + "loss": 0.86136144, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 3181, + "time_per_iteration": 2.6036593914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071608, + "balance_loss_mlp": 1.06117702, + "diversity_loss_mlp": 0.0, + "epoch": 0.6121585225086572, + "flos": 556809002496.0, + "grad_norm": 0.09091925049493645, + "language_loss": 0.84135175, + "learning_rate": 0.00034525448654252076, + "loss": 0.85206783, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.10430908, + "routerloss_mlp": 0.0, + "step": 3182, + "time_per_iteration": 2.636809825897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061719, + "balance_loss_mlp": 1.05150867, + "diversity_loss_mlp": 0.0, + "epoch": 0.6123509041939207, + "flos": 561849467904.0, + "grad_norm": 0.07252100888517035, + "language_loss": 0.82806599, + "learning_rate": 0.0003449582702816976, + "loss": 0.83868313, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3183, + "time_per_iteration": 2.707475423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070149, + "balance_loss_mlp": 1.05986118, + "diversity_loss_mlp": 0.0, + "epoch": 0.6125432858791843, + "flos": 558056729088.0, + "grad_norm": 0.07323153161974344, + "language_loss": 0.82831162, + "learning_rate": 0.0003446621142134122, + "loss": 0.8390131, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.10290527, + "routerloss_mlp": 0.0, + "step": 3184, + "time_per_iteration": 2.6639719009399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068209, + "balance_loss_mlp": 1.05824375, + "diversity_loss_mlp": 0.0, + "epoch": 0.6127356675644479, + "flos": 415015944192.0, + "grad_norm": 0.08088263565451759, + "language_loss": 0.84134692, + "learning_rate": 0.0003443660184526424, + "loss": 0.85202903, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3185, + "time_per_iteration": 2.465219736099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068542, + "balance_loss_mlp": 1.05862343, + "diversity_loss_mlp": 0.0, + "epoch": 0.6129280492497114, + "flos": 603843434496.0, + "grad_norm": 0.06289917121629264, + "language_loss": 0.86502969, + "learning_rate": 0.0003440699831143429, + "loss": 0.87571514, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3186, + "time_per_iteration": 2.7979393005371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062443, + "balance_loss_mlp": 1.05262065, + "diversity_loss_mlp": 0.0, + "epoch": 0.613120430934975, + "flos": 519766295040.0, + "grad_norm": 0.07676649362634465, + "language_loss": 0.82236582, + "learning_rate": 0.0003437740083134449, + "loss": 0.83299029, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 3187, + "time_per_iteration": 2.686150312423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066248, + "balance_loss_mlp": 1.0564487, + "diversity_loss_mlp": 0.0, + "epoch": 0.6133128126202385, + "flos": 511083965952.0, + "grad_norm": 0.08991197971935971, + "language_loss": 0.83540225, + "learning_rate": 0.00034347809416485574, + "loss": 0.84606475, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.09790039, + "routerloss_mlp": 0.0, + "step": 3188, + "time_per_iteration": 2.604308605194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106686, + "balance_loss_mlp": 1.05696571, + "diversity_loss_mlp": 0.0, + "epoch": 0.6135051943055021, + "flos": 607562021376.0, + "grad_norm": 0.07330624647380965, + "language_loss": 0.81935883, + "learning_rate": 0.0003431822407834597, + "loss": 0.83002746, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.09887695, + "routerloss_mlp": 0.0, + "step": 3189, + "time_per_iteration": 2.786008596420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070082, + "balance_loss_mlp": 1.0602051, + "diversity_loss_mlp": 0.0, + "epoch": 0.6136975759907657, + "flos": 1160200931328.0, + "grad_norm": 0.07745901872485048, + "language_loss": 0.84407461, + "learning_rate": 0.00034288644828411706, + "loss": 0.85477537, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.09869385, + "routerloss_mlp": 0.0, + "step": 3190, + "time_per_iteration": 3.4646387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078292, + "balance_loss_mlp": 1.06861246, + "diversity_loss_mlp": 0.0, + "epoch": 0.6138899576760293, + "flos": 706938596352.0, + "grad_norm": 0.07529521339256182, + "language_loss": 0.75715351, + "learning_rate": 0.0003425907167816649, + "loss": 0.76793635, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3191, + "time_per_iteration": 2.874946117401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00808796, + "balance_loss_mlp": 1.37378812, + "diversity_loss_mlp": 0.21839428, + "epoch": 0.6140823393612928, + "flos": 586443898368.0, + "grad_norm": 0.033870623426287425, + "language_loss": 0.84848714, + "learning_rate": 0.00034229504639091623, + "loss": 0.85657513, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01270431, + "step": 3192, + "time_per_iteration": 2.8179514408111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074782, + "balance_loss_mlp": 1.06519175, + "diversity_loss_mlp": 0.0, + "epoch": 0.6142747210465563, + "flos": 804130633728.0, + "grad_norm": 0.07980932307836838, + "language_loss": 0.79876941, + "learning_rate": 0.0003419994372266606, + "loss": 0.80951726, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3193, + "time_per_iteration": 3.121509552001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070128, + "balance_loss_mlp": 1.06069219, + "diversity_loss_mlp": 0.0, + "epoch": 0.6144671027318199, + "flos": 529434620928.0, + "grad_norm": 0.05544583647367184, + "language_loss": 0.82228541, + "learning_rate": 0.00034170388940366335, + "loss": 0.83298671, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3194, + "time_per_iteration": 2.725961685180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071987, + "balance_loss_mlp": 1.0625093, + "diversity_loss_mlp": 0.0, + "epoch": 0.6146594844170835, + "flos": 805425348096.0, + "grad_norm": 0.06534437990847952, + "language_loss": 0.80109018, + "learning_rate": 0.0003414084030366667, + "loss": 0.81181002, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.0947876, + "routerloss_mlp": 0.0, + "step": 3195, + "time_per_iteration": 3.127318859100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073594, + "balance_loss_mlp": 1.06399155, + "diversity_loss_mlp": 0.0, + "epoch": 0.6148518661023471, + "flos": 501697193472.0, + "grad_norm": 0.07171859971508983, + "language_loss": 0.83377409, + "learning_rate": 0.0003411129782403883, + "loss": 0.84451008, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3196, + "time_per_iteration": 2.7145206928253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078425, + "balance_loss_mlp": 1.06870365, + "diversity_loss_mlp": 0.0, + "epoch": 0.6150442477876106, + "flos": 510688613376.0, + "grad_norm": 0.09666217933122766, + "language_loss": 0.85076511, + "learning_rate": 0.0003408176151295225, + "loss": 0.86154932, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3197, + "time_per_iteration": 2.5919525623321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079298, + "balance_loss_mlp": 1.06990433, + "diversity_loss_mlp": 0.0, + "epoch": 0.6152366294728742, + "flos": 527005979136.0, + "grad_norm": 0.06581377475358774, + "language_loss": 0.77279031, + "learning_rate": 0.00034052231381873944, + "loss": 0.78358328, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3198, + "time_per_iteration": 2.597702741622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082219, + "balance_loss_mlp": 1.07295024, + "diversity_loss_mlp": 0.0, + "epoch": 0.6154290111581378, + "flos": 473300112384.0, + "grad_norm": 0.0683279233493331, + "language_loss": 0.85131848, + "learning_rate": 0.00034022707442268494, + "loss": 0.8621406, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3199, + "time_per_iteration": 2.562068223953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080014, + "balance_loss_mlp": 1.07069743, + "diversity_loss_mlp": 0.0, + "epoch": 0.6156213928434013, + "flos": 550819616256.0, + "grad_norm": 0.0761762485373057, + "language_loss": 0.82035017, + "learning_rate": 0.0003399318970559813, + "loss": 0.83115035, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 3200, + "time_per_iteration": 2.789898157119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080478, + "balance_loss_mlp": 1.07100666, + "diversity_loss_mlp": 0.0, + "epoch": 0.6158137745286649, + "flos": 750941259264.0, + "grad_norm": 0.08069642466901547, + "language_loss": 0.84662288, + "learning_rate": 0.00033963678183322656, + "loss": 0.85742772, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3201, + "time_per_iteration": 3.026878595352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091206, + "balance_loss_mlp": 1.08173513, + "diversity_loss_mlp": 0.0, + "epoch": 0.6160061562139284, + "flos": 555815665152.0, + "grad_norm": 0.059556899615455, + "language_loss": 0.82784677, + "learning_rate": 0.0003393417288689945, + "loss": 0.83875883, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3202, + "time_per_iteration": 2.6654982566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090691, + "balance_loss_mlp": 1.08118427, + "diversity_loss_mlp": 0.0, + "epoch": 0.616198537899192, + "flos": 742177437696.0, + "grad_norm": 0.07467788423655687, + "language_loss": 0.76113433, + "learning_rate": 0.00033904673827783504, + "loss": 0.77204126, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3203, + "time_per_iteration": 2.92669939994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010864, + "balance_loss_mlp": 1.07689261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6163909195844556, + "flos": 478810082304.0, + "grad_norm": 0.06286363142909755, + "language_loss": 0.8181622, + "learning_rate": 0.00033875181017427357, + "loss": 0.82902622, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3204, + "time_per_iteration": 2.5680675506591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090188, + "balance_loss_mlp": 1.08068752, + "diversity_loss_mlp": 0.0, + "epoch": 0.6165833012697192, + "flos": 531517469184.0, + "grad_norm": 0.07085405603281952, + "language_loss": 0.81132901, + "learning_rate": 0.00033845694467281133, + "loss": 0.82223082, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3205, + "time_per_iteration": 2.8592958450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00806951, + "balance_loss_mlp": 1.37197065, + "diversity_loss_mlp": 0.21751499, + "epoch": 0.6167756829549826, + "flos": 807765156864.0, + "grad_norm": 0.030824309293312202, + "language_loss": 0.83412218, + "learning_rate": 0.00033816214188792516, + "loss": 0.84219164, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01220786, + "step": 3206, + "time_per_iteration": 3.1863744258880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087794, + "balance_loss_mlp": 1.07844186, + "diversity_loss_mlp": 0.0, + "epoch": 0.6169680646402462, + "flos": 488928089088.0, + "grad_norm": 0.07935266980456598, + "language_loss": 0.85488075, + "learning_rate": 0.00033786740193406784, + "loss": 0.86575866, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3207, + "time_per_iteration": 2.626253604888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.07682097, + "diversity_loss_mlp": 0.0, + "epoch": 0.6171604463255098, + "flos": 618954918912.0, + "grad_norm": 0.07540350896316815, + "language_loss": 0.81724775, + "learning_rate": 0.00033757272492566736, + "loss": 0.82811046, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3208, + "time_per_iteration": 2.8899030685424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080715, + "balance_loss_mlp": 1.07114851, + "diversity_loss_mlp": 0.0, + "epoch": 0.6173528280107734, + "flos": 528859031040.0, + "grad_norm": 0.05796890161537444, + "language_loss": 0.87216032, + "learning_rate": 0.0003372781109771278, + "loss": 0.88296747, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3209, + "time_per_iteration": 2.752558708190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077325, + "balance_loss_mlp": 1.06753802, + "diversity_loss_mlp": 0.0, + "epoch": 0.617545209696037, + "flos": 596581728768.0, + "grad_norm": 0.06419749590312054, + "language_loss": 0.76373756, + "learning_rate": 0.0003369835602028281, + "loss": 0.7745108, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3210, + "time_per_iteration": 2.7878270149230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068114, + "balance_loss_mlp": 1.05842817, + "diversity_loss_mlp": 0.0, + "epoch": 0.6177375913813005, + "flos": 475098835968.0, + "grad_norm": 0.0669620080474601, + "language_loss": 0.79502624, + "learning_rate": 0.0003366890727171232, + "loss": 0.8057074, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3211, + "time_per_iteration": 2.7112903594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069089, + "balance_loss_mlp": 1.05950451, + "diversity_loss_mlp": 0.0, + "epoch": 0.617929973066564, + "flos": 529812721152.0, + "grad_norm": 0.08442057123784988, + "language_loss": 0.78359348, + "learning_rate": 0.00033639464863434313, + "loss": 0.79428434, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3212, + "time_per_iteration": 2.634425163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_mlp": 1.03023958, + "diversity_loss_mlp": 0.0, + "epoch": 0.6181223547518276, + "flos": 1420053783552.0, + "grad_norm": 0.02134222442632316, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79478121, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 3213, + "time_per_iteration": 4.7891459465026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066789, + "balance_loss_mlp": 1.05715084, + "diversity_loss_mlp": 0.0, + "epoch": 0.6183147364370912, + "flos": 740319243264.0, + "grad_norm": 0.07602232380536252, + "language_loss": 0.79711038, + "learning_rate": 0.00033580599113475543, + "loss": 0.80777824, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3214, + "time_per_iteration": 2.987006187438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065135, + "balance_loss_mlp": 1.0553956, + "diversity_loss_mlp": 0.0, + "epoch": 0.6185071181223547, + "flos": 381649978368.0, + "grad_norm": 0.0762428760353498, + "language_loss": 0.86394417, + "learning_rate": 0.00033551175794648507, + "loss": 0.87459552, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.09735107, + "routerloss_mlp": 0.0, + "step": 3215, + "time_per_iteration": 2.4780433177948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064383, + "balance_loss_mlp": 1.05447078, + "diversity_loss_mlp": 0.0, + "epoch": 0.6186994998076183, + "flos": 463347661824.0, + "grad_norm": 0.059308624592263506, + "language_loss": 0.81911212, + "learning_rate": 0.00033521758861821365, + "loss": 0.82975602, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3216, + "time_per_iteration": 2.5746333599090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062859, + "balance_loss_mlp": 1.05332255, + "diversity_loss_mlp": 0.0, + "epoch": 0.6188918814928819, + "flos": 485273742336.0, + "grad_norm": 0.06339313693664829, + "language_loss": 0.89093363, + "learning_rate": 0.0003349234832641479, + "loss": 0.90156221, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3217, + "time_per_iteration": 2.561518669128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062704, + "balance_loss_mlp": 1.05323243, + "diversity_loss_mlp": 0.0, + "epoch": 0.6190842631781455, + "flos": 657307021824.0, + "grad_norm": 0.07035473810033784, + "language_loss": 0.81230485, + "learning_rate": 0.00033462944199846975, + "loss": 0.82293189, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3218, + "time_per_iteration": 3.0372345447540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065156, + "balance_loss_mlp": 1.05549467, + "diversity_loss_mlp": 0.0, + "epoch": 0.619276644863409, + "flos": 403603223040.0, + "grad_norm": 0.07112802613336307, + "language_loss": 0.86179578, + "learning_rate": 0.00033433546493533606, + "loss": 0.87244731, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3219, + "time_per_iteration": 2.4615468978881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066941, + "balance_loss_mlp": 1.05763078, + "diversity_loss_mlp": 0.0, + "epoch": 0.6194690265486725, + "flos": 583093499904.0, + "grad_norm": 0.07983484825062852, + "language_loss": 0.84651643, + "learning_rate": 0.00033404155218887897, + "loss": 0.8571859, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3220, + "time_per_iteration": 2.725001335144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066491, + "balance_loss_mlp": 1.05722845, + "diversity_loss_mlp": 0.0, + "epoch": 0.6196614082339361, + "flos": 504246974976.0, + "grad_norm": 0.05498489673307501, + "language_loss": 0.87258649, + "learning_rate": 0.00033374770387320534, + "loss": 0.88325131, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3221, + "time_per_iteration": 2.7884719371795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066206, + "balance_loss_mlp": 1.05684233, + "diversity_loss_mlp": 0.0, + "epoch": 0.6198537899191997, + "flos": 575409277440.0, + "grad_norm": 0.06826724081601121, + "language_loss": 0.85091376, + "learning_rate": 0.00033345392010239737, + "loss": 0.86157584, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3222, + "time_per_iteration": 2.758528232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_mlp": 1.06346869, + "diversity_loss_mlp": 0.0, + "epoch": 0.6200461716044633, + "flos": 593157178368.0, + "grad_norm": 0.07112470494876487, + "language_loss": 0.82199866, + "learning_rate": 0.0003331602009905118, + "loss": 0.8327266, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3223, + "time_per_iteration": 2.7497544288635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073437, + "balance_loss_mlp": 1.06405497, + "diversity_loss_mlp": 0.0, + "epoch": 0.6202385532897268, + "flos": 666093238272.0, + "grad_norm": 0.06198906744782324, + "language_loss": 0.8420788, + "learning_rate": 0.00033286654665158085, + "loss": 0.85281318, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3224, + "time_per_iteration": 2.938769817352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00805444, + "balance_loss_mlp": 1.36691594, + "diversity_loss_mlp": 0.21943557, + "epoch": 0.6204309349749904, + "flos": 484952541696.0, + "grad_norm": 0.03128305924884035, + "language_loss": 0.87915754, + "learning_rate": 0.0003325729571996109, + "loss": 0.88721198, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01226849, + "step": 3225, + "time_per_iteration": 2.6774377822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.07079625, + "diversity_loss_mlp": 0.0, + "epoch": 0.6206233166602539, + "flos": 584057101824.0, + "grad_norm": 0.15310961758991004, + "language_loss": 0.83791566, + "learning_rate": 0.000332279432748584, + "loss": 0.8487193, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3226, + "time_per_iteration": 2.723944664001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078837, + "balance_loss_mlp": 1.06965768, + "diversity_loss_mlp": 0.0, + "epoch": 0.6208156983455175, + "flos": 476917383168.0, + "grad_norm": 0.06102841985942585, + "language_loss": 0.87609762, + "learning_rate": 0.00033198597341245576, + "loss": 0.886886, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3227, + "time_per_iteration": 2.6077282428741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107844, + "balance_loss_mlp": 1.06877792, + "diversity_loss_mlp": 0.0, + "epoch": 0.6210080800307811, + "flos": 789066137088.0, + "grad_norm": 0.05859377500804419, + "language_loss": 0.81977952, + "learning_rate": 0.00033169257930515763, + "loss": 0.8305639, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3228, + "time_per_iteration": 3.0201709270477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079582, + "balance_loss_mlp": 1.06983042, + "diversity_loss_mlp": 0.0, + "epoch": 0.6212004617160446, + "flos": 607794388992.0, + "grad_norm": 0.06260829937623101, + "language_loss": 0.81892502, + "learning_rate": 0.0003313992505405951, + "loss": 0.82972085, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.09741211, + "routerloss_mlp": 0.0, + "step": 3229, + "time_per_iteration": 2.7065281867980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085739, + "balance_loss_mlp": 1.07612467, + "diversity_loss_mlp": 0.0, + "epoch": 0.6213928434013082, + "flos": 586520621568.0, + "grad_norm": 0.07524693848551285, + "language_loss": 0.81223184, + "learning_rate": 0.0003311059872326487, + "loss": 0.82308924, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.09606934, + "routerloss_mlp": 0.0, + "step": 3230, + "time_per_iteration": 2.6831164360046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082096, + "balance_loss_mlp": 1.07257652, + "diversity_loss_mlp": 0.0, + "epoch": 0.6215852250865718, + "flos": 536076320256.0, + "grad_norm": 0.08041283658351392, + "language_loss": 0.792005, + "learning_rate": 0.0003308127894951734, + "loss": 0.80282593, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3231, + "time_per_iteration": 2.6133408546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.07829607, + "diversity_loss_mlp": 0.0, + "epoch": 0.6217776067718354, + "flos": 618169356288.0, + "grad_norm": 0.0806270364015219, + "language_loss": 0.86446661, + "learning_rate": 0.00033051965744199834, + "loss": 0.87534499, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3232, + "time_per_iteration": 2.7565104961395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081354, + "balance_loss_mlp": 1.07194829, + "diversity_loss_mlp": 0.0, + "epoch": 0.6219699884570988, + "flos": 545875324416.0, + "grad_norm": 0.06624380464527684, + "language_loss": 0.90293765, + "learning_rate": 0.0003302265911869276, + "loss": 0.91375124, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3233, + "time_per_iteration": 2.926671266555786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070794, + "balance_loss_mlp": 1.06132245, + "diversity_loss_mlp": 0.0, + "epoch": 0.6221623701423624, + "flos": 481149891072.0, + "grad_norm": 0.08213933441923858, + "language_loss": 0.84280741, + "learning_rate": 0.0003299335908437397, + "loss": 0.85351539, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3234, + "time_per_iteration": 2.5910556316375732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074187, + "balance_loss_mlp": 1.06473994, + "diversity_loss_mlp": 0.0, + "epoch": 0.622354751827626, + "flos": 380024151552.0, + "grad_norm": 0.08585428313311574, + "language_loss": 0.79975766, + "learning_rate": 0.0003296406565261873, + "loss": 0.81049955, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3235, + "time_per_iteration": 2.4815149307250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069588, + "balance_loss_mlp": 1.06017601, + "diversity_loss_mlp": 0.0, + "epoch": 0.6225471335128896, + "flos": 667869940224.0, + "grad_norm": 0.07182021420774376, + "language_loss": 0.84884858, + "learning_rate": 0.0003293477883479978, + "loss": 0.85954452, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3236, + "time_per_iteration": 2.821707248687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.05992377, + "diversity_loss_mlp": 0.0, + "epoch": 0.6227395151981532, + "flos": 771320807424.0, + "grad_norm": 0.08520791019751349, + "language_loss": 0.79754794, + "learning_rate": 0.0003290549864228727, + "loss": 0.80824208, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3237, + "time_per_iteration": 2.932542324066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075658, + "balance_loss_mlp": 1.06604934, + "diversity_loss_mlp": 0.0, + "epoch": 0.6229318968834167, + "flos": 484354556928.0, + "grad_norm": 0.07053580491728426, + "language_loss": 0.86281902, + "learning_rate": 0.0003287622508644875, + "loss": 0.87357557, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3238, + "time_per_iteration": 2.742324113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00814101, + "balance_loss_mlp": 1.38574493, + "diversity_loss_mlp": 0.21743111, + "epoch": 0.6231242785686802, + "flos": 462935056896.0, + "grad_norm": 0.03587473659698897, + "language_loss": 0.86128193, + "learning_rate": 0.0003284695817864923, + "loss": 0.86942297, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01251296, + "step": 3239, + "time_per_iteration": 2.5240445137023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071749, + "balance_loss_mlp": 1.06229532, + "diversity_loss_mlp": 0.0, + "epoch": 0.6233166602539438, + "flos": 609089103360.0, + "grad_norm": 0.08834225044652763, + "language_loss": 0.84207428, + "learning_rate": 0.0003281769793025116, + "loss": 0.85279179, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3240, + "time_per_iteration": 2.733356237411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00812174, + "balance_loss_mlp": 1.3801111, + "diversity_loss_mlp": 0.21927354, + "epoch": 0.6235090419392074, + "flos": 439200340992.0, + "grad_norm": 0.03793852776762896, + "language_loss": 0.8948651, + "learning_rate": 0.00032788444352614346, + "loss": 0.90298682, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01248194, + "step": 3241, + "time_per_iteration": 2.599942922592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_mlp": 1.06840372, + "diversity_loss_mlp": 0.0, + "epoch": 0.6237014236244709, + "flos": 504904430592.0, + "grad_norm": 0.07096292336409799, + "language_loss": 0.80582923, + "learning_rate": 0.0003275919745709606, + "loss": 0.81660759, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3242, + "time_per_iteration": 2.5855822563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079567, + "balance_loss_mlp": 1.07014906, + "diversity_loss_mlp": 0.0, + "epoch": 0.6238938053097345, + "flos": 512917194240.0, + "grad_norm": 0.06686828549294242, + "language_loss": 0.81972641, + "learning_rate": 0.00032729957255050936, + "loss": 0.83052206, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3243, + "time_per_iteration": 2.652064561843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.06973052, + "diversity_loss_mlp": 0.0, + "epoch": 0.6240861869949981, + "flos": 736751531520.0, + "grad_norm": 0.0716805986451115, + "language_loss": 0.81674051, + "learning_rate": 0.0003270072375783102, + "loss": 0.8275336, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3244, + "time_per_iteration": 2.894718647003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070218, + "balance_loss_mlp": 1.06071746, + "diversity_loss_mlp": 0.0, + "epoch": 0.6242785686802617, + "flos": 494712271872.0, + "grad_norm": 0.06745739273028781, + "language_loss": 0.79402959, + "learning_rate": 0.00032671496976785774, + "loss": 0.80473179, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3245, + "time_per_iteration": 2.637991428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077124, + "balance_loss_mlp": 1.06772995, + "diversity_loss_mlp": 0.0, + "epoch": 0.6244709503655252, + "flos": 745846465536.0, + "grad_norm": 0.06297519573167677, + "language_loss": 0.7578575, + "learning_rate": 0.0003264227692326205, + "loss": 0.76862872, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3246, + "time_per_iteration": 3.0627310276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010763, + "balance_loss_mlp": 1.06653643, + "diversity_loss_mlp": 0.0, + "epoch": 0.6246633320507887, + "flos": 492602259456.0, + "grad_norm": 0.06711643928809063, + "language_loss": 0.85974544, + "learning_rate": 0.00032613063608604055, + "loss": 0.87050849, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3247, + "time_per_iteration": 2.6602516174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_mlp": 1.0650897, + "diversity_loss_mlp": 0.0, + "epoch": 0.6248557137360523, + "flos": 517391981568.0, + "grad_norm": 0.06836828090896512, + "language_loss": 0.8368777, + "learning_rate": 0.0003258385704415343, + "loss": 0.84762454, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3248, + "time_per_iteration": 2.5850605964660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068989, + "balance_loss_mlp": 1.05929732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6250480954213159, + "flos": 519363601920.0, + "grad_norm": 0.0567839390219681, + "language_loss": 0.82901073, + "learning_rate": 0.0003255465724124915, + "loss": 0.83970058, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3249, + "time_per_iteration": 2.7133941650390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068793, + "balance_loss_mlp": 1.05952442, + "diversity_loss_mlp": 0.0, + "epoch": 0.6252404771065795, + "flos": 516060191232.0, + "grad_norm": 0.05839887652934639, + "language_loss": 0.82966471, + "learning_rate": 0.00032525464211227587, + "loss": 0.84035265, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3250, + "time_per_iteration": 2.611469030380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071121, + "balance_loss_mlp": 1.06180525, + "diversity_loss_mlp": 0.0, + "epoch": 0.6254328587918431, + "flos": 576916535808.0, + "grad_norm": 0.07351416510504778, + "language_loss": 0.85770059, + "learning_rate": 0.0003249627796542249, + "loss": 0.8684119, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3251, + "time_per_iteration": 2.6665618419647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066555, + "balance_loss_mlp": 1.05709553, + "diversity_loss_mlp": 0.0, + "epoch": 0.6256252404771065, + "flos": 597930771456.0, + "grad_norm": 0.06415360650327814, + "language_loss": 0.84284747, + "learning_rate": 0.00032467098515164943, + "loss": 0.853513, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3252, + "time_per_iteration": 2.8863329887390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069653, + "balance_loss_mlp": 1.06005657, + "diversity_loss_mlp": 0.0, + "epoch": 0.6258176221623701, + "flos": 508299245568.0, + "grad_norm": 0.07319159145136593, + "language_loss": 0.83726692, + "learning_rate": 0.00032437925871783456, + "loss": 0.84796345, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3253, + "time_per_iteration": 2.6411869525909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107529, + "balance_loss_mlp": 1.06570566, + "diversity_loss_mlp": 0.0, + "epoch": 0.6260100038476337, + "flos": 639645755904.0, + "grad_norm": 0.06969705547120199, + "language_loss": 0.84202456, + "learning_rate": 0.00032408760046603803, + "loss": 0.85277742, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3254, + "time_per_iteration": 2.79947829246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070892, + "balance_loss_mlp": 1.06131983, + "diversity_loss_mlp": 0.0, + "epoch": 0.6262023855328973, + "flos": 841007784960.0, + "grad_norm": 0.06622216529123302, + "language_loss": 0.77594912, + "learning_rate": 0.00032379601050949193, + "loss": 0.78665805, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3255, + "time_per_iteration": 3.089614152908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073078, + "balance_loss_mlp": 1.06385732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6263947672181608, + "flos": 522138410496.0, + "grad_norm": 0.06913459813204618, + "language_loss": 0.88098216, + "learning_rate": 0.0003235044889614013, + "loss": 0.8917129, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3256, + "time_per_iteration": 2.5961923599243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076462, + "balance_loss_mlp": 1.0670923, + "diversity_loss_mlp": 0.0, + "epoch": 0.6265871489034244, + "flos": 607055440896.0, + "grad_norm": 0.07985483332339025, + "language_loss": 0.83828497, + "learning_rate": 0.0003232130359349451, + "loss": 0.84904957, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3257, + "time_per_iteration": 2.8164010047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106986, + "balance_loss_mlp": 1.06043053, + "diversity_loss_mlp": 0.0, + "epoch": 0.626779530588688, + "flos": 588484901376.0, + "grad_norm": 0.06128522405733426, + "language_loss": 0.81820428, + "learning_rate": 0.0003229216515432751, + "loss": 0.82890296, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3258, + "time_per_iteration": 2.7743678092956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00804618, + "balance_loss_mlp": 1.36253858, + "diversity_loss_mlp": 0.22081783, + "epoch": 0.6269719122739515, + "flos": 438612268032.0, + "grad_norm": 0.03450370763198899, + "language_loss": 0.80067343, + "learning_rate": 0.0003226303358995174, + "loss": 0.80871964, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01293936, + "step": 3259, + "time_per_iteration": 2.6309425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065495, + "balance_loss_mlp": 1.05593443, + "diversity_loss_mlp": 0.0, + "epoch": 0.6271642939592151, + "flos": 562874738688.0, + "grad_norm": 0.05636981182900784, + "language_loss": 0.88916153, + "learning_rate": 0.00032233908911677, + "loss": 0.89981651, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 3260, + "time_per_iteration": 2.847928524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072322, + "balance_loss_mlp": 1.06297052, + "diversity_loss_mlp": 0.0, + "epoch": 0.6273566756444786, + "flos": 514560273408.0, + "grad_norm": 0.07940970349438319, + "language_loss": 0.810615, + "learning_rate": 0.0003220479113081053, + "loss": 0.8213383, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3261, + "time_per_iteration": 2.7070260047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070214, + "balance_loss_mlp": 1.06123137, + "diversity_loss_mlp": 0.0, + "epoch": 0.6275490573297422, + "flos": 585472955904.0, + "grad_norm": 0.06801817573689214, + "language_loss": 0.78964686, + "learning_rate": 0.00032175680258656836, + "loss": 0.80034894, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 3262, + "time_per_iteration": 2.7481493949890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067516, + "balance_loss_mlp": 1.05819941, + "diversity_loss_mlp": 0.0, + "epoch": 0.6277414390150058, + "flos": 559423024128.0, + "grad_norm": 0.06408124041259919, + "language_loss": 0.80091017, + "learning_rate": 0.00032146576306517794, + "loss": 0.81158531, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3263, + "time_per_iteration": 2.799330949783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071623, + "balance_loss_mlp": 1.06242585, + "diversity_loss_mlp": 0.0, + "epoch": 0.6279338207002694, + "flos": 612706374144.0, + "grad_norm": 0.06510106509747231, + "language_loss": 0.80605328, + "learning_rate": 0.0003211747928569255, + "loss": 0.81676954, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 3264, + "time_per_iteration": 2.71992826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071469, + "balance_loss_mlp": 1.06197381, + "diversity_loss_mlp": 0.0, + "epoch": 0.6281262023855329, + "flos": 625685451264.0, + "grad_norm": 0.06441574996580214, + "language_loss": 0.8154881, + "learning_rate": 0.0003208838920747754, + "loss": 0.82620275, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3265, + "time_per_iteration": 2.8526246547698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073261, + "balance_loss_mlp": 1.06409347, + "diversity_loss_mlp": 0.0, + "epoch": 0.6283185840707964, + "flos": 1123600564224.0, + "grad_norm": 0.07893812182761015, + "language_loss": 0.76554495, + "learning_rate": 0.0003205930608316656, + "loss": 0.7762776, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3266, + "time_per_iteration": 3.4734575748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066498, + "balance_loss_mlp": 1.05708683, + "diversity_loss_mlp": 0.0, + "epoch": 0.62851096575606, + "flos": 515239750656.0, + "grad_norm": 0.06620674427686414, + "language_loss": 0.85159075, + "learning_rate": 0.00032030229924050673, + "loss": 0.86225569, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3267, + "time_per_iteration": 2.7024662494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072026, + "balance_loss_mlp": 1.06285858, + "diversity_loss_mlp": 0.0, + "epoch": 0.6287033474413236, + "flos": 404171472384.0, + "grad_norm": 0.06417389888600762, + "language_loss": 0.79950488, + "learning_rate": 0.00032001160741418247, + "loss": 0.81022519, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 3268, + "time_per_iteration": 2.6112074851989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066415, + "balance_loss_mlp": 1.05720639, + "diversity_loss_mlp": 0.0, + "epoch": 0.6288957291265872, + "flos": 525718605312.0, + "grad_norm": 0.08748068388552233, + "language_loss": 0.82228744, + "learning_rate": 0.0003197209854655494, + "loss": 0.83295155, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3269, + "time_per_iteration": 2.642714500427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064334, + "balance_loss_mlp": 1.05507767, + "diversity_loss_mlp": 0.0, + "epoch": 0.6290881108118507, + "flos": 603722294784.0, + "grad_norm": 0.07987454353472763, + "language_loss": 0.74589109, + "learning_rate": 0.0003194304335074371, + "loss": 0.7565344, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 3270, + "time_per_iteration": 2.8935019969940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061724, + "balance_loss_mlp": 1.05230033, + "diversity_loss_mlp": 0.0, + "epoch": 0.6292804924971143, + "flos": 437675830272.0, + "grad_norm": 0.07476368913364388, + "language_loss": 0.8843264, + "learning_rate": 0.0003191399516526475, + "loss": 0.89494365, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3271, + "time_per_iteration": 2.5182955265045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010675, + "balance_loss_mlp": 1.0580647, + "diversity_loss_mlp": 0.0, + "epoch": 0.6294728741823779, + "flos": 606662659584.0, + "grad_norm": 0.0671044499872579, + "language_loss": 0.79825693, + "learning_rate": 0.0003188495400139559, + "loss": 0.80893195, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3272, + "time_per_iteration": 2.834392786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106563, + "balance_loss_mlp": 1.05608094, + "diversity_loss_mlp": 0.0, + "epoch": 0.6296652558676414, + "flos": 701529942528.0, + "grad_norm": 0.07440991142052084, + "language_loss": 0.84596652, + "learning_rate": 0.00031855919870411013, + "loss": 0.85662282, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.09539795, + "routerloss_mlp": 0.0, + "step": 3273, + "time_per_iteration": 2.8662502765655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.05781233, + "diversity_loss_mlp": 0.0, + "epoch": 0.6298576375529049, + "flos": 523909969920.0, + "grad_norm": 0.06934000715416044, + "language_loss": 0.8508203, + "learning_rate": 0.0003182689278358305, + "loss": 0.86149418, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3274, + "time_per_iteration": 2.707679510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071338, + "balance_loss_mlp": 1.06173623, + "diversity_loss_mlp": 0.0, + "epoch": 0.6300500192381685, + "flos": 475963693056.0, + "grad_norm": 0.08830765837123684, + "language_loss": 0.79631943, + "learning_rate": 0.0003179787275218105, + "loss": 0.80703276, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3275, + "time_per_iteration": 2.6076841354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00806629, + "balance_loss_mlp": 1.3660543, + "diversity_loss_mlp": 0.22307114, + "epoch": 0.6302424009234321, + "flos": 520880772096.0, + "grad_norm": 0.030809011685951734, + "language_loss": 0.84306061, + "learning_rate": 0.0003176885978747155, + "loss": 0.85112691, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01206683, + "step": 3276, + "time_per_iteration": 2.6712234020233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070055, + "balance_loss_mlp": 1.06039953, + "diversity_loss_mlp": 0.0, + "epoch": 0.6304347826086957, + "flos": 694596777984.0, + "grad_norm": 0.05912857494905308, + "language_loss": 0.82393259, + "learning_rate": 0.0003173985390071839, + "loss": 0.83463317, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3277, + "time_per_iteration": 2.8781204223632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020459, + "balance_loss_mlp": 1.01545238, + "diversity_loss_mlp": 0.0, + "epoch": 0.6306271642939593, + "flos": 1466858045952.0, + "grad_norm": 0.014813696367821054, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78920913, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.05004883, + "routerloss_mlp": 0.0, + "step": 3278, + "time_per_iteration": 4.869734287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071906, + "balance_loss_mlp": 1.06190431, + "diversity_loss_mlp": 0.0, + "epoch": 0.6308195459792227, + "flos": 601740762624.0, + "grad_norm": 0.07813339799532502, + "language_loss": 0.80876654, + "learning_rate": 0.00031681863406122704, + "loss": 0.8194856, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3279, + "time_per_iteration": 2.773547410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074089, + "balance_loss_mlp": 1.06446278, + "diversity_loss_mlp": 0.0, + "epoch": 0.6310119276644863, + "flos": 726858178560.0, + "grad_norm": 0.07216916580711319, + "language_loss": 0.85329819, + "learning_rate": 0.00031652878820794087, + "loss": 0.86403906, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3280, + "time_per_iteration": 2.980884552001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070557, + "balance_loss_mlp": 1.0605855, + "diversity_loss_mlp": 0.0, + "epoch": 0.6312043093497499, + "flos": 519749042688.0, + "grad_norm": 0.08329353384521647, + "language_loss": 0.85882401, + "learning_rate": 0.00031623901358449627, + "loss": 0.8695296, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.09967041, + "routerloss_mlp": 0.0, + "step": 3281, + "time_per_iteration": 2.650691509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107097, + "balance_loss_mlp": 1.06155276, + "diversity_loss_mlp": 0.0, + "epoch": 0.6313966910350135, + "flos": 531191499264.0, + "grad_norm": 0.06939094759952598, + "language_loss": 0.88689077, + "learning_rate": 0.0003159493103033936, + "loss": 0.89760047, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3282, + "time_per_iteration": 2.589892864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022479, + "balance_loss_mlp": 1.0175674, + "diversity_loss_mlp": 0.0, + "epoch": 0.631589072720277, + "flos": 1379887529472.0, + "grad_norm": 0.015595592818812096, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80941534, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.04907227, + "routerloss_mlp": 0.0, + "step": 3283, + "time_per_iteration": 4.845726728439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063188, + "balance_loss_mlp": 1.05360401, + "diversity_loss_mlp": 0.0, + "epoch": 0.6317814544055406, + "flos": 624677432832.0, + "grad_norm": 0.08266858178450832, + "language_loss": 0.82553136, + "learning_rate": 0.0003153701182180776, + "loss": 0.83616328, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3284, + "time_per_iteration": 2.783351421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065881, + "balance_loss_mlp": 1.05632019, + "diversity_loss_mlp": 0.0, + "epoch": 0.6319738360908042, + "flos": 498119569920.0, + "grad_norm": 0.063758085961612, + "language_loss": 0.81699741, + "learning_rate": 0.00031508062963872655, + "loss": 0.82765627, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3285, + "time_per_iteration": 2.5591769218444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064029, + "balance_loss_mlp": 1.05435503, + "diversity_loss_mlp": 0.0, + "epoch": 0.6321662177760677, + "flos": 579760353792.0, + "grad_norm": 0.06946286940388995, + "language_loss": 0.79716074, + "learning_rate": 0.0003147912128514423, + "loss": 0.80780101, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 3286, + "time_per_iteration": 2.7374072074890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00792206, + "balance_loss_mlp": 1.3388809, + "diversity_loss_mlp": 0.2218435, + "epoch": 0.6323585994613313, + "flos": 601486373376.0, + "grad_norm": 0.030646294163886513, + "language_loss": 0.87300044, + "learning_rate": 0.0003145018679685859, + "loss": 0.8809225, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01184397, + "step": 3287, + "time_per_iteration": 2.7549750804901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067783, + "balance_loss_mlp": 1.05837727, + "diversity_loss_mlp": 0.0, + "epoch": 0.6325509811465948, + "flos": 528535259136.0, + "grad_norm": 0.05105189166461937, + "language_loss": 0.87830782, + "learning_rate": 0.00031421259510249134, + "loss": 0.88898563, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3288, + "time_per_iteration": 2.7835381031036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067714, + "balance_loss_mlp": 1.05796242, + "diversity_loss_mlp": 0.0, + "epoch": 0.6327433628318584, + "flos": 574262866944.0, + "grad_norm": 0.136960350782239, + "language_loss": 0.81129575, + "learning_rate": 0.00031392339436546414, + "loss": 0.82197285, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.09747314, + "routerloss_mlp": 0.0, + "step": 3289, + "time_per_iteration": 2.8133864402770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069758, + "balance_loss_mlp": 1.05946374, + "diversity_loss_mlp": 0.0, + "epoch": 0.632935744517122, + "flos": 517088033280.0, + "grad_norm": 0.0683406709240254, + "language_loss": 0.8385359, + "learning_rate": 0.00031363426586978205, + "loss": 0.84923339, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 3290, + "time_per_iteration": 2.7862977981567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070784, + "balance_loss_mlp": 1.06093121, + "diversity_loss_mlp": 0.0, + "epoch": 0.6331281262023856, + "flos": 617462714880.0, + "grad_norm": 0.06517080869241837, + "language_loss": 0.84541273, + "learning_rate": 0.0003133452097276947, + "loss": 0.85612059, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3291, + "time_per_iteration": 2.735102415084839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063814, + "balance_loss_mlp": 1.05341327, + "diversity_loss_mlp": 0.0, + "epoch": 0.633320507887649, + "flos": 592954546176.0, + "grad_norm": 0.06655999718782692, + "language_loss": 0.8441304, + "learning_rate": 0.0003130562260514238, + "loss": 0.85476851, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 3292, + "time_per_iteration": 2.7411108016967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067846, + "balance_loss_mlp": 1.05757022, + "diversity_loss_mlp": 0.0, + "epoch": 0.6335128895729126, + "flos": 582349782528.0, + "grad_norm": 0.05657366074496326, + "language_loss": 0.81691957, + "learning_rate": 0.0003127673149531626, + "loss": 0.82759798, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3293, + "time_per_iteration": 2.766249418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066103, + "balance_loss_mlp": 1.05568373, + "diversity_loss_mlp": 0.0, + "epoch": 0.6337052712581762, + "flos": 453036934656.0, + "grad_norm": 0.0752121645824798, + "language_loss": 0.83436191, + "learning_rate": 0.0003124784765450762, + "loss": 0.84502298, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3294, + "time_per_iteration": 2.5490550994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066062, + "balance_loss_mlp": 1.05569124, + "diversity_loss_mlp": 0.0, + "epoch": 0.6338976529434398, + "flos": 573407921664.0, + "grad_norm": 0.06917813795445459, + "language_loss": 0.797925, + "learning_rate": 0.0003121897109393017, + "loss": 0.80858564, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 3295, + "time_per_iteration": 2.779365062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061765, + "balance_loss_mlp": 1.05135238, + "diversity_loss_mlp": 0.0, + "epoch": 0.6340900346287034, + "flos": 508758838272.0, + "grad_norm": 0.06234951999103671, + "language_loss": 0.89289808, + "learning_rate": 0.0003119010182479481, + "loss": 0.9035157, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3296, + "time_per_iteration": 2.6138393878936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069092, + "balance_loss_mlp": 1.05855989, + "diversity_loss_mlp": 0.0, + "epoch": 0.6342824163139669, + "flos": 479746520064.0, + "grad_norm": 0.06350246507064496, + "language_loss": 0.82675922, + "learning_rate": 0.00031161239858309563, + "loss": 0.83745015, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.10540771, + "routerloss_mlp": 0.0, + "step": 3297, + "time_per_iteration": 2.586970329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072163, + "balance_loss_mlp": 1.06148767, + "diversity_loss_mlp": 0.0, + "epoch": 0.6344747979992305, + "flos": 572031714816.0, + "grad_norm": 0.0696399427467901, + "language_loss": 0.83455825, + "learning_rate": 0.0003113238520567964, + "loss": 0.84527981, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.10681152, + "routerloss_mlp": 0.0, + "step": 3298, + "time_per_iteration": 2.6586110591888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065276, + "balance_loss_mlp": 1.05495286, + "diversity_loss_mlp": 0.0, + "epoch": 0.634667179684494, + "flos": 605911601664.0, + "grad_norm": 0.07177816314390054, + "language_loss": 0.81584775, + "learning_rate": 0.00031103537878107403, + "loss": 0.82650054, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 3299, + "time_per_iteration": 2.708526372909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106763, + "balance_loss_mlp": 1.05756879, + "diversity_loss_mlp": 0.0, + "epoch": 0.6348595613697576, + "flos": 646944537600.0, + "grad_norm": 0.0821312661024272, + "language_loss": 0.7999661, + "learning_rate": 0.0003107469788679238, + "loss": 0.81064236, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3300, + "time_per_iteration": 2.774571180343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070552, + "balance_loss_mlp": 1.06004977, + "diversity_loss_mlp": 0.0, + "epoch": 0.6350519430550212, + "flos": 639074935296.0, + "grad_norm": 0.06269586290013059, + "language_loss": 0.86672354, + "learning_rate": 0.00031045865242931267, + "loss": 0.87742901, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 3301, + "time_per_iteration": 2.800271987915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_mlp": 1.06537664, + "diversity_loss_mlp": 0.0, + "epoch": 0.6352443247402847, + "flos": 686437908480.0, + "grad_norm": 0.060025608417058285, + "language_loss": 0.83086729, + "learning_rate": 0.00031017039957717877, + "loss": 0.84162271, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 3302, + "time_per_iteration": 2.99652361869812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083489, + "balance_loss_mlp": 1.07342744, + "diversity_loss_mlp": 0.0, + "epoch": 0.6354367064255483, + "flos": 559442847744.0, + "grad_norm": 0.0673613891994724, + "language_loss": 0.89035141, + "learning_rate": 0.0003098822204234318, + "loss": 0.90118629, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3303, + "time_per_iteration": 2.6769609451293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076148, + "balance_loss_mlp": 1.06632543, + "diversity_loss_mlp": 0.0, + "epoch": 0.6356290881108119, + "flos": 979487520768.0, + "grad_norm": 0.0682411238472533, + "language_loss": 0.87294948, + "learning_rate": 0.00030959411507995273, + "loss": 0.88371098, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 3304, + "time_per_iteration": 3.25303053855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073289, + "balance_loss_mlp": 1.06334674, + "diversity_loss_mlp": 0.0, + "epoch": 0.6358214697960755, + "flos": 528278298624.0, + "grad_norm": 0.09293144525754729, + "language_loss": 0.80997777, + "learning_rate": 0.00030930608365859407, + "loss": 0.82071066, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3305, + "time_per_iteration": 2.650047540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079087, + "balance_loss_mlp": 1.06908488, + "diversity_loss_mlp": 0.0, + "epoch": 0.6360138514813389, + "flos": 516811249152.0, + "grad_norm": 0.06298630616486185, + "language_loss": 0.87762672, + "learning_rate": 0.00030901812627117943, + "loss": 0.8884176, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.10003662, + "routerloss_mlp": 0.0, + "step": 3306, + "time_per_iteration": 2.605576276779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106604, + "balance_loss_mlp": 1.05617523, + "diversity_loss_mlp": 0.0, + "epoch": 0.6362062331666025, + "flos": 466525163520.0, + "grad_norm": 0.09439685712352788, + "language_loss": 0.8446157, + "learning_rate": 0.000308730243029504, + "loss": 0.85527611, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3307, + "time_per_iteration": 2.6111857891082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070403, + "balance_loss_mlp": 1.06070554, + "diversity_loss_mlp": 0.0, + "epoch": 0.6363986148518661, + "flos": 549720193536.0, + "grad_norm": 0.06852736886674453, + "language_loss": 0.7914747, + "learning_rate": 0.0003084424340453339, + "loss": 0.80217868, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3308, + "time_per_iteration": 2.8072149753570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063715, + "balance_loss_mlp": 1.05379033, + "diversity_loss_mlp": 0.0, + "epoch": 0.6365909965371297, + "flos": 583049083392.0, + "grad_norm": 0.0739185528440478, + "language_loss": 0.82162523, + "learning_rate": 0.0003081546994304064, + "loss": 0.8322624, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3309, + "time_per_iteration": 2.7670769691467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_mlp": 1.04971123, + "diversity_loss_mlp": 0.0, + "epoch": 0.6367833782223933, + "flos": 531255739392.0, + "grad_norm": 0.07802596117693822, + "language_loss": 0.81907165, + "learning_rate": 0.0003078670392964298, + "loss": 0.82966554, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3310, + "time_per_iteration": 2.6474099159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058905, + "balance_loss_mlp": 1.04899311, + "diversity_loss_mlp": 0.0, + "epoch": 0.6369757599076568, + "flos": 569506526208.0, + "grad_norm": 0.0731557233203608, + "language_loss": 0.82997435, + "learning_rate": 0.00030757945375508406, + "loss": 0.84056342, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3311, + "time_per_iteration": 2.6429851055145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054164, + "balance_loss_mlp": 1.04434729, + "diversity_loss_mlp": 0.0, + "epoch": 0.6371681415929203, + "flos": 539957892096.0, + "grad_norm": 0.06845871409018763, + "language_loss": 0.81414253, + "learning_rate": 0.00030729194291801944, + "loss": 0.8246842, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3312, + "time_per_iteration": 2.6631555557250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_mlp": 1.04690671, + "diversity_loss_mlp": 0.0, + "epoch": 0.6373605232781839, + "flos": 483566423040.0, + "grad_norm": 0.08097298950364754, + "language_loss": 0.77058214, + "learning_rate": 0.00030700450689685787, + "loss": 0.78114825, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3313, + "time_per_iteration": 2.540600061416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059608, + "balance_loss_mlp": 1.0500232, + "diversity_loss_mlp": 0.0, + "epoch": 0.6375529049634475, + "flos": 578581636608.0, + "grad_norm": 0.0804877394257798, + "language_loss": 0.85728467, + "learning_rate": 0.00030671714580319186, + "loss": 0.86788076, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 3314, + "time_per_iteration": 2.804875135421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055412, + "balance_loss_mlp": 1.04565513, + "diversity_loss_mlp": 0.0, + "epoch": 0.637745286648711, + "flos": 682257530880.0, + "grad_norm": 0.07597136338877614, + "language_loss": 0.83442312, + "learning_rate": 0.0003064298597485846, + "loss": 0.84497726, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.09747314, + "routerloss_mlp": 0.0, + "step": 3315, + "time_per_iteration": 2.860419273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010587, + "balance_loss_mlp": 1.04858494, + "diversity_loss_mlp": 0.0, + "epoch": 0.6379376683339746, + "flos": 504637558272.0, + "grad_norm": 0.06770078099501715, + "language_loss": 0.83771706, + "learning_rate": 0.00030614264884457054, + "loss": 0.84830409, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3316, + "time_per_iteration": 2.6398963928222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_mlp": 1.04450154, + "diversity_loss_mlp": 0.0, + "epoch": 0.6381300500192382, + "flos": 502020965376.0, + "grad_norm": 0.09575765703427323, + "language_loss": 0.77156532, + "learning_rate": 0.000305855513202655, + "loss": 0.78211164, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 3317, + "time_per_iteration": 2.57024884223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052471, + "balance_loss_mlp": 1.04220688, + "diversity_loss_mlp": 0.0, + "epoch": 0.6383224317045018, + "flos": 400489961472.0, + "grad_norm": 0.07693758647747995, + "language_loss": 0.77392501, + "learning_rate": 0.0003055684529343138, + "loss": 0.7844497, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 3318, + "time_per_iteration": 2.4296517372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058636, + "balance_loss_mlp": 1.04889059, + "diversity_loss_mlp": 0.0, + "epoch": 0.6385148133897653, + "flos": 499377208320.0, + "grad_norm": 0.08157026730411542, + "language_loss": 0.78901523, + "learning_rate": 0.00030528146815099374, + "loss": 0.79960155, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.09741211, + "routerloss_mlp": 0.0, + "step": 3319, + "time_per_iteration": 2.6178040504455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105942, + "balance_loss_mlp": 1.0495379, + "diversity_loss_mlp": 0.0, + "epoch": 0.6387071950750288, + "flos": 527665632768.0, + "grad_norm": 0.05929975411068792, + "language_loss": 0.72059178, + "learning_rate": 0.00030499455896411203, + "loss": 0.73118603, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.09875488, + "routerloss_mlp": 0.0, + "step": 3320, + "time_per_iteration": 2.627962589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026073, + "balance_loss_mlp": 1.02049422, + "diversity_loss_mlp": 0.0, + "epoch": 0.6388995767602924, + "flos": 1455979069440.0, + "grad_norm": 0.01967957525447477, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77326888, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.0559082, + "routerloss_mlp": 0.0, + "step": 3321, + "time_per_iteration": 4.926000595092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068116, + "balance_loss_mlp": 1.05800068, + "diversity_loss_mlp": 0.0, + "epoch": 0.639091958445556, + "flos": 603895191552.0, + "grad_norm": 0.06833251339694629, + "language_loss": 0.76524007, + "learning_rate": 0.0003044209678251865, + "loss": 0.77592129, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.10107422, + "routerloss_mlp": 0.0, + "step": 3322, + "time_per_iteration": 2.916396379470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_mlp": 1.05691469, + "diversity_loss_mlp": 0.0, + "epoch": 0.6392843401308196, + "flos": 584516694528.0, + "grad_norm": 0.05729140281605497, + "language_loss": 0.84366953, + "learning_rate": 0.0003041342860958306, + "loss": 0.85433549, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 3323, + "time_per_iteration": 2.7770862579345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_mlp": 1.06162453, + "diversity_loss_mlp": 0.0, + "epoch": 0.6394767218160831, + "flos": 514681413120.0, + "grad_norm": 0.08519156923386062, + "language_loss": 0.91346496, + "learning_rate": 0.00030384768040828857, + "loss": 0.92417842, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3324, + "time_per_iteration": 2.6812171936035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081336, + "balance_loss_mlp": 1.07172787, + "diversity_loss_mlp": 0.0, + "epoch": 0.6396691035013466, + "flos": 541732022784.0, + "grad_norm": 0.07651235317530308, + "language_loss": 0.85160887, + "learning_rate": 0.00030356115087383094, + "loss": 0.86242223, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.0960083, + "routerloss_mlp": 0.0, + "step": 3325, + "time_per_iteration": 2.6458263397216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00811228, + "balance_loss_mlp": 1.37989581, + "diversity_loss_mlp": 0.21910624, + "epoch": 0.6398614851866102, + "flos": 525535796736.0, + "grad_norm": 0.034032588306098184, + "language_loss": 0.8530367, + "learning_rate": 0.00030327469760369803, + "loss": 0.86114895, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01172681, + "step": 3326, + "time_per_iteration": 2.6054904460906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075022, + "balance_loss_mlp": 1.06528878, + "diversity_loss_mlp": 0.0, + "epoch": 0.6400538668718738, + "flos": 622989937152.0, + "grad_norm": 0.06651858881657381, + "language_loss": 0.84802389, + "learning_rate": 0.0003029883207091009, + "loss": 0.85877407, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.097229, + "routerloss_mlp": 0.0, + "step": 3327, + "time_per_iteration": 2.7084085941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075174, + "balance_loss_mlp": 1.06530905, + "diversity_loss_mlp": 0.0, + "epoch": 0.6402462485571374, + "flos": 503367436800.0, + "grad_norm": 0.07064025062286232, + "language_loss": 0.78362405, + "learning_rate": 0.00030270202030122095, + "loss": 0.79437578, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3328, + "time_per_iteration": 2.668501615524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076287, + "balance_loss_mlp": 1.06659508, + "diversity_loss_mlp": 0.0, + "epoch": 0.6404386302424009, + "flos": 819247260672.0, + "grad_norm": 0.07541554155703202, + "language_loss": 0.85661519, + "learning_rate": 0.00030241579649121, + "loss": 0.867378, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3329, + "time_per_iteration": 2.9972317218780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107413, + "balance_loss_mlp": 1.06488538, + "diversity_loss_mlp": 0.0, + "epoch": 0.6406310119276645, + "flos": 471812677632.0, + "grad_norm": 0.06439571325368963, + "language_loss": 0.7957617, + "learning_rate": 0.00030212964939018994, + "loss": 0.806503, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.09234619, + "routerloss_mlp": 0.0, + "step": 3330, + "time_per_iteration": 2.5598840713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075671, + "balance_loss_mlp": 1.06651545, + "diversity_loss_mlp": 0.0, + "epoch": 0.6408233936129281, + "flos": 425583631872.0, + "grad_norm": 0.07958558119065547, + "language_loss": 0.85401917, + "learning_rate": 0.0003018435791092527, + "loss": 0.8647759, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3331, + "time_per_iteration": 2.4886720180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077198, + "balance_loss_mlp": 1.06757176, + "diversity_loss_mlp": 0.0, + "epoch": 0.6410157752981916, + "flos": 549784433664.0, + "grad_norm": 0.08502928683846613, + "language_loss": 0.80926251, + "learning_rate": 0.00030155758575946083, + "loss": 0.8200345, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3332, + "time_per_iteration": 2.661039113998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073982, + "balance_loss_mlp": 1.06464815, + "diversity_loss_mlp": 0.0, + "epoch": 0.6412081569834551, + "flos": 475899452928.0, + "grad_norm": 0.07641451366860309, + "language_loss": 0.84045428, + "learning_rate": 0.0003012716694518467, + "loss": 0.85119408, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.09332275, + "routerloss_mlp": 0.0, + "step": 3333, + "time_per_iteration": 2.579451322555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074146, + "balance_loss_mlp": 1.06456161, + "diversity_loss_mlp": 0.0, + "epoch": 0.6414005386687187, + "flos": 540921494016.0, + "grad_norm": 0.06148329614598223, + "language_loss": 0.85011578, + "learning_rate": 0.000300985830297413, + "loss": 0.86085725, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3334, + "time_per_iteration": 2.6951658725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070237, + "balance_loss_mlp": 1.0607183, + "diversity_loss_mlp": 0.0, + "epoch": 0.6415929203539823, + "flos": 1041317379072.0, + "grad_norm": 0.07715385519242493, + "language_loss": 0.8765533, + "learning_rate": 0.00030070006840713205, + "loss": 0.88725567, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3335, + "time_per_iteration": 3.415095329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068996, + "balance_loss_mlp": 1.05956614, + "diversity_loss_mlp": 0.0, + "epoch": 0.6417853020392459, + "flos": 648337996800.0, + "grad_norm": 0.06540243812784874, + "language_loss": 0.73462147, + "learning_rate": 0.000300414383891947, + "loss": 0.74531144, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3336, + "time_per_iteration": 2.8207781314849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070682, + "balance_loss_mlp": 1.06142569, + "diversity_loss_mlp": 0.0, + "epoch": 0.6419776837245095, + "flos": 500899147776.0, + "grad_norm": 0.062126831222401244, + "language_loss": 0.88856506, + "learning_rate": 0.00030012877686276973, + "loss": 0.89927197, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3337, + "time_per_iteration": 2.701467752456665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070599, + "balance_loss_mlp": 1.06103206, + "diversity_loss_mlp": 0.0, + "epoch": 0.642170065409773, + "flos": 620620392960.0, + "grad_norm": 0.06622404014204096, + "language_loss": 0.86998606, + "learning_rate": 0.0002998432474304832, + "loss": 0.88069206, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3338, + "time_per_iteration": 2.754462242126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023208, + "balance_loss_mlp": 1.01724732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6423624470950365, + "flos": 1423539629568.0, + "grad_norm": 0.025409804512754288, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80260551, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 3339, + "time_per_iteration": 4.871408700942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061344, + "balance_loss_mlp": 1.05190849, + "diversity_loss_mlp": 0.0, + "epoch": 0.6425548287803001, + "flos": 562353477120.0, + "grad_norm": 0.056182904751461135, + "language_loss": 0.88884711, + "learning_rate": 0.00029927242179996107, + "loss": 0.89946061, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3340, + "time_per_iteration": 2.6943204402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063875, + "balance_loss_mlp": 1.05451107, + "diversity_loss_mlp": 0.0, + "epoch": 0.6427472104655637, + "flos": 585443220480.0, + "grad_norm": 0.05740093819519034, + "language_loss": 0.83547878, + "learning_rate": 0.0002989871258233398, + "loss": 0.8461175, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3341, + "time_per_iteration": 2.759075164794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106288, + "balance_loss_mlp": 1.05317652, + "diversity_loss_mlp": 0.0, + "epoch": 0.6429395921508272, + "flos": 404282700288.0, + "grad_norm": 0.08495529058707293, + "language_loss": 0.82866132, + "learning_rate": 0.0002987019078868373, + "loss": 0.83929014, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3342, + "time_per_iteration": 2.460184097290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00806137, + "balance_loss_mlp": 1.3687458, + "diversity_loss_mlp": 0.21894245, + "epoch": 0.6431319738360908, + "flos": 548783755776.0, + "grad_norm": 0.03059825895364693, + "language_loss": 0.81932986, + "learning_rate": 0.00029841676810118484, + "loss": 0.82739115, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01229309, + "step": 3343, + "time_per_iteration": 2.6885409355163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_mlp": 1.04915428, + "diversity_loss_mlp": 0.0, + "epoch": 0.6433243555213544, + "flos": 793375368192.0, + "grad_norm": 0.0604476685897385, + "language_loss": 0.87177467, + "learning_rate": 0.0002981317065770839, + "loss": 0.88236231, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3344, + "time_per_iteration": 3.03983736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060176, + "balance_loss_mlp": 1.05044222, + "diversity_loss_mlp": 0.0, + "epoch": 0.643516737206618, + "flos": 583031831040.0, + "grad_norm": 0.07704872008291591, + "language_loss": 0.8078779, + "learning_rate": 0.00029784672342520493, + "loss": 0.81847966, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.097229, + "routerloss_mlp": 0.0, + "step": 3345, + "time_per_iteration": 2.6846296787261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061675, + "balance_loss_mlp": 1.05220366, + "diversity_loss_mlp": 0.0, + "epoch": 0.6437091188918815, + "flos": 518750936064.0, + "grad_norm": 0.06975007259690363, + "language_loss": 0.8341136, + "learning_rate": 0.00029756181875618834, + "loss": 0.84473026, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3346, + "time_per_iteration": 2.5665693283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00808422, + "balance_loss_mlp": 1.37269104, + "diversity_loss_mlp": 0.21939373, + "epoch": 0.643901500577145, + "flos": 384946048512.0, + "grad_norm": 0.035494504018204545, + "language_loss": 0.83294541, + "learning_rate": 0.0002972769926806439, + "loss": 0.84102958, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0123796, + "step": 3347, + "time_per_iteration": 2.504934549331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0080263, + "balance_loss_mlp": 1.36098909, + "diversity_loss_mlp": 0.21952364, + "epoch": 0.6440938822624086, + "flos": 483722067456.0, + "grad_norm": 0.0334865497392214, + "language_loss": 0.88848293, + "learning_rate": 0.0002969922453091508, + "loss": 0.89650929, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01237371, + "step": 3348, + "time_per_iteration": 2.588092803955078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105701, + "balance_loss_mlp": 1.04741955, + "diversity_loss_mlp": 0.0, + "epoch": 0.6442862639476722, + "flos": 540469241856.0, + "grad_norm": 0.07081599083542611, + "language_loss": 0.85229504, + "learning_rate": 0.00029670757675225777, + "loss": 0.86286509, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3349, + "time_per_iteration": 2.7467896938323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056774, + "balance_loss_mlp": 1.04726744, + "diversity_loss_mlp": 0.0, + "epoch": 0.6444786456329358, + "flos": 526912003584.0, + "grad_norm": 0.08621507866757971, + "language_loss": 0.79660463, + "learning_rate": 0.0002964229871204831, + "loss": 0.80717242, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3350, + "time_per_iteration": 2.65602707862854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056473, + "balance_loss_mlp": 1.04715693, + "diversity_loss_mlp": 0.0, + "epoch": 0.6446710273181993, + "flos": 697892848128.0, + "grad_norm": 0.0705050991392221, + "language_loss": 0.83769023, + "learning_rate": 0.00029613847652431403, + "loss": 0.84825498, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3351, + "time_per_iteration": 2.8451104164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00797485, + "balance_loss_mlp": 1.35163832, + "diversity_loss_mlp": 0.21852379, + "epoch": 0.6448634090034628, + "flos": 625023226368.0, + "grad_norm": 0.02943697991412704, + "language_loss": 0.79510611, + "learning_rate": 0.0002958540450742078, + "loss": 0.80308104, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01240353, + "step": 3352, + "time_per_iteration": 2.950679063796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060228, + "balance_loss_mlp": 1.05063784, + "diversity_loss_mlp": 0.0, + "epoch": 0.6450557906887264, + "flos": 600950057472.0, + "grad_norm": 0.06852868488451136, + "language_loss": 0.7732749, + "learning_rate": 0.0002955696928805901, + "loss": 0.78387713, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 3353, + "time_per_iteration": 2.8771724700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067774, + "balance_loss_mlp": 1.0582372, + "diversity_loss_mlp": 0.0, + "epoch": 0.64524817237399, + "flos": 646200820224.0, + "grad_norm": 0.10704512558750189, + "language_loss": 0.86111909, + "learning_rate": 0.0002952854200538563, + "loss": 0.87179685, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3354, + "time_per_iteration": 2.777782917022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798015, + "balance_loss_mlp": 1.35377836, + "diversity_loss_mlp": 0.21820019, + "epoch": 0.6454405540592536, + "flos": 473411340288.0, + "grad_norm": 0.032699702246912744, + "language_loss": 0.82167614, + "learning_rate": 0.000295001226704371, + "loss": 0.82965624, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01202584, + "step": 3355, + "time_per_iteration": 2.5991604328155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_mlp": 1.05207551, + "diversity_loss_mlp": 0.0, + "epoch": 0.6456329357445171, + "flos": 611841517056.0, + "grad_norm": 0.07645377110954561, + "language_loss": 0.82891458, + "learning_rate": 0.00029471711294246783, + "loss": 0.8395294, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3356, + "time_per_iteration": 2.8146939277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064507, + "balance_loss_mlp": 1.05512571, + "diversity_loss_mlp": 0.0, + "epoch": 0.6458253174297807, + "flos": 731683901952.0, + "grad_norm": 0.07650305014050414, + "language_loss": 0.82254899, + "learning_rate": 0.0002944330788784494, + "loss": 0.83319402, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3357, + "time_per_iteration": 2.90537428855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_mlp": 1.05508041, + "diversity_loss_mlp": 0.0, + "epoch": 0.6460176991150443, + "flos": 570413228544.0, + "grad_norm": 0.06168723315149378, + "language_loss": 0.84662282, + "learning_rate": 0.00029414912462258786, + "loss": 0.85727078, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3358, + "time_per_iteration": 2.8301830291748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068597, + "balance_loss_mlp": 1.05873299, + "diversity_loss_mlp": 0.0, + "epoch": 0.6462100808003078, + "flos": 583160311296.0, + "grad_norm": 0.07109215771884392, + "language_loss": 0.81651056, + "learning_rate": 0.00029386525028512366, + "loss": 0.8271966, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3359, + "time_per_iteration": 2.689298152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068602, + "balance_loss_mlp": 1.05881464, + "diversity_loss_mlp": 0.0, + "epoch": 0.6464024624855714, + "flos": 483919557120.0, + "grad_norm": 0.0690455154627963, + "language_loss": 0.86761546, + "learning_rate": 0.0002935814559762666, + "loss": 0.8783015, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3360, + "time_per_iteration": 2.820415496826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072441, + "balance_loss_mlp": 1.06286263, + "diversity_loss_mlp": 0.0, + "epoch": 0.6465948441708349, + "flos": 527774289408.0, + "grad_norm": 0.06340694058104589, + "language_loss": 0.7940557, + "learning_rate": 0.0002932977418061957, + "loss": 0.80478007, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3361, + "time_per_iteration": 2.638246536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075754, + "balance_loss_mlp": 1.06592488, + "diversity_loss_mlp": 0.0, + "epoch": 0.6467872258560985, + "flos": 669421615104.0, + "grad_norm": 0.11078731162526398, + "language_loss": 0.80980253, + "learning_rate": 0.00029301410788505833, + "loss": 0.82056004, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3362, + "time_per_iteration": 2.829946279525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067795, + "balance_loss_mlp": 1.05792451, + "diversity_loss_mlp": 0.0, + "epoch": 0.6469796075413621, + "flos": 432101620224.0, + "grad_norm": 0.08350394703111745, + "language_loss": 0.80845594, + "learning_rate": 0.00029273055432297126, + "loss": 0.81913394, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.09875488, + "routerloss_mlp": 0.0, + "step": 3363, + "time_per_iteration": 2.5047130584716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057084, + "balance_loss_mlp": 1.04717803, + "diversity_loss_mlp": 0.0, + "epoch": 0.6471719892266257, + "flos": 803750335488.0, + "grad_norm": 0.06756647759690963, + "language_loss": 0.80998582, + "learning_rate": 0.00029244708123001917, + "loss": 0.8205567, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3364, + "time_per_iteration": 3.071207284927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059298, + "balance_loss_mlp": 1.04951715, + "diversity_loss_mlp": 0.0, + "epoch": 0.6473643709118891, + "flos": 577208001024.0, + "grad_norm": 0.08982319043529345, + "language_loss": 0.84555328, + "learning_rate": 0.0002921636887162565, + "loss": 0.85614622, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3365, + "time_per_iteration": 2.768284797668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057421, + "balance_loss_mlp": 1.04800391, + "diversity_loss_mlp": 0.0, + "epoch": 0.6475567525971527, + "flos": 761420113920.0, + "grad_norm": 0.08629567448100454, + "language_loss": 0.83712798, + "learning_rate": 0.00029188037689170595, + "loss": 0.84770226, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3366, + "time_per_iteration": 2.9462075233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054242, + "balance_loss_mlp": 1.04440713, + "diversity_loss_mlp": 0.0, + "epoch": 0.6477491342824163, + "flos": 843103116288.0, + "grad_norm": 0.07194825267456643, + "language_loss": 0.84329098, + "learning_rate": 0.0002915971458663586, + "loss": 0.85383338, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.09820557, + "routerloss_mlp": 0.0, + "step": 3367, + "time_per_iteration": 3.052452802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105506, + "balance_loss_mlp": 1.04521894, + "diversity_loss_mlp": 0.0, + "epoch": 0.6479415159676799, + "flos": 884820298752.0, + "grad_norm": 0.06187590041276245, + "language_loss": 0.81901962, + "learning_rate": 0.00029131399575017494, + "loss": 0.82957023, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.09838867, + "routerloss_mlp": 0.0, + "step": 3368, + "time_per_iteration": 3.260995864868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054713, + "balance_loss_mlp": 1.04508734, + "diversity_loss_mlp": 0.0, + "epoch": 0.6481338976529435, + "flos": 615513116160.0, + "grad_norm": 0.08987768190651603, + "language_loss": 0.85898274, + "learning_rate": 0.0002910309266530836, + "loss": 0.8695299, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3369, + "time_per_iteration": 2.8022115230560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059559, + "balance_loss_mlp": 1.0497539, + "diversity_loss_mlp": 0.0, + "epoch": 0.648326279338207, + "flos": 510009136128.0, + "grad_norm": 0.07644364345836648, + "language_loss": 0.8560974, + "learning_rate": 0.0002907479386849814, + "loss": 0.86669296, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.09796143, + "routerloss_mlp": 0.0, + "step": 3370, + "time_per_iteration": 2.646334171295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057441, + "balance_loss_mlp": 1.04791021, + "diversity_loss_mlp": 0.0, + "epoch": 0.6485186610234706, + "flos": 702498313728.0, + "grad_norm": 0.07833648604751785, + "language_loss": 0.80597669, + "learning_rate": 0.0002904650319557339, + "loss": 0.81655109, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3371, + "time_per_iteration": 2.9977073669433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787303, + "balance_loss_mlp": 1.33170056, + "diversity_loss_mlp": 0.21746175, + "epoch": 0.6487110427087341, + "flos": 560683233792.0, + "grad_norm": 0.036264020076934224, + "language_loss": 0.81342006, + "learning_rate": 0.0002901822065751758, + "loss": 0.82129312, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01272238, + "step": 3372, + "time_per_iteration": 2.697375774383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054945, + "balance_loss_mlp": 1.04537833, + "diversity_loss_mlp": 0.0, + "epoch": 0.6489034243939977, + "flos": 680100530688.0, + "grad_norm": 0.06787352107623057, + "language_loss": 0.8556366, + "learning_rate": 0.0002898994626531093, + "loss": 0.86618596, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3373, + "time_per_iteration": 2.8561713695526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059452, + "balance_loss_mlp": 1.05008769, + "diversity_loss_mlp": 0.0, + "epoch": 0.6490958060792612, + "flos": 474412018176.0, + "grad_norm": 0.07079984620053167, + "language_loss": 0.87879932, + "learning_rate": 0.00028961680029930526, + "loss": 0.88939387, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3374, + "time_per_iteration": 2.535357713699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058352, + "balance_loss_mlp": 1.04902411, + "diversity_loss_mlp": 0.0, + "epoch": 0.6492881877645248, + "flos": 588850518528.0, + "grad_norm": 0.07847742657670442, + "language_loss": 0.7705428, + "learning_rate": 0.00028933421962350317, + "loss": 0.78112632, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3375, + "time_per_iteration": 2.7630350589752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059653, + "balance_loss_mlp": 1.05022955, + "diversity_loss_mlp": 0.0, + "epoch": 0.6494805694497884, + "flos": 642427905024.0, + "grad_norm": 0.060066877370730534, + "language_loss": 0.83867884, + "learning_rate": 0.0002890517207354104, + "loss": 0.84927535, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3376, + "time_per_iteration": 2.8403854370117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067954, + "balance_loss_mlp": 1.05819058, + "diversity_loss_mlp": 0.0, + "epoch": 0.649672951135052, + "flos": 531806736384.0, + "grad_norm": 0.07875615832785021, + "language_loss": 0.81685328, + "learning_rate": 0.0002887693037447029, + "loss": 0.82753289, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3377, + "time_per_iteration": 2.5936834812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786778, + "balance_loss_mlp": 1.32879448, + "diversity_loss_mlp": 0.22056285, + "epoch": 0.6498653328203156, + "flos": 547387725312.0, + "grad_norm": 0.03360133181749734, + "language_loss": 0.82620949, + "learning_rate": 0.00028848696876102443, + "loss": 0.8340773, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01209909, + "step": 3378, + "time_per_iteration": 2.646881341934204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083646, + "balance_loss_mlp": 1.07432425, + "diversity_loss_mlp": 0.0, + "epoch": 0.650057714505579, + "flos": 462228415488.0, + "grad_norm": 0.07289026043627096, + "language_loss": 0.83464664, + "learning_rate": 0.00028820471589398723, + "loss": 0.84548312, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 3379, + "time_per_iteration": 2.5300872325897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079061, + "balance_loss_mlp": 1.3374207, + "diversity_loss_mlp": 0.22020277, + "epoch": 0.6502500961908426, + "flos": 510172121088.0, + "grad_norm": 0.03832598047329158, + "language_loss": 0.78047603, + "learning_rate": 0.00028792254525317196, + "loss": 0.78838205, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01179803, + "step": 3380, + "time_per_iteration": 2.696711301803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090042, + "balance_loss_mlp": 1.08066666, + "diversity_loss_mlp": 0.0, + "epoch": 0.6504424778761062, + "flos": 579827165184.0, + "grad_norm": 0.07654044550208572, + "language_loss": 0.81385279, + "learning_rate": 0.00028764045694812645, + "loss": 0.82475317, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3381, + "time_per_iteration": 2.7730586528778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092437, + "balance_loss_mlp": 1.08303761, + "diversity_loss_mlp": 0.0, + "epoch": 0.6506348595613698, + "flos": 519457577472.0, + "grad_norm": 0.08987457099582341, + "language_loss": 0.76744068, + "learning_rate": 0.0002873584510883671, + "loss": 0.77836508, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3382, + "time_per_iteration": 2.6443450450897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088701, + "balance_loss_mlp": 1.07926512, + "diversity_loss_mlp": 0.0, + "epoch": 0.6508272412466333, + "flos": 510310513152.0, + "grad_norm": 0.07067062397279458, + "language_loss": 0.86143303, + "learning_rate": 0.0002870765277833788, + "loss": 0.87232006, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3383, + "time_per_iteration": 2.740920305252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.07161593, + "diversity_loss_mlp": 0.0, + "epoch": 0.6510196229318969, + "flos": 625623782400.0, + "grad_norm": 0.07689735458190097, + "language_loss": 0.80460048, + "learning_rate": 0.00028679468714261347, + "loss": 0.81540942, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3384, + "time_per_iteration": 2.7767040729522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074737, + "balance_loss_mlp": 1.06546891, + "diversity_loss_mlp": 0.0, + "epoch": 0.6512120046171604, + "flos": 474696142848.0, + "grad_norm": 0.06416640561224615, + "language_loss": 0.76925558, + "learning_rate": 0.0002865129292754918, + "loss": 0.78000295, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3385, + "time_per_iteration": 2.591616630554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075196, + "balance_loss_mlp": 1.06574309, + "diversity_loss_mlp": 0.0, + "epoch": 0.651404386302424, + "flos": 551854798848.0, + "grad_norm": 0.06819374320087251, + "language_loss": 0.81950033, + "learning_rate": 0.00028623125429140105, + "loss": 0.83025235, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3386, + "time_per_iteration": 2.819565773010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068167, + "balance_loss_mlp": 1.05845094, + "diversity_loss_mlp": 0.0, + "epoch": 0.6515967679876876, + "flos": 523311985152.0, + "grad_norm": 0.07152430707450508, + "language_loss": 0.8685019, + "learning_rate": 0.00028594966229969785, + "loss": 0.87918359, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3387, + "time_per_iteration": 2.6802561283111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067324, + "balance_loss_mlp": 1.05746567, + "diversity_loss_mlp": 0.0, + "epoch": 0.6517891496729511, + "flos": 573874854912.0, + "grad_norm": 0.0719578704836234, + "language_loss": 0.81695348, + "learning_rate": 0.00028566815340970577, + "loss": 0.82762671, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3388, + "time_per_iteration": 2.725184917449951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078869, + "balance_loss_mlp": 1.33117235, + "diversity_loss_mlp": 0.22285563, + "epoch": 0.6519815313582147, + "flos": 555926893056.0, + "grad_norm": 0.03133119374313574, + "language_loss": 0.80959165, + "learning_rate": 0.0002853867277307162, + "loss": 0.81747854, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01167633, + "step": 3389, + "time_per_iteration": 2.6700825691223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066001, + "balance_loss_mlp": 1.05601168, + "diversity_loss_mlp": 0.0, + "epoch": 0.6521739130434783, + "flos": 480487666176.0, + "grad_norm": 0.077177119922592, + "language_loss": 0.82811326, + "learning_rate": 0.00028510538537198824, + "loss": 0.83877325, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.09985352, + "routerloss_mlp": 0.0, + "step": 3390, + "time_per_iteration": 2.65598464012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065546, + "balance_loss_mlp": 1.05591428, + "diversity_loss_mlp": 0.0, + "epoch": 0.6523662947287419, + "flos": 665707797504.0, + "grad_norm": 0.06292665593790116, + "language_loss": 0.86663938, + "learning_rate": 0.00028482412644274867, + "loss": 0.87729478, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3391, + "time_per_iteration": 2.926029682159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106361, + "balance_loss_mlp": 1.05354261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6525586764140053, + "flos": 548655275520.0, + "grad_norm": 0.07441000419261597, + "language_loss": 0.74793214, + "learning_rate": 0.00028454295105219207, + "loss": 0.75856817, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3392, + "time_per_iteration": 2.6511483192443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064701, + "balance_loss_mlp": 1.05479479, + "diversity_loss_mlp": 0.0, + "epoch": 0.6527510580992689, + "flos": 802900159488.0, + "grad_norm": 0.053639196798002685, + "language_loss": 0.79547405, + "learning_rate": 0.0002842618593094802, + "loss": 0.80612105, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3393, + "time_per_iteration": 3.1180903911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066549, + "balance_loss_mlp": 1.05651164, + "diversity_loss_mlp": 0.0, + "epoch": 0.6529434397845325, + "flos": 671166010368.0, + "grad_norm": 0.09762000223606793, + "language_loss": 0.80486917, + "learning_rate": 0.00028398085132374243, + "loss": 0.81553459, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.1003418, + "routerloss_mlp": 0.0, + "step": 3394, + "time_per_iteration": 2.805560350418091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061804, + "balance_loss_mlp": 1.05185044, + "diversity_loss_mlp": 0.0, + "epoch": 0.6531358214697961, + "flos": 828409006080.0, + "grad_norm": 0.06212778963151281, + "language_loss": 0.84015262, + "learning_rate": 0.0002836999272040761, + "loss": 0.85077065, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3395, + "time_per_iteration": 3.1151998043060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062758, + "balance_loss_mlp": 1.05245829, + "diversity_loss_mlp": 0.0, + "epoch": 0.6533282031550597, + "flos": 487403578368.0, + "grad_norm": 0.07524661860640132, + "language_loss": 0.83834863, + "learning_rate": 0.00028341908705954575, + "loss": 0.84897625, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 3396, + "time_per_iteration": 2.5500996112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00599946, + "balance_loss_mlp": 1.02570343, + "diversity_loss_mlp": 0.15256089, + "epoch": 0.6535205848403232, + "flos": 1557744638976.0, + "grad_norm": 0.0014313680900061394, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82361758, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01081435, + "step": 3397, + "time_per_iteration": 4.838392496109009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060865, + "balance_loss_mlp": 1.05047619, + "diversity_loss_mlp": 0.0, + "epoch": 0.6537129665255867, + "flos": 493711593984.0, + "grad_norm": 0.08700190278237876, + "language_loss": 0.77911532, + "learning_rate": 0.00028285765913198604, + "loss": 0.78972399, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 3398, + "time_per_iteration": 2.5510177612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056044, + "balance_loss_mlp": 1.04590559, + "diversity_loss_mlp": 0.0, + "epoch": 0.6539053482108503, + "flos": 605002328064.0, + "grad_norm": 0.06794032810044964, + "language_loss": 0.82229477, + "learning_rate": 0.0002825770715669227, + "loss": 0.83285522, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 3399, + "time_per_iteration": 2.7065982818603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052667, + "balance_loss_mlp": 1.04248071, + "diversity_loss_mlp": 0.0, + "epoch": 0.6540977298961139, + "flos": 577778821632.0, + "grad_norm": 0.06703848890261048, + "language_loss": 0.81440985, + "learning_rate": 0.00028229656841292634, + "loss": 0.82493651, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3400, + "time_per_iteration": 2.7117483615875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050894, + "balance_loss_mlp": 1.04067171, + "diversity_loss_mlp": 0.0, + "epoch": 0.6542901115813774, + "flos": 511753531392.0, + "grad_norm": 0.06998039744710104, + "language_loss": 0.76892245, + "learning_rate": 0.0002820161497788979, + "loss": 0.7794314, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.10217285, + "routerloss_mlp": 0.0, + "step": 3401, + "time_per_iteration": 2.590047836303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049365, + "balance_loss_mlp": 1.03930926, + "diversity_loss_mlp": 0.0, + "epoch": 0.654482493266641, + "flos": 625495302144.0, + "grad_norm": 0.06845614791056948, + "language_loss": 0.86992002, + "learning_rate": 0.00028173581577370545, + "loss": 0.88041365, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.1005249, + "routerloss_mlp": 0.0, + "step": 3402, + "time_per_iteration": 2.7577242851257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047368, + "balance_loss_mlp": 1.03716338, + "diversity_loss_mlp": 0.0, + "epoch": 0.6546748749519046, + "flos": 523981550592.0, + "grad_norm": 0.059228402052172, + "language_loss": 0.78973734, + "learning_rate": 0.0002814555665061844, + "loss": 0.80021101, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.10198975, + "routerloss_mlp": 0.0, + "step": 3403, + "time_per_iteration": 2.731137752532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_mlp": 1.0375247, + "diversity_loss_mlp": 0.0, + "epoch": 0.6548672566371682, + "flos": 479210204160.0, + "grad_norm": 0.07926071177251158, + "language_loss": 0.77611935, + "learning_rate": 0.00028117540208513715, + "loss": 0.78659368, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3404, + "time_per_iteration": 2.689107894897461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0077145, + "balance_loss_mlp": 1.2970531, + "diversity_loss_mlp": 0.22200939, + "epoch": 0.6550596383224317, + "flos": 616012356096.0, + "grad_norm": 0.029568297533915613, + "language_loss": 0.85005927, + "learning_rate": 0.00028089532261933313, + "loss": 0.85777372, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01191924, + "step": 3405, + "time_per_iteration": 2.7177927494049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105146, + "balance_loss_mlp": 1.04141116, + "diversity_loss_mlp": 0.0, + "epoch": 0.6552520200076952, + "flos": 488836684800.0, + "grad_norm": 0.08876519929545809, + "language_loss": 0.85989165, + "learning_rate": 0.0002806153282175087, + "loss": 0.87040627, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3406, + "time_per_iteration": 2.5502045154571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053299, + "balance_loss_mlp": 1.04348814, + "diversity_loss_mlp": 0.0, + "epoch": 0.6554444016929588, + "flos": 687619196928.0, + "grad_norm": 0.07350490516448754, + "language_loss": 0.82776654, + "learning_rate": 0.0002803354189883679, + "loss": 0.83829957, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3407, + "time_per_iteration": 2.8476340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054863, + "balance_loss_mlp": 1.0448494, + "diversity_loss_mlp": 0.0, + "epoch": 0.6556367833782224, + "flos": 543051330048.0, + "grad_norm": 0.06617021222220203, + "language_loss": 0.85199594, + "learning_rate": 0.00028005559504058053, + "loss": 0.86254454, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3408, + "time_per_iteration": 2.701035261154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105219, + "balance_loss_mlp": 1.04206932, + "diversity_loss_mlp": 0.0, + "epoch": 0.655829165063486, + "flos": 673535554560.0, + "grad_norm": 0.08388731304351217, + "language_loss": 0.77208018, + "learning_rate": 0.0002797758564827838, + "loss": 0.78260207, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3409, + "time_per_iteration": 2.8340024948120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058496, + "balance_loss_mlp": 1.04903674, + "diversity_loss_mlp": 0.0, + "epoch": 0.6560215467487496, + "flos": 531806736384.0, + "grad_norm": 0.07006819638769121, + "language_loss": 0.83542061, + "learning_rate": 0.0002794962034235824, + "loss": 0.84600556, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3410, + "time_per_iteration": 2.634612798690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054024, + "balance_loss_mlp": 1.04401076, + "diversity_loss_mlp": 0.0, + "epoch": 0.656213928434013, + "flos": 591311467008.0, + "grad_norm": 0.07454971523093613, + "language_loss": 0.74929279, + "learning_rate": 0.00027921663597154695, + "loss": 0.75983304, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3411, + "time_per_iteration": 2.736161708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058952, + "balance_loss_mlp": 1.04926038, + "diversity_loss_mlp": 0.0, + "epoch": 0.6564063101192766, + "flos": 415786825728.0, + "grad_norm": 0.08159088858174726, + "language_loss": 0.81125355, + "learning_rate": 0.00027893715423521525, + "loss": 0.82184303, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3412, + "time_per_iteration": 2.452563524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781164, + "balance_loss_mlp": 1.31892097, + "diversity_loss_mlp": 0.22038518, + "epoch": 0.6565986918045402, + "flos": 453321059328.0, + "grad_norm": 0.03347946196666781, + "language_loss": 0.8419345, + "learning_rate": 0.00027865775832309163, + "loss": 0.84974611, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01151081, + "step": 3413, + "time_per_iteration": 2.6782755851745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068715, + "balance_loss_mlp": 1.05899358, + "diversity_loss_mlp": 0.0, + "epoch": 0.6567910734898038, + "flos": 547746001920.0, + "grad_norm": 0.0675198993979362, + "language_loss": 0.86263126, + "learning_rate": 0.00027837844834364733, + "loss": 0.87331843, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3414, + "time_per_iteration": 2.63967227935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058125, + "balance_loss_mlp": 1.04836726, + "diversity_loss_mlp": 0.0, + "epoch": 0.6569834551750673, + "flos": 655518210048.0, + "grad_norm": 0.06663266607359189, + "language_loss": 0.8659035, + "learning_rate": 0.00027809922440532, + "loss": 0.87648469, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3415, + "time_per_iteration": 2.816204786300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059729, + "balance_loss_mlp": 1.05018628, + "diversity_loss_mlp": 0.0, + "epoch": 0.6571758368603309, + "flos": 539681107968.0, + "grad_norm": 0.06360594790571725, + "language_loss": 0.81154943, + "learning_rate": 0.00027782008661651406, + "loss": 0.82214665, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3416, + "time_per_iteration": 2.80657958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059234, + "balance_loss_mlp": 1.04937577, + "diversity_loss_mlp": 0.0, + "epoch": 0.6573682185455945, + "flos": 497346117120.0, + "grad_norm": 0.062003807204006764, + "language_loss": 0.87255514, + "learning_rate": 0.00027754103508560013, + "loss": 0.88314748, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3417, + "time_per_iteration": 2.648777723312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062014, + "balance_loss_mlp": 1.05205965, + "diversity_loss_mlp": 0.0, + "epoch": 0.657560600230858, + "flos": 447465295872.0, + "grad_norm": 0.06781110485333444, + "language_loss": 0.82382166, + "learning_rate": 0.0002772620699209163, + "loss": 0.83444178, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3418, + "time_per_iteration": 2.566547155380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_mlp": 1.0503962, + "diversity_loss_mlp": 0.0, + "epoch": 0.6577529819161216, + "flos": 481940596224.0, + "grad_norm": 0.0650517875970755, + "language_loss": 0.79616904, + "learning_rate": 0.0002769831912307658, + "loss": 0.80676609, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3419, + "time_per_iteration": 2.606062889099121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061383, + "balance_loss_mlp": 1.05156565, + "diversity_loss_mlp": 0.0, + "epoch": 0.6579453636013851, + "flos": 530843134464.0, + "grad_norm": 0.07306581186555239, + "language_loss": 0.80279779, + "learning_rate": 0.00027670439912341917, + "loss": 0.81341165, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3420, + "time_per_iteration": 2.616004228591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058734, + "balance_loss_mlp": 1.04903078, + "diversity_loss_mlp": 0.0, + "epoch": 0.6581377452866487, + "flos": 628037743104.0, + "grad_norm": 0.07531365664549339, + "language_loss": 0.83319843, + "learning_rate": 0.0002764256937071129, + "loss": 0.84378576, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.09692383, + "routerloss_mlp": 0.0, + "step": 3421, + "time_per_iteration": 2.7864840030670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061647, + "balance_loss_mlp": 1.05205703, + "diversity_loss_mlp": 0.0, + "epoch": 0.6583301269719123, + "flos": 548618199552.0, + "grad_norm": 0.06844647739450752, + "language_loss": 0.87222612, + "learning_rate": 0.00027614707509005036, + "loss": 0.88284254, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 3422, + "time_per_iteration": 2.666473388671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058786, + "balance_loss_mlp": 1.04912376, + "diversity_loss_mlp": 0.0, + "epoch": 0.6585225086571759, + "flos": 427493583360.0, + "grad_norm": 0.0762783210263198, + "language_loss": 0.79373097, + "learning_rate": 0.0002758685433804008, + "loss": 0.8043189, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3423, + "time_per_iteration": 2.4872303009033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056028, + "balance_loss_mlp": 1.04637778, + "diversity_loss_mlp": 0.0, + "epoch": 0.6587148903424394, + "flos": 859620542976.0, + "grad_norm": 0.07259832833327884, + "language_loss": 0.79187661, + "learning_rate": 0.00027559009868630005, + "loss": 0.80243689, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3424, + "time_per_iteration": 3.1284892559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063823, + "balance_loss_mlp": 1.0545187, + "diversity_loss_mlp": 0.0, + "epoch": 0.6589072720277029, + "flos": 805630551552.0, + "grad_norm": 0.07475259244153008, + "language_loss": 0.80332637, + "learning_rate": 0.0002753117411158491, + "loss": 0.81396455, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3425, + "time_per_iteration": 3.024216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066892, + "balance_loss_mlp": 1.05724216, + "diversity_loss_mlp": 0.0, + "epoch": 0.6590996537129665, + "flos": 548618199552.0, + "grad_norm": 0.06493586108743211, + "language_loss": 0.89989424, + "learning_rate": 0.0002750334707771168, + "loss": 0.91056317, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3426, + "time_per_iteration": 2.6436870098114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066532, + "balance_loss_mlp": 1.0567987, + "diversity_loss_mlp": 0.0, + "epoch": 0.6592920353982301, + "flos": 454166092800.0, + "grad_norm": 0.06891806065084582, + "language_loss": 0.81568319, + "learning_rate": 0.0002747552877781369, + "loss": 0.82634848, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.097229, + "routerloss_mlp": 0.0, + "step": 3427, + "time_per_iteration": 2.484457015991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106612, + "balance_loss_mlp": 1.05665517, + "diversity_loss_mlp": 0.0, + "epoch": 0.6594844170834937, + "flos": 567174057984.0, + "grad_norm": 0.06651025164376474, + "language_loss": 0.81769067, + "learning_rate": 0.0002744771922269097, + "loss": 0.82835186, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3428, + "time_per_iteration": 2.724034547805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.06395817, + "diversity_loss_mlp": 0.0, + "epoch": 0.6596767987687572, + "flos": 1187911194624.0, + "grad_norm": 0.08249136451092651, + "language_loss": 0.81983304, + "learning_rate": 0.0002741991842314015, + "loss": 0.83056509, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3429, + "time_per_iteration": 3.4791431427001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106958, + "balance_loss_mlp": 1.06021035, + "diversity_loss_mlp": 0.0, + "epoch": 0.6598691804540208, + "flos": 503491147776.0, + "grad_norm": 0.09631718735244636, + "language_loss": 0.85994452, + "learning_rate": 0.0002739212638995445, + "loss": 0.87064034, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3430, + "time_per_iteration": 2.5809226036071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070566, + "balance_loss_mlp": 1.06089258, + "diversity_loss_mlp": 0.0, + "epoch": 0.6600615621392844, + "flos": 531337231872.0, + "grad_norm": 0.07152811859744175, + "language_loss": 0.83226836, + "learning_rate": 0.00027364343133923696, + "loss": 0.84297395, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 3431, + "time_per_iteration": 2.664724826812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072179, + "balance_loss_mlp": 1.06281483, + "diversity_loss_mlp": 0.0, + "epoch": 0.6602539438245479, + "flos": 565446915072.0, + "grad_norm": 0.07076815482363777, + "language_loss": 0.82710063, + "learning_rate": 0.0002733656866583431, + "loss": 0.83782238, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3432, + "time_per_iteration": 2.6845815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075681, + "balance_loss_mlp": 1.06614459, + "diversity_loss_mlp": 0.0, + "epoch": 0.6604463255098114, + "flos": 857159594496.0, + "grad_norm": 0.07348653509543634, + "language_loss": 0.83014315, + "learning_rate": 0.0002730880299646927, + "loss": 0.84089994, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3433, + "time_per_iteration": 3.09417462348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072804, + "balance_loss_mlp": 1.06348789, + "diversity_loss_mlp": 0.0, + "epoch": 0.660638707195075, + "flos": 674462080512.0, + "grad_norm": 0.060523936244010056, + "language_loss": 0.85307741, + "learning_rate": 0.0002728104613660821, + "loss": 0.86380541, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3434, + "time_per_iteration": 2.844012498855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071618, + "balance_loss_mlp": 1.06231332, + "diversity_loss_mlp": 0.0, + "epoch": 0.6608310888803386, + "flos": 888961402368.0, + "grad_norm": 0.06580511923703304, + "language_loss": 0.83062303, + "learning_rate": 0.0002725329809702729, + "loss": 0.84133923, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3435, + "time_per_iteration": 3.203927516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070425, + "balance_loss_mlp": 1.06119871, + "diversity_loss_mlp": 0.0, + "epoch": 0.6610234705656022, + "flos": 1136347646976.0, + "grad_norm": 0.07937285786961487, + "language_loss": 0.76092625, + "learning_rate": 0.0002722555888849921, + "loss": 0.77163053, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3436, + "time_per_iteration": 3.441042423248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071904, + "balance_loss_mlp": 1.06265306, + "diversity_loss_mlp": 0.0, + "epoch": 0.6612158522508658, + "flos": 468012598272.0, + "grad_norm": 0.06477982340890849, + "language_loss": 0.80420995, + "learning_rate": 0.00027197828521793334, + "loss": 0.81492901, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3437, + "time_per_iteration": 2.508976697921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072662, + "balance_loss_mlp": 1.0631609, + "diversity_loss_mlp": 0.0, + "epoch": 0.6614082339361292, + "flos": 571653614592.0, + "grad_norm": 0.05773126923802199, + "language_loss": 0.85235512, + "learning_rate": 0.0002717010700767552, + "loss": 0.86308175, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3438, + "time_per_iteration": 2.7343809604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788388, + "balance_loss_mlp": 1.33122396, + "diversity_loss_mlp": 0.22170436, + "epoch": 0.6616006156213928, + "flos": 498467934720.0, + "grad_norm": 0.035967269047030424, + "language_loss": 0.76073134, + "learning_rate": 0.00027142394356908226, + "loss": 0.76861525, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01192367, + "step": 3439, + "time_per_iteration": 2.6098694801330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072149, + "balance_loss_mlp": 1.06304741, + "diversity_loss_mlp": 0.0, + "epoch": 0.6617929973066564, + "flos": 602420239872.0, + "grad_norm": 0.07092995700037574, + "language_loss": 0.84935868, + "learning_rate": 0.00027114690580250456, + "loss": 0.86008012, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3440, + "time_per_iteration": 2.7477781772613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067851, + "balance_loss_mlp": 1.05864227, + "diversity_loss_mlp": 0.0, + "epoch": 0.66198537899192, + "flos": 522983443968.0, + "grad_norm": 0.07606845250334485, + "language_loss": 0.87084186, + "learning_rate": 0.0002708699568845776, + "loss": 0.88152039, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3441, + "time_per_iteration": 2.6247143745422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068327, + "balance_loss_mlp": 1.062343, + "diversity_loss_mlp": 0.0, + "epoch": 0.6621777606771835, + "flos": 1566256642560.0, + "grad_norm": 0.03817420207517821, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80356109, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 3442, + "time_per_iteration": 4.9118194580078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070581, + "balance_loss_mlp": 1.06144977, + "diversity_loss_mlp": 0.0, + "epoch": 0.6623701423624471, + "flos": 526664954880.0, + "grad_norm": 0.059711141008881904, + "language_loss": 0.83110899, + "learning_rate": 0.0002703163260247261, + "loss": 0.84181482, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3443, + "time_per_iteration": 2.6146388053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070764, + "balance_loss_mlp": 1.06162047, + "diversity_loss_mlp": 0.0, + "epoch": 0.6625625240477107, + "flos": 528179553792.0, + "grad_norm": 0.07293118954211444, + "language_loss": 0.81726909, + "learning_rate": 0.0002700396442977399, + "loss": 0.82797676, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3444, + "time_per_iteration": 2.6122488975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072126, + "balance_loss_mlp": 1.06287587, + "diversity_loss_mlp": 0.0, + "epoch": 0.6627549057329742, + "flos": 473122073088.0, + "grad_norm": 0.06235524151571192, + "language_loss": 0.84365332, + "learning_rate": 0.0002697630518492817, + "loss": 0.85437459, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3445, + "time_per_iteration": 2.695577621459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074381, + "balance_loss_mlp": 1.06496358, + "diversity_loss_mlp": 0.0, + "epoch": 0.6629472874182378, + "flos": 527996745216.0, + "grad_norm": 0.09449311389962292, + "language_loss": 0.85555631, + "learning_rate": 0.0002694865487867343, + "loss": 0.86630011, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3446, + "time_per_iteration": 2.643448829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066192, + "balance_loss_mlp": 1.0568881, + "diversity_loss_mlp": 0.0, + "epoch": 0.6631396691035013, + "flos": 613200471552.0, + "grad_norm": 0.06130478535455018, + "language_loss": 0.84665477, + "learning_rate": 0.0002692101352174453, + "loss": 0.85731673, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3447, + "time_per_iteration": 2.7684693336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071985, + "balance_loss_mlp": 1.06239462, + "diversity_loss_mlp": 0.0, + "epoch": 0.6633320507887649, + "flos": 609318899712.0, + "grad_norm": 0.0686574359328325, + "language_loss": 0.84783942, + "learning_rate": 0.00026893381124872787, + "loss": 0.85855925, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3448, + "time_per_iteration": 2.6856155395507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.06869519, + "diversity_loss_mlp": 0.0, + "epoch": 0.6635244324740285, + "flos": 749700873216.0, + "grad_norm": 0.07711664740076789, + "language_loss": 0.80761468, + "learning_rate": 0.00026865757698786097, + "loss": 0.8183924, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3449, + "time_per_iteration": 3.0219905376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064699, + "balance_loss_mlp": 1.05549026, + "diversity_loss_mlp": 0.0, + "epoch": 0.6637168141592921, + "flos": 664526882304.0, + "grad_norm": 0.07081100750222453, + "language_loss": 0.81853712, + "learning_rate": 0.000268381432542088, + "loss": 0.82918411, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3450, + "time_per_iteration": 2.7959303855895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063443, + "balance_loss_mlp": 1.05394757, + "diversity_loss_mlp": 0.0, + "epoch": 0.6639091958445555, + "flos": 606783799296.0, + "grad_norm": 0.0764006206271421, + "language_loss": 0.80043346, + "learning_rate": 0.00026810537801861807, + "loss": 0.81106788, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3451, + "time_per_iteration": 2.7303504943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058217, + "balance_loss_mlp": 1.04875171, + "diversity_loss_mlp": 0.0, + "epoch": 0.6641015775298191, + "flos": 476697498624.0, + "grad_norm": 0.05834244489040309, + "language_loss": 0.81090832, + "learning_rate": 0.0002678294135246243, + "loss": 0.82149041, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3452, + "time_per_iteration": 2.733463764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056165, + "balance_loss_mlp": 1.04691422, + "diversity_loss_mlp": 0.0, + "epoch": 0.6642939592150827, + "flos": 904115105280.0, + "grad_norm": 0.07343702884431198, + "language_loss": 0.86356318, + "learning_rate": 0.0002675535391672463, + "loss": 0.87412483, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3453, + "time_per_iteration": 3.115978956222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00797636, + "balance_loss_mlp": 1.35083306, + "diversity_loss_mlp": 0.22054271, + "epoch": 0.6644863409003463, + "flos": 581808697344.0, + "grad_norm": 0.028810841374919304, + "language_loss": 0.86237454, + "learning_rate": 0.0002672777550535877, + "loss": 0.87035096, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01194801, + "step": 3454, + "time_per_iteration": 2.793548822402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060819, + "balance_loss_mlp": 1.05172312, + "diversity_loss_mlp": 0.0, + "epoch": 0.6646787225856099, + "flos": 479002802688.0, + "grad_norm": 0.0753840272591569, + "language_loss": 0.85331321, + "learning_rate": 0.00026700206129071747, + "loss": 0.8639214, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3455, + "time_per_iteration": 2.5915210247039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064823, + "balance_loss_mlp": 1.05565548, + "diversity_loss_mlp": 0.0, + "epoch": 0.6648711042708734, + "flos": 449906420736.0, + "grad_norm": 0.07433202645873906, + "language_loss": 0.89061069, + "learning_rate": 0.00026672645798566925, + "loss": 0.90125895, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3456, + "time_per_iteration": 2.5754494667053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059485, + "balance_loss_mlp": 1.05019283, + "diversity_loss_mlp": 0.0, + "epoch": 0.665063485956137, + "flos": 858960516096.0, + "grad_norm": 0.07294926148794169, + "language_loss": 0.79539233, + "learning_rate": 0.00026645094524544225, + "loss": 0.80598718, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.09289551, + "routerloss_mlp": 0.0, + "step": 3457, + "time_per_iteration": 3.2948148250579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056831, + "balance_loss_mlp": 1.04734802, + "diversity_loss_mlp": 0.0, + "epoch": 0.6652558676414005, + "flos": 604312939008.0, + "grad_norm": 0.08386362480566827, + "language_loss": 0.75221157, + "learning_rate": 0.00026617552317699945, + "loss": 0.76277989, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3458, + "time_per_iteration": 2.789961576461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057714, + "balance_loss_mlp": 1.04836822, + "diversity_loss_mlp": 0.0, + "epoch": 0.6654482493266641, + "flos": 510394576896.0, + "grad_norm": 0.09354786354914506, + "language_loss": 0.87007248, + "learning_rate": 0.0002659001918872693, + "loss": 0.88064957, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3459, + "time_per_iteration": 2.6320250034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058346, + "balance_loss_mlp": 1.04896998, + "diversity_loss_mlp": 0.0, + "epoch": 0.6656406310119277, + "flos": 565605130752.0, + "grad_norm": 0.06598239053228593, + "language_loss": 0.80718446, + "learning_rate": 0.0002656249514831449, + "loss": 0.81776798, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3460, + "time_per_iteration": 2.6485753059387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063647, + "balance_loss_mlp": 1.05442595, + "diversity_loss_mlp": 0.0, + "epoch": 0.6658330126971912, + "flos": 1024298141184.0, + "grad_norm": 0.05863451757746151, + "language_loss": 0.87114978, + "learning_rate": 0.00026534980207148416, + "loss": 0.88178623, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3461, + "time_per_iteration": 3.4618935585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.05719471, + "diversity_loss_mlp": 0.0, + "epoch": 0.6660253943824548, + "flos": 816823388160.0, + "grad_norm": 0.07572861338992695, + "language_loss": 0.73451698, + "learning_rate": 0.0002650747437591097, + "loss": 0.7451815, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 3462, + "time_per_iteration": 2.985516309738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_mlp": 1.02065372, + "diversity_loss_mlp": 0.0, + "epoch": 0.6662177760677184, + "flos": 1496169169920.0, + "grad_norm": 0.017950660829121307, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82906377, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 3463, + "time_per_iteration": 5.041592359542847 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067724, + "balance_loss_mlp": 1.05844963, + "diversity_loss_mlp": 0.0, + "epoch": 0.666410157752982, + "flos": 500120925696.0, + "grad_norm": 0.06793562911737132, + "language_loss": 0.86417711, + "learning_rate": 0.00026452490085933155, + "loss": 0.87485433, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3464, + "time_per_iteration": 2.5661425590515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069546, + "balance_loss_mlp": 1.05994368, + "diversity_loss_mlp": 0.0, + "epoch": 0.6666025394382454, + "flos": 481169714688.0, + "grad_norm": 0.08819800975527838, + "language_loss": 0.89818048, + "learning_rate": 0.00026425011648539614, + "loss": 0.90887594, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3465, + "time_per_iteration": 2.5488314628601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065239, + "balance_loss_mlp": 1.05584478, + "diversity_loss_mlp": 0.0, + "epoch": 0.666794921123509, + "flos": 546653919744.0, + "grad_norm": 0.06406494944770698, + "language_loss": 0.82567346, + "learning_rate": 0.00026397542363768267, + "loss": 0.83632582, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3466, + "time_per_iteration": 2.669250965118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781407, + "balance_loss_mlp": 1.32080197, + "diversity_loss_mlp": 0.21862534, + "epoch": 0.6669873028087726, + "flos": 471988145664.0, + "grad_norm": 0.03313864292511896, + "language_loss": 0.8202821, + "learning_rate": 0.0002637008224228362, + "loss": 0.82809615, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01169338, + "step": 3467, + "time_per_iteration": 2.572173833847046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070583, + "balance_loss_mlp": 1.06133246, + "diversity_loss_mlp": 0.0, + "epoch": 0.6671796844940362, + "flos": 547395065856.0, + "grad_norm": 0.05107139851875669, + "language_loss": 0.8441903, + "learning_rate": 0.00026342631294746653, + "loss": 0.85489613, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3468, + "time_per_iteration": 2.698885917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072156, + "balance_loss_mlp": 1.06254137, + "diversity_loss_mlp": 0.0, + "epoch": 0.6673720661792998, + "flos": 1070317214208.0, + "grad_norm": 0.05734496396036439, + "language_loss": 0.80842233, + "learning_rate": 0.0002631518953181476, + "loss": 0.81914389, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3469, + "time_per_iteration": 3.4733734130859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101407, + "balance_loss_mlp": 1.0077759, + "diversity_loss_mlp": 0.0, + "epoch": 0.6675644478645633, + "flos": 1523790600192.0, + "grad_norm": 0.015747171991140264, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77339357, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.06298828, + "routerloss_mlp": 0.0, + "step": 3470, + "time_per_iteration": 4.929265737533569 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074034, + "balance_loss_mlp": 1.06445539, + "diversity_loss_mlp": 0.0, + "epoch": 0.6677568295498268, + "flos": 579696113664.0, + "grad_norm": 0.060826323549746535, + "language_loss": 0.80429429, + "learning_rate": 0.00026260333602377985, + "loss": 0.81503463, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3471, + "time_per_iteration": 2.848822593688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076958, + "balance_loss_mlp": 1.06758189, + "diversity_loss_mlp": 0.0, + "epoch": 0.6679492112350904, + "flos": 383935458816.0, + "grad_norm": 0.07184696149338711, + "language_loss": 0.87395489, + "learning_rate": 0.0002623291945717007, + "loss": 0.88472444, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.09368896, + "routerloss_mlp": 0.0, + "step": 3472, + "time_per_iteration": 2.500190019607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073902, + "balance_loss_mlp": 1.06426954, + "diversity_loss_mlp": 0.0, + "epoch": 0.668141592920354, + "flos": 1150759830528.0, + "grad_norm": 0.06589735356893138, + "language_loss": 0.84111875, + "learning_rate": 0.00026205514539161175, + "loss": 0.85185778, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3473, + "time_per_iteration": 3.534797191619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072796, + "balance_loss_mlp": 1.0632112, + "diversity_loss_mlp": 0.0, + "epoch": 0.6683339746056175, + "flos": 561100608000.0, + "grad_norm": 0.059882211902428664, + "language_loss": 0.83973366, + "learning_rate": 0.00026178118858990773, + "loss": 0.8504616, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3474, + "time_per_iteration": 2.8565967082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070699, + "balance_loss_mlp": 1.06109083, + "diversity_loss_mlp": 0.0, + "epoch": 0.6685263562908811, + "flos": 514305884160.0, + "grad_norm": 0.06021787961002869, + "language_loss": 0.84205377, + "learning_rate": 0.0002615073242729483, + "loss": 0.85276067, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.0960083, + "routerloss_mlp": 0.0, + "step": 3475, + "time_per_iteration": 2.678913116455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070157, + "balance_loss_mlp": 1.0605185, + "diversity_loss_mlp": 0.0, + "epoch": 0.6687187379761447, + "flos": 629772226560.0, + "grad_norm": 0.05349171948445146, + "language_loss": 0.84449661, + "learning_rate": 0.0002612335525470573, + "loss": 0.85519814, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3476, + "time_per_iteration": 2.8754477500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_mlp": 1.05415487, + "diversity_loss_mlp": 0.0, + "epoch": 0.6689111196614083, + "flos": 535586992128.0, + "grad_norm": 0.0743507074362168, + "language_loss": 0.78049976, + "learning_rate": 0.0002609598735185221, + "loss": 0.79113823, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3477, + "time_per_iteration": 2.6721932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066424, + "balance_loss_mlp": 1.05687511, + "diversity_loss_mlp": 0.0, + "epoch": 0.6691035013466718, + "flos": 603038048256.0, + "grad_norm": 0.06005632064488323, + "language_loss": 0.83158946, + "learning_rate": 0.00026068628729359445, + "loss": 0.84225374, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3478, + "time_per_iteration": 2.7650654315948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068151, + "balance_loss_mlp": 1.05830431, + "diversity_loss_mlp": 0.0, + "epoch": 0.6692958830319353, + "flos": 632855752704.0, + "grad_norm": 0.0704650229723735, + "language_loss": 0.76221395, + "learning_rate": 0.00026041279397848996, + "loss": 0.77289546, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3479, + "time_per_iteration": 2.8531105518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.055673, + "diversity_loss_mlp": 0.0, + "epoch": 0.6694882647171989, + "flos": 645471783936.0, + "grad_norm": 0.06824163679163787, + "language_loss": 0.82570118, + "learning_rate": 0.00026013939367938797, + "loss": 0.8363536, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3480, + "time_per_iteration": 2.8762619495391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798551, + "balance_loss_mlp": 1.35232079, + "diversity_loss_mlp": 0.22152299, + "epoch": 0.6696806464024625, + "flos": 569585447424.0, + "grad_norm": 0.028482542431452974, + "language_loss": 0.81186199, + "learning_rate": 0.00025986608650243204, + "loss": 0.81984746, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01162949, + "step": 3481, + "time_per_iteration": 2.8153860569000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071437, + "balance_loss_mlp": 1.06166184, + "diversity_loss_mlp": 0.0, + "epoch": 0.6698730280877261, + "flos": 622700669952.0, + "grad_norm": 0.08903053329626802, + "language_loss": 0.79281807, + "learning_rate": 0.0002595928725537293, + "loss": 0.80353248, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3482, + "time_per_iteration": 2.8563952445983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064965, + "balance_loss_mlp": 1.05542827, + "diversity_loss_mlp": 0.0, + "epoch": 0.6700654097729896, + "flos": 502507722240.0, + "grad_norm": 0.06597366352184171, + "language_loss": 0.8811605, + "learning_rate": 0.0002593197519393509, + "loss": 0.89181018, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.09539795, + "routerloss_mlp": 0.0, + "step": 3483, + "time_per_iteration": 2.659468650817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060552, + "balance_loss_mlp": 1.05117035, + "diversity_loss_mlp": 0.0, + "epoch": 0.6702577914582531, + "flos": 623876815872.0, + "grad_norm": 0.06129183928704833, + "language_loss": 0.79517573, + "learning_rate": 0.00025904672476533165, + "loss": 0.80578125, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3484, + "time_per_iteration": 2.843041181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_mlp": 1.0531497, + "diversity_loss_mlp": 0.0, + "epoch": 0.6704501731435167, + "flos": 456268764672.0, + "grad_norm": 0.06231151375576235, + "language_loss": 0.82821012, + "learning_rate": 0.0002587737911376704, + "loss": 0.83883744, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3485, + "time_per_iteration": 2.579852819442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065444, + "balance_loss_mlp": 1.0560143, + "diversity_loss_mlp": 0.0, + "epoch": 0.6706425548287803, + "flos": 543229369344.0, + "grad_norm": 0.06196157664485949, + "language_loss": 0.84223086, + "learning_rate": 0.00025850095116232885, + "loss": 0.85288531, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3486, + "time_per_iteration": 2.6867549419403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059913, + "balance_loss_mlp": 1.05029857, + "diversity_loss_mlp": 0.0, + "epoch": 0.6708349365140439, + "flos": 633940494336.0, + "grad_norm": 0.07455755751361211, + "language_loss": 0.77796304, + "learning_rate": 0.000258228204945233, + "loss": 0.78856218, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3487, + "time_per_iteration": 2.9104583263397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788296, + "balance_loss_mlp": 1.33072948, + "diversity_loss_mlp": 0.22110668, + "epoch": 0.6710273181993074, + "flos": 640747749888.0, + "grad_norm": 0.03107378418050736, + "language_loss": 0.84813625, + "learning_rate": 0.00025795555259227254, + "loss": 0.8560192, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0123779, + "step": 3488, + "time_per_iteration": 2.799049139022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064064, + "balance_loss_mlp": 1.05453348, + "diversity_loss_mlp": 0.0, + "epoch": 0.671219699884571, + "flos": 553942789632.0, + "grad_norm": 0.05587900492957358, + "language_loss": 0.8365714, + "learning_rate": 0.00025768299420930046, + "loss": 0.84721196, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.09515381, + "routerloss_mlp": 0.0, + "step": 3489, + "time_per_iteration": 2.7350802421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059988, + "balance_loss_mlp": 1.05058801, + "diversity_loss_mlp": 0.0, + "epoch": 0.6714120815698346, + "flos": 731508433920.0, + "grad_norm": 0.0636982622522837, + "language_loss": 0.83686626, + "learning_rate": 0.0002574105299021332, + "loss": 0.84746611, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3490, + "time_per_iteration": 2.8952267169952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056705, + "balance_loss_mlp": 1.04722226, + "diversity_loss_mlp": 0.0, + "epoch": 0.6716044632550981, + "flos": 688664291328.0, + "grad_norm": 0.059047086854658884, + "language_loss": 0.84235394, + "learning_rate": 0.00025713815977655084, + "loss": 0.85292095, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3491, + "time_per_iteration": 2.8801188468933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059823, + "balance_loss_mlp": 1.05020285, + "diversity_loss_mlp": 0.0, + "epoch": 0.6717968449403616, + "flos": 460629752832.0, + "grad_norm": 0.0713613195550899, + "language_loss": 0.84868813, + "learning_rate": 0.0002568658839382969, + "loss": 0.85928631, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3492, + "time_per_iteration": 2.565765380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055936, + "balance_loss_mlp": 1.04666197, + "diversity_loss_mlp": 0.0, + "epoch": 0.6719892266256252, + "flos": 501608360448.0, + "grad_norm": 0.0809894292628365, + "language_loss": 0.8436929, + "learning_rate": 0.00025659370249307814, + "loss": 0.85425228, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3493, + "time_per_iteration": 2.61505126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_mlp": 1.04709792, + "diversity_loss_mlp": 0.0, + "epoch": 0.6721816083108888, + "flos": 683525081088.0, + "grad_norm": 0.06605957100839344, + "language_loss": 0.85386133, + "learning_rate": 0.00025632161554656473, + "loss": 0.86442864, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.09631348, + "routerloss_mlp": 0.0, + "step": 3494, + "time_per_iteration": 2.8639488220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054512, + "balance_loss_mlp": 1.04485643, + "diversity_loss_mlp": 0.0, + "epoch": 0.6723739899961524, + "flos": 585813980160.0, + "grad_norm": 0.0758709557174038, + "language_loss": 0.8232398, + "learning_rate": 0.00025604962320439017, + "loss": 0.83378488, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3495, + "time_per_iteration": 2.71235728263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056091, + "balance_loss_mlp": 1.04692411, + "diversity_loss_mlp": 0.0, + "epoch": 0.672566371681416, + "flos": 506616519168.0, + "grad_norm": 0.06832671008161519, + "language_loss": 0.82082075, + "learning_rate": 0.0002557777255721516, + "loss": 0.83138162, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3496, + "time_per_iteration": 2.728652000427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052712, + "balance_loss_mlp": 1.04334807, + "diversity_loss_mlp": 0.0, + "epoch": 0.6727587533666795, + "flos": 535671055872.0, + "grad_norm": 0.07590882568517338, + "language_loss": 0.80502313, + "learning_rate": 0.0002555059227554087, + "loss": 0.81555027, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3497, + "time_per_iteration": 2.6704843044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054312, + "balance_loss_mlp": 1.04488301, + "diversity_loss_mlp": 0.0, + "epoch": 0.672951135051943, + "flos": 602832844800.0, + "grad_norm": 0.0738650094824256, + "language_loss": 0.77972269, + "learning_rate": 0.00025523421485968453, + "loss": 0.79026586, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3498, + "time_per_iteration": 2.8093771934509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057909, + "balance_loss_mlp": 1.04843736, + "diversity_loss_mlp": 0.0, + "epoch": 0.6731435167372066, + "flos": 811315989504.0, + "grad_norm": 0.07086262263525961, + "language_loss": 0.85447127, + "learning_rate": 0.00025496260199046585, + "loss": 0.86505038, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3499, + "time_per_iteration": 3.0010836124420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105668, + "balance_loss_mlp": 1.04721487, + "diversity_loss_mlp": 0.0, + "epoch": 0.6733358984224702, + "flos": 611594468352.0, + "grad_norm": 0.056698795982303, + "language_loss": 0.84606051, + "learning_rate": 0.000254691084253202, + "loss": 0.85662723, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3500, + "time_per_iteration": 2.7931160926818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106411, + "balance_loss_mlp": 1.05446577, + "diversity_loss_mlp": 0.0, + "epoch": 0.6735282801077337, + "flos": 558901762560.0, + "grad_norm": 0.075539637024569, + "language_loss": 0.77243733, + "learning_rate": 0.00025441966175330567, + "loss": 0.78307843, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3501, + "time_per_iteration": 2.6508493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067964, + "balance_loss_mlp": 1.05850506, + "diversity_loss_mlp": 0.0, + "epoch": 0.6737206617929973, + "flos": 672433560576.0, + "grad_norm": 0.07065885937587965, + "language_loss": 0.79737401, + "learning_rate": 0.00025414833459615183, + "loss": 0.80805361, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3502, + "time_per_iteration": 2.784524917602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074197, + "balance_loss_mlp": 1.0648514, + "diversity_loss_mlp": 0.0, + "epoch": 0.6739130434782609, + "flos": 633446396928.0, + "grad_norm": 0.06652503704287359, + "language_loss": 0.80206275, + "learning_rate": 0.0002538771028870796, + "loss": 0.8128047, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3503, + "time_per_iteration": 2.802136182785034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075432, + "balance_loss_mlp": 1.06571674, + "diversity_loss_mlp": 0.0, + "epoch": 0.6741054251635245, + "flos": 531445888512.0, + "grad_norm": 0.06376799007020843, + "language_loss": 0.81455564, + "learning_rate": 0.0002536059667313903, + "loss": 0.82530999, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3504, + "time_per_iteration": 2.711933135986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068251, + "balance_loss_mlp": 1.05844057, + "diversity_loss_mlp": 0.0, + "epoch": 0.674297806848788, + "flos": 542604220416.0, + "grad_norm": 0.09964706429340704, + "language_loss": 0.89608288, + "learning_rate": 0.0002533349262343483, + "loss": 0.9067654, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3505, + "time_per_iteration": 2.6715004444122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082337, + "balance_loss_mlp": 1.07268143, + "diversity_loss_mlp": 0.0, + "epoch": 0.6744901885340515, + "flos": 463523129856.0, + "grad_norm": 0.06572677444304757, + "language_loss": 0.81604284, + "learning_rate": 0.0002530639815011807, + "loss": 0.82686627, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3506, + "time_per_iteration": 2.4929287433624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078955, + "balance_loss_mlp": 1.33325195, + "diversity_loss_mlp": 0.2229899, + "epoch": 0.6746825702193151, + "flos": 631830481920.0, + "grad_norm": 0.03439328096706921, + "language_loss": 0.8506915, + "learning_rate": 0.0002527931326370781, + "loss": 0.85858697, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01142928, + "step": 3507, + "time_per_iteration": 2.83644962310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.07446539, + "diversity_loss_mlp": 0.0, + "epoch": 0.6748749519045787, + "flos": 671146186752.0, + "grad_norm": 0.08750505461607005, + "language_loss": 0.82915336, + "learning_rate": 0.00025252237974719276, + "loss": 0.83999527, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3508, + "time_per_iteration": 2.871253252029419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081215, + "balance_loss_mlp": 1.07155883, + "diversity_loss_mlp": 0.0, + "epoch": 0.6750673335898423, + "flos": 767102980608.0, + "grad_norm": 0.08335060522291943, + "language_loss": 0.80458963, + "learning_rate": 0.00025225172293664056, + "loss": 0.81540173, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3509, + "time_per_iteration": 3.033853530883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014527, + "balance_loss_mlp": 1.00832772, + "diversity_loss_mlp": 0.0, + "epoch": 0.6752597152751059, + "flos": 1512607675392.0, + "grad_norm": 0.01800991302482, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77947664, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.06176758, + "routerloss_mlp": 0.0, + "step": 3510, + "time_per_iteration": 4.911616325378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085264, + "balance_loss_mlp": 1.07521439, + "diversity_loss_mlp": 0.0, + "epoch": 0.6754520969603693, + "flos": 687297996288.0, + "grad_norm": 0.09401749664970258, + "language_loss": 0.84862983, + "learning_rate": 0.00025171069797381106, + "loss": 0.85948253, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3511, + "time_per_iteration": 2.8283350467681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071317, + "balance_loss_mlp": 1.06139874, + "diversity_loss_mlp": 0.0, + "epoch": 0.6756444786456329, + "flos": 500577947136.0, + "grad_norm": 0.06520954806538445, + "language_loss": 0.82273233, + "learning_rate": 0.00025144033003157864, + "loss": 0.83344549, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3512, + "time_per_iteration": 2.5983166694641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070965, + "balance_loss_mlp": 1.06117821, + "diversity_loss_mlp": 0.0, + "epoch": 0.6758368603308965, + "flos": 492616940544.0, + "grad_norm": 0.08310754245868612, + "language_loss": 0.78935671, + "learning_rate": 0.00025117005858876806, + "loss": 0.80006635, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3513, + "time_per_iteration": 2.6797635555267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787238, + "balance_loss_mlp": 1.33182001, + "diversity_loss_mlp": 0.21994653, + "epoch": 0.6760292420161601, + "flos": 555934233600.0, + "grad_norm": 0.03353723121835004, + "language_loss": 0.85560071, + "learning_rate": 0.000250899883750308, + "loss": 0.86347306, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0113544, + "step": 3514, + "time_per_iteration": 2.7176060676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059441, + "balance_loss_mlp": 1.04921913, + "diversity_loss_mlp": 0.0, + "epoch": 0.6762216237014236, + "flos": 607601668608.0, + "grad_norm": 0.07453608092591449, + "language_loss": 0.81898236, + "learning_rate": 0.00025062980562109006, + "loss": 0.82957679, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3515, + "time_per_iteration": 2.7594966888427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789958, + "balance_loss_mlp": 1.33716106, + "diversity_loss_mlp": 0.21975538, + "epoch": 0.6764140053866872, + "flos": 533785697280.0, + "grad_norm": 0.033729691487123833, + "language_loss": 0.83036506, + "learning_rate": 0.0002503598243059677, + "loss": 0.83826458, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01150025, + "step": 3516, + "time_per_iteration": 2.891763687133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_mlp": 1.04839277, + "diversity_loss_mlp": 0.0, + "epoch": 0.6766063870719508, + "flos": 504810455040.0, + "grad_norm": 0.07017833187059877, + "language_loss": 0.80408925, + "learning_rate": 0.0002500899399097568, + "loss": 0.81467211, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.09887695, + "routerloss_mlp": 0.0, + "step": 3517, + "time_per_iteration": 2.672029972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786476, + "balance_loss_mlp": 1.32907259, + "diversity_loss_mlp": 0.22110882, + "epoch": 0.6767987687572143, + "flos": 513176726016.0, + "grad_norm": 0.038425556988831724, + "language_loss": 0.85818875, + "learning_rate": 0.0002498201525372359, + "loss": 0.86605346, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01138566, + "step": 3518, + "time_per_iteration": 2.617760419845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054389, + "balance_loss_mlp": 1.04459572, + "diversity_loss_mlp": 0.0, + "epoch": 0.6769911504424779, + "flos": 525039128064.0, + "grad_norm": 0.06814874892769256, + "language_loss": 0.83201683, + "learning_rate": 0.00024955046229314584, + "loss": 0.84256077, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.09783936, + "routerloss_mlp": 0.0, + "step": 3519, + "time_per_iteration": 2.6269547939300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_mlp": 1.04138207, + "diversity_loss_mlp": 0.0, + "epoch": 0.6771835321277414, + "flos": 449896508928.0, + "grad_norm": 0.06326657634867637, + "language_loss": 0.87517166, + "learning_rate": 0.00024928086928218947, + "loss": 0.88568723, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.10174561, + "routerloss_mlp": 0.0, + "step": 3520, + "time_per_iteration": 2.500542163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057369, + "balance_loss_mlp": 1.04749823, + "diversity_loss_mlp": 0.0, + "epoch": 0.677375913813005, + "flos": 709349985792.0, + "grad_norm": 0.0729210521666428, + "language_loss": 0.76251125, + "learning_rate": 0.00024901137360903216, + "loss": 0.77308488, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.09869385, + "routerloss_mlp": 0.0, + "step": 3521, + "time_per_iteration": 2.921558380126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055481, + "balance_loss_mlp": 1.04586673, + "diversity_loss_mlp": 0.0, + "epoch": 0.6775682954982686, + "flos": 428420109312.0, + "grad_norm": 0.08065371435227142, + "language_loss": 0.80853164, + "learning_rate": 0.00024874197537830115, + "loss": 0.81908649, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3522, + "time_per_iteration": 2.5280978679656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793286, + "balance_loss_mlp": 1.3416667, + "diversity_loss_mlp": 0.22178407, + "epoch": 0.6777606771835322, + "flos": 437905626624.0, + "grad_norm": 0.034341347950706966, + "language_loss": 0.834656, + "learning_rate": 0.00024847267469458684, + "loss": 0.8425889, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0115611, + "step": 3523, + "time_per_iteration": 2.5251760482788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058563, + "balance_loss_mlp": 1.04881763, + "diversity_loss_mlp": 0.0, + "epoch": 0.6779530588687956, + "flos": 775442087424.0, + "grad_norm": 0.0593554156839795, + "language_loss": 0.77790511, + "learning_rate": 0.00024820347166244034, + "loss": 0.78849077, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.09741211, + "routerloss_mlp": 0.0, + "step": 3524, + "time_per_iteration": 2.9970362186431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061163, + "balance_loss_mlp": 1.051489, + "diversity_loss_mlp": 0.0, + "epoch": 0.6781454405540592, + "flos": 571782094848.0, + "grad_norm": 0.05785383684082485, + "language_loss": 0.8476572, + "learning_rate": 0.0002479343663863755, + "loss": 0.85826874, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 3525, + "time_per_iteration": 2.748159885406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059476, + "balance_loss_mlp": 1.04968917, + "diversity_loss_mlp": 0.0, + "epoch": 0.6783378222393228, + "flos": 485026693632.0, + "grad_norm": 0.0719627260838572, + "language_loss": 0.76970756, + "learning_rate": 0.00024766535897086876, + "loss": 0.78030241, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3526, + "time_per_iteration": 2.5848824977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060587, + "balance_loss_mlp": 1.05073428, + "diversity_loss_mlp": 0.0, + "epoch": 0.6785302039245864, + "flos": 482839958016.0, + "grad_norm": 0.06835251841322831, + "language_loss": 0.79290187, + "learning_rate": 0.0002473964495203578, + "loss": 0.80350775, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3527, + "time_per_iteration": 2.6953914165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106164, + "balance_loss_mlp": 1.05191827, + "diversity_loss_mlp": 0.0, + "epoch": 0.67872258560985, + "flos": 524732608512.0, + "grad_norm": 0.06684083470405644, + "language_loss": 0.85681713, + "learning_rate": 0.0002471276381392425, + "loss": 0.86743355, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3528, + "time_per_iteration": 2.7917094230651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_mlp": 1.02451074, + "diversity_loss_mlp": 0.0, + "epoch": 0.6789149672951135, + "flos": 1552605428736.0, + "grad_norm": 0.029269024795112553, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.7921958, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.06445312, + "routerloss_mlp": 0.0, + "step": 3529, + "time_per_iteration": 4.962055921554565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066825, + "balance_loss_mlp": 1.05733609, + "diversity_loss_mlp": 0.0, + "epoch": 0.6791073489803771, + "flos": 741406556160.0, + "grad_norm": 0.06831388456608918, + "language_loss": 0.84243917, + "learning_rate": 0.00024659031000260826, + "loss": 0.85310745, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3530, + "time_per_iteration": 2.8746378421783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066552, + "balance_loss_mlp": 1.05688381, + "diversity_loss_mlp": 0.0, + "epoch": 0.6792997306656406, + "flos": 576365538816.0, + "grad_norm": 0.07285232550578888, + "language_loss": 0.80730051, + "learning_rate": 0.0002463217934556985, + "loss": 0.81796598, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3531, + "time_per_iteration": 2.7028424739837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014286, + "balance_loss_mlp": 1.00808728, + "diversity_loss_mlp": 0.0, + "epoch": 0.6794921123509042, + "flos": 1503337273344.0, + "grad_norm": 0.01858574921496822, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77546376, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.06201172, + "routerloss_mlp": 0.0, + "step": 3532, + "time_per_iteration": 4.780252933502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071665, + "balance_loss_mlp": 1.06221724, + "diversity_loss_mlp": 0.0, + "epoch": 0.6796844940361677, + "flos": 698923261440.0, + "grad_norm": 0.08979673870599186, + "language_loss": 0.83808529, + "learning_rate": 0.0002457850559259306, + "loss": 0.84880191, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3533, + "time_per_iteration": 2.9009928703308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107616, + "balance_loss_mlp": 1.06684947, + "diversity_loss_mlp": 0.0, + "epoch": 0.6798768757214313, + "flos": 552759303168.0, + "grad_norm": 0.06667977411786664, + "language_loss": 0.81866515, + "learning_rate": 0.00024551683515145275, + "loss": 0.82942677, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3534, + "time_per_iteration": 2.67411208152771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.0675205, + "diversity_loss_mlp": 0.0, + "epoch": 0.6800692574066949, + "flos": 522936456192.0, + "grad_norm": 0.06662082176408471, + "language_loss": 0.86499625, + "learning_rate": 0.0002452487131761014, + "loss": 0.87576586, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3535, + "time_per_iteration": 2.723414421081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071026, + "balance_loss_mlp": 1.06126261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6802616390919585, + "flos": 574023158784.0, + "grad_norm": 0.07513209939898634, + "language_loss": 0.79904449, + "learning_rate": 0.00024498069010397093, + "loss": 0.80975473, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3536, + "time_per_iteration": 2.729044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071543, + "balance_loss_mlp": 1.06177378, + "diversity_loss_mlp": 0.0, + "epoch": 0.6804540207772221, + "flos": 488157207552.0, + "grad_norm": 0.062001089349607685, + "language_loss": 0.85142958, + "learning_rate": 0.00024471276603911697, + "loss": 0.86214507, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3537, + "time_per_iteration": 4.243680953979492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073179, + "balance_loss_mlp": 1.06360102, + "diversity_loss_mlp": 0.0, + "epoch": 0.6806464024624855, + "flos": 578594119680.0, + "grad_norm": 0.06230124795461592, + "language_loss": 0.79373354, + "learning_rate": 0.0002444449410855572, + "loss": 0.80446529, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3538, + "time_per_iteration": 2.744311571121216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071763, + "balance_loss_mlp": 1.06218505, + "diversity_loss_mlp": 0.0, + "epoch": 0.6808387841477491, + "flos": 553722905088.0, + "grad_norm": 0.057428584707934646, + "language_loss": 0.84307408, + "learning_rate": 0.00024417721534727033, + "loss": 0.85379171, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3539, + "time_per_iteration": 2.643796920776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073723, + "balance_loss_mlp": 1.06420994, + "diversity_loss_mlp": 0.0, + "epoch": 0.6810311658330127, + "flos": 426841270272.0, + "grad_norm": 0.09448746877359589, + "language_loss": 0.82968056, + "learning_rate": 0.00024390958892819687, + "loss": 0.8404178, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3540, + "time_per_iteration": 2.500807285308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010722, + "balance_loss_mlp": 1.0624193, + "diversity_loss_mlp": 0.0, + "epoch": 0.6812235475182763, + "flos": 572256368640.0, + "grad_norm": 0.06494427347835982, + "language_loss": 0.80941665, + "learning_rate": 0.0002436420619322381, + "loss": 0.82013869, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3541, + "time_per_iteration": 2.8345742225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077361, + "balance_loss_mlp": 1.0675267, + "diversity_loss_mlp": 0.0, + "epoch": 0.6814159292035398, + "flos": 501917078016.0, + "grad_norm": 0.07816741001086884, + "language_loss": 0.82754946, + "learning_rate": 0.0002433746344632577, + "loss": 0.83832312, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3542, + "time_per_iteration": 2.6863982677459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067339, + "balance_loss_mlp": 1.05741465, + "diversity_loss_mlp": 0.0, + "epoch": 0.6816083108888034, + "flos": 765531482112.0, + "grad_norm": 0.06517118266272649, + "language_loss": 0.80166835, + "learning_rate": 0.00024310730662508006, + "loss": 0.81234175, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.09924316, + "routerloss_mlp": 0.0, + "step": 3543, + "time_per_iteration": 3.0644540786743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070309, + "balance_loss_mlp": 1.06105816, + "diversity_loss_mlp": 0.0, + "epoch": 0.681800692574067, + "flos": 479459824128.0, + "grad_norm": 0.06994305910782121, + "language_loss": 0.87753445, + "learning_rate": 0.0002428400785214911, + "loss": 0.88823748, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3544, + "time_per_iteration": 2.5769219398498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070709, + "balance_loss_mlp": 1.06136894, + "diversity_loss_mlp": 0.0, + "epoch": 0.6819930742593305, + "flos": 691604656128.0, + "grad_norm": 0.07082765333867001, + "language_loss": 0.82354796, + "learning_rate": 0.00024257295025623794, + "loss": 0.83425504, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3545, + "time_per_iteration": 2.799276828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066867, + "balance_loss_mlp": 1.05750871, + "diversity_loss_mlp": 0.0, + "epoch": 0.6821854559445941, + "flos": 678096603648.0, + "grad_norm": 0.06649234916050309, + "language_loss": 0.8049404, + "learning_rate": 0.00024230592193302892, + "loss": 0.8156091, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3546, + "time_per_iteration": 2.9205825328826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064713, + "balance_loss_mlp": 1.05521762, + "diversity_loss_mlp": 0.0, + "epoch": 0.6823778376298576, + "flos": 462191339520.0, + "grad_norm": 0.07288649013986744, + "language_loss": 0.84268177, + "learning_rate": 0.00024203899365553372, + "loss": 0.85332888, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3547, + "time_per_iteration": 2.5345499515533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_mlp": 1.02241051, + "diversity_loss_mlp": 0.0, + "epoch": 0.6825702193151212, + "flos": 1475298842112.0, + "grad_norm": 0.024887330229706912, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77762419, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.05639648, + "routerloss_mlp": 0.0, + "step": 3548, + "time_per_iteration": 4.575555801391602 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066126, + "balance_loss_mlp": 1.05700111, + "diversity_loss_mlp": 0.0, + "epoch": 0.6827626010003848, + "flos": 723114998784.0, + "grad_norm": 0.06418703018565212, + "language_loss": 0.83182037, + "learning_rate": 0.00024150543765216848, + "loss": 0.84248167, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 3549, + "time_per_iteration": 2.9021003246307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060662, + "balance_loss_mlp": 1.05113733, + "diversity_loss_mlp": 0.0, + "epoch": 0.6829549826856484, + "flos": 558864686592.0, + "grad_norm": 0.07049185581954354, + "language_loss": 0.83715057, + "learning_rate": 0.00024123881013344352, + "loss": 0.8477571, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.09515381, + "routerloss_mlp": 0.0, + "step": 3550, + "time_per_iteration": 2.671104669570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062514, + "balance_loss_mlp": 1.05335271, + "diversity_loss_mlp": 0.0, + "epoch": 0.6831473643709118, + "flos": 624934393344.0, + "grad_norm": 0.06503037380674516, + "language_loss": 0.7999897, + "learning_rate": 0.00024097228307472202, + "loss": 0.81061488, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3551, + "time_per_iteration": 2.826650619506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064244, + "balance_loss_mlp": 1.05474889, + "diversity_loss_mlp": 0.0, + "epoch": 0.6833397460561754, + "flos": 713861849088.0, + "grad_norm": 0.06680109192015529, + "language_loss": 0.82289582, + "learning_rate": 0.00024070585657947846, + "loss": 0.83353829, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3552, + "time_per_iteration": 2.831995725631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010645, + "balance_loss_mlp": 1.05527949, + "diversity_loss_mlp": 0.0, + "epoch": 0.683532127741439, + "flos": 464704045056.0, + "grad_norm": 0.065434895685697, + "language_loss": 0.85023475, + "learning_rate": 0.00024043953075114934, + "loss": 0.86087978, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3553, + "time_per_iteration": 2.622846841812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055947, + "balance_loss_mlp": 1.0463928, + "diversity_loss_mlp": 0.0, + "epoch": 0.6837245094267026, + "flos": 582251037696.0, + "grad_norm": 0.07243414619593286, + "language_loss": 0.89257199, + "learning_rate": 0.00024017330569313128, + "loss": 0.90313148, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3554, + "time_per_iteration": 2.705098867416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065851, + "balance_loss_mlp": 1.05631375, + "diversity_loss_mlp": 0.0, + "epoch": 0.6839168911119662, + "flos": 794173413888.0, + "grad_norm": 0.06810293796091849, + "language_loss": 0.7482394, + "learning_rate": 0.0002399071815087821, + "loss": 0.7588979, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3555, + "time_per_iteration": 3.053788900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064522, + "balance_loss_mlp": 1.05496788, + "diversity_loss_mlp": 0.0, + "epoch": 0.6841092727972297, + "flos": 580009973760.0, + "grad_norm": 0.0721005752972134, + "language_loss": 0.83788198, + "learning_rate": 0.00023964115830142025, + "loss": 0.84852719, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3556, + "time_per_iteration": 2.7068707942962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062319, + "balance_loss_mlp": 1.05320573, + "diversity_loss_mlp": 0.0, + "epoch": 0.6843016544824932, + "flos": 383742738432.0, + "grad_norm": 0.07897700130685587, + "language_loss": 0.87426114, + "learning_rate": 0.00023937523617432522, + "loss": 0.88488424, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3557, + "time_per_iteration": 2.526129722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_mlp": 1.05461264, + "diversity_loss_mlp": 0.0, + "epoch": 0.6844940361677568, + "flos": 1439035476480.0, + "grad_norm": 0.08002974259616906, + "language_loss": 0.8704505, + "learning_rate": 0.00023910941523073705, + "loss": 0.88108861, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3558, + "time_per_iteration": 3.884982109069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067156, + "balance_loss_mlp": 1.05752969, + "diversity_loss_mlp": 0.0, + "epoch": 0.6846864178530204, + "flos": 520870860288.0, + "grad_norm": 0.0697798269972245, + "language_loss": 0.86687434, + "learning_rate": 0.0002388436955738566, + "loss": 0.87754589, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3559, + "time_per_iteration": 2.6896438598632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067142, + "balance_loss_mlp": 1.05763495, + "diversity_loss_mlp": 0.0, + "epoch": 0.6848787995382839, + "flos": 717946053120.0, + "grad_norm": 0.07371598831130721, + "language_loss": 0.81583881, + "learning_rate": 0.00023857807730684523, + "loss": 0.82651019, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3560, + "time_per_iteration": 2.906409740447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070721, + "balance_loss_mlp": 1.06119633, + "diversity_loss_mlp": 0.0, + "epoch": 0.6850711812235475, + "flos": 511061571072.0, + "grad_norm": 0.09020757950976771, + "language_loss": 0.82591355, + "learning_rate": 0.00023831256053282547, + "loss": 0.83662075, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3561, + "time_per_iteration": 2.741647481918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076955, + "balance_loss_mlp": 1.06726301, + "diversity_loss_mlp": 0.0, + "epoch": 0.6852635629088111, + "flos": 668151493632.0, + "grad_norm": 0.06598100836979733, + "language_loss": 0.7798056, + "learning_rate": 0.00023804714535488003, + "loss": 0.79057515, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3562, + "time_per_iteration": 2.8663859367370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022665, + "balance_loss_mlp": 1.01694274, + "diversity_loss_mlp": 0.0, + "epoch": 0.6854559445940747, + "flos": 1522980071424.0, + "grad_norm": 0.018293527884891043, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80832297, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.05712891, + "routerloss_mlp": 0.0, + "step": 3563, + "time_per_iteration": 4.938952684402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.06765318, + "diversity_loss_mlp": 0.0, + "epoch": 0.6856483262793382, + "flos": 454203168768.0, + "grad_norm": 0.06579070354920068, + "language_loss": 0.8089236, + "learning_rate": 0.00023751662019934488, + "loss": 0.81969196, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3564, + "time_per_iteration": 2.4886345863342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085968, + "balance_loss_mlp": 1.07677126, + "diversity_loss_mlp": 0.0, + "epoch": 0.6858407079646017, + "flos": 615552763392.0, + "grad_norm": 0.06770513871895241, + "language_loss": 0.79428673, + "learning_rate": 0.00023725151042772364, + "loss": 0.80514634, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3565, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091397, + "balance_loss_mlp": 1.08220637, + "diversity_loss_mlp": 0.0, + "epoch": 0.6860330896498653, + "flos": 466053087744.0, + "grad_norm": 0.0657025292696896, + "language_loss": 0.83245081, + "learning_rate": 0.00023698650266411276, + "loss": 0.84336478, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3566, + "time_per_iteration": 2.619652032852173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087671, + "balance_loss_mlp": 1.07844996, + "diversity_loss_mlp": 0.0, + "epoch": 0.6862254713351289, + "flos": 864270425088.0, + "grad_norm": 0.07570090303701395, + "language_loss": 0.82732457, + "learning_rate": 0.00023672159701139755, + "loss": 0.83820128, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3567, + "time_per_iteration": 3.2096190452575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092795, + "balance_loss_mlp": 1.08350825, + "diversity_loss_mlp": 0.0, + "epoch": 0.6864178530203925, + "flos": 447141523968.0, + "grad_norm": 0.07219945861824417, + "language_loss": 0.86111134, + "learning_rate": 0.00023645679357242296, + "loss": 0.87203926, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3568, + "time_per_iteration": 2.598115921020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00792363, + "balance_loss_mlp": 1.34135008, + "diversity_loss_mlp": 0.22022857, + "epoch": 0.6866102347056561, + "flos": 424269093888.0, + "grad_norm": 0.03374979092207147, + "language_loss": 0.84308195, + "learning_rate": 0.00023619209244999534, + "loss": 0.85100567, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01157361, + "step": 3569, + "time_per_iteration": 2.647141695022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.07559109, + "diversity_loss_mlp": 0.0, + "epoch": 0.6868026163909196, + "flos": 472373586432.0, + "grad_norm": 0.09720254317506574, + "language_loss": 0.85017771, + "learning_rate": 0.0002359274937468806, + "loss": 0.86102515, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 3570, + "time_per_iteration": 2.5088424682617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080851, + "balance_loss_mlp": 1.07149255, + "diversity_loss_mlp": 0.0, + "epoch": 0.6869949980761831, + "flos": 464190124032.0, + "grad_norm": 0.06491952507138833, + "language_loss": 0.77798098, + "learning_rate": 0.00023566299756580512, + "loss": 0.78878951, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3571, + "time_per_iteration": 2.6349782943725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080655, + "balance_loss_mlp": 1.07132113, + "diversity_loss_mlp": 0.0, + "epoch": 0.6871873797614467, + "flos": 426235944960.0, + "grad_norm": 0.07205344290521438, + "language_loss": 0.78495932, + "learning_rate": 0.0002353986040094551, + "loss": 0.79576588, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3572, + "time_per_iteration": 2.4710493087768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079091, + "balance_loss_mlp": 1.06974494, + "diversity_loss_mlp": 0.0, + "epoch": 0.6873797614467103, + "flos": 443625569280.0, + "grad_norm": 0.07195013135933294, + "language_loss": 0.7977035, + "learning_rate": 0.00023513431318047796, + "loss": 0.80849445, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3573, + "time_per_iteration": 2.5213143825531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081479, + "balance_loss_mlp": 1.07233512, + "diversity_loss_mlp": 0.0, + "epoch": 0.6875721431319738, + "flos": 992323436544.0, + "grad_norm": 0.0671999790126143, + "language_loss": 0.77178657, + "learning_rate": 0.00023487012518147977, + "loss": 0.78260136, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3574, + "time_per_iteration": 3.2319135665893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073879, + "balance_loss_mlp": 1.06456256, + "diversity_loss_mlp": 0.0, + "epoch": 0.6877645248172374, + "flos": 1285513638912.0, + "grad_norm": 0.06898424741609648, + "language_loss": 0.84452772, + "learning_rate": 0.00023460604011502772, + "loss": 0.85526657, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3575, + "time_per_iteration": 3.8878557682037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075527, + "balance_loss_mlp": 1.0666877, + "diversity_loss_mlp": 0.0, + "epoch": 0.687956906502501, + "flos": 876733383168.0, + "grad_norm": 0.0699577179930161, + "language_loss": 0.85862118, + "learning_rate": 0.00023434205808364845, + "loss": 0.86937642, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 3576, + "time_per_iteration": 3.1633143424987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072972, + "balance_loss_mlp": 1.06390619, + "diversity_loss_mlp": 0.0, + "epoch": 0.6881492881877646, + "flos": 563324419584.0, + "grad_norm": 0.07476899851847786, + "language_loss": 0.85238355, + "learning_rate": 0.00023407817918982932, + "loss": 0.86311328, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3577, + "time_per_iteration": 2.7126357555389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075894, + "balance_loss_mlp": 1.06677413, + "diversity_loss_mlp": 0.0, + "epoch": 0.6883416698730281, + "flos": 795127104000.0, + "grad_norm": 0.07427735671199864, + "language_loss": 0.78816962, + "learning_rate": 0.00023381440353601718, + "loss": 0.79892862, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3578, + "time_per_iteration": 2.9925150871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069003, + "balance_loss_mlp": 1.05976987, + "diversity_loss_mlp": 0.0, + "epoch": 0.6885340515582916, + "flos": 723621579264.0, + "grad_norm": 0.07604251893794473, + "language_loss": 0.86125422, + "learning_rate": 0.00023355073122461822, + "loss": 0.87194419, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3579, + "time_per_iteration": 2.938112258911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065285, + "balance_loss_mlp": 1.05620754, + "diversity_loss_mlp": 0.0, + "epoch": 0.6887264332435552, + "flos": 1010926282752.0, + "grad_norm": 0.06357801718819331, + "language_loss": 0.82597542, + "learning_rate": 0.00023328716235799973, + "loss": 0.83662832, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3580, + "time_per_iteration": 3.2711336612701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066362, + "balance_loss_mlp": 1.05755877, + "diversity_loss_mlp": 0.0, + "epoch": 0.6889188149288188, + "flos": 585262983168.0, + "grad_norm": 0.07922172227575792, + "language_loss": 0.84162283, + "learning_rate": 0.00023302369703848803, + "loss": 0.85228646, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 3581, + "time_per_iteration": 2.8185226917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069197, + "balance_loss_mlp": 1.06004775, + "diversity_loss_mlp": 0.0, + "epoch": 0.6891111966140824, + "flos": 636119889408.0, + "grad_norm": 0.07416922878209098, + "language_loss": 0.79931486, + "learning_rate": 0.00023276033536836937, + "loss": 0.81000686, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 3582, + "time_per_iteration": 2.844299554824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061227, + "balance_loss_mlp": 1.05179787, + "diversity_loss_mlp": 0.0, + "epoch": 0.6893035782993459, + "flos": 495270609408.0, + "grad_norm": 0.06489183727188522, + "language_loss": 0.85119617, + "learning_rate": 0.00023249707744988984, + "loss": 0.86180842, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3583, + "time_per_iteration": 2.701711654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060915, + "balance_loss_mlp": 1.05140829, + "diversity_loss_mlp": 0.0, + "epoch": 0.6894959599846094, + "flos": 458215792128.0, + "grad_norm": 0.07019303893436639, + "language_loss": 0.82148254, + "learning_rate": 0.00023223392338525529, + "loss": 0.83209163, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3584, + "time_per_iteration": 2.5167200565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053502, + "balance_loss_mlp": 1.04406083, + "diversity_loss_mlp": 0.0, + "epoch": 0.689688341669873, + "flos": 505003175424.0, + "grad_norm": 0.06639305906088179, + "language_loss": 0.78639823, + "learning_rate": 0.00023197087327663107, + "loss": 0.79693329, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3585, + "time_per_iteration": 2.6349897384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057819, + "balance_loss_mlp": 1.04834747, + "diversity_loss_mlp": 0.0, + "epoch": 0.6898807233551366, + "flos": 763910797824.0, + "grad_norm": 0.0732534701091779, + "language_loss": 0.81201088, + "learning_rate": 0.00023170792722614243, + "loss": 0.82258916, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3586, + "time_per_iteration": 2.9198050498962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056115, + "balance_loss_mlp": 1.04651892, + "diversity_loss_mlp": 0.0, + "epoch": 0.6900731050404002, + "flos": 583337977344.0, + "grad_norm": 0.06720533838288198, + "language_loss": 0.83776879, + "learning_rate": 0.00023144508533587377, + "loss": 0.84832996, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3587, + "time_per_iteration": 2.8723502159118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054327, + "balance_loss_mlp": 1.04436147, + "diversity_loss_mlp": 0.0, + "epoch": 0.6902654867256637, + "flos": 711865262592.0, + "grad_norm": 0.07065225941485688, + "language_loss": 0.78699905, + "learning_rate": 0.0002311823477078698, + "loss": 0.79754233, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3588, + "time_per_iteration": 2.9407894611358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054694, + "balance_loss_mlp": 1.04507959, + "diversity_loss_mlp": 0.0, + "epoch": 0.6904578684109273, + "flos": 597112902144.0, + "grad_norm": 0.0778571388662146, + "language_loss": 0.85240763, + "learning_rate": 0.00023091971444413428, + "loss": 0.8629545, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.0960083, + "routerloss_mlp": 0.0, + "step": 3589, + "time_per_iteration": 2.796943187713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054005, + "balance_loss_mlp": 1.04385448, + "diversity_loss_mlp": 0.0, + "epoch": 0.6906502500961909, + "flos": 585040527360.0, + "grad_norm": 0.0732795678952718, + "language_loss": 0.82600373, + "learning_rate": 0.00023065718564663012, + "loss": 0.8365438, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 3590, + "time_per_iteration": 2.742586135864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010537, + "balance_loss_mlp": 1.00519681, + "diversity_loss_mlp": 0.0, + "epoch": 0.6908426317814544, + "flos": 1587827017728.0, + "grad_norm": 0.012465594930310886, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74922127, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.0534668, + "routerloss_mlp": 0.0, + "step": 3591, + "time_per_iteration": 4.981812477111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079259, + "balance_loss_mlp": 1.34177041, + "diversity_loss_mlp": 0.2198928, + "epoch": 0.6910350134667179, + "flos": 500780579328.0, + "grad_norm": 0.028847197535296083, + "language_loss": 0.80689478, + "learning_rate": 0.0002301324418579666, + "loss": 0.81482071, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0117582, + "step": 3592, + "time_per_iteration": 2.71809983253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0058906, + "balance_loss_mlp": 1.01557088, + "diversity_loss_mlp": 0.14263315, + "epoch": 0.6912273951519815, + "flos": 1409194257408.0, + "grad_norm": 0.0010924650790030575, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79277533, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00995804, + "step": 3593, + "time_per_iteration": 4.800194263458252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064196, + "balance_loss_mlp": 1.05474234, + "diversity_loss_mlp": 0.0, + "epoch": 0.6914197768372451, + "flos": 635279625216.0, + "grad_norm": 0.08227146788009188, + "language_loss": 0.80700612, + "learning_rate": 0.00022960811715677415, + "loss": 0.81764805, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3594, + "time_per_iteration": 2.8780887126922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065404, + "balance_loss_mlp": 1.05574787, + "diversity_loss_mlp": 0.0, + "epoch": 0.6916121585225087, + "flos": 558044246016.0, + "grad_norm": 0.06283622806249096, + "language_loss": 0.82029772, + "learning_rate": 0.00022934611221845608, + "loss": 0.83095175, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3595, + "time_per_iteration": 2.80785870552063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062245, + "balance_loss_mlp": 1.05264866, + "diversity_loss_mlp": 0.0, + "epoch": 0.6918045402077723, + "flos": 529167748608.0, + "grad_norm": 0.07415067488634865, + "language_loss": 0.77666163, + "learning_rate": 0.00022908421235729609, + "loss": 0.78728402, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3596, + "time_per_iteration": 2.75410795211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065698, + "balance_loss_mlp": 1.05607235, + "diversity_loss_mlp": 0.0, + "epoch": 0.6919969218930357, + "flos": 570351559680.0, + "grad_norm": 0.06984612144500793, + "language_loss": 0.8509379, + "learning_rate": 0.0002288224176749728, + "loss": 0.86159492, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3597, + "time_per_iteration": 2.670696258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070664, + "balance_loss_mlp": 1.06105542, + "diversity_loss_mlp": 0.0, + "epoch": 0.6921893035782993, + "flos": 683305196544.0, + "grad_norm": 0.1037313094960325, + "language_loss": 0.78704476, + "learning_rate": 0.00022856072827312385, + "loss": 0.79775131, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.09606934, + "routerloss_mlp": 0.0, + "step": 3598, + "time_per_iteration": 2.795475959777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106581, + "balance_loss_mlp": 1.05624998, + "diversity_loss_mlp": 0.0, + "epoch": 0.6923816852635629, + "flos": 546745324032.0, + "grad_norm": 0.06439958207329444, + "language_loss": 0.77316082, + "learning_rate": 0.00022829914425334598, + "loss": 0.78381896, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3599, + "time_per_iteration": 2.6179866790771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064235, + "balance_loss_mlp": 1.05483484, + "diversity_loss_mlp": 0.0, + "epoch": 0.6925740669488265, + "flos": 510036300288.0, + "grad_norm": 0.06408780313496462, + "language_loss": 0.80725557, + "learning_rate": 0.0002280376657171956, + "loss": 0.81789792, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3600, + "time_per_iteration": 2.633162021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_mlp": 1.05445051, + "diversity_loss_mlp": 0.0, + "epoch": 0.69276644863409, + "flos": 869424689664.0, + "grad_norm": 0.07377083778937557, + "language_loss": 0.76414573, + "learning_rate": 0.00022777629276618706, + "loss": 0.77478784, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3601, + "time_per_iteration": 3.0916104316711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065762, + "balance_loss_mlp": 1.05597496, + "diversity_loss_mlp": 0.0, + "epoch": 0.6929588303193536, + "flos": 625772086272.0, + "grad_norm": 0.06702562864271609, + "language_loss": 0.77948666, + "learning_rate": 0.0002275150255017947, + "loss": 0.79014426, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3602, + "time_per_iteration": 2.7668936252593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012943, + "balance_loss_mlp": 1.00765014, + "diversity_loss_mlp": 0.0, + "epoch": 0.6931512120046172, + "flos": 1545382996992.0, + "grad_norm": 0.010670435186768691, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76745617, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 3603, + "time_per_iteration": 5.010159492492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011501, + "balance_loss_mlp": 1.00618434, + "diversity_loss_mlp": 0.0, + "epoch": 0.6933435936898807, + "flos": 1448230606848.0, + "grad_norm": 0.00963913060826947, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76138604, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 3604, + "time_per_iteration": 4.7926812171936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061095, + "balance_loss_mlp": 1.05157018, + "diversity_loss_mlp": 0.0, + "epoch": 0.6935359753751443, + "flos": 540896901120.0, + "grad_norm": 0.06111799581134822, + "language_loss": 0.84283471, + "learning_rate": 0.0002267318588424379, + "loss": 0.85344565, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3605, + "time_per_iteration": 2.732388496398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_mlp": 1.04717207, + "diversity_loss_mlp": 0.0, + "epoch": 0.6937283570604078, + "flos": 719396411904.0, + "grad_norm": 0.07244313312376265, + "language_loss": 0.87551069, + "learning_rate": 0.00022647101533842845, + "loss": 0.88607633, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3606, + "time_per_iteration": 3.001912832260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_mlp": 1.04882836, + "diversity_loss_mlp": 0.0, + "epoch": 0.6939207387456714, + "flos": 522165574656.0, + "grad_norm": 0.07498146805012186, + "language_loss": 0.76334918, + "learning_rate": 0.00022621027802778872, + "loss": 0.77393162, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3607, + "time_per_iteration": 2.6257400512695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052774, + "balance_loss_mlp": 1.04345798, + "diversity_loss_mlp": 0.0, + "epoch": 0.694113120430935, + "flos": 535359767040.0, + "grad_norm": 0.07029819881410336, + "language_loss": 0.78756207, + "learning_rate": 0.00022594964701174586, + "loss": 0.79808986, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3608, + "time_per_iteration": 2.6099236011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065561, + "balance_loss_mlp": 1.05642402, + "diversity_loss_mlp": 0.0, + "epoch": 0.6943055021161986, + "flos": 523358972928.0, + "grad_norm": 0.10152593614861574, + "language_loss": 0.84643018, + "learning_rate": 0.00022568912239148586, + "loss": 0.85708582, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 3609, + "time_per_iteration": 2.6678829193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059207, + "balance_loss_mlp": 1.04986095, + "diversity_loss_mlp": 0.0, + "epoch": 0.694497883801462, + "flos": 484902982656.0, + "grad_norm": 0.06906376751770449, + "language_loss": 0.81638551, + "learning_rate": 0.00022542870426815344, + "loss": 0.82697761, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3610, + "time_per_iteration": 2.69460129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058231, + "balance_loss_mlp": 1.04869449, + "diversity_loss_mlp": 0.0, + "epoch": 0.6946902654867256, + "flos": 461474786304.0, + "grad_norm": 0.07528135941421366, + "language_loss": 0.86051476, + "learning_rate": 0.00022516839274285173, + "loss": 0.87109709, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3611, + "time_per_iteration": 2.5634658336639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063678, + "balance_loss_mlp": 1.05389714, + "diversity_loss_mlp": 0.0, + "epoch": 0.6948826471719892, + "flos": 512855525376.0, + "grad_norm": 0.06331906344074151, + "language_loss": 0.7521888, + "learning_rate": 0.00022490818791664265, + "loss": 0.76282561, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3612, + "time_per_iteration": 2.617492437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067849, + "balance_loss_mlp": 1.05837226, + "diversity_loss_mlp": 0.0, + "epoch": 0.6950750288572528, + "flos": 557184531456.0, + "grad_norm": 0.05946591075452152, + "language_loss": 0.85666263, + "learning_rate": 0.00022464808989054676, + "loss": 0.86734116, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3613, + "time_per_iteration": 2.6678874492645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789837, + "balance_loss_mlp": 1.33770788, + "diversity_loss_mlp": 0.21965824, + "epoch": 0.6952674105425164, + "flos": 542475740160.0, + "grad_norm": 0.03604068217542595, + "language_loss": 0.76138353, + "learning_rate": 0.00022438809876554284, + "loss": 0.76928186, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01115366, + "step": 3614, + "time_per_iteration": 2.6613171100616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070097, + "balance_loss_mlp": 1.0602442, + "diversity_loss_mlp": 0.0, + "epoch": 0.6954597922277799, + "flos": 546742752768.0, + "grad_norm": 0.08971125257054285, + "language_loss": 0.80425173, + "learning_rate": 0.00022412821464256873, + "loss": 0.81495273, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3615, + "time_per_iteration": 2.7288718223571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_mlp": 1.06157804, + "diversity_loss_mlp": 0.0, + "epoch": 0.6956521739130435, + "flos": 519511905792.0, + "grad_norm": 0.07384702921709109, + "language_loss": 0.82342923, + "learning_rate": 0.00022386843762252023, + "loss": 0.83414114, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3616, + "time_per_iteration": 2.5761711597442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106737, + "balance_loss_mlp": 1.0575707, + "diversity_loss_mlp": 0.0, + "epoch": 0.695844555598307, + "flos": 466275543552.0, + "grad_norm": 0.07908443617567998, + "language_loss": 0.79798818, + "learning_rate": 0.00022360876780625193, + "loss": 0.80866194, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.09790039, + "routerloss_mlp": 0.0, + "step": 3617, + "time_per_iteration": 2.6008386611938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059868, + "balance_loss_mlp": 1.05015886, + "diversity_loss_mlp": 0.0, + "epoch": 0.6960369372835706, + "flos": 600663361536.0, + "grad_norm": 0.07021226627677062, + "language_loss": 0.80116498, + "learning_rate": 0.00022334920529457604, + "loss": 0.81176364, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3618, + "time_per_iteration": 2.9185733795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105864, + "balance_loss_mlp": 1.04876924, + "diversity_loss_mlp": 0.0, + "epoch": 0.6962293189688342, + "flos": 644233969152.0, + "grad_norm": 0.05697997760775425, + "language_loss": 0.87189567, + "learning_rate": 0.00022308975018826423, + "loss": 0.88248205, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3619, + "time_per_iteration": 2.927544355392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054275, + "balance_loss_mlp": 1.04414856, + "diversity_loss_mlp": 0.0, + "epoch": 0.6964217006540977, + "flos": 638810634240.0, + "grad_norm": 0.0740354998090604, + "language_loss": 0.84932256, + "learning_rate": 0.00022283040258804564, + "loss": 0.85986531, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 3620, + "time_per_iteration": 2.755613327026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787662, + "balance_loss_mlp": 1.33203387, + "diversity_loss_mlp": 0.22018704, + "epoch": 0.6966140823393613, + "flos": 652167811584.0, + "grad_norm": 0.033538632644234186, + "language_loss": 0.83875167, + "learning_rate": 0.00022257116259460802, + "loss": 0.84662825, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01155162, + "step": 3621, + "time_per_iteration": 2.844062089920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_mlp": 1.03843641, + "diversity_loss_mlp": 0.0, + "epoch": 0.6968064640246249, + "flos": 704492328960.0, + "grad_norm": 0.06349986715080715, + "language_loss": 0.81602001, + "learning_rate": 0.00022231203030859725, + "loss": 0.82649869, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3622, + "time_per_iteration": 2.9582505226135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053512, + "balance_loss_mlp": 1.04382682, + "diversity_loss_mlp": 0.0, + "epoch": 0.6969988457098885, + "flos": 492555271680.0, + "grad_norm": 0.09473470519326596, + "language_loss": 0.83760095, + "learning_rate": 0.00022205300583061737, + "loss": 0.84813607, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 3623, + "time_per_iteration": 2.5727412700653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016252, + "balance_loss_mlp": 1.01057744, + "diversity_loss_mlp": 0.0, + "epoch": 0.6971912273951519, + "flos": 1352592442368.0, + "grad_norm": 0.01746847385777515, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83854461, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.05664062, + "routerloss_mlp": 0.0, + "step": 3624, + "time_per_iteration": 4.8940582275390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051503, + "balance_loss_mlp": 1.04190028, + "diversity_loss_mlp": 0.0, + "epoch": 0.6973836090804155, + "flos": 602459887104.0, + "grad_norm": 0.07214179790538137, + "language_loss": 0.77598304, + "learning_rate": 0.00022153528070095735, + "loss": 0.78649807, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3625, + "time_per_iteration": 2.694251298904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049268, + "balance_loss_mlp": 1.03960037, + "diversity_loss_mlp": 0.0, + "epoch": 0.6975759907656791, + "flos": 524065614336.0, + "grad_norm": 0.07542787145084529, + "language_loss": 0.88381326, + "learning_rate": 0.00022127658025027568, + "loss": 0.89430594, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3626, + "time_per_iteration": 2.6595661640167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053933, + "balance_loss_mlp": 1.04412818, + "diversity_loss_mlp": 0.0, + "epoch": 0.6977683724509427, + "flos": 480912754176.0, + "grad_norm": 0.08038583191357998, + "language_loss": 0.85689813, + "learning_rate": 0.00022101798800962258, + "loss": 0.86743748, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3627, + "time_per_iteration": 2.6137661933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057745, + "balance_loss_mlp": 1.04847646, + "diversity_loss_mlp": 0.0, + "epoch": 0.6979607541362063, + "flos": 522625167360.0, + "grad_norm": 0.08075391789271535, + "language_loss": 0.78634858, + "learning_rate": 0.00022075950407939227, + "loss": 0.79692602, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3628, + "time_per_iteration": 2.6296188831329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059141, + "balance_loss_mlp": 1.04959214, + "diversity_loss_mlp": 0.0, + "epoch": 0.6981531358214698, + "flos": 548077114368.0, + "grad_norm": 0.0897351301563825, + "language_loss": 0.8281461, + "learning_rate": 0.0002205011285599367, + "loss": 0.83873749, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.09539795, + "routerloss_mlp": 0.0, + "step": 3629, + "time_per_iteration": 2.6147000789642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079513, + "balance_loss_mlp": 1.34714937, + "diversity_loss_mlp": 0.21970588, + "epoch": 0.6983455175067333, + "flos": 700052419584.0, + "grad_norm": 0.029792453728032804, + "language_loss": 0.80962801, + "learning_rate": 0.00022024286155156658, + "loss": 0.81757927, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01170244, + "step": 3630, + "time_per_iteration": 2.8613815307617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058875, + "balance_loss_mlp": 1.04967785, + "diversity_loss_mlp": 0.0, + "epoch": 0.6985378991919969, + "flos": 485078450688.0, + "grad_norm": 0.10033041150535157, + "language_loss": 0.86079919, + "learning_rate": 0.00021998470315454994, + "loss": 0.87138796, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3631, + "time_per_iteration": 2.647185802459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061381, + "balance_loss_mlp": 1.05195761, + "diversity_loss_mlp": 0.0, + "epoch": 0.6987302808772605, + "flos": 558780622848.0, + "grad_norm": 0.06594571513985185, + "language_loss": 0.86829215, + "learning_rate": 0.00021972665346911275, + "loss": 0.87890601, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3632, + "time_per_iteration": 2.757704257965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065424, + "balance_loss_mlp": 1.05622673, + "diversity_loss_mlp": 0.0, + "epoch": 0.698922662562524, + "flos": 483593587200.0, + "grad_norm": 0.06824207534465764, + "language_loss": 0.79957312, + "learning_rate": 0.00021946871259543877, + "loss": 0.81022739, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 3633, + "time_per_iteration": 2.577909231185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063518, + "balance_loss_mlp": 1.05467892, + "diversity_loss_mlp": 0.0, + "epoch": 0.6991150442477876, + "flos": 718909655040.0, + "grad_norm": 0.08329780404335202, + "language_loss": 0.83364546, + "learning_rate": 0.00021921088063366957, + "loss": 0.84428072, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 3634, + "time_per_iteration": 2.933506965637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106547, + "balance_loss_mlp": 1.05625534, + "diversity_loss_mlp": 0.0, + "epoch": 0.6993074259330512, + "flos": 489128150016.0, + "grad_norm": 0.06097911291290099, + "language_loss": 0.81932688, + "learning_rate": 0.00021895315768390435, + "loss": 0.82998157, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 3635, + "time_per_iteration": 2.6155378818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071151, + "balance_loss_mlp": 1.06179357, + "diversity_loss_mlp": 0.0, + "epoch": 0.6994998076183148, + "flos": 718089214464.0, + "grad_norm": 0.05851098027896569, + "language_loss": 0.87547219, + "learning_rate": 0.00021869554384619999, + "loss": 0.88618374, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3636, + "time_per_iteration": 2.9845876693725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106866, + "balance_loss_mlp": 1.05937409, + "diversity_loss_mlp": 0.0, + "epoch": 0.6996921893035783, + "flos": 579016636416.0, + "grad_norm": 0.066101183722826, + "language_loss": 0.80819213, + "learning_rate": 0.00021843803922057115, + "loss": 0.81887871, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3637, + "time_per_iteration": 2.736743688583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069376, + "balance_loss_mlp": 1.060215, + "diversity_loss_mlp": 0.0, + "epoch": 0.6998845709888418, + "flos": 518629796352.0, + "grad_norm": 0.07934438223674636, + "language_loss": 0.8197611, + "learning_rate": 0.00021818064390698977, + "loss": 0.83045483, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3638, + "time_per_iteration": 2.6075611114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070673, + "balance_loss_mlp": 1.06178594, + "diversity_loss_mlp": 0.0, + "epoch": 0.7000769526741054, + "flos": 620951505408.0, + "grad_norm": 0.0705113992952529, + "language_loss": 0.87237096, + "learning_rate": 0.0002179233580053861, + "loss": 0.88307768, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 3639, + "time_per_iteration": 2.7142910957336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107015, + "balance_loss_mlp": 1.06120896, + "diversity_loss_mlp": 0.0, + "epoch": 0.700269334359369, + "flos": 559946856960.0, + "grad_norm": 0.07560028355572443, + "language_loss": 0.85636085, + "learning_rate": 0.00021766618161564688, + "loss": 0.86706233, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3640, + "time_per_iteration": 2.7285115718841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065402, + "balance_loss_mlp": 1.0562886, + "diversity_loss_mlp": 0.0, + "epoch": 0.7004617160446326, + "flos": 483343967232.0, + "grad_norm": 0.06395770762467583, + "language_loss": 0.87343419, + "learning_rate": 0.00021740911483761677, + "loss": 0.88408822, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3641, + "time_per_iteration": 2.584667205810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068314, + "balance_loss_mlp": 1.05936706, + "diversity_loss_mlp": 0.0, + "epoch": 0.7006540977298961, + "flos": 696981003264.0, + "grad_norm": 0.05940351360925286, + "language_loss": 0.91777283, + "learning_rate": 0.00021715215777109837, + "loss": 0.92845595, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 3642, + "time_per_iteration": 2.9933156967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069259, + "balance_loss_mlp": 1.06025815, + "diversity_loss_mlp": 0.0, + "epoch": 0.7008464794151597, + "flos": 504775950336.0, + "grad_norm": 0.07347565488383569, + "language_loss": 0.84518594, + "learning_rate": 0.00021689531051585103, + "loss": 0.85587853, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 3643, + "time_per_iteration": 2.6531710624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067704, + "balance_loss_mlp": 1.05844164, + "diversity_loss_mlp": 0.0, + "epoch": 0.7010388611004232, + "flos": 537242554368.0, + "grad_norm": 0.08696231717445767, + "language_loss": 0.80713868, + "learning_rate": 0.00021663857317159196, + "loss": 0.81781578, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3644, + "time_per_iteration": 2.604703426361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.06396961, + "diversity_loss_mlp": 0.0, + "epoch": 0.7012312427856868, + "flos": 547259245056.0, + "grad_norm": 0.057193672258815845, + "language_loss": 0.81973934, + "learning_rate": 0.00021638194583799487, + "loss": 0.83046699, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 3645, + "time_per_iteration": 2.6747145652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067445, + "balance_loss_mlp": 1.05851054, + "diversity_loss_mlp": 0.0, + "epoch": 0.7014236244709504, + "flos": 941409630720.0, + "grad_norm": 0.08498226844175927, + "language_loss": 0.82551372, + "learning_rate": 0.00021612542861469176, + "loss": 0.83618826, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 3646, + "time_per_iteration": 3.2375802993774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067002, + "balance_loss_mlp": 1.05810285, + "diversity_loss_mlp": 0.0, + "epoch": 0.7016160061562139, + "flos": 525167608320.0, + "grad_norm": 0.07003978186883456, + "language_loss": 0.8260622, + "learning_rate": 0.00021586902160127135, + "loss": 0.83673215, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 3647, + "time_per_iteration": 2.6448206901550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076859, + "balance_loss_mlp": 1.06791854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7018083878414775, + "flos": 373385023488.0, + "grad_norm": 0.11788208419913924, + "language_loss": 0.74163634, + "learning_rate": 0.00021561272489727974, + "loss": 0.75240493, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3648, + "time_per_iteration": 2.5040485858917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107998, + "balance_loss_mlp": 1.07128358, + "diversity_loss_mlp": 0.0, + "epoch": 0.7020007695267411, + "flos": 527784201216.0, + "grad_norm": 0.06337788759133205, + "language_loss": 0.8008945, + "learning_rate": 0.0002153565386022199, + "loss": 0.81169432, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 3649, + "time_per_iteration": 2.7248024940490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076924, + "balance_loss_mlp": 1.06812, + "diversity_loss_mlp": 0.0, + "epoch": 0.7021931512120047, + "flos": 690154297344.0, + "grad_norm": 0.0801860998557123, + "language_loss": 0.82855487, + "learning_rate": 0.00021510046281555262, + "loss": 0.83932412, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 3650, + "time_per_iteration": 2.809051036834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077447, + "balance_loss_mlp": 1.06870925, + "diversity_loss_mlp": 0.0, + "epoch": 0.7023855328972681, + "flos": 639784147968.0, + "grad_norm": 0.08542793543919469, + "language_loss": 0.81736684, + "learning_rate": 0.0002148444976366949, + "loss": 0.82814133, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 3651, + "time_per_iteration": 2.7492573261260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084402, + "balance_loss_mlp": 1.07583714, + "diversity_loss_mlp": 0.0, + "epoch": 0.7025779145825317, + "flos": 560940194304.0, + "grad_norm": 0.0799718694707253, + "language_loss": 0.82820916, + "learning_rate": 0.00021458864316502136, + "loss": 0.83905321, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.08575439, + "routerloss_mlp": 0.0, + "step": 3652, + "time_per_iteration": 2.7140626907348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.07368028, + "diversity_loss_mlp": 0.0, + "epoch": 0.7027702962677953, + "flos": 447445472256.0, + "grad_norm": 0.0716785593922181, + "language_loss": 0.87417138, + "learning_rate": 0.0002143328994998634, + "loss": 0.88499534, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 3653, + "time_per_iteration": 2.5076870918273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074305, + "balance_loss_mlp": 1.06541252, + "diversity_loss_mlp": 0.0, + "epoch": 0.7029626779530589, + "flos": 622500609024.0, + "grad_norm": 0.078552736129926, + "language_loss": 0.78368807, + "learning_rate": 0.00021407726674050982, + "loss": 0.79443109, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 3654, + "time_per_iteration": 2.8595826625823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077013, + "balance_loss_mlp": 1.06806064, + "diversity_loss_mlp": 0.0, + "epoch": 0.7031550596383225, + "flos": 629591989248.0, + "grad_norm": 0.06456326920806615, + "language_loss": 0.8704083, + "learning_rate": 0.0002138217449862061, + "loss": 0.88117838, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 3655, + "time_per_iteration": 2.727473258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074047, + "balance_loss_mlp": 1.06530333, + "diversity_loss_mlp": 0.0, + "epoch": 0.703347441323586, + "flos": 530843134464.0, + "grad_norm": 0.06685907167482581, + "language_loss": 0.78296137, + "learning_rate": 0.00021356633433615403, + "loss": 0.79370177, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 3656, + "time_per_iteration": 2.5853357315063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072471, + "balance_loss_mlp": 1.06341755, + "diversity_loss_mlp": 0.0, + "epoch": 0.7035398230088495, + "flos": 693593528832.0, + "grad_norm": 0.05195711031116695, + "language_loss": 0.83568424, + "learning_rate": 0.0002133110348895133, + "loss": 0.84640896, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3657, + "time_per_iteration": 2.966989517211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069658, + "balance_loss_mlp": 1.06044364, + "diversity_loss_mlp": 0.0, + "epoch": 0.7037322046941131, + "flos": 968035152384.0, + "grad_norm": 0.05842315057280589, + "language_loss": 0.85166538, + "learning_rate": 0.0002130558467453999, + "loss": 0.86236197, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3658, + "time_per_iteration": 3.3303468227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080025, + "balance_loss_mlp": 1.07069683, + "diversity_loss_mlp": 0.0, + "epoch": 0.7039245863793767, + "flos": 502863427584.0, + "grad_norm": 0.06729984707772495, + "language_loss": 0.8469972, + "learning_rate": 0.0002128007700028865, + "loss": 0.85779744, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3659, + "time_per_iteration": 2.7004916667938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069483, + "balance_loss_mlp": 1.06041121, + "diversity_loss_mlp": 0.0, + "epoch": 0.7041169680646402, + "flos": 465954342912.0, + "grad_norm": 0.08608403684795747, + "language_loss": 0.84587854, + "learning_rate": 0.00021254580476100276, + "loss": 0.85657346, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3660, + "time_per_iteration": 2.5480196475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072222, + "balance_loss_mlp": 1.06278646, + "diversity_loss_mlp": 0.0, + "epoch": 0.7043093497499038, + "flos": 632181417984.0, + "grad_norm": 0.07339918095130941, + "language_loss": 0.79315257, + "learning_rate": 0.00021229095111873497, + "loss": 0.80387473, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3661, + "time_per_iteration": 2.7757935523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791151, + "balance_loss_mlp": 1.34026599, + "diversity_loss_mlp": 0.21938899, + "epoch": 0.7045017314351674, + "flos": 542930190336.0, + "grad_norm": 0.027590424390171175, + "language_loss": 0.85883224, + "learning_rate": 0.0002120362091750261, + "loss": 0.8667438, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01132388, + "step": 3662, + "time_per_iteration": 2.896202802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798199, + "balance_loss_mlp": 1.35343075, + "diversity_loss_mlp": 0.22044487, + "epoch": 0.704694113120431, + "flos": 428237300736.0, + "grad_norm": 0.03684811642709949, + "language_loss": 0.87121612, + "learning_rate": 0.00021178157902877566, + "loss": 0.87919807, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01126087, + "step": 3663, + "time_per_iteration": 2.4897618293762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059718, + "balance_loss_mlp": 1.05026472, + "diversity_loss_mlp": 0.0, + "epoch": 0.7048864948056945, + "flos": 650544556032.0, + "grad_norm": 0.06585144557964606, + "language_loss": 0.868586, + "learning_rate": 0.0002115270607788397, + "loss": 0.87918323, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3664, + "time_per_iteration": 2.767237901687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061738, + "balance_loss_mlp": 1.05233264, + "diversity_loss_mlp": 0.0, + "epoch": 0.705078876490958, + "flos": 412562336256.0, + "grad_norm": 0.06809628156665722, + "language_loss": 0.8563199, + "learning_rate": 0.00021127265452403133, + "loss": 0.86693728, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3665, + "time_per_iteration": 2.5270590782165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028622, + "balance_loss_mlp": 1.02266109, + "diversity_loss_mlp": 0.0, + "epoch": 0.7052712581762216, + "flos": 1420040927232.0, + "grad_norm": 0.030216242564882093, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85120249, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 3666, + "time_per_iteration": 4.850507974624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105764, + "balance_loss_mlp": 1.04785872, + "diversity_loss_mlp": 0.0, + "epoch": 0.7054636398614852, + "flos": 493049369088.0, + "grad_norm": 0.07688296901308685, + "language_loss": 0.82549417, + "learning_rate": 0.00021076417839483065, + "loss": 0.83607054, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3667, + "time_per_iteration": 2.789318799972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00785288, + "balance_loss_mlp": 1.32734215, + "diversity_loss_mlp": 0.21942863, + "epoch": 0.7056560215467488, + "flos": 450457417728.0, + "grad_norm": 0.027872662040783723, + "language_loss": 0.85229611, + "learning_rate": 0.00021051010871784589, + "loss": 0.86014903, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01190263, + "step": 3668, + "time_per_iteration": 2.6029293537139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049431, + "balance_loss_mlp": 1.03972173, + "diversity_loss_mlp": 0.0, + "epoch": 0.7058484032320124, + "flos": 565703875584.0, + "grad_norm": 0.06094440535163373, + "language_loss": 0.79136097, + "learning_rate": 0.0002102561514308045, + "loss": 0.80185533, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3669, + "time_per_iteration": 2.717550754547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048509, + "balance_loss_mlp": 1.03882289, + "diversity_loss_mlp": 0.0, + "epoch": 0.7060407849172758, + "flos": 567008501760.0, + "grad_norm": 0.06685679205809081, + "language_loss": 0.82684934, + "learning_rate": 0.00021000230663230135, + "loss": 0.83733451, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3670, + "time_per_iteration": 2.663641929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_mlp": 1.03758621, + "diversity_loss_mlp": 0.0, + "epoch": 0.7062331666025394, + "flos": 468746403840.0, + "grad_norm": 0.0788999580683501, + "language_loss": 0.8333686, + "learning_rate": 0.00020974857442088762, + "loss": 0.84384131, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3671, + "time_per_iteration": 2.603200674057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050365, + "balance_loss_mlp": 1.04090595, + "diversity_loss_mlp": 0.0, + "epoch": 0.706425548287803, + "flos": 595316749824.0, + "grad_norm": 0.06597055707746856, + "language_loss": 0.89200228, + "learning_rate": 0.00020949495489507104, + "loss": 0.90250599, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3672, + "time_per_iteration": 2.6877996921539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_mlp": 1.04270363, + "diversity_loss_mlp": 0.0, + "epoch": 0.7066179299730666, + "flos": 475815389184.0, + "grad_norm": 0.17274894008002345, + "language_loss": 0.84991109, + "learning_rate": 0.00020924144815331525, + "loss": 0.86043334, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3673, + "time_per_iteration": 2.5844242572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_mlp": 1.04517114, + "diversity_loss_mlp": 0.0, + "epoch": 0.7068103116583301, + "flos": 506409117696.0, + "grad_norm": 0.0640379080300773, + "language_loss": 0.83600396, + "learning_rate": 0.00020898805429404044, + "loss": 0.84655201, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3674, + "time_per_iteration": 2.676417350769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056838, + "balance_loss_mlp": 1.04724169, + "diversity_loss_mlp": 0.0, + "epoch": 0.7070026933435937, + "flos": 679336989696.0, + "grad_norm": 0.0780577693768427, + "language_loss": 0.78793156, + "learning_rate": 0.0002087347734156228, + "loss": 0.79849994, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3675, + "time_per_iteration": 2.8697783946990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057522, + "balance_loss_mlp": 1.04800272, + "diversity_loss_mlp": 0.0, + "epoch": 0.7071950750288573, + "flos": 472217942016.0, + "grad_norm": 0.0710988084964876, + "language_loss": 0.79834986, + "learning_rate": 0.00020848160561639452, + "loss": 0.80892509, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.09515381, + "routerloss_mlp": 0.0, + "step": 3676, + "time_per_iteration": 2.7413785457611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106069, + "balance_loss_mlp": 1.05147529, + "diversity_loss_mlp": 0.0, + "epoch": 0.7073874567141208, + "flos": 473742452736.0, + "grad_norm": 0.06834186778178446, + "language_loss": 0.86040401, + "learning_rate": 0.0002082285509946445, + "loss": 0.8710109, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3677, + "time_per_iteration": 2.5471127033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063838, + "balance_loss_mlp": 1.05436051, + "diversity_loss_mlp": 0.0, + "epoch": 0.7075798383993844, + "flos": 545877895680.0, + "grad_norm": 0.06236421972787801, + "language_loss": 0.83409554, + "learning_rate": 0.00020797560964861683, + "loss": 0.84473389, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3678, + "time_per_iteration": 2.748696804046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065573, + "balance_loss_mlp": 1.05635202, + "diversity_loss_mlp": 0.0, + "epoch": 0.7077722200846479, + "flos": 662090526720.0, + "grad_norm": 0.07878907365407993, + "language_loss": 0.80641901, + "learning_rate": 0.0002077227816765122, + "loss": 0.81707478, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3679, + "time_per_iteration": 3.000666618347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_mlp": 1.03114033, + "diversity_loss_mlp": 0.0, + "epoch": 0.7079646017699115, + "flos": 1529960223744.0, + "grad_norm": 0.025842314854182848, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77483988, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.05126953, + "routerloss_mlp": 0.0, + "step": 3680, + "time_per_iteration": 4.779016971588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106697, + "balance_loss_mlp": 1.05772507, + "diversity_loss_mlp": 0.0, + "epoch": 0.7081569834551751, + "flos": 621502502400.0, + "grad_norm": 0.06703239561102693, + "language_loss": 0.78754878, + "learning_rate": 0.00020721746624665383, + "loss": 0.79821849, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3681, + "time_per_iteration": 2.7041916847229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073317, + "balance_loss_mlp": 1.06381631, + "diversity_loss_mlp": 0.0, + "epoch": 0.7083493651404387, + "flos": 794630435328.0, + "grad_norm": 0.06071055961479113, + "language_loss": 0.80160034, + "learning_rate": 0.00020696497898508114, + "loss": 0.81233358, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3682, + "time_per_iteration": 3.003126382827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.06374955, + "diversity_loss_mlp": 0.0, + "epoch": 0.7085417468257021, + "flos": 813747202560.0, + "grad_norm": 0.0794178936209596, + "language_loss": 0.77425051, + "learning_rate": 0.00020671260548979316, + "loss": 0.7849825, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3683, + "time_per_iteration": 3.000619649887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079652, + "balance_loss_mlp": 1.07019854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7087341285109657, + "flos": 700566340608.0, + "grad_norm": 0.06569012319146904, + "language_loss": 0.85012448, + "learning_rate": 0.00020646034585876982, + "loss": 0.86092097, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3684, + "time_per_iteration": 2.8407599925994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788495, + "balance_loss_mlp": 1.33244729, + "diversity_loss_mlp": 0.22155851, + "epoch": 0.7089265101962293, + "flos": 596514917376.0, + "grad_norm": 0.02817752508262258, + "language_loss": 0.84630954, + "learning_rate": 0.00020620820018994718, + "loss": 0.8541944, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0114923, + "step": 3685, + "time_per_iteration": 2.8807289600372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791818, + "balance_loss_mlp": 1.33957911, + "diversity_loss_mlp": 0.22135019, + "epoch": 0.7091188918814929, + "flos": 487106970624.0, + "grad_norm": 0.03572846620936607, + "language_loss": 0.83307725, + "learning_rate": 0.00020595616858121675, + "loss": 0.84099543, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0113536, + "step": 3686, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075035, + "balance_loss_mlp": 1.06569517, + "diversity_loss_mlp": 0.0, + "epoch": 0.7093112735667565, + "flos": 600117507072.0, + "grad_norm": 0.05825520117041851, + "language_loss": 0.80985916, + "learning_rate": 0.00020570425113042586, + "loss": 0.82060945, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3687, + "time_per_iteration": 2.724151611328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078198, + "balance_loss_mlp": 1.06894779, + "diversity_loss_mlp": 0.0, + "epoch": 0.70950365525202, + "flos": 505830956544.0, + "grad_norm": 0.0736963808397267, + "language_loss": 0.8558749, + "learning_rate": 0.0002054524479353776, + "loss": 0.8666569, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3688, + "time_per_iteration": 2.7505970001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074288, + "balance_loss_mlp": 1.06498957, + "diversity_loss_mlp": 0.0, + "epoch": 0.7096960369372836, + "flos": 732160747008.0, + "grad_norm": 0.07506666957013575, + "language_loss": 0.81571054, + "learning_rate": 0.00020520075909383063, + "loss": 0.82645345, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3689, + "time_per_iteration": 2.854198694229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074645, + "balance_loss_mlp": 1.06511474, + "diversity_loss_mlp": 0.0, + "epoch": 0.7098884186225471, + "flos": 972077511168.0, + "grad_norm": 0.06551416788386397, + "language_loss": 0.80860078, + "learning_rate": 0.00020494918470349916, + "loss": 0.81934714, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3690, + "time_per_iteration": 3.2713325023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079528, + "balance_loss_mlp": 1.34716058, + "diversity_loss_mlp": 0.22097552, + "epoch": 0.7100808003078107, + "flos": 504252117504.0, + "grad_norm": 0.03587666052644611, + "language_loss": 0.85333264, + "learning_rate": 0.00020469772486205297, + "loss": 0.86128545, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01121199, + "step": 3691, + "time_per_iteration": 2.626685380935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787595, + "balance_loss_mlp": 1.33183146, + "diversity_loss_mlp": 0.22060202, + "epoch": 0.7102731819930742, + "flos": 540335992320.0, + "grad_norm": 0.030476334667887343, + "language_loss": 0.81455922, + "learning_rate": 0.0002044463796671177, + "loss": 0.82243514, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0113784, + "step": 3692, + "time_per_iteration": 2.7819416522979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074018, + "balance_loss_mlp": 1.06465387, + "diversity_loss_mlp": 0.0, + "epoch": 0.7104655636783378, + "flos": 620378113536.0, + "grad_norm": 0.07963770038273417, + "language_loss": 0.8046093, + "learning_rate": 0.00020419514921627408, + "loss": 0.81534946, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3693, + "time_per_iteration": 2.8676981925964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069877, + "balance_loss_mlp": 1.06088233, + "diversity_loss_mlp": 0.0, + "epoch": 0.7106579453636014, + "flos": 557322923520.0, + "grad_norm": 0.07391756130926609, + "language_loss": 0.77261078, + "learning_rate": 0.00020394403360705855, + "loss": 0.78330958, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 3694, + "time_per_iteration": 2.695068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788663, + "balance_loss_mlp": 1.33321095, + "diversity_loss_mlp": 0.22100018, + "epoch": 0.710850327048865, + "flos": 513048245760.0, + "grad_norm": 0.034812211167962216, + "language_loss": 0.88271379, + "learning_rate": 0.00020369303293696228, + "loss": 0.89060044, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01155703, + "step": 3695, + "time_per_iteration": 2.601621627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066517, + "balance_loss_mlp": 1.05723643, + "diversity_loss_mlp": 0.0, + "epoch": 0.7110427087341286, + "flos": 423619352064.0, + "grad_norm": 0.07715335648803619, + "language_loss": 0.78224587, + "learning_rate": 0.00020344214730343304, + "loss": 0.79291105, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3696, + "time_per_iteration": 2.6193599700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065299, + "balance_loss_mlp": 1.05618572, + "diversity_loss_mlp": 0.0, + "epoch": 0.711235090419392, + "flos": 577415402496.0, + "grad_norm": 0.05468894944159508, + "language_loss": 0.79277122, + "learning_rate": 0.00020319137680387296, + "loss": 0.80342424, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 3697, + "time_per_iteration": 2.9309933185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060045, + "balance_loss_mlp": 1.05068743, + "diversity_loss_mlp": 0.0, + "epoch": 0.7114274721046556, + "flos": 448060709376.0, + "grad_norm": 0.07057759031394817, + "language_loss": 0.80451727, + "learning_rate": 0.0002029407215356398, + "loss": 0.81511772, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3698, + "time_per_iteration": 2.4956727027893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058583, + "balance_loss_mlp": 1.04976714, + "diversity_loss_mlp": 0.0, + "epoch": 0.7116198537899192, + "flos": 621962095104.0, + "grad_norm": 0.0722387573875999, + "language_loss": 0.83844793, + "learning_rate": 0.00020269018159604663, + "loss": 0.84903371, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 3699, + "time_per_iteration": 2.731231689453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057429, + "balance_loss_mlp": 1.04814827, + "diversity_loss_mlp": 0.0, + "epoch": 0.7118122354751828, + "flos": 498724895232.0, + "grad_norm": 0.07123396580800914, + "language_loss": 0.818003, + "learning_rate": 0.00020243975708236162, + "loss": 0.82857728, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3700, + "time_per_iteration": 2.597215414047241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781944, + "balance_loss_mlp": 1.31673443, + "diversity_loss_mlp": 0.22274226, + "epoch": 0.7120046171604463, + "flos": 572718532608.0, + "grad_norm": 0.030217464674653638, + "language_loss": 0.86634398, + "learning_rate": 0.00020218944809180818, + "loss": 0.87416339, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01220552, + "step": 3701, + "time_per_iteration": 2.7128944396972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056546, + "balance_loss_mlp": 1.04739642, + "diversity_loss_mlp": 0.0, + "epoch": 0.7121969988457099, + "flos": 572664204288.0, + "grad_norm": 0.06969302254489844, + "language_loss": 0.84630072, + "learning_rate": 0.00020193925472156493, + "loss": 0.85686618, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3702, + "time_per_iteration": 2.695040702819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009738, + "balance_loss_mlp": 1.00442076, + "diversity_loss_mlp": 0.0, + "epoch": 0.7123893805309734, + "flos": 1523429752320.0, + "grad_norm": 0.015177951683804305, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75298905, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 3703, + "time_per_iteration": 4.91239857673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784779, + "balance_loss_mlp": 1.3239193, + "diversity_loss_mlp": 0.22157452, + "epoch": 0.712581762216237, + "flos": 615105280512.0, + "grad_norm": 0.02622509859947044, + "language_loss": 0.83696187, + "learning_rate": 0.00020143921523049863, + "loss": 0.84480959, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01203172, + "step": 3704, + "time_per_iteration": 3.0262062549591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057244, + "balance_loss_mlp": 1.04805851, + "diversity_loss_mlp": 0.0, + "epoch": 0.7127741439015006, + "flos": 597777698304.0, + "grad_norm": 0.07737525798134272, + "language_loss": 0.838422, + "learning_rate": 0.00020118936930380837, + "loss": 0.84899437, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 3705, + "time_per_iteration": 2.741217851638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105596, + "balance_loss_mlp": 1.04639971, + "diversity_loss_mlp": 0.0, + "epoch": 0.7129665255867641, + "flos": 537398198784.0, + "grad_norm": 0.08146435226617602, + "language_loss": 0.80879092, + "learning_rate": 0.0002009396393856932, + "loss": 0.81935048, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 3706, + "time_per_iteration": 2.643540143966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.0414114, + "diversity_loss_mlp": 0.0, + "epoch": 0.7131589072720277, + "flos": 526442499072.0, + "grad_norm": 0.07418360122955521, + "language_loss": 0.82790005, + "learning_rate": 0.00020069002557310673, + "loss": 0.83840382, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3707, + "time_per_iteration": 2.719648838043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052452, + "balance_loss_mlp": 1.04351699, + "diversity_loss_mlp": 0.0, + "epoch": 0.7133512889572913, + "flos": 530919484416.0, + "grad_norm": 0.05884856391484217, + "language_loss": 0.77115107, + "learning_rate": 0.00020044052796295807, + "loss": 0.78167558, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3708, + "time_per_iteration": 2.830353260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051246, + "balance_loss_mlp": 1.04202533, + "diversity_loss_mlp": 0.0, + "epoch": 0.7135436706425549, + "flos": 503535564288.0, + "grad_norm": 0.07889939453961878, + "language_loss": 0.82217181, + "learning_rate": 0.00020019114665211063, + "loss": 0.83268428, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3709, + "time_per_iteration": 2.581709623336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048706, + "balance_loss_mlp": 1.03982449, + "diversity_loss_mlp": 0.0, + "epoch": 0.7137360523278183, + "flos": 515968786944.0, + "grad_norm": 0.06519405348344502, + "language_loss": 0.81405282, + "learning_rate": 0.00019994188173738276, + "loss": 0.8245399, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 3710, + "time_per_iteration": 2.5735976696014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049854, + "balance_loss_mlp": 1.04063272, + "diversity_loss_mlp": 0.0, + "epoch": 0.7139284340130819, + "flos": 510389434368.0, + "grad_norm": 0.07046885330875076, + "language_loss": 0.80712581, + "learning_rate": 0.0001996927333155477, + "loss": 0.81762433, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 3711, + "time_per_iteration": 2.814368724822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054586, + "balance_loss_mlp": 1.04546654, + "diversity_loss_mlp": 0.0, + "epoch": 0.7141208156983455, + "flos": 890275940352.0, + "grad_norm": 0.07187972004168419, + "language_loss": 0.85349059, + "learning_rate": 0.00019944370148333346, + "loss": 0.8640365, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3712, + "time_per_iteration": 3.169759750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058298, + "balance_loss_mlp": 1.04938745, + "diversity_loss_mlp": 0.0, + "epoch": 0.7143131973836091, + "flos": 535779712512.0, + "grad_norm": 0.060002667598624965, + "language_loss": 0.79623508, + "learning_rate": 0.00019919478633742278, + "loss": 0.80681807, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 3713, + "time_per_iteration": 2.644663095474243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061749, + "balance_loss_mlp": 1.05258763, + "diversity_loss_mlp": 0.0, + "epoch": 0.7145055790688727, + "flos": 473668300800.0, + "grad_norm": 0.07397385813864758, + "language_loss": 0.85182703, + "learning_rate": 0.00019894598797445302, + "loss": 0.86244452, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3714, + "time_per_iteration": 2.5240604877471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061709, + "balance_loss_mlp": 1.05239308, + "diversity_loss_mlp": 0.0, + "epoch": 0.7146979607541362, + "flos": 570521885184.0, + "grad_norm": 0.07339492646897193, + "language_loss": 0.81885231, + "learning_rate": 0.00019869730649101615, + "loss": 0.82946944, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3715, + "time_per_iteration": 2.827868938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063135, + "balance_loss_mlp": 1.05403948, + "diversity_loss_mlp": 0.0, + "epoch": 0.7148903424393998, + "flos": 839666082816.0, + "grad_norm": 0.0742719443850205, + "language_loss": 0.72613627, + "learning_rate": 0.00019844874198365943, + "loss": 0.73676765, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3716, + "time_per_iteration": 3.0963878631591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063516, + "balance_loss_mlp": 1.05428362, + "diversity_loss_mlp": 0.0, + "epoch": 0.7150827241246633, + "flos": 541823427072.0, + "grad_norm": 0.061591749317610134, + "language_loss": 0.83976817, + "learning_rate": 0.00019820029454888362, + "loss": 0.85040331, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3717, + "time_per_iteration": 2.7068889141082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006732, + "balance_loss_mlp": 1.0012722, + "diversity_loss_mlp": 0.0, + "epoch": 0.7152751058099269, + "flos": 1583678200320.0, + "grad_norm": 0.016486733546314403, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75528002, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.0546875, + "routerloss_mlp": 0.0, + "step": 3718, + "time_per_iteration": 5.0301513671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010681, + "balance_loss_mlp": 1.05873013, + "diversity_loss_mlp": 0.0, + "epoch": 0.7154674874951905, + "flos": 517419145728.0, + "grad_norm": 0.06632920905024949, + "language_loss": 0.80107152, + "learning_rate": 0.0001977037512828529, + "loss": 0.81175244, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3719, + "time_per_iteration": 2.573982000350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066134, + "balance_loss_mlp": 1.05686522, + "diversity_loss_mlp": 0.0, + "epoch": 0.715659869180454, + "flos": 602524127232.0, + "grad_norm": 0.05986593090344285, + "language_loss": 0.86432415, + "learning_rate": 0.0001974556556443734, + "loss": 0.87498546, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3720, + "time_per_iteration": 2.7087209224700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106825, + "balance_loss_mlp": 1.0589757, + "diversity_loss_mlp": 0.0, + "epoch": 0.7158522508657176, + "flos": 531675684864.0, + "grad_norm": 0.05551674827732864, + "language_loss": 0.88590324, + "learning_rate": 0.00019720767746402547, + "loss": 0.89658576, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3721, + "time_per_iteration": 2.7290821075439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010754, + "balance_loss_mlp": 1.06610191, + "diversity_loss_mlp": 0.0, + "epoch": 0.7160446325509812, + "flos": 557569972224.0, + "grad_norm": 0.07406216566818759, + "language_loss": 0.79965603, + "learning_rate": 0.00019695981683808222, + "loss": 0.81041002, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3722, + "time_per_iteration": 2.8323793411254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072903, + "balance_loss_mlp": 1.06386733, + "diversity_loss_mlp": 0.0, + "epoch": 0.7162370142362448, + "flos": 690986847744.0, + "grad_norm": 0.08922707402242334, + "language_loss": 0.84955275, + "learning_rate": 0.00019671207386277225, + "loss": 0.86028177, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 3723, + "time_per_iteration": 2.94681978225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069475, + "balance_loss_mlp": 1.06010544, + "diversity_loss_mlp": 0.0, + "epoch": 0.7164293959215082, + "flos": 794109173760.0, + "grad_norm": 0.07420263460977167, + "language_loss": 0.78355432, + "learning_rate": 0.0001964644486342777, + "loss": 0.79424912, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3724, + "time_per_iteration": 2.960944414138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064733, + "balance_loss_mlp": 1.05573297, + "diversity_loss_mlp": 0.0, + "epoch": 0.7166217776067718, + "flos": 494178527232.0, + "grad_norm": 0.0760825236490028, + "language_loss": 0.86588323, + "learning_rate": 0.00019621694124873524, + "loss": 0.87653053, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 3725, + "time_per_iteration": 2.6881937980651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101766, + "balance_loss_mlp": 1.01224804, + "diversity_loss_mlp": 0.0, + "epoch": 0.7168141592920354, + "flos": 1401060354048.0, + "grad_norm": 0.018433056607108506, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77557743, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 3726, + "time_per_iteration": 4.8842387199401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057658, + "balance_loss_mlp": 1.04820442, + "diversity_loss_mlp": 0.0, + "epoch": 0.717006540977299, + "flos": 793150341120.0, + "grad_norm": 0.08148717312552407, + "language_loss": 0.77167314, + "learning_rate": 0.00019572228039082428, + "loss": 0.78224969, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3727, + "time_per_iteration": 3.071643829345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055911, + "balance_loss_mlp": 1.04670763, + "diversity_loss_mlp": 0.0, + "epoch": 0.7171989226625626, + "flos": 554812416000.0, + "grad_norm": 0.05270267691232831, + "language_loss": 0.83482945, + "learning_rate": 0.0001954751271105002, + "loss": 0.84538865, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3728, + "time_per_iteration": 2.8301711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105429, + "balance_loss_mlp": 1.04496169, + "diversity_loss_mlp": 0.0, + "epoch": 0.717391304347826, + "flos": 555914409984.0, + "grad_norm": 0.06896440922655821, + "language_loss": 0.80838037, + "learning_rate": 0.00019522809205721687, + "loss": 0.81892335, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3729, + "time_per_iteration": 2.8094747066497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048553, + "balance_loss_mlp": 1.03930831, + "diversity_loss_mlp": 0.0, + "epoch": 0.7175836860330896, + "flos": 538855898112.0, + "grad_norm": 0.09744205035272979, + "language_loss": 0.83110106, + "learning_rate": 0.0001949811753268816, + "loss": 0.84158659, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3730, + "time_per_iteration": 2.6963374614715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045755, + "balance_loss_mlp": 1.03643274, + "diversity_loss_mlp": 0.0, + "epoch": 0.7177760677183532, + "flos": 515637674496.0, + "grad_norm": 0.0730125544637403, + "language_loss": 0.82630277, + "learning_rate": 0.00019473437701535634, + "loss": 0.83676028, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3731, + "time_per_iteration": 2.6076574325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047574, + "balance_loss_mlp": 1.03844213, + "diversity_loss_mlp": 0.0, + "epoch": 0.7179684494036168, + "flos": 674719041024.0, + "grad_norm": 0.07914181118847867, + "language_loss": 0.89615285, + "learning_rate": 0.00019448769721845677, + "loss": 0.90662855, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3732, + "time_per_iteration": 2.824897289276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047996, + "balance_loss_mlp": 1.03853655, + "diversity_loss_mlp": 0.0, + "epoch": 0.7181608310888803, + "flos": 469912637952.0, + "grad_norm": 0.07061643018013358, + "language_loss": 0.86148334, + "learning_rate": 0.00019424113603195203, + "loss": 0.87196326, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3733, + "time_per_iteration": 2.520390510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_mlp": 1.03879809, + "diversity_loss_mlp": 0.0, + "epoch": 0.7183532127741439, + "flos": 593952652800.0, + "grad_norm": 0.07087799527916698, + "language_loss": 0.79863775, + "learning_rate": 0.0001939946935515657, + "loss": 0.80912238, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3734, + "time_per_iteration": 2.8286993503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104904, + "balance_loss_mlp": 1.03927684, + "diversity_loss_mlp": 0.0, + "epoch": 0.7185455944594075, + "flos": 498917615616.0, + "grad_norm": 0.08245280249652003, + "language_loss": 0.80650169, + "learning_rate": 0.0001937483698729755, + "loss": 0.8169921, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3735, + "time_per_iteration": 2.6458795070648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_mlp": 1.0338974, + "diversity_loss_mlp": 0.0, + "epoch": 0.718737976144671, + "flos": 814933260288.0, + "grad_norm": 0.07515481344769812, + "language_loss": 0.82211673, + "learning_rate": 0.0001935021650918128, + "loss": 0.83255374, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3736, + "time_per_iteration": 3.0285887718200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043391, + "balance_loss_mlp": 1.03346682, + "diversity_loss_mlp": 0.0, + "epoch": 0.7189303578299346, + "flos": 438328143360.0, + "grad_norm": 0.06979349456564556, + "language_loss": 0.87017608, + "learning_rate": 0.0001932560793036625, + "loss": 0.88060999, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.09924316, + "routerloss_mlp": 0.0, + "step": 3737, + "time_per_iteration": 2.482374906539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_mlp": 1.03452408, + "diversity_loss_mlp": 0.0, + "epoch": 0.7191227395151981, + "flos": 549398992896.0, + "grad_norm": 0.08340257337042449, + "language_loss": 0.86882925, + "learning_rate": 0.00019301011260406382, + "loss": 0.87927186, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3738, + "time_per_iteration": 2.6162045001983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104539, + "balance_loss_mlp": 1.03576994, + "diversity_loss_mlp": 0.0, + "epoch": 0.7193151212004617, + "flos": 626938320384.0, + "grad_norm": 0.0721539169034284, + "language_loss": 0.79805303, + "learning_rate": 0.00019276426508850936, + "loss": 0.80850697, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3739, + "time_per_iteration": 2.7380456924438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_mlp": 1.03111315, + "diversity_loss_mlp": 0.0, + "epoch": 0.7195075028857253, + "flos": 741062960640.0, + "grad_norm": 0.0788007665709812, + "language_loss": 0.80469853, + "learning_rate": 0.00019251853685244564, + "loss": 0.81510872, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3740, + "time_per_iteration": 3.0559754371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044512, + "balance_loss_mlp": 1.03485012, + "diversity_loss_mlp": 0.0, + "epoch": 0.7196998845709889, + "flos": 802875566592.0, + "grad_norm": 0.07989753754857366, + "language_loss": 0.80738026, + "learning_rate": 0.00019227292799127283, + "loss": 0.81782538, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3741, + "time_per_iteration": 3.0058369636535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044827, + "balance_loss_mlp": 1.03530192, + "diversity_loss_mlp": 0.0, + "epoch": 0.7198922662562524, + "flos": 925183669248.0, + "grad_norm": 0.17846470971826942, + "language_loss": 0.79000109, + "learning_rate": 0.00019202743860034454, + "loss": 0.80044937, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3742, + "time_per_iteration": 3.218614339828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043684, + "balance_loss_mlp": 1.03441513, + "diversity_loss_mlp": 0.0, + "epoch": 0.7200846479415159, + "flos": 580111289856.0, + "grad_norm": 0.07729553507192725, + "language_loss": 0.83831203, + "learning_rate": 0.00019178206877496873, + "loss": 0.84874886, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3743, + "time_per_iteration": 2.7014403343200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048278, + "balance_loss_mlp": 1.03885424, + "diversity_loss_mlp": 0.0, + "epoch": 0.7202770296267795, + "flos": 557695881216.0, + "grad_norm": 0.06342209640567653, + "language_loss": 0.85333169, + "learning_rate": 0.0001915368186104059, + "loss": 0.86381447, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3744, + "time_per_iteration": 2.733520746231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105441, + "balance_loss_mlp": 1.04513526, + "diversity_loss_mlp": 0.0, + "epoch": 0.7204694113120431, + "flos": 672552129024.0, + "grad_norm": 0.08207076889899251, + "language_loss": 0.81176144, + "learning_rate": 0.0001912916882018706, + "loss": 0.8223055, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3745, + "time_per_iteration": 2.7833125591278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057126, + "balance_loss_mlp": 1.04774427, + "diversity_loss_mlp": 0.0, + "epoch": 0.7206617929973067, + "flos": 799194055680.0, + "grad_norm": 0.08263651010752651, + "language_loss": 0.79468751, + "learning_rate": 0.00019104667764453125, + "loss": 0.80525875, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.09368896, + "routerloss_mlp": 0.0, + "step": 3746, + "time_per_iteration": 3.0572047233581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066676, + "balance_loss_mlp": 1.05751503, + "diversity_loss_mlp": 0.0, + "epoch": 0.7208541746825702, + "flos": 531898140672.0, + "grad_norm": 0.06554660744507769, + "language_loss": 0.80441052, + "learning_rate": 0.00019080178703350926, + "loss": 0.8150773, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3747, + "time_per_iteration": 2.6344070434570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067771, + "balance_loss_mlp": 1.05819249, + "diversity_loss_mlp": 0.0, + "epoch": 0.7210465563678338, + "flos": 535139882496.0, + "grad_norm": 0.07164749029527417, + "language_loss": 0.83225226, + "learning_rate": 0.00019055701646387952, + "loss": 0.84292996, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3748, + "time_per_iteration": 2.674436330795288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014621, + "balance_loss_mlp": 1.00935245, + "diversity_loss_mlp": 0.0, + "epoch": 0.7212389380530974, + "flos": 1533908606976.0, + "grad_norm": 0.01350364958452467, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.8148731, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.05273438, + "routerloss_mlp": 0.0, + "step": 3749, + "time_per_iteration": 4.8169167041778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074721, + "balance_loss_mlp": 1.06568444, + "diversity_loss_mlp": 0.0, + "epoch": 0.7214313197383609, + "flos": 461511862272.0, + "grad_norm": 0.09948968640859872, + "language_loss": 0.86443639, + "learning_rate": 0.00019006783582886368, + "loss": 0.87518358, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3750, + "time_per_iteration": 2.6094882488250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082564, + "balance_loss_mlp": 1.0731287, + "diversity_loss_mlp": 0.0, + "epoch": 0.7216237014236244, + "flos": 1037134056960.0, + "grad_norm": 0.0940617497046545, + "language_loss": 0.8313877, + "learning_rate": 0.00018982342595339437, + "loss": 0.84221339, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3751, + "time_per_iteration": 4.834062576293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077441, + "balance_loss_mlp": 1.06848848, + "diversity_loss_mlp": 0.0, + "epoch": 0.721816083108888, + "flos": 895951466496.0, + "grad_norm": 0.08300933032368943, + "language_loss": 0.81837034, + "learning_rate": 0.00018957913649915076, + "loss": 0.82914484, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 3752, + "time_per_iteration": 3.1204826831817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076559, + "balance_loss_mlp": 1.06739748, + "diversity_loss_mlp": 0.0, + "epoch": 0.7220084647941516, + "flos": 523314556416.0, + "grad_norm": 0.08305681898579634, + "language_loss": 0.79633486, + "learning_rate": 0.00018933496756097428, + "loss": 0.80710053, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3753, + "time_per_iteration": 2.6664350032806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077149, + "balance_loss_mlp": 1.06786871, + "diversity_loss_mlp": 0.0, + "epoch": 0.7222008464794152, + "flos": 816099494400.0, + "grad_norm": 0.08328010196337048, + "language_loss": 0.81679463, + "learning_rate": 0.0001890909192336603, + "loss": 0.82756615, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3754, + "time_per_iteration": 2.994882822036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073126, + "balance_loss_mlp": 1.06407857, + "diversity_loss_mlp": 0.0, + "epoch": 0.7223932281646788, + "flos": 749053702656.0, + "grad_norm": 0.08777822688547723, + "language_loss": 0.70716894, + "learning_rate": 0.00018884699161195623, + "loss": 0.71790028, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 3755, + "time_per_iteration": 4.262615442276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071208, + "balance_loss_mlp": 1.06174874, + "diversity_loss_mlp": 0.0, + "epoch": 0.7225856098499422, + "flos": 745502870016.0, + "grad_norm": 0.0673256778775424, + "language_loss": 0.77517748, + "learning_rate": 0.00018860318479056327, + "loss": 0.78588951, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3756, + "time_per_iteration": 3.1185147762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064095, + "balance_loss_mlp": 1.05514276, + "diversity_loss_mlp": 0.0, + "epoch": 0.7227779915352058, + "flos": 547330825728.0, + "grad_norm": 0.06734169026400741, + "language_loss": 0.83406973, + "learning_rate": 0.00018835949886413555, + "loss": 0.84471071, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 3757, + "time_per_iteration": 2.7693490982055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066701, + "balance_loss_mlp": 1.05735517, + "diversity_loss_mlp": 0.0, + "epoch": 0.7229703732204694, + "flos": 530484857856.0, + "grad_norm": 0.0750419048722912, + "language_loss": 0.78459024, + "learning_rate": 0.0001881159339272806, + "loss": 0.79525727, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3758, + "time_per_iteration": 2.6415517330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059793, + "balance_loss_mlp": 1.05062032, + "diversity_loss_mlp": 0.0, + "epoch": 0.723162754905733, + "flos": 528355021824.0, + "grad_norm": 0.0644798827635335, + "language_loss": 0.78601432, + "learning_rate": 0.00018787249007455858, + "loss": 0.79661226, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 3759, + "time_per_iteration": 2.6022799015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063242, + "balance_loss_mlp": 1.05413401, + "diversity_loss_mlp": 0.0, + "epoch": 0.7233551365909965, + "flos": 654868468224.0, + "grad_norm": 0.07015599197769962, + "language_loss": 0.71291095, + "learning_rate": 0.00018762916740048302, + "loss": 0.72354335, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3760, + "time_per_iteration": 2.8239991664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059773, + "balance_loss_mlp": 1.05033171, + "diversity_loss_mlp": 0.0, + "epoch": 0.7235475182762601, + "flos": 522365635584.0, + "grad_norm": 0.07068719643677601, + "language_loss": 0.86275655, + "learning_rate": 0.0001873859659995195, + "loss": 0.87335426, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3761, + "time_per_iteration": 2.825853109359741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056903, + "balance_loss_mlp": 1.04742599, + "diversity_loss_mlp": 0.0, + "epoch": 0.7237398999615237, + "flos": 609170595840.0, + "grad_norm": 0.06521234046982781, + "language_loss": 0.83369851, + "learning_rate": 0.0001871428859660878, + "loss": 0.84426749, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3762, + "time_per_iteration": 2.765061855316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054846, + "balance_loss_mlp": 1.04584002, + "diversity_loss_mlp": 0.0, + "epoch": 0.7239322816467872, + "flos": 658987176960.0, + "grad_norm": 0.06876344834189922, + "language_loss": 0.81910485, + "learning_rate": 0.00018689992739455975, + "loss": 0.82965332, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 3763, + "time_per_iteration": 2.955744504928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050714, + "balance_loss_mlp": 1.04123139, + "diversity_loss_mlp": 0.0, + "epoch": 0.7241246633320508, + "flos": 969282878976.0, + "grad_norm": 0.06967924844938471, + "language_loss": 0.85903621, + "learning_rate": 0.00018665709037926027, + "loss": 0.86954343, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.0947876, + "routerloss_mlp": 0.0, + "step": 3764, + "time_per_iteration": 3.306689977645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050074, + "balance_loss_mlp": 1.04077554, + "diversity_loss_mlp": 0.0, + "epoch": 0.7243170450173143, + "flos": 514995273216.0, + "grad_norm": 0.07823184864923875, + "language_loss": 0.8509047, + "learning_rate": 0.00018641437501446694, + "loss": 0.86140537, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3765, + "time_per_iteration": 2.5606436729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053259, + "balance_loss_mlp": 1.04385924, + "diversity_loss_mlp": 0.0, + "epoch": 0.7245094267025779, + "flos": 559746796032.0, + "grad_norm": 0.07453327039799393, + "language_loss": 0.8240428, + "learning_rate": 0.0001861717813944104, + "loss": 0.83457536, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3766, + "time_per_iteration": 2.639479875564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052028, + "balance_loss_mlp": 1.04260468, + "diversity_loss_mlp": 0.0, + "epoch": 0.7247018083878415, + "flos": 612642134016.0, + "grad_norm": 0.07462880824505752, + "language_loss": 0.79635704, + "learning_rate": 0.00018592930961327365, + "loss": 0.80687737, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3767, + "time_per_iteration": 2.71537446975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051032, + "balance_loss_mlp": 1.04159653, + "diversity_loss_mlp": 0.0, + "epoch": 0.7248941900731051, + "flos": 634676871168.0, + "grad_norm": 0.06502387009338012, + "language_loss": 0.88172042, + "learning_rate": 0.00018568695976519273, + "loss": 0.89223075, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3768, + "time_per_iteration": 2.7851336002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053572, + "balance_loss_mlp": 1.04388046, + "diversity_loss_mlp": 0.0, + "epoch": 0.7250865717583687, + "flos": 424941230592.0, + "grad_norm": 0.07526480217284313, + "language_loss": 0.80197144, + "learning_rate": 0.00018544473194425593, + "loss": 0.81250715, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3769, + "time_per_iteration": 2.5187532901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054045, + "balance_loss_mlp": 1.044276, + "diversity_loss_mlp": 0.0, + "epoch": 0.7252789534436321, + "flos": 635114068992.0, + "grad_norm": 0.07238275679239237, + "language_loss": 0.78824592, + "learning_rate": 0.00018520262624450485, + "loss": 0.79878634, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3770, + "time_per_iteration": 2.8748114109039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057377, + "balance_loss_mlp": 1.04787064, + "diversity_loss_mlp": 0.0, + "epoch": 0.7254713351288957, + "flos": 617185930752.0, + "grad_norm": 0.08918095477851212, + "language_loss": 0.86894727, + "learning_rate": 0.00018496064275993324, + "loss": 0.87952113, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3771, + "time_per_iteration": 2.824845314025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105481, + "balance_loss_mlp": 1.04509437, + "diversity_loss_mlp": 0.0, + "epoch": 0.7256637168141593, + "flos": 766986983424.0, + "grad_norm": 0.06900224223805673, + "language_loss": 0.82001221, + "learning_rate": 0.00018471878158448686, + "loss": 0.83056033, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3772, + "time_per_iteration": 2.9548990726470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056235, + "balance_loss_mlp": 1.04668033, + "diversity_loss_mlp": 0.0, + "epoch": 0.7258560984994229, + "flos": 495559503360.0, + "grad_norm": 0.058256019250052936, + "language_loss": 0.84301949, + "learning_rate": 0.00018447704281206512, + "loss": 0.85358179, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3773, + "time_per_iteration": 2.83591365814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055601, + "balance_loss_mlp": 1.04598725, + "diversity_loss_mlp": 0.0, + "epoch": 0.7260484801846864, + "flos": 530069681664.0, + "grad_norm": 0.07576068763334884, + "language_loss": 0.82763028, + "learning_rate": 0.0001842354265365191, + "loss": 0.83818638, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3774, + "time_per_iteration": 2.68778657913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060495, + "balance_loss_mlp": 1.05112517, + "diversity_loss_mlp": 0.0, + "epoch": 0.72624086186995, + "flos": 624964128768.0, + "grad_norm": 0.0805275617178238, + "language_loss": 0.80610001, + "learning_rate": 0.0001839939328516526, + "loss": 0.81670493, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3775, + "time_per_iteration": 2.7422258853912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790959, + "balance_loss_mlp": 1.33957541, + "diversity_loss_mlp": 0.21958014, + "epoch": 0.7264332435552135, + "flos": 716522858496.0, + "grad_norm": 0.033705672182060005, + "language_loss": 0.8138454, + "learning_rate": 0.0001837525618512218, + "loss": 0.82175499, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01138153, + "step": 3776, + "time_per_iteration": 2.9108829498291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053299, + "balance_loss_mlp": 1.04409015, + "diversity_loss_mlp": 0.0, + "epoch": 0.7266256252404771, + "flos": 681036968448.0, + "grad_norm": 0.07511121424148261, + "language_loss": 0.8321476, + "learning_rate": 0.00018351131362893519, + "loss": 0.84268057, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3777, + "time_per_iteration": 2.789809465408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058309, + "balance_loss_mlp": 1.04874849, + "diversity_loss_mlp": 0.0, + "epoch": 0.7268180069257407, + "flos": 518906580480.0, + "grad_norm": 0.08246656435114352, + "language_loss": 0.80534494, + "learning_rate": 0.00018327018827845364, + "loss": 0.81592798, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3778, + "time_per_iteration": 2.6201207637786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059499, + "balance_loss_mlp": 1.0502367, + "diversity_loss_mlp": 0.0, + "epoch": 0.7270103886110042, + "flos": 512662804992.0, + "grad_norm": 0.060849425034284504, + "language_loss": 0.87504601, + "learning_rate": 0.00018302918589339036, + "loss": 0.88564098, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3779, + "time_per_iteration": 2.689378499984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061153, + "balance_loss_mlp": 1.05198562, + "diversity_loss_mlp": 0.0, + "epoch": 0.7272027702962678, + "flos": 546653919744.0, + "grad_norm": 0.06743911417724738, + "language_loss": 0.90138805, + "learning_rate": 0.00018278830656731054, + "loss": 0.91199952, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3780, + "time_per_iteration": 2.6595706939697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056667, + "balance_loss_mlp": 1.04758894, + "diversity_loss_mlp": 0.0, + "epoch": 0.7273951519815314, + "flos": 593048521728.0, + "grad_norm": 0.06124301945992682, + "language_loss": 0.86350238, + "learning_rate": 0.00018254755039373222, + "loss": 0.87406909, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 3781, + "time_per_iteration": 2.7230565547943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062194, + "balance_loss_mlp": 1.0530144, + "diversity_loss_mlp": 0.0, + "epoch": 0.727587533666795, + "flos": 606012917760.0, + "grad_norm": 0.07105415138975459, + "language_loss": 0.83752382, + "learning_rate": 0.0001823069174661252, + "loss": 0.84814572, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 3782, + "time_per_iteration": 2.7941086292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056585, + "balance_loss_mlp": 1.04759097, + "diversity_loss_mlp": 0.0, + "epoch": 0.7277799153520584, + "flos": 513021081600.0, + "grad_norm": 0.06458866746308467, + "language_loss": 0.78171599, + "learning_rate": 0.00018206640787791112, + "loss": 0.79228187, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 3783, + "time_per_iteration": 2.618022918701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062955, + "balance_loss_mlp": 1.05387712, + "diversity_loss_mlp": 0.0, + "epoch": 0.727972297037322, + "flos": 537756475392.0, + "grad_norm": 0.06663972838638854, + "language_loss": 0.85480422, + "learning_rate": 0.00018182602172246416, + "loss": 0.86543375, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3784, + "time_per_iteration": 2.6113829612731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.05812776, + "diversity_loss_mlp": 0.0, + "epoch": 0.7281646787225856, + "flos": 535038566400.0, + "grad_norm": 0.07678107880467737, + "language_loss": 0.76375031, + "learning_rate": 0.00018158575909311075, + "loss": 0.77441949, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 3785, + "time_per_iteration": 2.650192975997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061503, + "balance_loss_mlp": 1.05243719, + "diversity_loss_mlp": 0.0, + "epoch": 0.7283570604078492, + "flos": 625055533056.0, + "grad_norm": 0.07604258502871962, + "language_loss": 0.79732937, + "learning_rate": 0.000181345620083129, + "loss": 0.80794436, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3786, + "time_per_iteration": 2.8074841499328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061193, + "balance_loss_mlp": 1.05211556, + "diversity_loss_mlp": 0.0, + "epoch": 0.7285494420931128, + "flos": 534173709312.0, + "grad_norm": 0.0629164713746694, + "language_loss": 0.86736983, + "learning_rate": 0.00018110560478574927, + "loss": 0.87798178, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3787, + "time_per_iteration": 2.6831634044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106216, + "balance_loss_mlp": 1.05288577, + "diversity_loss_mlp": 0.0, + "epoch": 0.7287418237783763, + "flos": 666548061696.0, + "grad_norm": 0.07652228362928638, + "language_loss": 0.80521822, + "learning_rate": 0.0001808657132941533, + "loss": 0.81583983, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3788, + "time_per_iteration": 2.7681210041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063738, + "balance_loss_mlp": 1.05462408, + "diversity_loss_mlp": 0.0, + "epoch": 0.7289342054636399, + "flos": 550602302976.0, + "grad_norm": 0.06755228065084157, + "language_loss": 0.83012414, + "learning_rate": 0.00018062594570147572, + "loss": 0.84076142, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3789, + "time_per_iteration": 2.59897780418396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069496, + "balance_loss_mlp": 1.06051326, + "diversity_loss_mlp": 0.0, + "epoch": 0.7291265871489034, + "flos": 687923145216.0, + "grad_norm": 0.0602370632110868, + "language_loss": 0.84944886, + "learning_rate": 0.00018038630210080243, + "loss": 0.86014384, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 3790, + "time_per_iteration": 2.8492085933685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061985, + "balance_loss_mlp": 1.05299687, + "diversity_loss_mlp": 0.0, + "epoch": 0.729318968834167, + "flos": 572664204288.0, + "grad_norm": 0.06258751029355039, + "language_loss": 0.85112703, + "learning_rate": 0.0001801467825851712, + "loss": 0.86174691, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 3791, + "time_per_iteration": 2.724008321762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063231, + "balance_loss_mlp": 1.05412316, + "diversity_loss_mlp": 0.0, + "epoch": 0.7295113505194305, + "flos": 586061028864.0, + "grad_norm": 0.06759881980366181, + "language_loss": 0.78407717, + "learning_rate": 0.00017990738724757172, + "loss": 0.79470944, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3792, + "time_per_iteration": 2.8527557849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065387, + "balance_loss_mlp": 1.05635726, + "diversity_loss_mlp": 0.0, + "epoch": 0.7297037322046941, + "flos": 707185645056.0, + "grad_norm": 0.05706424828537789, + "language_loss": 0.82412189, + "learning_rate": 0.00017966811618094598, + "loss": 0.83477581, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3793, + "time_per_iteration": 2.891587734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071379, + "balance_loss_mlp": 1.06256318, + "diversity_loss_mlp": 0.0, + "epoch": 0.7298961138899577, + "flos": 487292350464.0, + "grad_norm": 0.0800044571001495, + "language_loss": 0.84934509, + "learning_rate": 0.00017942896947818664, + "loss": 0.86005884, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 3794, + "time_per_iteration": 2.578213691711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027287, + "balance_loss_mlp": 1.02208936, + "diversity_loss_mlp": 0.0, + "epoch": 0.7300884955752213, + "flos": 1365804260352.0, + "grad_norm": 0.018812365315957286, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.7585234, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.05200195, + "routerloss_mlp": 0.0, + "step": 3795, + "time_per_iteration": 4.8731958866119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_mlp": 1.05696881, + "diversity_loss_mlp": 0.0, + "epoch": 0.7302808772604849, + "flos": 531806736384.0, + "grad_norm": 0.08247331408198653, + "language_loss": 0.85473979, + "learning_rate": 0.00017895104953559947, + "loss": 0.86539787, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 3796, + "time_per_iteration": 2.6150035858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071519, + "balance_loss_mlp": 1.06257856, + "diversity_loss_mlp": 0.0, + "epoch": 0.7304732589457483, + "flos": 436171143168.0, + "grad_norm": 0.0876682306683089, + "language_loss": 0.90019357, + "learning_rate": 0.00017871227648131672, + "loss": 0.91090876, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3797, + "time_per_iteration": 2.5456666946411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790219, + "balance_loss_mlp": 1.33552265, + "diversity_loss_mlp": 0.2213349, + "epoch": 0.7306656406310119, + "flos": 451621080576.0, + "grad_norm": 0.0295011086457174, + "language_loss": 0.82969385, + "learning_rate": 0.0001784736281619907, + "loss": 0.83759606, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01178985, + "step": 3798, + "time_per_iteration": 2.617690086364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064248, + "balance_loss_mlp": 1.05507529, + "diversity_loss_mlp": 0.0, + "epoch": 0.7308580223162755, + "flos": 512010491904.0, + "grad_norm": 0.0761333988969544, + "language_loss": 0.74143457, + "learning_rate": 0.00017823510467027232, + "loss": 0.75207704, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 3799, + "time_per_iteration": 2.74944806098938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061269, + "balance_loss_mlp": 1.05231094, + "diversity_loss_mlp": 0.0, + "epoch": 0.7310504040015391, + "flos": 375423455232.0, + "grad_norm": 0.07529945885516458, + "language_loss": 0.7849319, + "learning_rate": 0.00017799670609876516, + "loss": 0.79554456, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3800, + "time_per_iteration": 2.514719247817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106109, + "balance_loss_mlp": 1.05228066, + "diversity_loss_mlp": 0.0, + "epoch": 0.7312427856868026, + "flos": 549334752768.0, + "grad_norm": 0.07202410794231434, + "language_loss": 0.89223945, + "learning_rate": 0.00017775843254002366, + "loss": 0.90285027, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 3801, + "time_per_iteration": 2.742403507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059589, + "balance_loss_mlp": 1.05084491, + "diversity_loss_mlp": 0.0, + "epoch": 0.7314351673720662, + "flos": 767238801408.0, + "grad_norm": 0.060424645606399964, + "language_loss": 0.83728462, + "learning_rate": 0.00017752028408655367, + "loss": 0.84788048, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 3802, + "time_per_iteration": 3.0845768451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.007903, + "balance_loss_mlp": 1.33712423, + "diversity_loss_mlp": 0.22043222, + "epoch": 0.7316275490573297, + "flos": 486734012928.0, + "grad_norm": 0.03351149815402085, + "language_loss": 0.85395515, + "learning_rate": 0.00017728226083081272, + "loss": 0.86185813, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01152179, + "step": 3803, + "time_per_iteration": 2.625450849533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064175, + "balance_loss_mlp": 1.05536509, + "diversity_loss_mlp": 0.0, + "epoch": 0.7318199307425933, + "flos": 473428592640.0, + "grad_norm": 0.06980647435682294, + "language_loss": 0.81371546, + "learning_rate": 0.00017704436286520965, + "loss": 0.82435715, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 3804, + "time_per_iteration": 2.5445075035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064875, + "balance_loss_mlp": 1.05574334, + "diversity_loss_mlp": 0.0, + "epoch": 0.7320123124278569, + "flos": 549463233024.0, + "grad_norm": 0.0710476755005787, + "language_loss": 0.84313726, + "learning_rate": 0.0001768065902821046, + "loss": 0.85378599, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 3805, + "time_per_iteration": 2.6542673110961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060899, + "balance_loss_mlp": 1.05200648, + "diversity_loss_mlp": 0.0, + "epoch": 0.7322046941131204, + "flos": 570781416960.0, + "grad_norm": 0.07797130890244271, + "language_loss": 0.8206104, + "learning_rate": 0.00017656894317380907, + "loss": 0.83121943, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 3806, + "time_per_iteration": 2.701544761657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020369, + "balance_loss_mlp": 1.01498067, + "diversity_loss_mlp": 0.0, + "epoch": 0.732397075798384, + "flos": 1469165548032.0, + "grad_norm": 0.021367923460696967, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.7705164, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 3807, + "time_per_iteration": 5.001535177230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066451, + "balance_loss_mlp": 1.05737972, + "diversity_loss_mlp": 0.0, + "epoch": 0.7325894574836476, + "flos": 464862260736.0, + "grad_norm": 0.08165775614059534, + "language_loss": 0.83709639, + "learning_rate": 0.00017609402575064875, + "loss": 0.84776092, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 3808, + "time_per_iteration": 2.583564043045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061217, + "balance_loss_mlp": 1.05229425, + "diversity_loss_mlp": 0.0, + "epoch": 0.7327818391689112, + "flos": 495493065216.0, + "grad_norm": 0.0811056502064105, + "language_loss": 0.80930746, + "learning_rate": 0.00017585675562016367, + "loss": 0.81991959, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 3809, + "time_per_iteration": 2.6347053050994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101136, + "balance_loss_mlp": 1.00604343, + "diversity_loss_mlp": 0.0, + "epoch": 0.7329742208541746, + "flos": 1433489508864.0, + "grad_norm": 0.015405005389362274, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78224206, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 3810, + "time_per_iteration": 4.809669017791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010632, + "balance_loss_mlp": 1.05418134, + "diversity_loss_mlp": 0.0, + "epoch": 0.7331666025394382, + "flos": 496889095680.0, + "grad_norm": 0.08174261034044085, + "language_loss": 0.85100114, + "learning_rate": 0.00017538259298196474, + "loss": 0.86163306, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 3811, + "time_per_iteration": 2.5669541358947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066918, + "balance_loss_mlp": 1.05802464, + "diversity_loss_mlp": 0.0, + "epoch": 0.7333589842247018, + "flos": 538524785664.0, + "grad_norm": 0.06518192792765873, + "language_loss": 0.82332867, + "learning_rate": 0.00017514570065833745, + "loss": 0.83399785, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 3812, + "time_per_iteration": 2.7447328567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071687, + "balance_loss_mlp": 1.06259131, + "diversity_loss_mlp": 0.0, + "epoch": 0.7335513659099654, + "flos": 491067836928.0, + "grad_norm": 0.09580264059121266, + "language_loss": 0.80788046, + "learning_rate": 0.00017490893445433426, + "loss": 0.81859732, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3813, + "time_per_iteration": 2.6378085613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_mlp": 1.05522716, + "diversity_loss_mlp": 0.0, + "epoch": 0.733743747595229, + "flos": 562150844928.0, + "grad_norm": 0.07102449829418327, + "language_loss": 0.81571025, + "learning_rate": 0.00017467229446187587, + "loss": 0.82635403, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3814, + "time_per_iteration": 2.7120914459228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072677, + "balance_loss_mlp": 1.06393909, + "diversity_loss_mlp": 0.0, + "epoch": 0.7339361292804925, + "flos": 538581685248.0, + "grad_norm": 0.07114012207935533, + "language_loss": 0.81285048, + "learning_rate": 0.00017443578077283424, + "loss": 0.82357717, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 3815, + "time_per_iteration": 2.6395435333251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106649, + "balance_loss_mlp": 1.05747199, + "diversity_loss_mlp": 0.0, + "epoch": 0.734128510965756, + "flos": 548469895680.0, + "grad_norm": 0.07483834875110257, + "language_loss": 0.84961641, + "learning_rate": 0.0001741993934790319, + "loss": 0.86028135, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 3816, + "time_per_iteration": 2.726897716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_mlp": 1.05116272, + "diversity_loss_mlp": 0.0, + "epoch": 0.7343208926510196, + "flos": 540066548736.0, + "grad_norm": 0.07480496039033006, + "language_loss": 0.84648383, + "learning_rate": 0.00017396313267224273, + "loss": 0.85708326, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 3817, + "time_per_iteration": 2.8066418170928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066831, + "balance_loss_mlp": 1.05799198, + "diversity_loss_mlp": 0.0, + "epoch": 0.7345132743362832, + "flos": 571095277056.0, + "grad_norm": 0.0889487029403391, + "language_loss": 0.8847158, + "learning_rate": 0.0001737269984441912, + "loss": 0.89538407, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 3818, + "time_per_iteration": 2.6318438053131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060197, + "balance_loss_mlp": 1.05124998, + "diversity_loss_mlp": 0.0, + "epoch": 0.7347056560215467, + "flos": 545403621888.0, + "grad_norm": 0.07556044268941689, + "language_loss": 0.85168499, + "learning_rate": 0.00017349099088655263, + "loss": 0.86228693, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 3819, + "time_per_iteration": 2.6988065242767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058709, + "balance_loss_mlp": 1.05007255, + "diversity_loss_mlp": 0.0, + "epoch": 0.7348980377068103, + "flos": 595949239296.0, + "grad_norm": 0.06839680418094873, + "language_loss": 0.80908042, + "learning_rate": 0.00017325511009095375, + "loss": 0.81966752, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 3820, + "time_per_iteration": 2.727027177810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057302, + "balance_loss_mlp": 1.04837942, + "diversity_loss_mlp": 0.0, + "epoch": 0.7350904193920739, + "flos": 538554521088.0, + "grad_norm": 0.07744320065165705, + "language_loss": 0.83646286, + "learning_rate": 0.00017301935614897113, + "loss": 0.84703583, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 3821, + "time_per_iteration": 2.6904449462890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059614, + "balance_loss_mlp": 1.05071497, + "diversity_loss_mlp": 0.0, + "epoch": 0.7352828010773375, + "flos": 512981434368.0, + "grad_norm": 0.06367960554180149, + "language_loss": 0.82050133, + "learning_rate": 0.00017278372915213274, + "loss": 0.83109748, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 3822, + "time_per_iteration": 2.715162515640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009526, + "balance_loss_mlp": 1.00437641, + "diversity_loss_mlp": 0.0, + "epoch": 0.735475182762601, + "flos": 1553820848640.0, + "grad_norm": 0.013680325571624621, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80903369, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.05151367, + "routerloss_mlp": 0.0, + "step": 3823, + "time_per_iteration": 4.962257146835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056616, + "balance_loss_mlp": 1.04753208, + "diversity_loss_mlp": 0.0, + "epoch": 0.7356675644478645, + "flos": 681308610048.0, + "grad_norm": 0.08246165896918017, + "language_loss": 0.80686677, + "learning_rate": 0.00017231285635975314, + "loss": 0.81743288, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 3824, + "time_per_iteration": 2.892613172531128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060803, + "balance_loss_mlp": 1.05131412, + "diversity_loss_mlp": 0.0, + "epoch": 0.7358599461331281, + "flos": 515215157760.0, + "grad_norm": 0.06805025721620432, + "language_loss": 0.83387762, + "learning_rate": 0.00017207761074702115, + "loss": 0.84448564, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3825, + "time_per_iteration": 2.600008964538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061591, + "balance_loss_mlp": 1.05259085, + "diversity_loss_mlp": 0.0, + "epoch": 0.7360523278183917, + "flos": 443973934080.0, + "grad_norm": 0.06050130894095604, + "language_loss": 0.84002912, + "learning_rate": 0.0001718424924450514, + "loss": 0.85064507, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 3826, + "time_per_iteration": 2.5992300510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054783, + "balance_loss_mlp": 1.04562807, + "diversity_loss_mlp": 0.0, + "epoch": 0.7362447095036553, + "flos": 603423489024.0, + "grad_norm": 0.057066515344493245, + "language_loss": 0.86262274, + "learning_rate": 0.00017160750154512482, + "loss": 0.87317061, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 3827, + "time_per_iteration": 2.726304292678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00795034, + "balance_loss_mlp": 1.34579134, + "diversity_loss_mlp": 0.220893, + "epoch": 0.7364370911889189, + "flos": 553095184896.0, + "grad_norm": 0.03015959834370855, + "language_loss": 0.83901906, + "learning_rate": 0.0001713726381384731, + "loss": 0.84696937, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01169185, + "step": 3828, + "time_per_iteration": 2.8043603897094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061645, + "balance_loss_mlp": 1.05248344, + "diversity_loss_mlp": 0.0, + "epoch": 0.7366294728741823, + "flos": 449061387264.0, + "grad_norm": 0.06844777280948466, + "language_loss": 0.81076348, + "learning_rate": 0.00017113790231627812, + "loss": 0.8213799, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3829, + "time_per_iteration": 2.619093179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100728, + "balance_loss_mlp": 1.0020107, + "diversity_loss_mlp": 0.0, + "epoch": 0.7368218545594459, + "flos": 1535502500352.0, + "grad_norm": 0.01400462839453399, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80265498, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.05273438, + "routerloss_mlp": 0.0, + "step": 3830, + "time_per_iteration": 4.812221527099609 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00792371, + "balance_loss_mlp": 1.34191561, + "diversity_loss_mlp": 0.21972378, + "epoch": 0.7370142362447095, + "flos": 515425130496.0, + "grad_norm": 0.03330075510268521, + "language_loss": 0.81812584, + "learning_rate": 0.00017066881378973936, + "loss": 0.82604957, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01155161, + "step": 3831, + "time_per_iteration": 2.7056965827941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060657, + "balance_loss_mlp": 1.05176377, + "diversity_loss_mlp": 0.0, + "epoch": 0.7372066179299731, + "flos": 500805172224.0, + "grad_norm": 0.07192956817041389, + "language_loss": 0.83134949, + "learning_rate": 0.00017043446126751189, + "loss": 0.84195602, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 3832, + "time_per_iteration": 2.676421880722046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060842, + "balance_loss_mlp": 1.05175185, + "diversity_loss_mlp": 0.0, + "epoch": 0.7373989996152366, + "flos": 558083893248.0, + "grad_norm": 0.07065913186643534, + "language_loss": 0.76922351, + "learning_rate": 0.00017020023669397376, + "loss": 0.77983195, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3833, + "time_per_iteration": 2.67942214012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063044, + "balance_loss_mlp": 1.0536567, + "diversity_loss_mlp": 0.0, + "epoch": 0.7375913813005002, + "flos": 506777306112.0, + "grad_norm": 0.07582868630536281, + "language_loss": 0.81676751, + "learning_rate": 0.0001699661401600589, + "loss": 0.82739794, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3834, + "time_per_iteration": 2.5813028812408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791828, + "balance_loss_mlp": 1.34016216, + "diversity_loss_mlp": 0.22067872, + "epoch": 0.7377837629857638, + "flos": 486183015936.0, + "grad_norm": 0.03104422851251126, + "language_loss": 0.78392982, + "learning_rate": 0.00016973217175665205, + "loss": 0.79184818, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01140742, + "step": 3835, + "time_per_iteration": 2.622943639755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002245, + "balance_loss_mlp": 0.99702322, + "diversity_loss_mlp": 0.0, + "epoch": 0.7379761446710273, + "flos": 1414693942272.0, + "grad_norm": 0.013207371532760371, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82168412, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.05224609, + "routerloss_mlp": 0.0, + "step": 3836, + "time_per_iteration": 4.931336402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060295, + "balance_loss_mlp": 1.05126452, + "diversity_loss_mlp": 0.0, + "epoch": 0.7381685263562909, + "flos": 629737721856.0, + "grad_norm": 0.06649751574670516, + "language_loss": 0.84498501, + "learning_rate": 0.00016926461970465047, + "loss": 0.85558796, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 3837, + "time_per_iteration": 2.765747547149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059108, + "balance_loss_mlp": 1.04992294, + "diversity_loss_mlp": 0.0, + "epoch": 0.7383609080415544, + "flos": 739224589824.0, + "grad_norm": 0.0574260047104924, + "language_loss": 0.84358233, + "learning_rate": 0.00016903103623757516, + "loss": 0.85417342, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3838, + "time_per_iteration": 3.069658041000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060146, + "balance_loss_mlp": 1.05106258, + "diversity_loss_mlp": 0.0, + "epoch": 0.738553289726818, + "flos": 550206950400.0, + "grad_norm": 0.19052913382225448, + "language_loss": 0.80133057, + "learning_rate": 0.00016879758126404738, + "loss": 0.81193197, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3839, + "time_per_iteration": 2.689941167831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789085, + "balance_loss_mlp": 1.33350182, + "diversity_loss_mlp": 0.2223025, + "epoch": 0.7387456714120816, + "flos": 910294640640.0, + "grad_norm": 0.03551016649676842, + "language_loss": 0.79851139, + "learning_rate": 0.00016856425487470216, + "loss": 0.80640227, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01118332, + "step": 3840, + "time_per_iteration": 3.1254615783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064553, + "balance_loss_mlp": 1.05543303, + "diversity_loss_mlp": 0.0, + "epoch": 0.7389380530973452, + "flos": 852684807168.0, + "grad_norm": 0.0706997471436485, + "language_loss": 0.79199183, + "learning_rate": 0.00016833105716012486, + "loss": 0.8026374, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 3841, + "time_per_iteration": 3.138193368911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063082, + "balance_loss_mlp": 1.05398655, + "diversity_loss_mlp": 0.0, + "epoch": 0.7391304347826086, + "flos": 817026020352.0, + "grad_norm": 0.06630465632536123, + "language_loss": 0.85135829, + "learning_rate": 0.00016809798821085088, + "loss": 0.86198914, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3842, + "time_per_iteration": 3.0023772716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070258, + "balance_loss_mlp": 1.06117415, + "diversity_loss_mlp": 0.0, + "epoch": 0.7393228164678722, + "flos": 572819848704.0, + "grad_norm": 0.05652902477854722, + "language_loss": 0.89046443, + "learning_rate": 0.00016786504811736565, + "loss": 0.90116704, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3843, + "time_per_iteration": 2.706385374069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063666, + "balance_loss_mlp": 1.05483222, + "diversity_loss_mlp": 0.0, + "epoch": 0.7395151981531358, + "flos": 685237169664.0, + "grad_norm": 0.0599118075718357, + "language_loss": 0.82577473, + "learning_rate": 0.00016763223697010442, + "loss": 0.83641136, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 3844, + "time_per_iteration": 3.0668578147888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065987, + "balance_loss_mlp": 1.05714738, + "diversity_loss_mlp": 0.0, + "epoch": 0.7397075798383994, + "flos": 556366662144.0, + "grad_norm": 0.06587022409921209, + "language_loss": 0.84292293, + "learning_rate": 0.00016739955485945256, + "loss": 0.8535828, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 3845, + "time_per_iteration": 2.76232647895813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066755, + "balance_loss_mlp": 1.05776656, + "diversity_loss_mlp": 0.0, + "epoch": 0.739899961523663, + "flos": 546782400000.0, + "grad_norm": 0.07863227392455628, + "language_loss": 0.85949242, + "learning_rate": 0.00016716700187574513, + "loss": 0.87015998, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 3846, + "time_per_iteration": 2.6615161895751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068464, + "balance_loss_mlp": 1.05967295, + "diversity_loss_mlp": 0.0, + "epoch": 0.7400923432089265, + "flos": 609190419456.0, + "grad_norm": 0.0694717633397352, + "language_loss": 0.8384943, + "learning_rate": 0.0001669345781092675, + "loss": 0.84917903, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 3847, + "time_per_iteration": 2.708287477493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068988, + "balance_loss_mlp": 1.06022048, + "diversity_loss_mlp": 0.0, + "epoch": 0.7402847248941901, + "flos": 591007518720.0, + "grad_norm": 0.08739626570818541, + "language_loss": 0.87128854, + "learning_rate": 0.0001667022836502546, + "loss": 0.88197839, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 3848, + "time_per_iteration": 2.768453598022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_mlp": 1.06293964, + "diversity_loss_mlp": 0.0, + "epoch": 0.7404771065794536, + "flos": 477369635328.0, + "grad_norm": 0.07849103844245357, + "language_loss": 0.83004302, + "learning_rate": 0.00016647011858889077, + "loss": 0.84076011, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 3849, + "time_per_iteration": 2.553321123123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066558, + "balance_loss_mlp": 1.05774295, + "diversity_loss_mlp": 0.0, + "epoch": 0.7406694882647172, + "flos": 496446755328.0, + "grad_norm": 0.0747699795491948, + "language_loss": 0.85671914, + "learning_rate": 0.00016623808301531056, + "loss": 0.86738473, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 3850, + "time_per_iteration": 2.6675972938537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072662, + "balance_loss_mlp": 1.06376278, + "diversity_loss_mlp": 0.0, + "epoch": 0.7408618699499807, + "flos": 562205173248.0, + "grad_norm": 0.08247164679043814, + "language_loss": 0.79259217, + "learning_rate": 0.00016600617701959842, + "loss": 0.8033188, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 3851, + "time_per_iteration": 2.7360141277313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_mlp": 1.03028595, + "diversity_loss_mlp": 0.0, + "epoch": 0.7410542516352443, + "flos": 1388228834304.0, + "grad_norm": 0.02428572869696352, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79879034, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.04931641, + "routerloss_mlp": 0.0, + "step": 3852, + "time_per_iteration": 4.992321968078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066311, + "balance_loss_mlp": 1.05746007, + "diversity_loss_mlp": 0.0, + "epoch": 0.7412466333205079, + "flos": 669999776256.0, + "grad_norm": 0.06380286775900439, + "language_loss": 0.81274605, + "learning_rate": 0.00016554275412186315, + "loss": 0.8234092, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 3853, + "time_per_iteration": 2.82212495803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065457, + "balance_loss_mlp": 1.05660534, + "diversity_loss_mlp": 0.0, + "epoch": 0.7414390150057715, + "flos": 489293706240.0, + "grad_norm": 0.08235676445627264, + "language_loss": 0.80846745, + "learning_rate": 0.0001653112373997568, + "loss": 0.81912202, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 3854, + "time_per_iteration": 2.6886162757873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072808, + "balance_loss_mlp": 1.06417763, + "diversity_loss_mlp": 0.0, + "epoch": 0.7416313966910351, + "flos": 599393613312.0, + "grad_norm": 0.0787808176004402, + "language_loss": 0.7459085, + "learning_rate": 0.0001650798506153517, + "loss": 0.75663662, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 3855, + "time_per_iteration": 2.699655294418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064176, + "balance_loss_mlp": 1.05534911, + "diversity_loss_mlp": 0.0, + "epoch": 0.7418237783762985, + "flos": 542539980288.0, + "grad_norm": 0.13185112675918914, + "language_loss": 0.84102911, + "learning_rate": 0.00016484859385848023, + "loss": 0.85167086, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 3856, + "time_per_iteration": 2.6237292289733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066087, + "balance_loss_mlp": 1.05749846, + "diversity_loss_mlp": 0.0, + "epoch": 0.7420161600615621, + "flos": 544136071680.0, + "grad_norm": 0.0735312090287519, + "language_loss": 0.77380371, + "learning_rate": 0.0001646174672189243, + "loss": 0.7844646, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.08599854, + "routerloss_mlp": 0.0, + "step": 3857, + "time_per_iteration": 2.662250518798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066781, + "balance_loss_mlp": 1.05808437, + "diversity_loss_mlp": 0.0, + "epoch": 0.7422085417468257, + "flos": 527178875904.0, + "grad_norm": 0.07158580991852644, + "language_loss": 0.80202585, + "learning_rate": 0.00016438647078641488, + "loss": 0.81269372, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 3858, + "time_per_iteration": 2.5815234184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061243, + "balance_loss_mlp": 1.05223656, + "diversity_loss_mlp": 0.0, + "epoch": 0.7424009234320893, + "flos": 508674774528.0, + "grad_norm": 0.07922307514532904, + "language_loss": 0.82879561, + "learning_rate": 0.00016415560465063344, + "loss": 0.83940804, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 3859, + "time_per_iteration": 2.708585739135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057118, + "balance_loss_mlp": 1.04814172, + "diversity_loss_mlp": 0.0, + "epoch": 0.7425933051173528, + "flos": 512598564864.0, + "grad_norm": 0.07844823875052143, + "language_loss": 0.79364371, + "learning_rate": 0.0001639248689012095, + "loss": 0.80421484, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 3860, + "time_per_iteration": 2.58583927154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_mlp": 1.05484664, + "diversity_loss_mlp": 0.0, + "epoch": 0.7427856868026164, + "flos": 458302053888.0, + "grad_norm": 0.0625994675611715, + "language_loss": 0.87600327, + "learning_rate": 0.00016369426362772271, + "loss": 0.88664174, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 3861, + "time_per_iteration": 2.7810909748077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058337, + "balance_loss_mlp": 1.04926515, + "diversity_loss_mlp": 0.0, + "epoch": 0.74297806848788, + "flos": 605019580416.0, + "grad_norm": 0.06941058470153043, + "language_loss": 0.80742699, + "learning_rate": 0.00016346378891970233, + "loss": 0.81801033, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3862, + "time_per_iteration": 2.846928596496582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063331, + "balance_loss_mlp": 1.05435514, + "diversity_loss_mlp": 0.0, + "epoch": 0.7431704501731435, + "flos": 891390044160.0, + "grad_norm": 0.0684493510726064, + "language_loss": 0.81710279, + "learning_rate": 0.00016323344486662633, + "loss": 0.82773608, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 3863, + "time_per_iteration": 3.331202745437622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061695, + "balance_loss_mlp": 1.05259883, + "diversity_loss_mlp": 0.0, + "epoch": 0.7433628318584071, + "flos": 592163841024.0, + "grad_norm": 0.05806816249285044, + "language_loss": 0.78816247, + "learning_rate": 0.00016300323155792247, + "loss": 0.79877937, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3864, + "time_per_iteration": 2.872833490371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060231, + "balance_loss_mlp": 1.05139732, + "diversity_loss_mlp": 0.0, + "epoch": 0.7435552135436706, + "flos": 477154520064.0, + "grad_norm": 0.06583078508607046, + "language_loss": 0.88677347, + "learning_rate": 0.00016277314908296687, + "loss": 0.89737576, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 3865, + "time_per_iteration": 2.6268508434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062318, + "balance_loss_mlp": 1.05286467, + "diversity_loss_mlp": 0.0, + "epoch": 0.7437475952289342, + "flos": 673184618496.0, + "grad_norm": 0.08180248385301583, + "language_loss": 0.7621361, + "learning_rate": 0.00016254319753108604, + "loss": 0.77275932, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3866, + "time_per_iteration": 2.8856914043426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062277, + "balance_loss_mlp": 1.05305004, + "diversity_loss_mlp": 0.0, + "epoch": 0.7439399769141978, + "flos": 770428786176.0, + "grad_norm": 0.07310249763973194, + "language_loss": 0.77018058, + "learning_rate": 0.00016231337699155492, + "loss": 0.78080332, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3867, + "time_per_iteration": 2.975250244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059189, + "balance_loss_mlp": 1.04974771, + "diversity_loss_mlp": 0.0, + "epoch": 0.7441323585994614, + "flos": 647777088000.0, + "grad_norm": 0.07083990267041149, + "language_loss": 0.78228271, + "learning_rate": 0.0001620836875535977, + "loss": 0.79287452, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3868, + "time_per_iteration": 2.856765031814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105862, + "balance_loss_mlp": 1.04925001, + "diversity_loss_mlp": 0.0, + "epoch": 0.7443247402847248, + "flos": 565372763136.0, + "grad_norm": 0.058820941096758894, + "language_loss": 0.80752689, + "learning_rate": 0.00016185412930638766, + "loss": 0.81811309, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3869, + "time_per_iteration": 2.7962300777435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060986, + "balance_loss_mlp": 1.05180645, + "diversity_loss_mlp": 0.0, + "epoch": 0.7445171219699884, + "flos": 578529879552.0, + "grad_norm": 0.09216022180459393, + "language_loss": 0.82565176, + "learning_rate": 0.00016162470233904765, + "loss": 0.83626163, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3870, + "time_per_iteration": 2.727376937866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059095, + "balance_loss_mlp": 1.05008888, + "diversity_loss_mlp": 0.0, + "epoch": 0.744709503655252, + "flos": 618875997696.0, + "grad_norm": 0.08871714462123159, + "language_loss": 0.82108277, + "learning_rate": 0.00016139540674064856, + "loss": 0.83167374, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 3871, + "time_per_iteration": 2.747559070587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055678, + "balance_loss_mlp": 1.04671371, + "diversity_loss_mlp": 0.0, + "epoch": 0.7449018853405156, + "flos": 528619322880.0, + "grad_norm": 0.063692065795828, + "language_loss": 0.7763024, + "learning_rate": 0.00016116624260021113, + "loss": 0.78685915, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3872, + "time_per_iteration": 2.75909423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106192, + "balance_loss_mlp": 1.0528599, + "diversity_loss_mlp": 0.0, + "epoch": 0.7450942670257792, + "flos": 433314842112.0, + "grad_norm": 0.06099997691226976, + "language_loss": 0.83786505, + "learning_rate": 0.0001609372100067046, + "loss": 0.84848428, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 3873, + "time_per_iteration": 2.5251874923706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796431, + "balance_loss_mlp": 1.34714556, + "diversity_loss_mlp": 0.22299039, + "epoch": 0.7452866487110427, + "flos": 696882258432.0, + "grad_norm": 0.03925838692514683, + "language_loss": 0.85007972, + "learning_rate": 0.0001607083090490475, + "loss": 0.85804403, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01136341, + "step": 3874, + "time_per_iteration": 2.8896329402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061928, + "balance_loss_mlp": 1.0527246, + "diversity_loss_mlp": 0.0, + "epoch": 0.7454790303963063, + "flos": 512210552832.0, + "grad_norm": 0.07963892031444339, + "language_loss": 0.80322075, + "learning_rate": 0.00016047953981610714, + "loss": 0.81384003, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 3875, + "time_per_iteration": 2.7198143005371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_mlp": 1.02416849, + "diversity_loss_mlp": 0.0, + "epoch": 0.7456714120815698, + "flos": 1325949668352.0, + "grad_norm": 0.01953041960218584, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80758721, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.04907227, + "routerloss_mlp": 0.0, + "step": 3876, + "time_per_iteration": 5.047106981277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105976, + "balance_loss_mlp": 1.05069435, + "diversity_loss_mlp": 0.0, + "epoch": 0.7458637937668334, + "flos": 721711627776.0, + "grad_norm": 0.07139005535531126, + "language_loss": 0.80606306, + "learning_rate": 0.0001600223968795889, + "loss": 0.81666064, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3877, + "time_per_iteration": 2.8899221420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_mlp": 1.02230287, + "diversity_loss_mlp": 0.0, + "epoch": 0.746056175452097, + "flos": 1501580395008.0, + "grad_norm": 0.018847716252117216, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76723289, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.04907227, + "routerloss_mlp": 0.0, + "step": 3878, + "time_per_iteration": 4.949044466018677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063533, + "balance_loss_mlp": 1.05449665, + "diversity_loss_mlp": 0.0, + "epoch": 0.7462485571373605, + "flos": 520245711360.0, + "grad_norm": 0.08037956070996295, + "language_loss": 0.8220886, + "learning_rate": 0.00015956578190706483, + "loss": 0.83272392, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3879, + "time_per_iteration": 2.679077386856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058904, + "balance_loss_mlp": 1.04966509, + "diversity_loss_mlp": 0.0, + "epoch": 0.7464409388226241, + "flos": 481206790656.0, + "grad_norm": 0.07423526276361143, + "language_loss": 0.75933188, + "learning_rate": 0.00015933767262892468, + "loss": 0.76992095, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3880, + "time_per_iteration": 2.725120782852173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061626, + "balance_loss_mlp": 1.05248249, + "diversity_loss_mlp": 0.0, + "epoch": 0.7466333205078877, + "flos": 486761177088.0, + "grad_norm": 0.08122487442608403, + "language_loss": 0.81791377, + "learning_rate": 0.00015910969560762927, + "loss": 0.82853001, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 3881, + "time_per_iteration": 2.5659735202789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061027, + "balance_loss_mlp": 1.05212796, + "diversity_loss_mlp": 0.0, + "epoch": 0.7468257021931513, + "flos": 611293091328.0, + "grad_norm": 0.06269003532148706, + "language_loss": 0.83085567, + "learning_rate": 0.00015888185093168727, + "loss": 0.84146595, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 3882, + "time_per_iteration": 2.7333316802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064922, + "balance_loss_mlp": 1.0554266, + "diversity_loss_mlp": 0.0, + "epoch": 0.7470180838784147, + "flos": 533459727360.0, + "grad_norm": 0.06569405974283654, + "language_loss": 0.81109202, + "learning_rate": 0.00015865413868955581, + "loss": 0.82174122, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3883, + "time_per_iteration": 2.6078059673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058237, + "balance_loss_mlp": 1.04946291, + "diversity_loss_mlp": 0.0, + "epoch": 0.7472104655636783, + "flos": 739338388992.0, + "grad_norm": 0.057634664266444945, + "language_loss": 0.82803142, + "learning_rate": 0.00015842655896964054, + "loss": 0.83861375, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 3884, + "time_per_iteration": 3.042433977127075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061863, + "balance_loss_mlp": 1.0528096, + "diversity_loss_mlp": 0.0, + "epoch": 0.7474028472489419, + "flos": 640305409536.0, + "grad_norm": 0.07244796431130596, + "language_loss": 0.73654252, + "learning_rate": 0.00015819911186029567, + "loss": 0.74716115, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 3885, + "time_per_iteration": 2.8399569988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063739, + "balance_loss_mlp": 1.05458951, + "diversity_loss_mlp": 0.0, + "epoch": 0.7475952289342055, + "flos": 590249120256.0, + "grad_norm": 0.0730187367037383, + "language_loss": 0.86386681, + "learning_rate": 0.00015797179744982443, + "loss": 0.87450415, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3886, + "time_per_iteration": 2.6979753971099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068538, + "balance_loss_mlp": 1.05947804, + "diversity_loss_mlp": 0.0, + "epoch": 0.7477876106194691, + "flos": 488191712256.0, + "grad_norm": 0.06196383449999257, + "language_loss": 0.78900141, + "learning_rate": 0.00015774461582647765, + "loss": 0.79968679, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3887, + "time_per_iteration": 2.6235530376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067008, + "balance_loss_mlp": 1.05791271, + "diversity_loss_mlp": 0.0, + "epoch": 0.7479799923047326, + "flos": 554733494784.0, + "grad_norm": 0.07428746170121639, + "language_loss": 0.81271255, + "learning_rate": 0.00015751756707845505, + "loss": 0.82338268, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3888, + "time_per_iteration": 2.654217481613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066279, + "balance_loss_mlp": 1.05733204, + "diversity_loss_mlp": 0.0, + "epoch": 0.7481723739899961, + "flos": 767387105280.0, + "grad_norm": 0.06349901375293318, + "language_loss": 0.8820529, + "learning_rate": 0.00015729065129390502, + "loss": 0.89271569, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 3889, + "time_per_iteration": 2.990723133087158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107017, + "balance_loss_mlp": 1.06125295, + "diversity_loss_mlp": 0.0, + "epoch": 0.7483647556752597, + "flos": 496172542464.0, + "grad_norm": 0.10644115001559669, + "language_loss": 0.82281494, + "learning_rate": 0.0001570638685609241, + "loss": 0.83351666, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 3890, + "time_per_iteration": 2.562049627304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064882, + "balance_loss_mlp": 1.0558815, + "diversity_loss_mlp": 0.0, + "epoch": 0.7485571373605233, + "flos": 472850431488.0, + "grad_norm": 0.07005408827456952, + "language_loss": 0.80632579, + "learning_rate": 0.00015683721896755693, + "loss": 0.81697452, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 3891, + "time_per_iteration": 2.5688047409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018069, + "balance_loss_mlp": 1.01291943, + "diversity_loss_mlp": 0.0, + "epoch": 0.7487495190457868, + "flos": 1554473161728.0, + "grad_norm": 0.021126139986013294, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83228564, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.05151367, + "routerloss_mlp": 0.0, + "step": 3892, + "time_per_iteration": 4.9241249561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063391, + "balance_loss_mlp": 1.05425954, + "diversity_loss_mlp": 0.0, + "epoch": 0.7489419007310504, + "flos": 581845773312.0, + "grad_norm": 0.07047459901443781, + "language_loss": 0.85042292, + "learning_rate": 0.00015638431955158528, + "loss": 0.8610568, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 3893, + "time_per_iteration": 2.696835517883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059789, + "balance_loss_mlp": 1.05092609, + "diversity_loss_mlp": 0.0, + "epoch": 0.749134282416314, + "flos": 567576751104.0, + "grad_norm": 0.07429691825865621, + "language_loss": 0.81044436, + "learning_rate": 0.00015615806990481186, + "loss": 0.8210423, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 3894, + "time_per_iteration": 2.721975088119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061695, + "balance_loss_mlp": 1.05259371, + "diversity_loss_mlp": 0.0, + "epoch": 0.7493266641015776, + "flos": 533061803520.0, + "grad_norm": 0.05332768573038703, + "language_loss": 0.84447378, + "learning_rate": 0.00015593195374931452, + "loss": 0.85509074, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3895, + "time_per_iteration": 2.724210500717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057311, + "balance_loss_mlp": 1.04820967, + "diversity_loss_mlp": 0.0, + "epoch": 0.7495190457868411, + "flos": 523613362176.0, + "grad_norm": 0.08170178598725314, + "language_loss": 0.79939067, + "learning_rate": 0.00015570597117287922, + "loss": 0.80996376, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3896, + "time_per_iteration": 2.6550590991973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058835, + "balance_loss_mlp": 1.04970384, + "diversity_loss_mlp": 0.0, + "epoch": 0.7497114274721046, + "flos": 514187315712.0, + "grad_norm": 0.07111999470543245, + "language_loss": 0.77950025, + "learning_rate": 0.0001554801222632406, + "loss": 0.79008865, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 3897, + "time_per_iteration": 2.5913069248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058781, + "balance_loss_mlp": 1.04961967, + "diversity_loss_mlp": 0.0, + "epoch": 0.7499038091573682, + "flos": 495006308352.0, + "grad_norm": 0.07004004520272819, + "language_loss": 0.8521589, + "learning_rate": 0.00015525440710808052, + "loss": 0.86274672, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3898, + "time_per_iteration": 2.633772850036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105768, + "balance_loss_mlp": 1.04835165, + "diversity_loss_mlp": 0.0, + "epoch": 0.7500961908426318, + "flos": 737658233856.0, + "grad_norm": 0.07310706246925956, + "language_loss": 0.77907795, + "learning_rate": 0.00015502882579502953, + "loss": 0.78965473, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.09332275, + "routerloss_mlp": 0.0, + "step": 3899, + "time_per_iteration": 2.938547372817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054758, + "balance_loss_mlp": 1.04551327, + "diversity_loss_mlp": 0.0, + "epoch": 0.7502885725278954, + "flos": 533400256512.0, + "grad_norm": 0.06650950979385485, + "language_loss": 0.8470974, + "learning_rate": 0.00015480337841166592, + "loss": 0.85764492, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3900, + "time_per_iteration": 2.719611167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064406, + "balance_loss_mlp": 1.05532193, + "diversity_loss_mlp": 0.0, + "epoch": 0.7504809542131589, + "flos": 589324792320.0, + "grad_norm": 0.06798274648693917, + "language_loss": 0.83017278, + "learning_rate": 0.00015457806504551647, + "loss": 0.84081692, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3901, + "time_per_iteration": 2.815099000930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055292, + "balance_loss_mlp": 1.04617858, + "diversity_loss_mlp": 0.0, + "epoch": 0.7506733358984224, + "flos": 511550899200.0, + "grad_norm": 0.06551967362841071, + "language_loss": 0.78146368, + "learning_rate": 0.0001543528857840554, + "loss": 0.79201663, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3902, + "time_per_iteration": 2.660747528076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105756, + "balance_loss_mlp": 1.04829192, + "diversity_loss_mlp": 0.0, + "epoch": 0.750865717583686, + "flos": 539268503040.0, + "grad_norm": 0.08761977110880032, + "language_loss": 0.80069476, + "learning_rate": 0.000154127840714705, + "loss": 0.81127042, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3903, + "time_per_iteration": 2.791895627975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057287, + "balance_loss_mlp": 1.04786348, + "diversity_loss_mlp": 0.0, + "epoch": 0.7510580992689496, + "flos": 476578930176.0, + "grad_norm": 0.08489214172044417, + "language_loss": 0.82145894, + "learning_rate": 0.00015390292992483557, + "loss": 0.83203179, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3904, + "time_per_iteration": 2.531291961669922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058521, + "balance_loss_mlp": 1.04955626, + "diversity_loss_mlp": 0.0, + "epoch": 0.7512504809542132, + "flos": 579043800576.0, + "grad_norm": 0.06641081846092535, + "language_loss": 0.84235787, + "learning_rate": 0.00015367815350176523, + "loss": 0.85294312, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3905, + "time_per_iteration": 2.7290806770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055516, + "balance_loss_mlp": 1.04627776, + "diversity_loss_mlp": 0.0, + "epoch": 0.7514428626394767, + "flos": 418660379136.0, + "grad_norm": 0.06804815402684934, + "language_loss": 0.82392836, + "learning_rate": 0.00015345351153275987, + "loss": 0.8344835, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3906, + "time_per_iteration": 2.530323028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054875, + "balance_loss_mlp": 1.04556477, + "diversity_loss_mlp": 0.0, + "epoch": 0.7516352443247403, + "flos": 641039215104.0, + "grad_norm": 0.06371304983723255, + "language_loss": 0.80832905, + "learning_rate": 0.00015322900410503332, + "loss": 0.81887782, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3907, + "time_per_iteration": 2.840207576751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062824, + "balance_loss_mlp": 1.05359089, + "diversity_loss_mlp": 0.0, + "epoch": 0.7518276260100039, + "flos": 580998168576.0, + "grad_norm": 0.0661364017188776, + "language_loss": 0.77996182, + "learning_rate": 0.00015300463130574703, + "loss": 0.79059005, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3908, + "time_per_iteration": 2.8597986698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00795371, + "balance_loss_mlp": 1.3454839, + "diversity_loss_mlp": 0.22311893, + "epoch": 0.7520200076952674, + "flos": 687342412800.0, + "grad_norm": 0.027335085290279493, + "language_loss": 0.81861627, + "learning_rate": 0.00015278039322201033, + "loss": 0.82656997, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01107004, + "step": 3909, + "time_per_iteration": 2.991687774658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056027, + "balance_loss_mlp": 1.04691339, + "diversity_loss_mlp": 0.0, + "epoch": 0.7522123893805309, + "flos": 486439976448.0, + "grad_norm": 0.07802530294793614, + "language_loss": 0.79405951, + "learning_rate": 0.00015255628994088004, + "loss": 0.80461979, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3910, + "time_per_iteration": 2.552389621734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057943, + "balance_loss_mlp": 1.04875183, + "diversity_loss_mlp": 0.0, + "epoch": 0.7524047710657945, + "flos": 818982586368.0, + "grad_norm": 0.06839079088853381, + "language_loss": 0.75070244, + "learning_rate": 0.00015233232154936082, + "loss": 0.76128185, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3911, + "time_per_iteration": 3.2685062885284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060306, + "balance_loss_mlp": 1.05104983, + "diversity_loss_mlp": 0.0, + "epoch": 0.7525971527510581, + "flos": 699508763136.0, + "grad_norm": 0.0742904302268966, + "language_loss": 0.76248109, + "learning_rate": 0.0001521084881344048, + "loss": 0.77308416, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3912, + "time_per_iteration": 2.8669307231903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063744, + "balance_loss_mlp": 1.05449915, + "diversity_loss_mlp": 0.0, + "epoch": 0.7527895344363217, + "flos": 633787421184.0, + "grad_norm": 0.07365945451583152, + "language_loss": 0.86536098, + "learning_rate": 0.00015188478978291208, + "loss": 0.87599838, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3913, + "time_per_iteration": 2.8062844276428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060792, + "balance_loss_mlp": 1.05141592, + "diversity_loss_mlp": 0.0, + "epoch": 0.7529819161215853, + "flos": 562830322176.0, + "grad_norm": 0.06964875853647617, + "language_loss": 0.86198735, + "learning_rate": 0.00015166122658173014, + "loss": 0.87259525, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3914, + "time_per_iteration": 2.832261085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062251, + "balance_loss_mlp": 1.05276752, + "diversity_loss_mlp": 0.0, + "epoch": 0.7531742978068487, + "flos": 690665647104.0, + "grad_norm": 0.07069372780846282, + "language_loss": 0.88695043, + "learning_rate": 0.00015143779861765332, + "loss": 0.89757293, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3915, + "time_per_iteration": 2.876596689224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057505, + "balance_loss_mlp": 1.04845726, + "diversity_loss_mlp": 0.0, + "epoch": 0.7533666794921123, + "flos": 681101208576.0, + "grad_norm": 0.07477721009048348, + "language_loss": 0.81360573, + "learning_rate": 0.00015121450597742458, + "loss": 0.82418078, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 3916, + "time_per_iteration": 2.83457612991333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105545, + "balance_loss_mlp": 1.04619908, + "diversity_loss_mlp": 0.0, + "epoch": 0.7535590611773759, + "flos": 623669414400.0, + "grad_norm": 0.07347506206734646, + "language_loss": 0.78634655, + "learning_rate": 0.00015099134874773369, + "loss": 0.79690111, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3917, + "time_per_iteration": 2.7597367763519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793692, + "balance_loss_mlp": 1.34194863, + "diversity_loss_mlp": 0.22241086, + "epoch": 0.7537514428626395, + "flos": 519427842048.0, + "grad_norm": 0.028776380158614775, + "language_loss": 0.80358481, + "learning_rate": 0.00015076832701521793, + "loss": 0.81152171, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01151239, + "step": 3918, + "time_per_iteration": 2.746518135070801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050485, + "balance_loss_mlp": 1.04122829, + "diversity_loss_mlp": 0.0, + "epoch": 0.753943824547903, + "flos": 723653512704.0, + "grad_norm": 0.08224807804324459, + "language_loss": 0.82372093, + "learning_rate": 0.000150545440866462, + "loss": 0.83422583, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 3919, + "time_per_iteration": 2.986933708190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056343, + "balance_loss_mlp": 1.047104, + "diversity_loss_mlp": 0.0, + "epoch": 0.7541362062331666, + "flos": 437547350016.0, + "grad_norm": 0.07659379290436485, + "language_loss": 0.78524017, + "learning_rate": 0.000150322690387998, + "loss": 0.79580355, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3920, + "time_per_iteration": 2.5535264015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053379, + "balance_loss_mlp": 1.04395509, + "diversity_loss_mlp": 0.0, + "epoch": 0.7543285879184302, + "flos": 565274018304.0, + "grad_norm": 0.08088787979004233, + "language_loss": 0.75178206, + "learning_rate": 0.00015010007566630535, + "loss": 0.76231587, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3921, + "time_per_iteration": 2.752476930618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052876, + "balance_loss_mlp": 1.0435003, + "diversity_loss_mlp": 0.0, + "epoch": 0.7545209696036937, + "flos": 521036416512.0, + "grad_norm": 0.09066204118342673, + "language_loss": 0.81410325, + "learning_rate": 0.00014987759678781077, + "loss": 0.82463199, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3922, + "time_per_iteration": 2.6611218452453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_mlp": 1.04057336, + "diversity_loss_mlp": 0.0, + "epoch": 0.7547133512889573, + "flos": 616066684416.0, + "grad_norm": 0.07014269793522399, + "language_loss": 0.82503462, + "learning_rate": 0.00014965525383888795, + "loss": 0.83553147, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3923, + "time_per_iteration": 2.7689740657806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_mlp": 1.04243279, + "diversity_loss_mlp": 0.0, + "epoch": 0.7549057329742208, + "flos": 750845085696.0, + "grad_norm": 0.07037901848858046, + "language_loss": 0.72344971, + "learning_rate": 0.00014943304690585851, + "loss": 0.73396569, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3924, + "time_per_iteration": 2.926786184310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050623, + "balance_loss_mlp": 1.04116416, + "diversity_loss_mlp": 0.0, + "epoch": 0.7550981146594844, + "flos": 514444276224.0, + "grad_norm": 0.07074790487011906, + "language_loss": 0.79134214, + "learning_rate": 0.0001492109760749908, + "loss": 0.80184835, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3925, + "time_per_iteration": 2.6663551330566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048636, + "balance_loss_mlp": 1.03920674, + "diversity_loss_mlp": 0.0, + "epoch": 0.755290496344748, + "flos": 522009930240.0, + "grad_norm": 0.06259359506310941, + "language_loss": 0.79865938, + "learning_rate": 0.00014898904143250002, + "loss": 0.80914569, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3926, + "time_per_iteration": 2.7111570835113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007032, + "balance_loss_mlp": 1.00193024, + "diversity_loss_mlp": 0.0, + "epoch": 0.7554828780300116, + "flos": 1414615021056.0, + "grad_norm": 0.018464770707338953, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76762235, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.05102539, + "routerloss_mlp": 0.0, + "step": 3927, + "time_per_iteration": 4.9247355461120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049551, + "balance_loss_mlp": 1.04027081, + "diversity_loss_mlp": 0.0, + "epoch": 0.7556752597152752, + "flos": 556937482752.0, + "grad_norm": 0.0681788266526358, + "language_loss": 0.80484271, + "learning_rate": 0.0001485455810572474, + "loss": 0.81533813, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3928, + "time_per_iteration": 2.644436836242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050291, + "balance_loss_mlp": 1.04075408, + "diversity_loss_mlp": 0.0, + "epoch": 0.7558676414005386, + "flos": 563638279680.0, + "grad_norm": 0.05891834719109388, + "language_loss": 0.83858299, + "learning_rate": 0.00014832405549665236, + "loss": 0.84908581, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3929, + "time_per_iteration": 2.7012484073638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_mlp": 1.03651154, + "diversity_loss_mlp": 0.0, + "epoch": 0.7560600230858022, + "flos": 561377392128.0, + "grad_norm": 0.06702269562440989, + "language_loss": 0.78850049, + "learning_rate": 0.00014810266646876746, + "loss": 0.79895926, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3930, + "time_per_iteration": 2.768267869949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_mlp": 1.0400542, + "diversity_loss_mlp": 0.0, + "epoch": 0.7562524047710658, + "flos": 719576649216.0, + "grad_norm": 0.07203252309013448, + "language_loss": 0.77448905, + "learning_rate": 0.00014788141405954364, + "loss": 0.78498399, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3931, + "time_per_iteration": 2.9904940128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047773, + "balance_loss_mlp": 1.03817058, + "diversity_loss_mlp": 0.0, + "epoch": 0.7564447864563294, + "flos": 543347937792.0, + "grad_norm": 0.07800689348595595, + "language_loss": 0.8509475, + "learning_rate": 0.00014766029835487865, + "loss": 0.86142522, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3932, + "time_per_iteration": 2.712207078933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050974, + "balance_loss_mlp": 1.04148519, + "diversity_loss_mlp": 0.0, + "epoch": 0.7566371681415929, + "flos": 725805743616.0, + "grad_norm": 0.09178447768332373, + "language_loss": 0.79506183, + "learning_rate": 0.0001474393194406173, + "loss": 0.80557162, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3933, + "time_per_iteration": 2.933224678039551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048897, + "balance_loss_mlp": 1.03937268, + "diversity_loss_mlp": 0.0, + "epoch": 0.7568295498268565, + "flos": 576580280832.0, + "grad_norm": 0.05892607400759823, + "language_loss": 0.79702771, + "learning_rate": 0.00014721847740255112, + "loss": 0.80751669, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3934, + "time_per_iteration": 2.826552391052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003728, + "balance_loss_mlp": 0.99864936, + "diversity_loss_mlp": 0.0, + "epoch": 0.75702193151212, + "flos": 1520059903488.0, + "grad_norm": 0.02131829704568505, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74915653, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 3935, + "time_per_iteration": 4.626272439956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050547, + "balance_loss_mlp": 1.0411061, + "diversity_loss_mlp": 0.0, + "epoch": 0.7572143131973836, + "flos": 525471556608.0, + "grad_norm": 0.08283198519893772, + "language_loss": 0.78541541, + "learning_rate": 0.00014677720429790526, + "loss": 0.79592091, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3936, + "time_per_iteration": 2.634308338165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046754, + "balance_loss_mlp": 1.03724098, + "diversity_loss_mlp": 0.0, + "epoch": 0.7574066948826472, + "flos": 550738123776.0, + "grad_norm": 0.060589870954327815, + "language_loss": 0.84442061, + "learning_rate": 0.0001465567734026429, + "loss": 0.8548882, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3937, + "time_per_iteration": 2.716531276702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051246, + "balance_loss_mlp": 1.04150677, + "diversity_loss_mlp": 0.0, + "epoch": 0.7575990765679107, + "flos": 395899176960.0, + "grad_norm": 0.08803792614427135, + "language_loss": 0.82826757, + "learning_rate": 0.00014633647972621034, + "loss": 0.83878005, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.09729004, + "routerloss_mlp": 0.0, + "step": 3938, + "time_per_iteration": 2.4589834213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053327, + "balance_loss_mlp": 1.04381418, + "diversity_loss_mlp": 0.0, + "epoch": 0.7577914582531743, + "flos": 585030615552.0, + "grad_norm": 0.07008474871833649, + "language_loss": 0.86420083, + "learning_rate": 0.00014611632335413354, + "loss": 0.87473404, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3939, + "time_per_iteration": 2.7953155040740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_mlp": 1.04597211, + "diversity_loss_mlp": 0.0, + "epoch": 0.7579838399384379, + "flos": 820979172864.0, + "grad_norm": 0.06005420836927303, + "language_loss": 0.82715803, + "learning_rate": 0.00014589630437188456, + "loss": 0.83771348, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3940, + "time_per_iteration": 3.1720919609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056474, + "balance_loss_mlp": 1.04727697, + "diversity_loss_mlp": 0.0, + "epoch": 0.7581762216237015, + "flos": 443892441600.0, + "grad_norm": 0.07556117037580423, + "language_loss": 0.78885162, + "learning_rate": 0.00014567642286488253, + "loss": 0.7994163, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3941, + "time_per_iteration": 2.5224215984344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105497, + "balance_loss_mlp": 1.0453198, + "diversity_loss_mlp": 0.0, + "epoch": 0.7583686033089649, + "flos": 540886989312.0, + "grad_norm": 0.10380533878684198, + "language_loss": 0.79189527, + "learning_rate": 0.00014545667891849258, + "loss": 0.80244499, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3942, + "time_per_iteration": 2.6196579933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_mlp": 1.04717493, + "diversity_loss_mlp": 0.0, + "epoch": 0.7585609849942285, + "flos": 522588091392.0, + "grad_norm": 0.06980232416240703, + "language_loss": 0.82745945, + "learning_rate": 0.00014523707261802733, + "loss": 0.83802581, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3943, + "time_per_iteration": 2.652625799179077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794094, + "balance_loss_mlp": 1.34365344, + "diversity_loss_mlp": 0.22232203, + "epoch": 0.7587533666794921, + "flos": 541860503040.0, + "grad_norm": 0.034795977662747106, + "language_loss": 0.81799769, + "learning_rate": 0.00014501760404874527, + "loss": 0.82593858, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01110633, + "step": 3944, + "time_per_iteration": 2.7529001235961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059832, + "balance_loss_mlp": 1.05071235, + "diversity_loss_mlp": 0.0, + "epoch": 0.7589457483647557, + "flos": 606408270336.0, + "grad_norm": 0.07566953086997541, + "language_loss": 0.85807776, + "learning_rate": 0.00014479827329585176, + "loss": 0.86867607, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3945, + "time_per_iteration": 2.701622486114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_mlp": 1.04233766, + "diversity_loss_mlp": 0.0, + "epoch": 0.7591381300500193, + "flos": 555106452480.0, + "grad_norm": 0.05933089648069645, + "language_loss": 0.84881538, + "learning_rate": 0.00014457908044449846, + "loss": 0.85933375, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3946, + "time_per_iteration": 2.728095769882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787034, + "balance_loss_mlp": 1.32538223, + "diversity_loss_mlp": 0.22601989, + "epoch": 0.7593305117352828, + "flos": 529681669632.0, + "grad_norm": 0.02987157443530754, + "language_loss": 0.83105904, + "learning_rate": 0.00014436002557978371, + "loss": 0.83892936, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.011333, + "step": 3947, + "time_per_iteration": 2.8229527473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009615, + "balance_loss_mlp": 1.00491834, + "diversity_loss_mlp": 0.0, + "epoch": 0.7595228934205464, + "flos": 1502798759424.0, + "grad_norm": 0.009520189474687826, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77652764, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.046875, + "routerloss_mlp": 0.0, + "step": 3948, + "time_per_iteration": 6.289541482925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060096, + "balance_loss_mlp": 1.05072582, + "diversity_loss_mlp": 0.0, + "epoch": 0.7597152751058099, + "flos": 455525047296.0, + "grad_norm": 0.06379991139513626, + "language_loss": 0.79987645, + "learning_rate": 0.0001439223301503945, + "loss": 0.8104775, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3949, + "time_per_iteration": 2.4896605014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063286, + "balance_loss_mlp": 1.05441725, + "diversity_loss_mlp": 0.0, + "epoch": 0.7599076567910735, + "flos": 685466966016.0, + "grad_norm": 0.07443357695534152, + "language_loss": 0.75937033, + "learning_rate": 0.00014370368975564834, + "loss": 0.7700032, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 3950, + "time_per_iteration": 2.939652442932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062595, + "balance_loss_mlp": 1.05339789, + "diversity_loss_mlp": 0.0, + "epoch": 0.760100038476337, + "flos": 532372414464.0, + "grad_norm": 0.07225326310483449, + "language_loss": 0.83501256, + "learning_rate": 0.00014348518768739766, + "loss": 0.84563851, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3951, + "time_per_iteration": 2.760315179824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013895, + "balance_loss_mlp": 1.00924563, + "diversity_loss_mlp": 0.0, + "epoch": 0.7602924201616006, + "flos": 1471742866944.0, + "grad_norm": 0.01015881799745275, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77741933, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.04638672, + "routerloss_mlp": 0.0, + "step": 3952, + "time_per_iteration": 4.8084025382995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106276, + "balance_loss_mlp": 1.05365205, + "diversity_loss_mlp": 0.0, + "epoch": 0.7604848018468642, + "flos": 774631558656.0, + "grad_norm": 0.06460876756714844, + "language_loss": 0.86549526, + "learning_rate": 0.00014304859886964867, + "loss": 0.87612283, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3953, + "time_per_iteration": 2.9919626712799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_mlp": 1.05655789, + "diversity_loss_mlp": 0.0, + "epoch": 0.7606771835321278, + "flos": 558185209344.0, + "grad_norm": 0.06531272999026969, + "language_loss": 0.83625901, + "learning_rate": 0.00014283051228964878, + "loss": 0.84691703, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.09234619, + "routerloss_mlp": 0.0, + "step": 3954, + "time_per_iteration": 2.7195558547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060232, + "balance_loss_mlp": 1.05114245, + "diversity_loss_mlp": 0.0, + "epoch": 0.7608695652173914, + "flos": 525397404672.0, + "grad_norm": 0.06973579873696066, + "language_loss": 0.82862848, + "learning_rate": 0.00014261256437514197, + "loss": 0.83923078, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3955, + "time_per_iteration": 2.6542091369628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794195, + "balance_loss_mlp": 1.3411088, + "diversity_loss_mlp": 0.22477356, + "epoch": 0.7610619469026548, + "flos": 615038842368.0, + "grad_norm": 0.03401627820018092, + "language_loss": 0.82645166, + "learning_rate": 0.0001423947552107428, + "loss": 0.83439362, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0112533, + "step": 3956, + "time_per_iteration": 2.7648067474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062618, + "balance_loss_mlp": 1.05335546, + "diversity_loss_mlp": 0.0, + "epoch": 0.7612543285879184, + "flos": 863356382208.0, + "grad_norm": 0.06632119476384091, + "language_loss": 0.77184016, + "learning_rate": 0.00014217708488101243, + "loss": 0.78246629, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3957, + "time_per_iteration": 3.1002120971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064244, + "balance_loss_mlp": 1.05514848, + "diversity_loss_mlp": 0.0, + "epoch": 0.761446710273182, + "flos": 553658664960.0, + "grad_norm": 0.08639703813163502, + "language_loss": 0.77281177, + "learning_rate": 0.0001419595534704579, + "loss": 0.78345418, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3958, + "time_per_iteration": 2.7124218940734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062721, + "balance_loss_mlp": 1.05369043, + "diversity_loss_mlp": 0.0, + "epoch": 0.7616390919584456, + "flos": 467350373376.0, + "grad_norm": 0.06838082339011158, + "language_loss": 0.81229275, + "learning_rate": 0.00014174216106353237, + "loss": 0.82291996, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3959, + "time_per_iteration": 2.628516912460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060156, + "balance_loss_mlp": 1.05085802, + "diversity_loss_mlp": 0.0, + "epoch": 0.7618314736437091, + "flos": 498430858752.0, + "grad_norm": 0.07205328766008003, + "language_loss": 0.76858711, + "learning_rate": 0.00014152490774463512, + "loss": 0.77918863, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3960, + "time_per_iteration": 2.630159854888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106295, + "balance_loss_mlp": 1.05382431, + "diversity_loss_mlp": 0.0, + "epoch": 0.7620238553289727, + "flos": 434545316352.0, + "grad_norm": 0.0819861529910791, + "language_loss": 0.87198371, + "learning_rate": 0.00014130779359811135, + "loss": 0.88261318, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3961, + "time_per_iteration": 2.464413642883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058979, + "balance_loss_mlp": 1.04990077, + "diversity_loss_mlp": 0.0, + "epoch": 0.7622162370142362, + "flos": 664277262336.0, + "grad_norm": 0.07245892571162069, + "language_loss": 0.85946453, + "learning_rate": 0.0001410908187082521, + "loss": 0.87005424, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 3962, + "time_per_iteration": 2.921780586242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058262, + "balance_loss_mlp": 1.04887986, + "diversity_loss_mlp": 0.0, + "epoch": 0.7624086186994998, + "flos": 557965324800.0, + "grad_norm": 0.06688462156779182, + "language_loss": 0.83390021, + "learning_rate": 0.0001408739831592949, + "loss": 0.84448284, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3963, + "time_per_iteration": 2.6833889484405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060293, + "balance_loss_mlp": 1.05104804, + "diversity_loss_mlp": 0.0, + "epoch": 0.7626010003847634, + "flos": 629132396544.0, + "grad_norm": 0.0755930480675871, + "language_loss": 0.77544367, + "learning_rate": 0.0001406572870354224, + "loss": 0.7860465, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3964, + "time_per_iteration": 2.7871947288513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060093, + "balance_loss_mlp": 1.05084801, + "diversity_loss_mlp": 0.0, + "epoch": 0.7627933820700269, + "flos": 437942702592.0, + "grad_norm": 0.06988595261199848, + "language_loss": 0.86813599, + "learning_rate": 0.00014044073042076337, + "loss": 0.87873685, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3965, + "time_per_iteration": 2.4948155879974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064783, + "balance_loss_mlp": 1.0558666, + "diversity_loss_mlp": 0.0, + "epoch": 0.7629857637552905, + "flos": 532723350528.0, + "grad_norm": 0.053016831320737375, + "language_loss": 0.88845956, + "learning_rate": 0.00014022431339939302, + "loss": 0.8991074, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 3966, + "time_per_iteration": 2.673383951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057572, + "balance_loss_mlp": 1.04824972, + "diversity_loss_mlp": 0.0, + "epoch": 0.7631781454405541, + "flos": 680036290560.0, + "grad_norm": 0.09057872820095057, + "language_loss": 0.7816959, + "learning_rate": 0.00014000803605533163, + "loss": 0.79227161, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3967, + "time_per_iteration": 2.8631951808929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057314, + "balance_loss_mlp": 1.04857016, + "diversity_loss_mlp": 0.0, + "epoch": 0.7633705271258177, + "flos": 507493859328.0, + "grad_norm": 0.08630668575925342, + "language_loss": 0.84042531, + "learning_rate": 0.00013979189847254553, + "loss": 0.85099846, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 3968, + "time_per_iteration": 2.5586295127868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057911, + "balance_loss_mlp": 1.04832053, + "diversity_loss_mlp": 0.0, + "epoch": 0.7635629088110811, + "flos": 618866085888.0, + "grad_norm": 0.07119073500769035, + "language_loss": 0.80335605, + "learning_rate": 0.00013957590073494674, + "loss": 0.81393516, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3969, + "time_per_iteration": 2.785759449005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055351, + "balance_loss_mlp": 1.0460887, + "diversity_loss_mlp": 0.0, + "epoch": 0.7637552904963447, + "flos": 638425193472.0, + "grad_norm": 0.0691753234001315, + "language_loss": 0.78865349, + "learning_rate": 0.0001393600429263931, + "loss": 0.79920697, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3970, + "time_per_iteration": 2.7582993507385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013524, + "balance_loss_mlp": 1.00873148, + "diversity_loss_mlp": 0.0, + "epoch": 0.7639476721816083, + "flos": 1563222302208.0, + "grad_norm": 0.011908325756944461, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.7575841, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.04785156, + "routerloss_mlp": 0.0, + "step": 3971, + "time_per_iteration": 4.944155693054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051128, + "balance_loss_mlp": 1.04182386, + "diversity_loss_mlp": 0.0, + "epoch": 0.7641400538668719, + "flos": 495987162624.0, + "grad_norm": 0.07417078530438988, + "language_loss": 0.81570405, + "learning_rate": 0.0001389287474315804, + "loss": 0.82621539, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3972, + "time_per_iteration": 2.6553244590759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052519, + "balance_loss_mlp": 1.04347086, + "diversity_loss_mlp": 0.0, + "epoch": 0.7643324355521355, + "flos": 578441046528.0, + "grad_norm": 0.05487535888911553, + "language_loss": 0.79840803, + "learning_rate": 0.00013871330991276505, + "loss": 0.8089332, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 3973, + "time_per_iteration": 2.681697368621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052207, + "balance_loss_mlp": 1.0428077, + "diversity_loss_mlp": 0.0, + "epoch": 0.764524817237399, + "flos": 784823717376.0, + "grad_norm": 0.08960984364762024, + "language_loss": 0.80946076, + "learning_rate": 0.00013849801265788247, + "loss": 0.81998283, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3974, + "time_per_iteration": 3.0523104667663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796632, + "balance_loss_mlp": 1.34598541, + "diversity_loss_mlp": 0.22497699, + "epoch": 0.7647171989226625, + "flos": 526279514112.0, + "grad_norm": 0.033347453631336434, + "language_loss": 0.83125114, + "learning_rate": 0.00013828285575051818, + "loss": 0.83921754, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01115073, + "step": 3975, + "time_per_iteration": 2.631014108657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052026, + "balance_loss_mlp": 1.04301977, + "diversity_loss_mlp": 0.0, + "epoch": 0.7649095806079261, + "flos": 554876656128.0, + "grad_norm": 0.06872239671854397, + "language_loss": 0.84060633, + "learning_rate": 0.0001380678392742035, + "loss": 0.85112655, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 3976, + "time_per_iteration": 2.710768938064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050013, + "balance_loss_mlp": 1.04042244, + "diversity_loss_mlp": 0.0, + "epoch": 0.7651019622931897, + "flos": 649145954304.0, + "grad_norm": 0.05722299510673748, + "language_loss": 0.84721446, + "learning_rate": 0.00013785296331241526, + "loss": 0.85771459, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3977, + "time_per_iteration": 2.863175868988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049924, + "balance_loss_mlp": 1.04060829, + "diversity_loss_mlp": 0.0, + "epoch": 0.7652943439784533, + "flos": 1046449248768.0, + "grad_norm": 0.0690026214963165, + "language_loss": 0.87410915, + "learning_rate": 0.00013763822794857583, + "loss": 0.88460839, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3978, + "time_per_iteration": 3.3100810050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049847, + "balance_loss_mlp": 1.04050136, + "diversity_loss_mlp": 0.0, + "epoch": 0.7654867256637168, + "flos": 504350862336.0, + "grad_norm": 0.06632607852839086, + "language_loss": 0.90003061, + "learning_rate": 0.00013742363326605278, + "loss": 0.91052908, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3979, + "time_per_iteration": 2.754115581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_mlp": 1.04258752, + "diversity_loss_mlp": 0.0, + "epoch": 0.7656791073489804, + "flos": 574709976576.0, + "grad_norm": 0.059791344398012564, + "language_loss": 0.78432417, + "learning_rate": 0.00013720917934815935, + "loss": 0.79484463, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3980, + "time_per_iteration": 2.801797866821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053778, + "balance_loss_mlp": 1.04425907, + "diversity_loss_mlp": 0.0, + "epoch": 0.765871489034244, + "flos": 492812232192.0, + "grad_norm": 0.08312893208703641, + "language_loss": 0.82967758, + "learning_rate": 0.00013699486627815344, + "loss": 0.84021544, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3981, + "time_per_iteration": 2.6589224338531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052365, + "balance_loss_mlp": 1.04295897, + "diversity_loss_mlp": 0.0, + "epoch": 0.7660638707195075, + "flos": 486024800256.0, + "grad_norm": 0.07260212580199023, + "language_loss": 0.82633436, + "learning_rate": 0.00013678069413923928, + "loss": 0.83685803, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3982, + "time_per_iteration": 2.6876726150512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_mlp": 1.0454247, + "diversity_loss_mlp": 0.0, + "epoch": 0.766256252404771, + "flos": 444295134720.0, + "grad_norm": 0.060912508562222696, + "language_loss": 0.81971568, + "learning_rate": 0.00013656666301456555, + "loss": 0.83026105, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3983, + "time_per_iteration": 2.547969341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051422, + "balance_loss_mlp": 1.04195666, + "diversity_loss_mlp": 0.0, + "epoch": 0.7664486340900346, + "flos": 485179766784.0, + "grad_norm": 0.07203556219041155, + "language_loss": 0.84272242, + "learning_rate": 0.0001363527729872267, + "loss": 0.85323668, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3984, + "time_per_iteration": 2.638418197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052921, + "balance_loss_mlp": 1.04378974, + "diversity_loss_mlp": 0.0, + "epoch": 0.7666410157752982, + "flos": 646200820224.0, + "grad_norm": 0.06683426358110046, + "language_loss": 0.76389247, + "learning_rate": 0.00013613902414026207, + "loss": 0.77442169, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3985, + "time_per_iteration": 2.7989237308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055274, + "balance_loss_mlp": 1.04588056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7668333974605618, + "flos": 774303017472.0, + "grad_norm": 0.07515257411295292, + "language_loss": 0.82508516, + "learning_rate": 0.00013592541655665642, + "loss": 0.83563781, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3986, + "time_per_iteration": 3.015293836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105591, + "balance_loss_mlp": 1.04635525, + "diversity_loss_mlp": 0.0, + "epoch": 0.7670257791458254, + "flos": 613462574592.0, + "grad_norm": 0.07774054250244124, + "language_loss": 0.85269868, + "learning_rate": 0.00013571195031933947, + "loss": 0.86325783, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 3987, + "time_per_iteration": 2.6980810165405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010581, + "balance_loss_mlp": 1.0057168, + "diversity_loss_mlp": 0.0, + "epoch": 0.7672181608310888, + "flos": 1485357378048.0, + "grad_norm": 0.012742252799641985, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81491923, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 3988, + "time_per_iteration": 4.809666156768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049905, + "balance_loss_mlp": 1.04043365, + "diversity_loss_mlp": 0.0, + "epoch": 0.7674105425163524, + "flos": 610732182528.0, + "grad_norm": 0.07424799958173026, + "language_loss": 0.85590923, + "learning_rate": 0.00013528544221501655, + "loss": 0.86640829, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3989, + "time_per_iteration": 2.7649118900299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010579, + "balance_loss_mlp": 1.04848218, + "diversity_loss_mlp": 0.0, + "epoch": 0.767602924201616, + "flos": 845205788160.0, + "grad_norm": 0.07001972276723446, + "language_loss": 0.81763613, + "learning_rate": 0.00013507240051359586, + "loss": 0.82821512, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3990, + "time_per_iteration": 3.0377867221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057165, + "balance_loss_mlp": 1.04797447, + "diversity_loss_mlp": 0.0, + "epoch": 0.7677953058868796, + "flos": 527114635776.0, + "grad_norm": 0.07160878890290734, + "language_loss": 0.86059034, + "learning_rate": 0.00013485950048963425, + "loss": 0.87116206, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3991, + "time_per_iteration": 2.5790224075317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105318, + "balance_loss_mlp": 1.04409003, + "diversity_loss_mlp": 0.0, + "epoch": 0.7679876875721431, + "flos": 923550501888.0, + "grad_norm": 0.0667031946156718, + "language_loss": 0.82767689, + "learning_rate": 0.00013464674222578643, + "loss": 0.83820868, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3992, + "time_per_iteration": 3.201578140258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057061, + "balance_loss_mlp": 1.04791176, + "diversity_loss_mlp": 0.0, + "epoch": 0.7681800692574067, + "flos": 458087311872.0, + "grad_norm": 0.08569609854575283, + "language_loss": 0.83404213, + "learning_rate": 0.00013443412580465292, + "loss": 0.84461272, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3993, + "time_per_iteration": 2.5704004764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_mlp": 1.04118383, + "diversity_loss_mlp": 0.0, + "epoch": 0.7683724509426703, + "flos": 658436179968.0, + "grad_norm": 0.0673936052155154, + "language_loss": 0.83964813, + "learning_rate": 0.00013422165130877857, + "loss": 0.85015404, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3994, + "time_per_iteration": 2.9138286113739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057424, + "balance_loss_mlp": 1.0483048, + "diversity_loss_mlp": 0.0, + "epoch": 0.7685648326279338, + "flos": 555284491776.0, + "grad_norm": 0.07281784593119212, + "language_loss": 0.8049981, + "learning_rate": 0.00013400931882065327, + "loss": 0.81557238, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3995, + "time_per_iteration": 2.6342077255249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055278, + "balance_loss_mlp": 1.04585409, + "diversity_loss_mlp": 0.0, + "epoch": 0.7687572143131974, + "flos": 687404081664.0, + "grad_norm": 0.062093519620885704, + "language_loss": 0.80842459, + "learning_rate": 0.0001337971284227118, + "loss": 0.81897736, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3996, + "time_per_iteration": 3.0022008419036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004861, + "balance_loss_mlp": 1.00011611, + "diversity_loss_mlp": 0.0, + "epoch": 0.7689495959984609, + "flos": 1489453691904.0, + "grad_norm": 0.007312606829584695, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77123284, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.04736328, + "routerloss_mlp": 0.0, + "step": 3997, + "time_per_iteration": 4.911606311798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055259, + "balance_loss_mlp": 1.04605579, + "diversity_loss_mlp": 0.0, + "epoch": 0.7691419776837245, + "flos": 570405888000.0, + "grad_norm": 0.06973120075241693, + "language_loss": 0.8046248, + "learning_rate": 0.0001333731742268438, + "loss": 0.81517738, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3998, + "time_per_iteration": 2.683593273162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053524, + "balance_loss_mlp": 1.0442791, + "diversity_loss_mlp": 0.0, + "epoch": 0.7693343593689881, + "flos": 520087495680.0, + "grad_norm": 0.0765354269800423, + "language_loss": 0.85693717, + "learning_rate": 0.0001331614105935109, + "loss": 0.86747241, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3999, + "time_per_iteration": 2.675220489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054062, + "balance_loss_mlp": 1.04481769, + "diversity_loss_mlp": 0.0, + "epoch": 0.7695267410542517, + "flos": 660378438144.0, + "grad_norm": 0.06349178277774252, + "language_loss": 0.84176111, + "learning_rate": 0.00013294978937954883, + "loss": 0.85230172, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4000, + "time_per_iteration": 2.8622941970825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054979, + "balance_loss_mlp": 1.04558492, + "diversity_loss_mlp": 0.0, + "epoch": 0.7697191227395151, + "flos": 546809564160.0, + "grad_norm": 0.09234703224205486, + "language_loss": 0.85414779, + "learning_rate": 0.00013273831066711655, + "loss": 0.86469758, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4001, + "time_per_iteration": 2.6298534870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052476, + "balance_loss_mlp": 1.04325461, + "diversity_loss_mlp": 0.0, + "epoch": 0.7699115044247787, + "flos": 540610205184.0, + "grad_norm": 0.06055695533202859, + "language_loss": 0.79907209, + "learning_rate": 0.00013252697453831747, + "loss": 0.8095969, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 4002, + "time_per_iteration": 2.692922830581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.03798985, + "diversity_loss_mlp": 0.0, + "epoch": 0.7701038861100423, + "flos": 562936407552.0, + "grad_norm": 0.06495740089460322, + "language_loss": 0.82613641, + "learning_rate": 0.00013231578107519916, + "loss": 0.83660942, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4003, + "time_per_iteration": 2.9229555130004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049706, + "balance_loss_mlp": 1.04049134, + "diversity_loss_mlp": 0.0, + "epoch": 0.7702962677953059, + "flos": 481737964032.0, + "grad_norm": 0.07621650724161941, + "language_loss": 0.82803172, + "learning_rate": 0.00013210473035975422, + "loss": 0.83852881, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 4004, + "time_per_iteration": 2.569532632827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_mlp": 1.04116035, + "diversity_loss_mlp": 0.0, + "epoch": 0.7704886494805695, + "flos": 770389138944.0, + "grad_norm": 0.07296352629436301, + "language_loss": 0.85812414, + "learning_rate": 0.0001318938224739201, + "loss": 0.86862826, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4005, + "time_per_iteration": 3.0234341621398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049929, + "balance_loss_mlp": 1.04063106, + "diversity_loss_mlp": 0.0, + "epoch": 0.770681031165833, + "flos": 601192336896.0, + "grad_norm": 0.06528825004105314, + "language_loss": 0.83766401, + "learning_rate": 0.00013168305749957843, + "loss": 0.84816337, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 4006, + "time_per_iteration": 2.733548641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790765, + "balance_loss_mlp": 1.33768153, + "diversity_loss_mlp": 0.22157404, + "epoch": 0.7708734128510966, + "flos": 496108302336.0, + "grad_norm": 0.030772470198916744, + "language_loss": 0.82874978, + "learning_rate": 0.00013147243551855532, + "loss": 0.8366574, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01113757, + "step": 4007, + "time_per_iteration": 2.6124446392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_mlp": 1.0404737, + "diversity_loss_mlp": 0.0, + "epoch": 0.7710657945363601, + "flos": 567299966976.0, + "grad_norm": 0.05859111752284866, + "language_loss": 0.80677342, + "learning_rate": 0.00013126195661262148, + "loss": 0.81727076, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4008, + "time_per_iteration": 2.7372946739196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052041, + "balance_loss_mlp": 1.04286766, + "diversity_loss_mlp": 0.0, + "epoch": 0.7712581762216237, + "flos": 604550075904.0, + "grad_norm": 0.06950402202343967, + "language_loss": 0.86921602, + "learning_rate": 0.00013105162086349216, + "loss": 0.87973642, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4009, + "time_per_iteration": 2.825164556503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050421, + "balance_loss_mlp": 1.04102159, + "diversity_loss_mlp": 0.0, + "epoch": 0.7714505579068872, + "flos": 530894891520.0, + "grad_norm": 0.05664497988696294, + "language_loss": 0.85945249, + "learning_rate": 0.00013084142835282687, + "loss": 0.86995667, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 4010, + "time_per_iteration": 2.6627306938171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00590218, + "balance_loss_mlp": 1.02735484, + "diversity_loss_mlp": 0.13424492, + "epoch": 0.7716429395921508, + "flos": 1422205267968.0, + "grad_norm": 0.0012430140076356488, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80474579, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00941846, + "step": 4011, + "time_per_iteration": 4.808507919311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050884, + "balance_loss_mlp": 1.04154992, + "diversity_loss_mlp": 0.0, + "epoch": 0.7718353212774144, + "flos": 578428563456.0, + "grad_norm": 0.062052307609784016, + "language_loss": 0.89290094, + "learning_rate": 0.0001304214733732485, + "loss": 0.90340984, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 4012, + "time_per_iteration": 2.7328708171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105349, + "balance_loss_mlp": 1.04380453, + "diversity_loss_mlp": 0.0, + "epoch": 0.772027702962678, + "flos": 510742941696.0, + "grad_norm": 0.07734543299334512, + "language_loss": 0.82669097, + "learning_rate": 0.00013021171106737672, + "loss": 0.83722585, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 4013, + "time_per_iteration": 2.6573734283447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049877, + "balance_loss_mlp": 1.04070377, + "diversity_loss_mlp": 0.0, + "epoch": 0.7722200846479416, + "flos": 525661705728.0, + "grad_norm": 0.06603423132938777, + "language_loss": 0.80092031, + "learning_rate": 0.00013000209232605071, + "loss": 0.81141913, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4014, + "time_per_iteration": 2.717602014541626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053571, + "balance_loss_mlp": 1.04388535, + "diversity_loss_mlp": 0.0, + "epoch": 0.772412466333205, + "flos": 479598216192.0, + "grad_norm": 0.10571386830465022, + "language_loss": 0.80179751, + "learning_rate": 0.0001297926172306519, + "loss": 0.81233323, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 4015, + "time_per_iteration": 2.65010142326355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051804, + "balance_loss_mlp": 1.04230273, + "diversity_loss_mlp": 0.0, + "epoch": 0.7726048480184686, + "flos": 905688801792.0, + "grad_norm": 0.06492582612573077, + "language_loss": 0.7883606, + "learning_rate": 0.0001295832858625055, + "loss": 0.79887861, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 4016, + "time_per_iteration": 3.2565736770629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050538, + "balance_loss_mlp": 1.04109037, + "diversity_loss_mlp": 0.0, + "epoch": 0.7727972297037322, + "flos": 631380801024.0, + "grad_norm": 0.06662088321139942, + "language_loss": 0.70083648, + "learning_rate": 0.00012937409830288154, + "loss": 0.71134186, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 4017, + "time_per_iteration": 2.818197250366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046111, + "balance_loss_mlp": 1.03688383, + "diversity_loss_mlp": 0.0, + "epoch": 0.7729896113889958, + "flos": 414786147840.0, + "grad_norm": 0.08953669234150197, + "language_loss": 0.84953344, + "learning_rate": 0.00012916505463299362, + "loss": 0.85999447, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4018, + "time_per_iteration": 2.5104525089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104651, + "balance_loss_mlp": 1.03696132, + "diversity_loss_mlp": 0.0, + "epoch": 0.7731819930742593, + "flos": 668907694080.0, + "grad_norm": 0.08710028809718832, + "language_loss": 0.78235918, + "learning_rate": 0.00012895615493399972, + "loss": 0.79282427, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 4019, + "time_per_iteration": 2.7878103256225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104679, + "balance_loss_mlp": 1.03747368, + "diversity_loss_mlp": 0.0, + "epoch": 0.7733743747595229, + "flos": 489854615040.0, + "grad_norm": 0.07808729146965544, + "language_loss": 0.82637143, + "learning_rate": 0.00012874739928700192, + "loss": 0.83683932, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 4020, + "time_per_iteration": 2.5788097381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044505, + "balance_loss_mlp": 1.03501582, + "diversity_loss_mlp": 0.0, + "epoch": 0.7735667564447865, + "flos": 659612325888.0, + "grad_norm": 0.07324265685000747, + "language_loss": 0.79874408, + "learning_rate": 0.00012853878777304624, + "loss": 0.80918914, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 4021, + "time_per_iteration": 2.870278835296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794381, + "balance_loss_mlp": 1.34430456, + "diversity_loss_mlp": 0.22252312, + "epoch": 0.77375913813005, + "flos": 533383004160.0, + "grad_norm": 0.029931863934209574, + "language_loss": 0.84459031, + "learning_rate": 0.000128330320473123, + "loss": 0.85253412, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01096685, + "step": 4022, + "time_per_iteration": 2.7129287719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008173, + "balance_loss_mlp": 1.00330901, + "diversity_loss_mlp": 0.0, + "epoch": 0.7739515198153136, + "flos": 1520081925120.0, + "grad_norm": 0.013994594591819043, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.7934007, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 4023, + "time_per_iteration": 4.895900726318359 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051725, + "balance_loss_mlp": 1.04231346, + "diversity_loss_mlp": 0.0, + "epoch": 0.7741439015005771, + "flos": 640105348608.0, + "grad_norm": 0.07018696985022486, + "language_loss": 0.81708258, + "learning_rate": 0.0001279138188390543, + "loss": 0.82759976, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 4024, + "time_per_iteration": 2.745079517364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050638, + "balance_loss_mlp": 1.04130435, + "diversity_loss_mlp": 0.0, + "epoch": 0.7743362831858407, + "flos": 665841420288.0, + "grad_norm": 0.06486800405407347, + "language_loss": 0.86009115, + "learning_rate": 0.00012770578466660915, + "loss": 0.87059748, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4025, + "time_per_iteration": 2.848886013031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054208, + "balance_loss_mlp": 1.04474843, + "diversity_loss_mlp": 0.0, + "epoch": 0.7745286648711043, + "flos": 562760939520.0, + "grad_norm": 0.06391594939980325, + "language_loss": 0.81626999, + "learning_rate": 0.0001274978950315968, + "loss": 0.82681203, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 4026, + "time_per_iteration": 2.791773796081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104997, + "balance_loss_mlp": 1.04037929, + "diversity_loss_mlp": 0.0, + "epoch": 0.7747210465563679, + "flos": 516912565248.0, + "grad_norm": 0.11270799389052534, + "language_loss": 0.83240479, + "learning_rate": 0.00012729015001472716, + "loss": 0.84290445, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 4027, + "time_per_iteration": 2.6333580017089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_mlp": 1.04164386, + "diversity_loss_mlp": 0.0, + "epoch": 0.7749134282416313, + "flos": 634209937920.0, + "grad_norm": 0.06039716871949276, + "language_loss": 0.81597829, + "learning_rate": 0.00012708254969665418, + "loss": 0.82648969, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 4028, + "time_per_iteration": 2.753960132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057336, + "balance_loss_mlp": 1.0482347, + "diversity_loss_mlp": 0.0, + "epoch": 0.7751058099268949, + "flos": 495364584960.0, + "grad_norm": 0.08015627547619836, + "language_loss": 0.83207834, + "learning_rate": 0.00012687509415797526, + "loss": 0.84265172, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4029, + "time_per_iteration": 2.549224376678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055891, + "balance_loss_mlp": 1.04669952, + "diversity_loss_mlp": 0.0, + "epoch": 0.7752981916121585, + "flos": 510310513152.0, + "grad_norm": 0.0754412874698092, + "language_loss": 0.81577122, + "learning_rate": 0.00012666778347923208, + "loss": 0.82633013, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 4030, + "time_per_iteration": 2.6578049659729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058934, + "balance_loss_mlp": 1.04996991, + "diversity_loss_mlp": 0.0, + "epoch": 0.7754905732974221, + "flos": 497548749312.0, + "grad_norm": 0.05434911795401194, + "language_loss": 0.83884913, + "learning_rate": 0.0001264606177409092, + "loss": 0.84943849, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4031, + "time_per_iteration": 2.7437548637390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054431, + "balance_loss_mlp": 1.04539514, + "diversity_loss_mlp": 0.0, + "epoch": 0.7756829549826857, + "flos": 480744626688.0, + "grad_norm": 0.06981681066227559, + "language_loss": 0.85926938, + "learning_rate": 0.00012625359702343609, + "loss": 0.86981368, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4032, + "time_per_iteration": 2.7145252227783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062978, + "balance_loss_mlp": 1.05414999, + "diversity_loss_mlp": 0.0, + "epoch": 0.7758753366679492, + "flos": 552630822912.0, + "grad_norm": 0.06703655691775996, + "language_loss": 0.84627414, + "learning_rate": 0.00012604672140718504, + "loss": 0.85690391, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4033, + "time_per_iteration": 2.6776609420776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061314, + "balance_loss_mlp": 1.05224824, + "diversity_loss_mlp": 0.0, + "epoch": 0.7760677183532128, + "flos": 703835246592.0, + "grad_norm": 0.0713724123127894, + "language_loss": 0.77912575, + "learning_rate": 0.00012583999097247233, + "loss": 0.78973895, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4034, + "time_per_iteration": 2.8429367542266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058219, + "balance_loss_mlp": 1.04938531, + "diversity_loss_mlp": 0.0, + "epoch": 0.7762601000384763, + "flos": 523470200832.0, + "grad_norm": 0.07138701732892383, + "language_loss": 0.80042505, + "learning_rate": 0.0001256334057995578, + "loss": 0.81100732, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4035, + "time_per_iteration": 2.805361032485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060081, + "balance_loss_mlp": 1.05109227, + "diversity_loss_mlp": 0.0, + "epoch": 0.7764524817237399, + "flos": 557532896256.0, + "grad_norm": 0.06152435345467902, + "language_loss": 0.85125613, + "learning_rate": 0.000125426965968645, + "loss": 0.86185694, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4036, + "time_per_iteration": 2.7150938510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064057, + "balance_loss_mlp": 1.05523515, + "diversity_loss_mlp": 0.0, + "epoch": 0.7766448634090035, + "flos": 579725849088.0, + "grad_norm": 0.07000613008602406, + "language_loss": 0.819399, + "learning_rate": 0.00012522067155988092, + "loss": 0.83003962, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4037, + "time_per_iteration": 2.6996352672576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060303, + "balance_loss_mlp": 1.05135584, + "diversity_loss_mlp": 0.0, + "epoch": 0.776837245094267, + "flos": 635603397120.0, + "grad_norm": 0.0718823999319763, + "language_loss": 0.75306779, + "learning_rate": 0.00012501452265335617, + "loss": 0.7636708, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4038, + "time_per_iteration": 2.8315415382385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066111, + "balance_loss_mlp": 1.05724156, + "diversity_loss_mlp": 0.0, + "epoch": 0.7770296267795306, + "flos": 614680565760.0, + "grad_norm": 0.06411925705378174, + "language_loss": 0.83063197, + "learning_rate": 0.0001248085193291047, + "loss": 0.84129304, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4039, + "time_per_iteration": 2.729095935821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069535, + "balance_loss_mlp": 1.0605464, + "diversity_loss_mlp": 0.0, + "epoch": 0.7772220084647942, + "flos": 878808890880.0, + "grad_norm": 0.05882048458025786, + "language_loss": 0.82089669, + "learning_rate": 0.00012460266166710443, + "loss": 0.83159202, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4040, + "time_per_iteration": 3.1514501571655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068929, + "balance_loss_mlp": 1.06013775, + "diversity_loss_mlp": 0.0, + "epoch": 0.7774143901500578, + "flos": 839641489920.0, + "grad_norm": 0.07867166554480139, + "language_loss": 0.77746958, + "learning_rate": 0.00012439694974727633, + "loss": 0.78815889, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4041, + "time_per_iteration": 3.0117955207824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_mlp": 1.05708027, + "diversity_loss_mlp": 0.0, + "epoch": 0.7776067718353212, + "flos": 568147571712.0, + "grad_norm": 0.06430167773545564, + "language_loss": 0.79798543, + "learning_rate": 0.00012419138364948458, + "loss": 0.80864501, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4042, + "time_per_iteration": 2.7055745124816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064858, + "balance_loss_mlp": 1.05601263, + "diversity_loss_mlp": 0.0, + "epoch": 0.7777991535205848, + "flos": 745943012352.0, + "grad_norm": 0.06788477072783218, + "language_loss": 0.82296908, + "learning_rate": 0.00012398596345353702, + "loss": 0.83361769, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4043, + "time_per_iteration": 2.8943872451782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064992, + "balance_loss_mlp": 1.05608058, + "diversity_loss_mlp": 0.0, + "epoch": 0.7779915352058484, + "flos": 538075104768.0, + "grad_norm": 0.06253380969554054, + "language_loss": 0.83342338, + "learning_rate": 0.0001237806892391851, + "loss": 0.8440733, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4044, + "time_per_iteration": 2.697079658508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061837, + "balance_loss_mlp": 1.05312264, + "diversity_loss_mlp": 0.0, + "epoch": 0.778183916891112, + "flos": 634788099072.0, + "grad_norm": 0.07069263559946819, + "language_loss": 0.81128013, + "learning_rate": 0.0001235755610861233, + "loss": 0.82189852, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4045, + "time_per_iteration": 2.7329134941101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066232, + "balance_loss_mlp": 1.05731463, + "diversity_loss_mlp": 0.0, + "epoch": 0.7783762985763756, + "flos": 588677621760.0, + "grad_norm": 0.07032278053298287, + "language_loss": 0.85504925, + "learning_rate": 0.0001233705790739893, + "loss": 0.86571157, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4046, + "time_per_iteration": 2.708867073059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061968, + "balance_loss_mlp": 1.05317056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7785686802616391, + "flos": 930656563200.0, + "grad_norm": 0.08570945023626393, + "language_loss": 0.7512747, + "learning_rate": 0.0001231657432823643, + "loss": 0.76189435, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4047, + "time_per_iteration": 3.209035634994507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064295, + "balance_loss_mlp": 1.05536008, + "diversity_loss_mlp": 0.0, + "epoch": 0.7787610619469026, + "flos": 497934190080.0, + "grad_norm": 0.07478772193794427, + "language_loss": 0.78683329, + "learning_rate": 0.0001229610537907725, + "loss": 0.79747623, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4048, + "time_per_iteration": 2.570645332336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063203, + "balance_loss_mlp": 1.05442929, + "diversity_loss_mlp": 0.0, + "epoch": 0.7789534436321662, + "flos": 515637674496.0, + "grad_norm": 0.07810921414498996, + "language_loss": 0.90262878, + "learning_rate": 0.00012275651067868143, + "loss": 0.91326082, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4049, + "time_per_iteration": 2.5862553119659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058533, + "balance_loss_mlp": 1.04978311, + "diversity_loss_mlp": 0.0, + "epoch": 0.7791458253174298, + "flos": 988476369408.0, + "grad_norm": 0.05845393765756997, + "language_loss": 0.80259252, + "learning_rate": 0.00012255211402550182, + "loss": 0.81317782, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4050, + "time_per_iteration": 3.2020328044891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055369, + "balance_loss_mlp": 1.04645181, + "diversity_loss_mlp": 0.0, + "epoch": 0.7793382070026933, + "flos": 629040992256.0, + "grad_norm": 0.07830185849799275, + "language_loss": 0.76506507, + "learning_rate": 0.00012234786391058727, + "loss": 0.77561879, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4051, + "time_per_iteration": 2.823751449584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059116, + "balance_loss_mlp": 1.05021727, + "diversity_loss_mlp": 0.0, + "epoch": 0.7795305886879569, + "flos": 531752408064.0, + "grad_norm": 0.07934971719083544, + "language_loss": 0.85162616, + "learning_rate": 0.0001221437604132352, + "loss": 0.86221731, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4052, + "time_per_iteration": 2.6284594535827637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054893, + "balance_loss_mlp": 1.04598236, + "diversity_loss_mlp": 0.0, + "epoch": 0.7797229703732205, + "flos": 611979909120.0, + "grad_norm": 0.07077897315409304, + "language_loss": 0.8102321, + "learning_rate": 0.0001219398036126852, + "loss": 0.82078099, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4053, + "time_per_iteration": 2.7439231872558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059971, + "balance_loss_mlp": 1.05101228, + "diversity_loss_mlp": 0.0, + "epoch": 0.7799153520584841, + "flos": 872164620288.0, + "grad_norm": 0.06870313821829518, + "language_loss": 0.78245676, + "learning_rate": 0.00012173599358812027, + "loss": 0.79305649, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4054, + "time_per_iteration": 3.256080150604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058619, + "balance_loss_mlp": 1.04986334, + "diversity_loss_mlp": 0.0, + "epoch": 0.7801077337437476, + "flos": 583627244544.0, + "grad_norm": 0.07402592003625927, + "language_loss": 0.82719493, + "learning_rate": 0.0001215323304186668, + "loss": 0.83778107, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4055, + "time_per_iteration": 2.7612040042877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105856, + "balance_loss_mlp": 1.05008435, + "diversity_loss_mlp": 0.0, + "epoch": 0.7803001154290111, + "flos": 601165172736.0, + "grad_norm": 0.06917846158934658, + "language_loss": 0.87829256, + "learning_rate": 0.00012132881418339364, + "loss": 0.88887817, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.08483887, + "routerloss_mlp": 0.0, + "step": 4056, + "time_per_iteration": 2.7365031242370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006422, + "balance_loss_mlp": 1.00186825, + "diversity_loss_mlp": 0.0, + "epoch": 0.7804924971142747, + "flos": 1479577591296.0, + "grad_norm": 0.016656968003394067, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78523988, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.0456543, + "routerloss_mlp": 0.0, + "step": 4057, + "time_per_iteration": 4.83305811882019 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_mlp": 1.04785705, + "diversity_loss_mlp": 0.0, + "epoch": 0.7806848787995383, + "flos": 630362870784.0, + "grad_norm": 0.06805160455788861, + "language_loss": 0.77303064, + "learning_rate": 0.00012092222283137944, + "loss": 0.78359842, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4058, + "time_per_iteration": 2.749647617340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100669, + "balance_loss_mlp": 1.00213623, + "diversity_loss_mlp": 0.0, + "epoch": 0.7808772604848019, + "flos": 1417587319296.0, + "grad_norm": 0.014137874321597207, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79913002, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.0456543, + "routerloss_mlp": 0.0, + "step": 4059, + "time_per_iteration": 4.786531209945679 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060827, + "balance_loss_mlp": 1.0521071, + "diversity_loss_mlp": 0.0, + "epoch": 0.7810696421700654, + "flos": 731696011776.0, + "grad_norm": 0.0627573295973092, + "language_loss": 0.83679825, + "learning_rate": 0.00012051622016348856, + "loss": 0.84740651, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4060, + "time_per_iteration": 2.999849557876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060136, + "balance_loss_mlp": 1.05145788, + "diversity_loss_mlp": 0.0, + "epoch": 0.781262023855329, + "flos": 424941230592.0, + "grad_norm": 0.09064537340570315, + "language_loss": 0.84317231, + "learning_rate": 0.00012031343978315539, + "loss": 0.85377359, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 4061, + "time_per_iteration": 2.468447208404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056381, + "balance_loss_mlp": 1.04746997, + "diversity_loss_mlp": 0.0, + "epoch": 0.7814544055405925, + "flos": 501027628032.0, + "grad_norm": 0.06926307807295869, + "language_loss": 0.8253361, + "learning_rate": 0.00012011080681021774, + "loss": 0.83589995, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4062, + "time_per_iteration": 2.6554322242736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058674, + "balance_loss_mlp": 1.04981685, + "diversity_loss_mlp": 0.0, + "epoch": 0.7816467872258561, + "flos": 462448300032.0, + "grad_norm": 0.07294593948757502, + "language_loss": 0.86419785, + "learning_rate": 0.00011990832132334512, + "loss": 0.87478459, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4063, + "time_per_iteration": 2.514464855194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054396, + "balance_loss_mlp": 1.04535961, + "diversity_loss_mlp": 0.0, + "epoch": 0.7818391689111197, + "flos": 740818483200.0, + "grad_norm": 0.07578138035513655, + "language_loss": 0.82624197, + "learning_rate": 0.00011970598340114897, + "loss": 0.83678591, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4064, + "time_per_iteration": 2.931457042694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051575, + "balance_loss_mlp": 1.04267633, + "diversity_loss_mlp": 0.0, + "epoch": 0.7820315505963832, + "flos": 547669278720.0, + "grad_norm": 0.07400316047770077, + "language_loss": 0.84204572, + "learning_rate": 0.00011950379312218396, + "loss": 0.85256147, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4065, + "time_per_iteration": 2.7011330127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053821, + "balance_loss_mlp": 1.04467154, + "diversity_loss_mlp": 0.0, + "epoch": 0.7822239322816468, + "flos": 728983245312.0, + "grad_norm": 0.057956585414562535, + "language_loss": 0.86203766, + "learning_rate": 0.00011930175056494719, + "loss": 0.87257588, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4066, + "time_per_iteration": 2.877427816390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054242, + "balance_loss_mlp": 1.04519939, + "diversity_loss_mlp": 0.0, + "epoch": 0.7824163139669104, + "flos": 452016433152.0, + "grad_norm": 0.057083401886059204, + "language_loss": 0.75923216, + "learning_rate": 0.00011909985580787885, + "loss": 0.76977456, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4067, + "time_per_iteration": 2.624633312225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047724, + "balance_loss_mlp": 1.03850365, + "diversity_loss_mlp": 0.0, + "epoch": 0.782608695652174, + "flos": 540489065472.0, + "grad_norm": 0.05949124262263275, + "language_loss": 0.81228232, + "learning_rate": 0.00011889810892936137, + "loss": 0.82275951, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4068, + "time_per_iteration": 2.736132860183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060005, + "balance_loss_mlp": 1.05080259, + "diversity_loss_mlp": 0.0, + "epoch": 0.7828010773374374, + "flos": 500308503552.0, + "grad_norm": 0.067986892151795, + "language_loss": 0.77103662, + "learning_rate": 0.00011869651000771959, + "loss": 0.78163677, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4069, + "time_per_iteration": 2.8403103351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054344, + "balance_loss_mlp": 1.04549229, + "diversity_loss_mlp": 0.0, + "epoch": 0.782993459022701, + "flos": 600816807936.0, + "grad_norm": 0.06684521190560817, + "language_loss": 0.83076346, + "learning_rate": 0.00011849505912122117, + "loss": 0.84130692, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4070, + "time_per_iteration": 2.7008423805236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054175, + "balance_loss_mlp": 1.04501987, + "diversity_loss_mlp": 0.0, + "epoch": 0.7831858407079646, + "flos": 810055779840.0, + "grad_norm": 0.07690857771038405, + "language_loss": 0.78090364, + "learning_rate": 0.00011829375634807654, + "loss": 0.79144537, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4071, + "time_per_iteration": 3.033573627471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054971, + "balance_loss_mlp": 1.04576814, + "diversity_loss_mlp": 0.0, + "epoch": 0.7833782223932282, + "flos": 806594153472.0, + "grad_norm": 0.056420463967120596, + "language_loss": 0.81179786, + "learning_rate": 0.00011809260176643821, + "loss": 0.82234752, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4072, + "time_per_iteration": 3.047667980194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057919, + "balance_loss_mlp": 1.0486629, + "diversity_loss_mlp": 0.0, + "epoch": 0.7835706040784918, + "flos": 520870860288.0, + "grad_norm": 0.08201668927537556, + "language_loss": 0.83855987, + "learning_rate": 0.00011789159545440131, + "loss": 0.84913909, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4073, + "time_per_iteration": 2.5870485305786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061783, + "balance_loss_mlp": 1.05281854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7837629857637552, + "flos": 505605929472.0, + "grad_norm": 0.05483100075639626, + "language_loss": 0.82342023, + "learning_rate": 0.00011769073749000348, + "loss": 0.83403808, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4074, + "time_per_iteration": 2.7744524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_mlp": 1.05058742, + "diversity_loss_mlp": 0.0, + "epoch": 0.7839553674490188, + "flos": 516124431360.0, + "grad_norm": 0.07650558225741275, + "language_loss": 0.76181698, + "learning_rate": 0.0001174900279512246, + "loss": 0.77241433, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4075, + "time_per_iteration": 2.5718233585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055959, + "balance_loss_mlp": 1.04716742, + "diversity_loss_mlp": 0.0, + "epoch": 0.7841477491342824, + "flos": 506648825856.0, + "grad_norm": 0.06638794146044662, + "language_loss": 0.81755495, + "learning_rate": 0.00011728946691598707, + "loss": 0.82811451, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4076, + "time_per_iteration": 2.597710371017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057965, + "balance_loss_mlp": 1.0489229, + "diversity_loss_mlp": 0.0, + "epoch": 0.784340130819546, + "flos": 719636120064.0, + "grad_norm": 0.07312696414479496, + "language_loss": 0.76038092, + "learning_rate": 0.00011708905446215561, + "loss": 0.77096057, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4077, + "time_per_iteration": 2.8587801456451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052857, + "balance_loss_mlp": 1.04389191, + "diversity_loss_mlp": 0.0, + "epoch": 0.7845325125048095, + "flos": 514441704960.0, + "grad_norm": 0.05480426452035972, + "language_loss": 0.79978698, + "learning_rate": 0.00011688879066753711, + "loss": 0.81031561, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4078, + "time_per_iteration": 2.6878645420074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794674, + "balance_loss_mlp": 1.3435601, + "diversity_loss_mlp": 0.22424069, + "epoch": 0.7847248941900731, + "flos": 466102646784.0, + "grad_norm": 0.037025249970490705, + "language_loss": 0.87360638, + "learning_rate": 0.00011668867560988122, + "loss": 0.88155311, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01077335, + "step": 4079, + "time_per_iteration": 2.605992317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055214, + "balance_loss_mlp": 1.04603505, + "diversity_loss_mlp": 0.0, + "epoch": 0.7849172758753367, + "flos": 503028983808.0, + "grad_norm": 0.07540056238596937, + "language_loss": 0.84502101, + "learning_rate": 0.00011648870936687916, + "loss": 0.85557318, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4080, + "time_per_iteration": 2.803166627883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054284, + "balance_loss_mlp": 1.04527164, + "diversity_loss_mlp": 0.0, + "epoch": 0.7851096575606002, + "flos": 531999456768.0, + "grad_norm": 0.07109491685615342, + "language_loss": 0.7888999, + "learning_rate": 0.00011628889201616461, + "loss": 0.79944277, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4081, + "time_per_iteration": 2.6307146549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053935, + "balance_loss_mlp": 1.04494071, + "diversity_loss_mlp": 0.0, + "epoch": 0.7853020392458638, + "flos": 569956207104.0, + "grad_norm": 0.06995649688675094, + "language_loss": 0.8206296, + "learning_rate": 0.00011608922363531393, + "loss": 0.83116901, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4082, + "time_per_iteration": 2.6929171085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_mlp": 1.04621124, + "diversity_loss_mlp": 0.0, + "epoch": 0.7854944209311273, + "flos": 832579845120.0, + "grad_norm": 0.06467745732761603, + "language_loss": 0.83401716, + "learning_rate": 0.00011588970430184504, + "loss": 0.84456635, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 4083, + "time_per_iteration": 3.0374722480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055907, + "balance_loss_mlp": 1.04704356, + "diversity_loss_mlp": 0.0, + "epoch": 0.7856868026163909, + "flos": 559929604608.0, + "grad_norm": 0.053416444226472466, + "language_loss": 0.81812388, + "learning_rate": 0.00011569033409321822, + "loss": 0.82868296, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4084, + "time_per_iteration": 2.7151241302490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056433, + "balance_loss_mlp": 1.04721808, + "diversity_loss_mlp": 0.0, + "epoch": 0.7858791843016545, + "flos": 545230725120.0, + "grad_norm": 0.08362128305368578, + "language_loss": 0.72967046, + "learning_rate": 0.00011549111308683591, + "loss": 0.74023485, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4085, + "time_per_iteration": 2.703397750854492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053784, + "balance_loss_mlp": 1.044855, + "diversity_loss_mlp": 0.0, + "epoch": 0.7860715659869181, + "flos": 380997665280.0, + "grad_norm": 0.07026628399198086, + "language_loss": 0.80478334, + "learning_rate": 0.00011529204136004251, + "loss": 0.81532121, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4086, + "time_per_iteration": 2.4818243980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055251, + "balance_loss_mlp": 1.04632854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7862639476721817, + "flos": 567440930304.0, + "grad_norm": 0.06468878784636958, + "language_loss": 0.84670031, + "learning_rate": 0.00011509311899012459, + "loss": 0.85725284, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4087, + "time_per_iteration": 2.6685831546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052321, + "balance_loss_mlp": 1.04333234, + "diversity_loss_mlp": 0.0, + "epoch": 0.7864563293574451, + "flos": 545238065664.0, + "grad_norm": 0.07857696263976417, + "language_loss": 0.781057, + "learning_rate": 0.00011489434605431053, + "loss": 0.7915802, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4088, + "time_per_iteration": 2.634192705154419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050771, + "balance_loss_mlp": 1.0415858, + "diversity_loss_mlp": 0.0, + "epoch": 0.7866487110427087, + "flos": 563536963584.0, + "grad_norm": 0.06849593864396217, + "language_loss": 0.81194121, + "learning_rate": 0.0001146957226297708, + "loss": 0.82244897, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4089, + "time_per_iteration": 2.6896586418151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054397, + "balance_loss_mlp": 1.04508066, + "diversity_loss_mlp": 0.0, + "epoch": 0.7868410927279723, + "flos": 728189968896.0, + "grad_norm": 0.06226549816004976, + "language_loss": 0.76514363, + "learning_rate": 0.00011449724879361827, + "loss": 0.77568758, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4090, + "time_per_iteration": 3.0211868286132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105233, + "balance_loss_mlp": 1.04349613, + "diversity_loss_mlp": 0.0, + "epoch": 0.7870334744132359, + "flos": 521355045888.0, + "grad_norm": 0.10606387135755017, + "language_loss": 0.73947829, + "learning_rate": 0.00011429892462290687, + "loss": 0.75000155, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4091, + "time_per_iteration": 2.663403034210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051781, + "balance_loss_mlp": 1.04245293, + "diversity_loss_mlp": 0.0, + "epoch": 0.7872258560984994, + "flos": 451411107840.0, + "grad_norm": 0.07444773057019392, + "language_loss": 0.83167046, + "learning_rate": 0.00011410075019463295, + "loss": 0.84218824, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4092, + "time_per_iteration": 2.6732146739959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048957, + "balance_loss_mlp": 1.04006362, + "diversity_loss_mlp": 0.0, + "epoch": 0.787418237783763, + "flos": 515195334144.0, + "grad_norm": 0.060787527331610934, + "language_loss": 0.80152667, + "learning_rate": 0.00011390272558573461, + "loss": 0.81201625, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4093, + "time_per_iteration": 2.7180373668670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046439, + "balance_loss_mlp": 1.03762388, + "diversity_loss_mlp": 0.0, + "epoch": 0.7876106194690266, + "flos": 485081021952.0, + "grad_norm": 0.06490792600835427, + "language_loss": 0.7982657, + "learning_rate": 0.00011370485087309202, + "loss": 0.80873013, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4094, + "time_per_iteration": 2.6366312503814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049978, + "balance_loss_mlp": 1.04087603, + "diversity_loss_mlp": 0.0, + "epoch": 0.7878030011542901, + "flos": 542841357312.0, + "grad_norm": 0.07475345031561743, + "language_loss": 0.79215139, + "learning_rate": 0.00011350712613352688, + "loss": 0.80265117, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4095, + "time_per_iteration": 2.652498960494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_mlp": 1.0379113, + "diversity_loss_mlp": 0.0, + "epoch": 0.7879953828395537, + "flos": 516739668480.0, + "grad_norm": 0.08748048466921367, + "language_loss": 0.79438257, + "learning_rate": 0.00011330955144380283, + "loss": 0.8048501, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4096, + "time_per_iteration": 2.641091823577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051654, + "balance_loss_mlp": 1.04231441, + "diversity_loss_mlp": 0.0, + "epoch": 0.7881877645248172, + "flos": 582278201856.0, + "grad_norm": 0.09762790842246886, + "language_loss": 0.8590734, + "learning_rate": 0.00011311212688062483, + "loss": 0.86958992, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 4097, + "time_per_iteration": 2.7734925746917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104816, + "balance_loss_mlp": 1.03907609, + "diversity_loss_mlp": 0.0, + "epoch": 0.7883801462100808, + "flos": 589171719168.0, + "grad_norm": 0.07905994769378807, + "language_loss": 0.77729434, + "learning_rate": 0.0001129148525206402, + "loss": 0.78777593, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4098, + "time_per_iteration": 2.7954680919647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043495, + "balance_loss_mlp": 1.03457785, + "diversity_loss_mlp": 0.0, + "epoch": 0.7885725278953444, + "flos": 481728052224.0, + "grad_norm": 0.07239705861159748, + "language_loss": 0.86597443, + "learning_rate": 0.00011271772844043759, + "loss": 0.87640929, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4099, + "time_per_iteration": 2.6607439517974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_mlp": 1.03621721, + "diversity_loss_mlp": 0.0, + "epoch": 0.788764909580608, + "flos": 756794824704.0, + "grad_norm": 0.0879845315874332, + "language_loss": 0.76285118, + "learning_rate": 0.00011252075471654727, + "loss": 0.7733022, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4100, + "time_per_iteration": 2.971648693084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105122, + "balance_loss_mlp": 1.04207063, + "diversity_loss_mlp": 0.0, + "epoch": 0.7889572912658714, + "flos": 702555213312.0, + "grad_norm": 0.0764302871750087, + "language_loss": 0.77711362, + "learning_rate": 0.00011232393142544133, + "loss": 0.78762579, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4101, + "time_per_iteration": 2.91229510307312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047622, + "balance_loss_mlp": 1.03860378, + "diversity_loss_mlp": 0.0, + "epoch": 0.789149672951135, + "flos": 736405364736.0, + "grad_norm": 0.07185195333789275, + "language_loss": 0.82940054, + "learning_rate": 0.00011212725864353323, + "loss": 0.83987677, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4102, + "time_per_iteration": 3.1023645401000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025318, + "balance_loss_mlp": 1.02088332, + "diversity_loss_mlp": 0.0, + "epoch": 0.7893420546363986, + "flos": 1481396511744.0, + "grad_norm": 0.024083596003167965, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77361244, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4103, + "time_per_iteration": 4.869060754776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.03684092, + "diversity_loss_mlp": 0.0, + "epoch": 0.7895344363216622, + "flos": 509072698368.0, + "grad_norm": 0.08808407727788632, + "language_loss": 0.75807375, + "learning_rate": 0.00011173436491267291, + "loss": 0.76853269, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4104, + "time_per_iteration": 2.632619619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051432, + "balance_loss_mlp": 1.04226446, + "diversity_loss_mlp": 0.0, + "epoch": 0.7897268180069258, + "flos": 541988983296.0, + "grad_norm": 0.06591293045265766, + "language_loss": 0.81841874, + "learning_rate": 0.0001115381441162554, + "loss": 0.82893306, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4105, + "time_per_iteration": 2.6688740253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015618, + "balance_loss_mlp": 1.0112071, + "diversity_loss_mlp": 0.0, + "epoch": 0.7899191996921893, + "flos": 1412687817216.0, + "grad_norm": 0.01578072375455914, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74599338, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4106, + "time_per_iteration": 4.878762245178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_mlp": 1.041677, + "diversity_loss_mlp": 0.0, + "epoch": 0.7901115813774529, + "flos": 622841633280.0, + "grad_norm": 0.06419159755656932, + "language_loss": 0.85182965, + "learning_rate": 0.00011114615504234465, + "loss": 0.86233652, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4107, + "time_per_iteration": 2.7453701496124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_mlp": 1.03746724, + "diversity_loss_mlp": 0.0, + "epoch": 0.7903039630627164, + "flos": 645545935872.0, + "grad_norm": 0.07341048206377168, + "language_loss": 0.80923963, + "learning_rate": 0.00011095038691703468, + "loss": 0.81970477, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4108, + "time_per_iteration": 2.857043504714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047055, + "balance_loss_mlp": 1.03800678, + "diversity_loss_mlp": 0.0, + "epoch": 0.79049634474798, + "flos": 594365257728.0, + "grad_norm": 0.06655370110946672, + "language_loss": 0.82816958, + "learning_rate": 0.00011075476983417998, + "loss": 0.83864009, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4109, + "time_per_iteration": 2.8551764488220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049932, + "balance_loss_mlp": 1.04054475, + "diversity_loss_mlp": 0.0, + "epoch": 0.7906887264332435, + "flos": 716093001216.0, + "grad_norm": 0.08565145998771567, + "language_loss": 0.7770009, + "learning_rate": 0.00011055930386972579, + "loss": 0.78750026, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 4110, + "time_per_iteration": 2.9051218032836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_mlp": 1.03950906, + "diversity_loss_mlp": 0.0, + "epoch": 0.7908811081185071, + "flos": 789893918208.0, + "grad_norm": 0.07889594156212229, + "language_loss": 0.78524226, + "learning_rate": 0.00011036398909955863, + "loss": 0.79572868, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 4111, + "time_per_iteration": 2.9591848850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00801967, + "balance_loss_mlp": 1.35861206, + "diversity_loss_mlp": 0.22341654, + "epoch": 0.7910734898037707, + "flos": 641904072192.0, + "grad_norm": 0.031814716701276446, + "language_loss": 0.81445456, + "learning_rate": 0.00011016882559950648, + "loss": 0.82247424, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0109526, + "step": 4112, + "time_per_iteration": 2.8517532348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_mlp": 1.04066622, + "diversity_loss_mlp": 0.0, + "epoch": 0.7912658714890343, + "flos": 669357374976.0, + "grad_norm": 0.06825914372029093, + "language_loss": 0.80628312, + "learning_rate": 0.00010997381344533853, + "loss": 0.81678075, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4113, + "time_per_iteration": 2.76458477973938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054223, + "balance_loss_mlp": 1.04482937, + "diversity_loss_mlp": 0.0, + "epoch": 0.7914582531742979, + "flos": 557779944960.0, + "grad_norm": 0.06296725861693256, + "language_loss": 0.80975449, + "learning_rate": 0.00010977895271276517, + "loss": 0.82029676, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4114, + "time_per_iteration": 2.677236795425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105271, + "balance_loss_mlp": 1.04387641, + "diversity_loss_mlp": 0.0, + "epoch": 0.7916506348595613, + "flos": 570064863744.0, + "grad_norm": 0.07698010071595295, + "language_loss": 0.79882276, + "learning_rate": 0.00010958424347743807, + "loss": 0.80934995, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4115, + "time_per_iteration": 2.7255280017852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056753, + "balance_loss_mlp": 1.04793203, + "diversity_loss_mlp": 0.0, + "epoch": 0.7918430165448249, + "flos": 718301758464.0, + "grad_norm": 0.06323084510093162, + "language_loss": 0.80379033, + "learning_rate": 0.00010938968581494991, + "loss": 0.81435782, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4116, + "time_per_iteration": 2.956744909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056354, + "balance_loss_mlp": 1.0473659, + "diversity_loss_mlp": 0.0, + "epoch": 0.7920353982300885, + "flos": 553648753152.0, + "grad_norm": 0.07593804019744407, + "language_loss": 0.78918922, + "learning_rate": 0.000109195279800835, + "loss": 0.79975271, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4117, + "time_per_iteration": 2.7232017517089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052824, + "balance_loss_mlp": 1.04372239, + "diversity_loss_mlp": 0.0, + "epoch": 0.7922277799153521, + "flos": 810120019968.0, + "grad_norm": 0.07668598230710005, + "language_loss": 0.76558191, + "learning_rate": 0.00010900102551056834, + "loss": 0.77611017, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4118, + "time_per_iteration": 3.0348682403564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_mlp": 1.04203153, + "diversity_loss_mlp": 0.0, + "epoch": 0.7924201616006156, + "flos": 421351123968.0, + "grad_norm": 0.06933579681898581, + "language_loss": 0.8458457, + "learning_rate": 0.00010880692301956601, + "loss": 0.85635561, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4119, + "time_per_iteration": 2.465395212173462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059146, + "balance_loss_mlp": 1.05027056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7926125432858792, + "flos": 617852924928.0, + "grad_norm": 0.06493837690301978, + "language_loss": 0.86651456, + "learning_rate": 0.00010861297240318518, + "loss": 0.87710601, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4120, + "time_per_iteration": 2.8506181240081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_mlp": 1.04826188, + "diversity_loss_mlp": 0.0, + "epoch": 0.7928049249711427, + "flos": 602487051264.0, + "grad_norm": 0.07524766323731863, + "language_loss": 0.87229133, + "learning_rate": 0.00010841917373672444, + "loss": 0.88286078, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 4121, + "time_per_iteration": 2.745227336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055842, + "balance_loss_mlp": 1.04712808, + "diversity_loss_mlp": 0.0, + "epoch": 0.7929973066564063, + "flos": 656024790528.0, + "grad_norm": 0.08118940133699648, + "language_loss": 0.78629029, + "learning_rate": 0.00010822552709542293, + "loss": 0.79684877, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4122, + "time_per_iteration": 2.813340425491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055553, + "balance_loss_mlp": 1.04677343, + "diversity_loss_mlp": 0.0, + "epoch": 0.7931896883416699, + "flos": 536397520896.0, + "grad_norm": 0.058728515527731805, + "language_loss": 0.86142117, + "learning_rate": 0.0001080320325544612, + "loss": 0.87197673, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4123, + "time_per_iteration": 2.6903398036956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053347, + "balance_loss_mlp": 1.04438257, + "diversity_loss_mlp": 0.0, + "epoch": 0.7933820700269334, + "flos": 498082493952.0, + "grad_norm": 0.06377375336372411, + "language_loss": 0.83519953, + "learning_rate": 0.00010783869018895997, + "loss": 0.84573305, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4124, + "time_per_iteration": 2.6091437339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_mlp": 1.04709673, + "diversity_loss_mlp": 0.0, + "epoch": 0.793574451712197, + "flos": 537472350720.0, + "grad_norm": 0.06290112703691109, + "language_loss": 0.84019685, + "learning_rate": 0.00010764550007398189, + "loss": 0.85075527, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4125, + "time_per_iteration": 2.639021396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105337, + "balance_loss_mlp": 1.04447079, + "diversity_loss_mlp": 0.0, + "epoch": 0.7937668333974606, + "flos": 488285687808.0, + "grad_norm": 0.059983052052207615, + "language_loss": 0.81026101, + "learning_rate": 0.00010745246228452982, + "loss": 0.8207947, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4126, + "time_per_iteration": 2.567128896713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_mlp": 1.04658413, + "diversity_loss_mlp": 0.0, + "epoch": 0.7939592150827242, + "flos": 527425924608.0, + "grad_norm": 0.06538981258691282, + "language_loss": 0.81837595, + "learning_rate": 0.00010725957689554771, + "loss": 0.82892644, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.08477783, + "routerloss_mlp": 0.0, + "step": 4127, + "time_per_iteration": 2.7668473720550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105364, + "balance_loss_mlp": 1.04483056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7941515967679876, + "flos": 541702287360.0, + "grad_norm": 0.06455760363891609, + "language_loss": 0.84442085, + "learning_rate": 0.00010706684398192013, + "loss": 0.85495722, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4128, + "time_per_iteration": 2.703094482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.04694915, + "diversity_loss_mlp": 0.0, + "epoch": 0.7943439784532512, + "flos": 518387516928.0, + "grad_norm": 0.10398066376678644, + "language_loss": 0.81773114, + "learning_rate": 0.00010687426361847313, + "loss": 0.82829189, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4129, + "time_per_iteration": 2.730570077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_mlp": 1.04571033, + "diversity_loss_mlp": 0.0, + "epoch": 0.7945363601385148, + "flos": 509025710592.0, + "grad_norm": 0.06937610081260179, + "language_loss": 0.8574326, + "learning_rate": 0.00010668183587997254, + "loss": 0.86797965, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4130, + "time_per_iteration": 2.644259452819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_mlp": 1.04217792, + "diversity_loss_mlp": 0.0, + "epoch": 0.7947287418237784, + "flos": 651214121472.0, + "grad_norm": 0.05953600763070223, + "language_loss": 0.77579701, + "learning_rate": 0.0001064895608411256, + "loss": 0.78630781, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4131, + "time_per_iteration": 2.841925859451294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105178, + "balance_loss_mlp": 1.04286337, + "diversity_loss_mlp": 0.0, + "epoch": 0.794921123509042, + "flos": 696054477312.0, + "grad_norm": 0.06486183241314894, + "language_loss": 0.80494809, + "learning_rate": 0.00010629743857657998, + "loss": 0.81546587, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4132, + "time_per_iteration": 2.9550116062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007878, + "balance_loss_mlp": 1.00334787, + "diversity_loss_mlp": 0.0, + "epoch": 0.7951135051943055, + "flos": 1402942768128.0, + "grad_norm": 0.014279472424614392, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71606547, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.04541016, + "routerloss_mlp": 0.0, + "step": 4133, + "time_per_iteration": 4.61087965965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_mlp": 1.05091596, + "diversity_loss_mlp": 0.0, + "epoch": 0.795305886879569, + "flos": 810085515264.0, + "grad_norm": 0.08419096338195846, + "language_loss": 0.82037973, + "learning_rate": 0.00010591365266868802, + "loss": 0.83097553, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 4134, + "time_per_iteration": 2.980473518371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006707, + "balance_loss_mlp": 1.00217748, + "diversity_loss_mlp": 0.0, + "epoch": 0.7954982685648326, + "flos": 1426005347328.0, + "grad_norm": 0.013377465040040408, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76518488, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.04541016, + "routerloss_mlp": 0.0, + "step": 4135, + "time_per_iteration": 5.031512975692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051378, + "balance_loss_mlp": 1.04224622, + "diversity_loss_mlp": 0.0, + "epoch": 0.7956906502500962, + "flos": 389885197824.0, + "grad_norm": 0.08143958467983652, + "language_loss": 0.7928952, + "learning_rate": 0.00010553047875229166, + "loss": 0.80340898, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4136, + "time_per_iteration": 2.536219596862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053745, + "balance_loss_mlp": 1.04491794, + "diversity_loss_mlp": 0.0, + "epoch": 0.7958830319353598, + "flos": 515573434368.0, + "grad_norm": 0.05917621440441134, + "language_loss": 0.8352496, + "learning_rate": 0.00010533912147689328, + "loss": 0.84578705, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4137, + "time_per_iteration": 2.62947416305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052392, + "balance_loss_mlp": 1.04364753, + "diversity_loss_mlp": 0.0, + "epoch": 0.7960754136206233, + "flos": 493941390336.0, + "grad_norm": 0.07247645097842569, + "language_loss": 0.82383895, + "learning_rate": 0.00010514791742243656, + "loss": 0.83436286, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4138, + "time_per_iteration": 2.6058223247528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053846, + "balance_loss_mlp": 1.04486322, + "diversity_loss_mlp": 0.0, + "epoch": 0.7962677953058869, + "flos": 655728182784.0, + "grad_norm": 0.07856202151848143, + "language_loss": 0.82678479, + "learning_rate": 0.00010495686666315341, + "loss": 0.83732331, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4139, + "time_per_iteration": 2.8820180892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_mlp": 1.04509258, + "diversity_loss_mlp": 0.0, + "epoch": 0.7964601769911505, + "flos": 542384335872.0, + "grad_norm": 0.09207393340076041, + "language_loss": 0.77504325, + "learning_rate": 0.00010476596927321635, + "loss": 0.78558183, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4140, + "time_per_iteration": 2.5876264572143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054719, + "balance_loss_mlp": 1.04586816, + "diversity_loss_mlp": 0.0, + "epoch": 0.796652558676414, + "flos": 537650016768.0, + "grad_norm": 0.06332389355869186, + "language_loss": 0.80286723, + "learning_rate": 0.00010457522532673835, + "loss": 0.81341445, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4141, + "time_per_iteration": 2.7853429317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053842, + "balance_loss_mlp": 1.04521155, + "diversity_loss_mlp": 0.0, + "epoch": 0.7968449403616775, + "flos": 475091495424.0, + "grad_norm": 0.07594916891501999, + "language_loss": 0.83322799, + "learning_rate": 0.00010438463489777272, + "loss": 0.84376645, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4142, + "time_per_iteration": 2.574995756149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053543, + "balance_loss_mlp": 1.0441432, + "diversity_loss_mlp": 0.0, + "epoch": 0.7970373220469411, + "flos": 567613827072.0, + "grad_norm": 0.06219380630034642, + "language_loss": 0.77388006, + "learning_rate": 0.00010419419806031316, + "loss": 0.78441548, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 4143, + "time_per_iteration": 2.681364059448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057205, + "balance_loss_mlp": 1.04838395, + "diversity_loss_mlp": 0.0, + "epoch": 0.7972297037322047, + "flos": 556208446464.0, + "grad_norm": 0.06244291716660837, + "language_loss": 0.83778638, + "learning_rate": 0.00010400391488829403, + "loss": 0.84835839, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4144, + "time_per_iteration": 2.7661397457122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056681, + "balance_loss_mlp": 1.04754949, + "diversity_loss_mlp": 0.0, + "epoch": 0.7974220854174683, + "flos": 576180158976.0, + "grad_norm": 0.056029857219710606, + "language_loss": 0.86605, + "learning_rate": 0.00010381378545558984, + "loss": 0.87661684, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4145, + "time_per_iteration": 2.706909656524658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_mlp": 1.04191816, + "diversity_loss_mlp": 0.0, + "epoch": 0.7976144671027319, + "flos": 483069754368.0, + "grad_norm": 0.06718577287314217, + "language_loss": 0.84665811, + "learning_rate": 0.00010362380983601505, + "loss": 0.85716891, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4146, + "time_per_iteration": 2.529480218887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055069, + "balance_loss_mlp": 1.04609227, + "diversity_loss_mlp": 0.0, + "epoch": 0.7978068487879953, + "flos": 1077865615872.0, + "grad_norm": 0.0571367932207486, + "language_loss": 0.7866556, + "learning_rate": 0.00010343398810332477, + "loss": 0.79720628, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4147, + "time_per_iteration": 3.4586639404296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105507, + "balance_loss_mlp": 1.04595661, + "diversity_loss_mlp": 0.0, + "epoch": 0.7979992304732589, + "flos": 733739586048.0, + "grad_norm": 0.07566676342485233, + "language_loss": 0.84437156, + "learning_rate": 0.00010324432033121467, + "loss": 0.85492229, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4148, + "time_per_iteration": 2.8839025497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053366, + "balance_loss_mlp": 1.04418659, + "diversity_loss_mlp": 0.0, + "epoch": 0.7981916121585225, + "flos": 415774342656.0, + "grad_norm": 0.06830192551222886, + "language_loss": 0.83435208, + "learning_rate": 0.00010305480659332005, + "loss": 0.84488571, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4149, + "time_per_iteration": 2.5951197147369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059283, + "balance_loss_mlp": 1.05012214, + "diversity_loss_mlp": 0.0, + "epoch": 0.7983839938437861, + "flos": 465257613312.0, + "grad_norm": 0.07563453451103978, + "language_loss": 0.83492422, + "learning_rate": 0.00010286544696321682, + "loss": 0.84551704, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4150, + "time_per_iteration": 2.5118510723114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055471, + "balance_loss_mlp": 1.04628563, + "diversity_loss_mlp": 0.0, + "epoch": 0.7985763755290496, + "flos": 510567473664.0, + "grad_norm": 0.07562833621575128, + "language_loss": 0.7924732, + "learning_rate": 0.00010267624151442073, + "loss": 0.80302793, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4151, + "time_per_iteration": 2.612138509750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052309, + "balance_loss_mlp": 1.04312396, + "diversity_loss_mlp": 0.0, + "epoch": 0.7987687572143132, + "flos": 1010649498624.0, + "grad_norm": 0.07020647270289845, + "language_loss": 0.80794007, + "learning_rate": 0.000102487190320388, + "loss": 0.81846315, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4152, + "time_per_iteration": 3.3858306407928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052492, + "balance_loss_mlp": 1.0432297, + "diversity_loss_mlp": 0.0, + "epoch": 0.7989611388995768, + "flos": 1021078794240.0, + "grad_norm": 0.08528953367031804, + "language_loss": 0.79654646, + "learning_rate": 0.00010229829345451475, + "loss": 0.80707145, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4153, + "time_per_iteration": 3.326597213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_mlp": 1.04706669, + "diversity_loss_mlp": 0.0, + "epoch": 0.7991535205848403, + "flos": 1101338601984.0, + "grad_norm": 0.06462141101761633, + "language_loss": 0.79619837, + "learning_rate": 0.00010210955099013724, + "loss": 0.80676001, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4154, + "time_per_iteration": 3.3817038536071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054824, + "balance_loss_mlp": 1.04566312, + "diversity_loss_mlp": 0.0, + "epoch": 0.7993459022701039, + "flos": 834818337792.0, + "grad_norm": 0.07616557599778462, + "language_loss": 0.76846623, + "learning_rate": 0.00010192096300053167, + "loss": 0.77901447, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4155, + "time_per_iteration": 3.081740379333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105106, + "balance_loss_mlp": 1.04188037, + "diversity_loss_mlp": 0.0, + "epoch": 0.7995382839553674, + "flos": 522686836224.0, + "grad_norm": 0.0612954553036602, + "language_loss": 0.85157597, + "learning_rate": 0.00010173252955891477, + "loss": 0.86208659, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4156, + "time_per_iteration": 2.7239129543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055997, + "balance_loss_mlp": 1.04709256, + "diversity_loss_mlp": 0.0, + "epoch": 0.799730665640631, + "flos": 537820715520.0, + "grad_norm": 0.07720224754254114, + "language_loss": 0.73362273, + "learning_rate": 0.00010154425073844253, + "loss": 0.74418271, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4157, + "time_per_iteration": 2.696467638015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052019, + "balance_loss_mlp": 1.04316235, + "diversity_loss_mlp": 0.0, + "epoch": 0.7999230473258946, + "flos": 505060075008.0, + "grad_norm": 0.060505733748086536, + "language_loss": 0.82517296, + "learning_rate": 0.00010135612661221138, + "loss": 0.83569312, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4158, + "time_per_iteration": 2.582913398742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_mlp": 1.03880203, + "diversity_loss_mlp": 0.0, + "epoch": 0.8001154290111582, + "flos": 1027342393344.0, + "grad_norm": 0.08198302238912947, + "language_loss": 0.81945235, + "learning_rate": 0.00010116815725325751, + "loss": 0.82993186, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 4159, + "time_per_iteration": 3.28433895111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798548, + "balance_loss_mlp": 1.34939909, + "diversity_loss_mlp": 0.22584054, + "epoch": 0.8003078106964217, + "flos": 750906754560.0, + "grad_norm": 0.032371691049230863, + "language_loss": 0.80472159, + "learning_rate": 0.00010098034273455725, + "loss": 0.81270707, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01092844, + "step": 4160, + "time_per_iteration": 3.020301342010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047481, + "balance_loss_mlp": 1.03802133, + "diversity_loss_mlp": 0.0, + "epoch": 0.8005001923816852, + "flos": 488465925120.0, + "grad_norm": 0.06923738075728161, + "language_loss": 0.79914421, + "learning_rate": 0.00010079268312902662, + "loss": 0.80961907, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 4161, + "time_per_iteration": 2.663827657699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053373, + "balance_loss_mlp": 1.04445577, + "diversity_loss_mlp": 0.0, + "epoch": 0.8006925740669488, + "flos": 513248306688.0, + "grad_norm": 0.07955090405050065, + "language_loss": 0.82002842, + "learning_rate": 0.0001006051785095215, + "loss": 0.83056211, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4162, + "time_per_iteration": 2.669938087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052136, + "balance_loss_mlp": 1.04306972, + "diversity_loss_mlp": 0.0, + "epoch": 0.8008849557522124, + "flos": 578529879552.0, + "grad_norm": 0.07737392704066832, + "language_loss": 0.79858398, + "learning_rate": 0.0001004178289488376, + "loss": 0.80910534, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4163, + "time_per_iteration": 2.7215919494628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052219, + "balance_loss_mlp": 1.04284358, + "diversity_loss_mlp": 0.0, + "epoch": 0.801077337437476, + "flos": 478708766208.0, + "grad_norm": 0.06994031793136987, + "language_loss": 0.83999282, + "learning_rate": 0.0001002306345197106, + "loss": 0.85051501, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.09368896, + "routerloss_mlp": 0.0, + "step": 4164, + "time_per_iteration": 2.545501708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049385, + "balance_loss_mlp": 1.04034317, + "diversity_loss_mlp": 0.0, + "epoch": 0.8012697191227395, + "flos": 676700573184.0, + "grad_norm": 0.07265204276246538, + "language_loss": 0.80238962, + "learning_rate": 0.00010004359529481571, + "loss": 0.81288344, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4165, + "time_per_iteration": 2.8751044273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049571, + "balance_loss_mlp": 1.04052877, + "diversity_loss_mlp": 0.0, + "epoch": 0.8014621008080031, + "flos": 1295132405760.0, + "grad_norm": 0.07344708402099766, + "language_loss": 0.82382286, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83431858, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4166, + "time_per_iteration": 3.706587314605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051894, + "balance_loss_mlp": 1.04301274, + "diversity_loss_mlp": 0.0, + "epoch": 0.8016544824932667, + "flos": 511827683328.0, + "grad_norm": 0.0782603427027698, + "language_loss": 0.83461916, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84513807, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4167, + "time_per_iteration": 2.5965118408203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_mlp": 1.04132366, + "diversity_loss_mlp": 0.0, + "epoch": 0.8018468641785302, + "flos": 535690879488.0, + "grad_norm": 0.08470873380508834, + "language_loss": 0.81762064, + "learning_rate": 9.948340957137308e-05, + "loss": 0.82812226, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4168, + "time_per_iteration": 2.6369173526763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_mlp": 1.04494286, + "diversity_loss_mlp": 0.0, + "epoch": 0.8020392458637937, + "flos": 1023431086080.0, + "grad_norm": 0.07955948845391579, + "language_loss": 0.79946613, + "learning_rate": 9.929699188895447e-05, + "loss": 0.81000549, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4169, + "time_per_iteration": 3.257819652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00590619, + "balance_loss_mlp": 1.02878523, + "diversity_loss_mlp": 0.13400336, + "epoch": 0.8022316275490573, + "flos": 1561806821376.0, + "grad_norm": 0.001271365187533197, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.78645021, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00922488, + "step": 4170, + "time_per_iteration": 4.967956066131592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052767, + "balance_loss_mlp": 1.04368353, + "diversity_loss_mlp": 0.0, + "epoch": 0.8024240092343209, + "flos": 420698810880.0, + "grad_norm": 0.06699330376146911, + "language_loss": 0.83303684, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84356451, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4171, + "time_per_iteration": 2.511323928833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053852, + "balance_loss_mlp": 1.04476857, + "diversity_loss_mlp": 0.0, + "epoch": 0.8026163909195845, + "flos": 763836645888.0, + "grad_norm": 0.0707874133261092, + "language_loss": 0.7890135, + "learning_rate": 9.873867253111762e-05, + "loss": 0.79955202, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4172, + "time_per_iteration": 2.938361644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002455, + "balance_loss_mlp": 0.99778163, + "diversity_loss_mlp": 0.0, + "epoch": 0.8028087726048481, + "flos": 1518861362688.0, + "grad_norm": 0.01094338931973828, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81267017, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.04663086, + "routerloss_mlp": 0.0, + "step": 4173, + "time_per_iteration": 4.908462285995483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793014, + "balance_loss_mlp": 1.33927226, + "diversity_loss_mlp": 0.22488941, + "epoch": 0.8030011542901115, + "flos": 517861486080.0, + "grad_norm": 0.03516130293682118, + "language_loss": 0.88785201, + "learning_rate": 9.836723842278733e-05, + "loss": 0.89578211, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01093344, + "step": 4174, + "time_per_iteration": 2.5922460556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053625, + "balance_loss_mlp": 1.04467213, + "diversity_loss_mlp": 0.0, + "epoch": 0.8031935359753751, + "flos": 545616165888.0, + "grad_norm": 0.07944554575907646, + "language_loss": 0.78243375, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79296994, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4175, + "time_per_iteration": 2.6601076126098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051864, + "balance_loss_mlp": 1.04280424, + "diversity_loss_mlp": 0.0, + "epoch": 0.8033859176606387, + "flos": 603559309824.0, + "grad_norm": 0.06387478026678979, + "language_loss": 0.84549594, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85601461, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4176, + "time_per_iteration": 2.7655818462371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049571, + "balance_loss_mlp": 1.0406065, + "diversity_loss_mlp": 0.0, + "epoch": 0.8035782993459023, + "flos": 565859520000.0, + "grad_norm": 0.07434715811474918, + "language_loss": 0.81265736, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82315314, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4177, + "time_per_iteration": 2.7365646362304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051174, + "balance_loss_mlp": 1.04198945, + "diversity_loss_mlp": 0.0, + "epoch": 0.8037706810311658, + "flos": 538435952640.0, + "grad_norm": 0.0854183247343152, + "language_loss": 0.84699386, + "learning_rate": 9.762624191379054e-05, + "loss": 0.85750556, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4178, + "time_per_iteration": 2.6607935428619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047249, + "balance_loss_mlp": 1.03811717, + "diversity_loss_mlp": 0.0, + "epoch": 0.8039630627164294, + "flos": 515187993600.0, + "grad_norm": 0.07548014236337308, + "language_loss": 0.79687864, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80735117, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4179, + "time_per_iteration": 2.649068593978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01001844, + "balance_loss_mlp": 0.99719512, + "diversity_loss_mlp": 0.0, + "epoch": 0.804155444401693, + "flos": 1478834247168.0, + "grad_norm": 0.010296775940752873, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75735408, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.04638672, + "routerloss_mlp": 0.0, + "step": 4180, + "time_per_iteration": 4.874431133270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050118, + "balance_loss_mlp": 1.04090953, + "diversity_loss_mlp": 0.0, + "epoch": 0.8043478260869565, + "flos": 521164896768.0, + "grad_norm": 0.07453821883084652, + "language_loss": 0.77098471, + "learning_rate": 9.707213454125396e-05, + "loss": 0.78148586, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4181, + "time_per_iteration": 2.687908887863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045921, + "balance_loss_mlp": 1.03656304, + "diversity_loss_mlp": 0.0, + "epoch": 0.8045402077722201, + "flos": 545448038400.0, + "grad_norm": 0.06056113889476793, + "language_loss": 0.80571556, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81617486, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 4182, + "time_per_iteration": 2.755779981613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_mlp": 1.04540682, + "diversity_loss_mlp": 0.0, + "epoch": 0.8047325894574836, + "flos": 678388068864.0, + "grad_norm": 0.07500472983981471, + "language_loss": 0.7412895, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75183195, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4183, + "time_per_iteration": 2.959167242050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046099, + "balance_loss_mlp": 1.03698587, + "diversity_loss_mlp": 0.0, + "epoch": 0.8049249711427472, + "flos": 587227262976.0, + "grad_norm": 0.07263280839339305, + "language_loss": 0.78791356, + "learning_rate": 9.65194350425882e-05, + "loss": 0.79837459, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4184, + "time_per_iteration": 2.7201614379882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049152, + "balance_loss_mlp": 1.0401814, + "diversity_loss_mlp": 0.0, + "epoch": 0.8051173528280108, + "flos": 814194312192.0, + "grad_norm": 0.0782100616306692, + "language_loss": 0.77473164, + "learning_rate": 9.633551507115452e-05, + "loss": 0.78522313, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4185, + "time_per_iteration": 3.134634256362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010494, + "balance_loss_mlp": 1.04034662, + "diversity_loss_mlp": 0.0, + "epoch": 0.8053097345132744, + "flos": 725687175168.0, + "grad_norm": 0.06922447607886563, + "language_loss": 0.77592742, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78642142, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4186, + "time_per_iteration": 2.961618423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051819, + "balance_loss_mlp": 1.04297376, + "diversity_loss_mlp": 0.0, + "epoch": 0.805502116198538, + "flos": 748050453504.0, + "grad_norm": 0.0745309975524961, + "language_loss": 0.81570286, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82622111, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4187, + "time_per_iteration": 2.9941747188568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050277, + "balance_loss_mlp": 1.04128897, + "diversity_loss_mlp": 0.0, + "epoch": 0.8056944978838014, + "flos": 640258421760.0, + "grad_norm": 0.06519286758654869, + "language_loss": 0.87670028, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88720298, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4188, + "time_per_iteration": 2.8933184146881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049222, + "balance_loss_mlp": 1.04018009, + "diversity_loss_mlp": 0.0, + "epoch": 0.805886879569065, + "flos": 644631892992.0, + "grad_norm": 0.07111853308758409, + "language_loss": 0.78227425, + "learning_rate": 9.560140306306436e-05, + "loss": 0.79276645, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4189, + "time_per_iteration": 2.8829870223999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050789, + "balance_loss_mlp": 1.0420208, + "diversity_loss_mlp": 0.0, + "epoch": 0.8060792612543286, + "flos": 661230812160.0, + "grad_norm": 0.07715619542299273, + "language_loss": 0.81660378, + "learning_rate": 9.541826738671233e-05, + "loss": 0.8271116, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4190, + "time_per_iteration": 2.805797815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050431, + "balance_loss_mlp": 1.041592, + "diversity_loss_mlp": 0.0, + "epoch": 0.8062716429395922, + "flos": 455075366400.0, + "grad_norm": 0.07784281121647556, + "language_loss": 0.82554364, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83604801, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4191, + "time_per_iteration": 2.555079460144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055942, + "balance_loss_mlp": 1.0468998, + "diversity_loss_mlp": 0.0, + "epoch": 0.8064640246248557, + "flos": 526407994368.0, + "grad_norm": 0.08129119625333912, + "language_loss": 0.85176903, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86232841, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4192, + "time_per_iteration": 2.616278648376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057227, + "balance_loss_mlp": 1.04823291, + "diversity_loss_mlp": 0.0, + "epoch": 0.8066564063101193, + "flos": 865115458560.0, + "grad_norm": 0.06195550147591559, + "language_loss": 0.8222602, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83283252, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4193, + "time_per_iteration": 3.1774582862854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055176, + "balance_loss_mlp": 1.04616332, + "diversity_loss_mlp": 0.0, + "epoch": 0.8068487879953828, + "flos": 530536614912.0, + "grad_norm": 0.07492247011829438, + "language_loss": 0.82230604, + "learning_rate": 9.468729611697246e-05, + "loss": 0.83285773, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4194, + "time_per_iteration": 2.711758613586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105404, + "balance_loss_mlp": 1.04514122, + "diversity_loss_mlp": 0.0, + "epoch": 0.8070411696806464, + "flos": 566183291904.0, + "grad_norm": 0.05932556750810355, + "language_loss": 0.81710708, + "learning_rate": 9.450494651319003e-05, + "loss": 0.82764751, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4195, + "time_per_iteration": 2.6608495712280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058947, + "balance_loss_mlp": 1.04997635, + "diversity_loss_mlp": 0.0, + "epoch": 0.80723355136591, + "flos": 986591010816.0, + "grad_norm": 0.063085164329588, + "language_loss": 0.79428887, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80487841, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4196, + "time_per_iteration": 3.337599515914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058486, + "balance_loss_mlp": 1.04924726, + "diversity_loss_mlp": 0.0, + "epoch": 0.8074259330511735, + "flos": 566961513984.0, + "grad_norm": 0.06810941123985487, + "language_loss": 0.82549566, + "learning_rate": 9.414071965778221e-05, + "loss": 0.83608055, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4197, + "time_per_iteration": 2.8500421047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793266, + "balance_loss_mlp": 1.33856153, + "diversity_loss_mlp": 0.22554049, + "epoch": 0.8076183147364371, + "flos": 494662712832.0, + "grad_norm": 0.030004109162440378, + "language_loss": 0.80021191, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80814457, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01121513, + "step": 4198, + "time_per_iteration": 2.7939352989196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061077, + "balance_loss_mlp": 1.05221987, + "diversity_loss_mlp": 0.0, + "epoch": 0.8078106964217007, + "flos": 420011993088.0, + "grad_norm": 0.07237334672543508, + "language_loss": 0.79747534, + "learning_rate": 9.377712307650044e-05, + "loss": 0.80808604, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4199, + "time_per_iteration": 2.616584300994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060422, + "balance_loss_mlp": 1.05148149, + "diversity_loss_mlp": 0.0, + "epoch": 0.8080030781069643, + "flos": 527537152512.0, + "grad_norm": 0.07529347845483464, + "language_loss": 0.83181953, + "learning_rate": 9.359556131514602e-05, + "loss": 0.8424238, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4200, + "time_per_iteration": 2.6320338249206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788939, + "balance_loss_mlp": 1.33364224, + "diversity_loss_mlp": 0.22200125, + "epoch": 0.8081954597922277, + "flos": 544148554752.0, + "grad_norm": 0.03126306975747278, + "language_loss": 0.8159976, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82388693, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01111754, + "step": 4201, + "time_per_iteration": 2.725898265838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060854, + "balance_loss_mlp": 1.05191302, + "diversity_loss_mlp": 0.0, + "epoch": 0.8083878414774913, + "flos": 640900823040.0, + "grad_norm": 0.07028300429625041, + "language_loss": 0.75730419, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76791275, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4202, + "time_per_iteration": 2.858754873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057987, + "balance_loss_mlp": 1.04905808, + "diversity_loss_mlp": 0.0, + "epoch": 0.8085802231627549, + "flos": 705614146560.0, + "grad_norm": 0.07410213802766576, + "language_loss": 0.72826529, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73884517, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4203, + "time_per_iteration": 2.910843849182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053797, + "balance_loss_mlp": 1.04489827, + "diversity_loss_mlp": 0.0, + "epoch": 0.8087726048480185, + "flos": 419762373120.0, + "grad_norm": 0.07872218498382196, + "language_loss": 0.88753879, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89807671, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4204, + "time_per_iteration": 2.531914234161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059733, + "balance_loss_mlp": 1.05073869, + "diversity_loss_mlp": 0.0, + "epoch": 0.8089649865332821, + "flos": 508766178816.0, + "grad_norm": 0.05750820164302825, + "language_loss": 0.87048918, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88108647, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4205, + "time_per_iteration": 2.7968151569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052186, + "balance_loss_mlp": 1.04308999, + "diversity_loss_mlp": 0.0, + "epoch": 0.8091573682185456, + "flos": 457219883520.0, + "grad_norm": 0.06433103951625496, + "language_loss": 0.8483271, + "learning_rate": 9.250950659394386e-05, + "loss": 0.85884893, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4206, + "time_per_iteration": 2.665961742401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_mlp": 1.04172313, + "diversity_loss_mlp": 0.0, + "epoch": 0.8093497499038091, + "flos": 525256441344.0, + "grad_norm": 0.0784365412189913, + "language_loss": 0.77137649, + "learning_rate": 9.232905077078824e-05, + "loss": 0.7818836, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4207, + "time_per_iteration": 2.7918972969055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105439, + "balance_loss_mlp": 1.04530609, + "diversity_loss_mlp": 0.0, + "epoch": 0.8095421315890727, + "flos": 489617478144.0, + "grad_norm": 0.07290792729834863, + "language_loss": 0.76617867, + "learning_rate": 9.214875321953164e-05, + "loss": 0.77672255, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4208, + "time_per_iteration": 2.6330010890960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056212, + "balance_loss_mlp": 1.04722369, + "diversity_loss_mlp": 0.0, + "epoch": 0.8097345132743363, + "flos": 625109861376.0, + "grad_norm": 0.06967828145804263, + "language_loss": 0.81180429, + "learning_rate": 9.196861401017164e-05, + "loss": 0.82236642, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4209, + "time_per_iteration": 2.8048768043518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053814, + "balance_loss_mlp": 1.04471278, + "diversity_loss_mlp": 0.0, + "epoch": 0.8099268949595998, + "flos": 615688584192.0, + "grad_norm": 0.08832200116465504, + "language_loss": 0.79589164, + "learning_rate": 9.178863321264475e-05, + "loss": 0.8064298, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4210, + "time_per_iteration": 2.775315046310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053364, + "balance_loss_mlp": 1.04430985, + "diversity_loss_mlp": 0.0, + "epoch": 0.8101192766448634, + "flos": 479642632704.0, + "grad_norm": 0.05749425026246104, + "language_loss": 0.79754937, + "learning_rate": 9.160881089682566e-05, + "loss": 0.80808306, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4211, + "time_per_iteration": 2.6467440128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051728, + "balance_loss_mlp": 1.04233456, + "diversity_loss_mlp": 0.0, + "epoch": 0.810311658330127, + "flos": 517327741440.0, + "grad_norm": 0.06468521234127066, + "language_loss": 0.8684355, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87895274, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4212, + "time_per_iteration": 2.6296494007110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051493, + "balance_loss_mlp": 1.04236174, + "diversity_loss_mlp": 0.0, + "epoch": 0.8105040400153906, + "flos": 575782235136.0, + "grad_norm": 0.05999607560391635, + "language_loss": 0.84117031, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85168523, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 4213, + "time_per_iteration": 2.834974527359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048457, + "balance_loss_mlp": 1.03935528, + "diversity_loss_mlp": 0.0, + "epoch": 0.8106964217006541, + "flos": 638963707392.0, + "grad_norm": 0.07539161755647025, + "language_loss": 0.85083151, + "learning_rate": 9.107029553743862e-05, + "loss": 0.86131608, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4214, + "time_per_iteration": 2.8861420154571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053208, + "balance_loss_mlp": 1.04424381, + "diversity_loss_mlp": 0.0, + "epoch": 0.8108888033859176, + "flos": 579505964544.0, + "grad_norm": 0.07165268891230793, + "language_loss": 0.81364369, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82417578, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4215, + "time_per_iteration": 2.6690080165863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047412, + "balance_loss_mlp": 1.03829873, + "diversity_loss_mlp": 0.0, + "epoch": 0.8110811850711812, + "flos": 559907209728.0, + "grad_norm": 0.05808229124837682, + "language_loss": 0.83832216, + "learning_rate": 9.071207898465284e-05, + "loss": 0.84879631, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4216, + "time_per_iteration": 2.8289334774017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012526, + "balance_loss_mlp": 1.00782871, + "diversity_loss_mlp": 0.0, + "epoch": 0.8112735667564448, + "flos": 1517939979264.0, + "grad_norm": 0.01559500500099235, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78272945, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.046875, + "routerloss_mlp": 0.0, + "step": 4217, + "time_per_iteration": 4.674102067947388 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051919, + "balance_loss_mlp": 1.04281104, + "diversity_loss_mlp": 0.0, + "epoch": 0.8114659484417084, + "flos": 616340897280.0, + "grad_norm": 0.07154355832559847, + "language_loss": 0.85079706, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86131632, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4218, + "time_per_iteration": 2.8154706954956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043945, + "balance_loss_mlp": 1.03502214, + "diversity_loss_mlp": 0.0, + "epoch": 0.8116583301269719, + "flos": 649951340544.0, + "grad_norm": 0.06078221490906587, + "language_loss": 0.79071403, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80115348, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4219, + "time_per_iteration": 2.9709677696228027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047615, + "balance_loss_mlp": 1.03838241, + "diversity_loss_mlp": 0.0, + "epoch": 0.8118507118122354, + "flos": 553087844352.0, + "grad_norm": 0.07350013125355677, + "language_loss": 0.80881071, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81928694, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 4220, + "time_per_iteration": 2.7022857666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046017, + "balance_loss_mlp": 1.03677237, + "diversity_loss_mlp": 0.0, + "epoch": 0.812043093497499, + "flos": 544118819328.0, + "grad_norm": 0.06142059768116679, + "language_loss": 0.87557077, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88603091, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4221, + "time_per_iteration": 2.637735366821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052382, + "balance_loss_mlp": 1.04335153, + "diversity_loss_mlp": 0.0, + "epoch": 0.8122354751827626, + "flos": 583404788736.0, + "grad_norm": 0.06689891729172881, + "language_loss": 0.83563554, + "learning_rate": 8.964124513805628e-05, + "loss": 0.84615934, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4222, + "time_per_iteration": 2.792409658432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010868, + "balance_loss_mlp": 1.00612342, + "diversity_loss_mlp": 0.0, + "epoch": 0.8124278568680262, + "flos": 1530568120320.0, + "grad_norm": 0.013920089604171917, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79260939, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.04736328, + "routerloss_mlp": 0.0, + "step": 4223, + "time_per_iteration": 4.96152138710022 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051266, + "balance_loss_mlp": 1.04209328, + "diversity_loss_mlp": 0.0, + "epoch": 0.8126202385532897, + "flos": 432865161216.0, + "grad_norm": 0.07751812943068913, + "language_loss": 0.8010273, + "learning_rate": 8.928557430748668e-05, + "loss": 0.81153995, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4224, + "time_per_iteration": 2.6411619186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010841, + "balance_loss_mlp": 1.00612068, + "diversity_loss_mlp": 0.0, + "epoch": 0.8128126202385533, + "flos": 1547905987584.0, + "grad_norm": 0.013617776499522711, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77506471, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.04711914, + "routerloss_mlp": 0.0, + "step": 4225, + "time_per_iteration": 4.849999904632568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047849, + "balance_loss_mlp": 1.03853297, + "diversity_loss_mlp": 0.0, + "epoch": 0.8130050019238169, + "flos": 528317945856.0, + "grad_norm": 0.06825415899254728, + "language_loss": 0.88826978, + "learning_rate": 8.893054129078077e-05, + "loss": 0.89874828, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 4226, + "time_per_iteration": 2.6051902770996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104715, + "balance_loss_mlp": 1.03806627, + "diversity_loss_mlp": 0.0, + "epoch": 0.8131973836090804, + "flos": 543125481984.0, + "grad_norm": 0.07913354085389648, + "language_loss": 0.80409497, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81456649, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4227, + "time_per_iteration": 2.709742307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046017, + "balance_loss_mlp": 1.03684425, + "diversity_loss_mlp": 0.0, + "epoch": 0.8133897652943439, + "flos": 576494019072.0, + "grad_norm": 0.11840379948544452, + "language_loss": 0.82457888, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83503902, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4228, + "time_per_iteration": 2.6976981163024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_mlp": 1.04245067, + "diversity_loss_mlp": 0.0, + "epoch": 0.8135821469796075, + "flos": 579219268608.0, + "grad_norm": 0.077990176521043, + "language_loss": 0.78880024, + "learning_rate": 8.839918887251025e-05, + "loss": 0.79931819, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 4229, + "time_per_iteration": 2.7945659160614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_mlp": 1.04340506, + "diversity_loss_mlp": 0.0, + "epoch": 0.8137745286648711, + "flos": 650346693120.0, + "grad_norm": 0.06092121648139386, + "language_loss": 0.84136802, + "learning_rate": 8.822239090334472e-05, + "loss": 0.8518936, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4230, + "time_per_iteration": 2.946951389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047623, + "balance_loss_mlp": 1.03831291, + "diversity_loss_mlp": 0.0, + "epoch": 0.8139669103501347, + "flos": 701888219136.0, + "grad_norm": 0.06877906362209742, + "language_loss": 0.75546557, + "learning_rate": 8.804575280042493e-05, + "loss": 0.7659418, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4231, + "time_per_iteration": 2.8897807598114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051697, + "balance_loss_mlp": 1.04225588, + "diversity_loss_mlp": 0.0, + "epoch": 0.8141592920353983, + "flos": 650223355392.0, + "grad_norm": 0.07632389877762422, + "language_loss": 0.82944, + "learning_rate": 8.786927463232774e-05, + "loss": 0.839957, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 4232, + "time_per_iteration": 2.755648374557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052198, + "balance_loss_mlp": 1.04287577, + "diversity_loss_mlp": 0.0, + "epoch": 0.8143516737206618, + "flos": 536829949440.0, + "grad_norm": 0.07245949865511514, + "language_loss": 0.81604928, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82657123, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 4233, + "time_per_iteration": 2.573910713195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048453, + "balance_loss_mlp": 1.03923225, + "diversity_loss_mlp": 0.0, + "epoch": 0.8145440554059253, + "flos": 508366056960.0, + "grad_norm": 0.07474822596726854, + "language_loss": 0.82091659, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83140111, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4234, + "time_per_iteration": 2.595383405685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050121, + "balance_loss_mlp": 1.04080522, + "diversity_loss_mlp": 0.0, + "epoch": 0.8147364370911889, + "flos": 635032576512.0, + "grad_norm": 0.05760879468903708, + "language_loss": 0.86682582, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87732702, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4235, + "time_per_iteration": 2.8454620838165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050489, + "balance_loss_mlp": 1.04129791, + "diversity_loss_mlp": 0.0, + "epoch": 0.8149288187764525, + "flos": 422801482752.0, + "grad_norm": 0.07072559835413951, + "language_loss": 0.78216445, + "learning_rate": 8.716496267753343e-05, + "loss": 0.7926693, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 4236, + "time_per_iteration": 2.4742040634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_mlp": 1.03813028, + "diversity_loss_mlp": 0.0, + "epoch": 0.8151212004617161, + "flos": 597444014592.0, + "grad_norm": 0.06449709049791848, + "language_loss": 0.81412882, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82460093, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4237, + "time_per_iteration": 2.7545273303985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006046, + "balance_loss_mlp": 1.00139654, + "diversity_loss_mlp": 0.0, + "epoch": 0.8153135821469796, + "flos": 1479330915840.0, + "grad_norm": 0.010587263465776719, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78858888, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.04638672, + "routerloss_mlp": 0.0, + "step": 4238, + "time_per_iteration": 5.016268730163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_mlp": 1.03776968, + "diversity_loss_mlp": 0.0, + "epoch": 0.8155059638322432, + "flos": 437097669120.0, + "grad_norm": 0.0684339838675198, + "language_loss": 0.82887548, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83934742, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 4239, + "time_per_iteration": 2.5211598873138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052042, + "balance_loss_mlp": 1.04271388, + "diversity_loss_mlp": 0.0, + "epoch": 0.8156983455175068, + "flos": 794390727168.0, + "grad_norm": 0.06874840636234532, + "language_loss": 0.85361314, + "learning_rate": 8.646321514990763e-05, + "loss": 0.8641336, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4240, + "time_per_iteration": 3.083944797515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00785137, + "balance_loss_mlp": 1.32642579, + "diversity_loss_mlp": 0.22223642, + "epoch": 0.8158907272027703, + "flos": 685986029568.0, + "grad_norm": 0.03037997104545499, + "language_loss": 0.81663668, + "learning_rate": 8.628817947092616e-05, + "loss": 0.82448804, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0108057, + "step": 4241, + "time_per_iteration": 2.849032163619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796697, + "balance_loss_mlp": 1.3468852, + "diversity_loss_mlp": 0.22464219, + "epoch": 0.8160831088880338, + "flos": 487055213568.0, + "grad_norm": 0.041459762566519655, + "language_loss": 0.84508646, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85305345, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0109333, + "step": 4242, + "time_per_iteration": 2.6374778747558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010496, + "balance_loss_mlp": 1.0404923, + "diversity_loss_mlp": 0.0, + "epoch": 0.8162754905732974, + "flos": 464872172544.0, + "grad_norm": 0.06813712019116032, + "language_loss": 0.80444574, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81494176, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4243, + "time_per_iteration": 2.5741348266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005855, + "balance_loss_mlp": 1.00120556, + "diversity_loss_mlp": 0.0, + "epoch": 0.816467872258561, + "flos": 1239530522112.0, + "grad_norm": 0.012183850402686274, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76290977, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.04638672, + "routerloss_mlp": 0.0, + "step": 4244, + "time_per_iteration": 4.708779573440552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079579, + "balance_loss_mlp": 1.34605587, + "diversity_loss_mlp": 0.22397524, + "epoch": 0.8166602539438246, + "flos": 687169516032.0, + "grad_norm": 0.030280251177676618, + "language_loss": 0.86728865, + "learning_rate": 8.558964360534615e-05, + "loss": 0.87524652, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01077495, + "step": 4245, + "time_per_iteration": 2.9368019104003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006174, + "balance_loss_mlp": 1.00154853, + "diversity_loss_mlp": 0.0, + "epoch": 0.8168526356290882, + "flos": 1490520807936.0, + "grad_norm": 0.013862139423476765, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.73980916, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.04614258, + "routerloss_mlp": 0.0, + "step": 4246, + "time_per_iteration": 4.941858291625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078277, + "balance_loss_mlp": 1.31999934, + "diversity_loss_mlp": 0.22372745, + "epoch": 0.8170450173143516, + "flos": 578201338368.0, + "grad_norm": 0.027810419821976344, + "language_loss": 0.84806323, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85589087, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01090694, + "step": 4247, + "time_per_iteration": 2.7287490367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791953, + "balance_loss_mlp": 1.33846903, + "diversity_loss_mlp": 0.22388186, + "epoch": 0.8172373989996152, + "flos": 571275514368.0, + "grad_norm": 0.03087757735964202, + "language_loss": 0.84696209, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85488164, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01077755, + "step": 4248, + "time_per_iteration": 2.7625157833099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053911, + "balance_loss_mlp": 1.04469025, + "diversity_loss_mlp": 0.0, + "epoch": 0.8174297806848788, + "flos": 528831866880.0, + "grad_norm": 0.06506910983745173, + "language_loss": 0.80918235, + "learning_rate": 8.489368195241948e-05, + "loss": 0.81972146, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 4249, + "time_per_iteration": 2.6258833408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044558, + "balance_loss_mlp": 1.03533733, + "diversity_loss_mlp": 0.0, + "epoch": 0.8176221623701424, + "flos": 569108602368.0, + "grad_norm": 0.06744676767794172, + "language_loss": 0.78911942, + "learning_rate": 8.47200942668846e-05, + "loss": 0.79956502, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4250, + "time_per_iteration": 2.7859880924224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048873, + "balance_loss_mlp": 1.03986096, + "diversity_loss_mlp": 0.0, + "epoch": 0.8178145440554059, + "flos": 656521459200.0, + "grad_norm": 0.09007032647039148, + "language_loss": 0.80543828, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81592703, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4251, + "time_per_iteration": 2.8444883823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050772, + "balance_loss_mlp": 1.04183125, + "diversity_loss_mlp": 0.0, + "epoch": 0.8180069257406695, + "flos": 545924883456.0, + "grad_norm": 0.06143293566062141, + "language_loss": 0.87781107, + "learning_rate": 8.437340264101828e-05, + "loss": 0.88831878, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4252, + "time_per_iteration": 2.710468053817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051813, + "balance_loss_mlp": 1.04260981, + "diversity_loss_mlp": 0.0, + "epoch": 0.818199307425933, + "flos": 619271350272.0, + "grad_norm": 0.06730242930695572, + "language_loss": 0.84812832, + "learning_rate": 8.420029883528474e-05, + "loss": 0.85864639, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4253, + "time_per_iteration": 2.7251899242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052007, + "balance_loss_mlp": 1.04279804, + "diversity_loss_mlp": 0.0, + "epoch": 0.8183916891111966, + "flos": 647618872320.0, + "grad_norm": 0.07105593379415724, + "language_loss": 0.77203315, + "learning_rate": 8.402735645731157e-05, + "loss": 0.7825532, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 4254, + "time_per_iteration": 2.8979763984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046206, + "balance_loss_mlp": 1.03733134, + "diversity_loss_mlp": 0.0, + "epoch": 0.8185840707964602, + "flos": 499120247808.0, + "grad_norm": 0.07494925573658785, + "language_loss": 0.77925122, + "learning_rate": 8.385457557424098e-05, + "loss": 0.78971332, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4255, + "time_per_iteration": 2.5896246433258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048093, + "balance_loss_mlp": 1.03896809, + "diversity_loss_mlp": 0.0, + "epoch": 0.8187764524817237, + "flos": 786229659648.0, + "grad_norm": 0.05893979232495145, + "language_loss": 0.79938138, + "learning_rate": 8.368195625315251e-05, + "loss": 0.80986238, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4256, + "time_per_iteration": 3.068570852279663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047062, + "balance_loss_mlp": 1.03782368, + "diversity_loss_mlp": 0.0, + "epoch": 0.8189688341669873, + "flos": 550710959616.0, + "grad_norm": 0.07101674717136439, + "language_loss": 0.80977142, + "learning_rate": 8.350949856106283e-05, + "loss": 0.82024205, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4257, + "time_per_iteration": 2.7494471073150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006173, + "balance_loss_mlp": 1.00154781, + "diversity_loss_mlp": 0.0, + "epoch": 0.8191612158522509, + "flos": 1351972435968.0, + "grad_norm": 0.007149039484563577, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72155517, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.04614258, + "routerloss_mlp": 0.0, + "step": 4258, + "time_per_iteration": 4.839837074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043595, + "balance_loss_mlp": 1.03455889, + "diversity_loss_mlp": 0.0, + "epoch": 0.8193535975375145, + "flos": 544257211392.0, + "grad_norm": 0.06534196989657123, + "language_loss": 0.84030735, + "learning_rate": 8.316506833163318e-05, + "loss": 0.85074329, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4259, + "time_per_iteration": 2.6422817707061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050217, + "balance_loss_mlp": 1.04123449, + "diversity_loss_mlp": 0.0, + "epoch": 0.8195459792227779, + "flos": 865733266944.0, + "grad_norm": 0.05670368476253994, + "language_loss": 0.85545492, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86595714, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4260, + "time_per_iteration": 3.125713586807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050233, + "balance_loss_mlp": 1.04122066, + "diversity_loss_mlp": 0.0, + "epoch": 0.8197383609080415, + "flos": 569293982208.0, + "grad_norm": 0.06904116359736774, + "language_loss": 0.81980395, + "learning_rate": 8.282128542083101e-05, + "loss": 0.83030629, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4261, + "time_per_iteration": 2.76778507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_mlp": 1.03641081, + "diversity_loss_mlp": 0.0, + "epoch": 0.8199307425933051, + "flos": 530813399040.0, + "grad_norm": 0.058406154368980764, + "language_loss": 0.85347754, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86393321, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4262, + "time_per_iteration": 2.628774404525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052152, + "balance_loss_mlp": 1.04290724, + "diversity_loss_mlp": 0.0, + "epoch": 0.8201231242785687, + "flos": 567070170624.0, + "grad_norm": 0.09112328550849395, + "language_loss": 0.85125005, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86177158, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4263, + "time_per_iteration": 2.7492353916168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048431, + "balance_loss_mlp": 1.03952646, + "diversity_loss_mlp": 0.0, + "epoch": 0.8203155059638323, + "flos": 1230505717248.0, + "grad_norm": 0.06356232342525024, + "language_loss": 0.82992971, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84041393, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4264, + "time_per_iteration": 3.54941725730896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052131, + "balance_loss_mlp": 1.04260004, + "diversity_loss_mlp": 0.0, + "epoch": 0.8205078876490958, + "flos": 574198626816.0, + "grad_norm": 0.061154055751469906, + "language_loss": 0.79944229, + "learning_rate": 8.213566368959558e-05, + "loss": 0.80996358, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 4265, + "time_per_iteration": 2.677964210510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052685, + "balance_loss_mlp": 1.04367328, + "diversity_loss_mlp": 0.0, + "epoch": 0.8207002693343594, + "flos": 931400280576.0, + "grad_norm": 0.06353811334374408, + "language_loss": 0.78419554, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79472238, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4266, + "time_per_iteration": 3.203380823135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052882, + "balance_loss_mlp": 1.04395366, + "diversity_loss_mlp": 0.0, + "epoch": 0.8208926510196229, + "flos": 549571889664.0, + "grad_norm": 0.06191713334502218, + "language_loss": 0.80525327, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81578207, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4267, + "time_per_iteration": 2.6596202850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056507, + "balance_loss_mlp": 1.04715538, + "diversity_loss_mlp": 0.0, + "epoch": 0.8210850327048865, + "flos": 648182352384.0, + "grad_norm": 0.06008885513704129, + "language_loss": 0.81976879, + "learning_rate": 8.162315056592918e-05, + "loss": 0.83033383, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 4268, + "time_per_iteration": 2.8304736614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053453, + "balance_loss_mlp": 1.04451835, + "diversity_loss_mlp": 0.0, + "epoch": 0.82127741439015, + "flos": 601520878080.0, + "grad_norm": 0.06523361113761998, + "language_loss": 0.81845587, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82899046, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4269, + "time_per_iteration": 2.7376768589019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105318, + "balance_loss_mlp": 1.04417932, + "diversity_loss_mlp": 0.0, + "epoch": 0.8214697960754136, + "flos": 474831963648.0, + "grad_norm": 0.07673767837283801, + "language_loss": 0.83897698, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84950882, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4270, + "time_per_iteration": 2.6805686950683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051913, + "balance_loss_mlp": 1.04284751, + "diversity_loss_mlp": 0.0, + "epoch": 0.8216621777606772, + "flos": 903648172032.0, + "grad_norm": 0.07279388279593675, + "language_loss": 0.85111851, + "learning_rate": 8.11120992965671e-05, + "loss": 0.86163765, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4271, + "time_per_iteration": 3.080000877380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783822, + "balance_loss_mlp": 1.32480633, + "diversity_loss_mlp": 0.22162104, + "epoch": 0.8218545594459408, + "flos": 514461528576.0, + "grad_norm": 0.033634037430315754, + "language_loss": 0.82290757, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83074582, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01060844, + "step": 4272, + "time_per_iteration": 2.615750789642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_mlp": 1.04102731, + "diversity_loss_mlp": 0.0, + "epoch": 0.8220469411312044, + "flos": 494536803840.0, + "grad_norm": 0.07856247677174821, + "language_loss": 0.86208439, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87258351, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4273, + "time_per_iteration": 2.6263344287872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051099, + "balance_loss_mlp": 1.04169989, + "diversity_loss_mlp": 0.0, + "epoch": 0.8222393228164678, + "flos": 386433483264.0, + "grad_norm": 0.08144467378809686, + "language_loss": 0.89614367, + "learning_rate": 8.060251166717835e-05, + "loss": 0.90665472, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4274, + "time_per_iteration": 2.400228500366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054991, + "balance_loss_mlp": 1.04600263, + "diversity_loss_mlp": 0.0, + "epoch": 0.8224317045017314, + "flos": 536590241280.0, + "grad_norm": 0.06163444359601604, + "language_loss": 0.86974454, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88029444, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4275, + "time_per_iteration": 2.6878175735473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048878, + "balance_loss_mlp": 1.03988957, + "diversity_loss_mlp": 0.0, + "epoch": 0.822624086186995, + "flos": 554899051008.0, + "grad_norm": 0.07177776406534302, + "language_loss": 0.82458985, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83507866, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4276, + "time_per_iteration": 2.666274070739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050704, + "balance_loss_mlp": 1.04178667, + "diversity_loss_mlp": 0.0, + "epoch": 0.8228164678722586, + "flos": 539579791872.0, + "grad_norm": 0.06822688117582502, + "language_loss": 0.79940748, + "learning_rate": 8.009438945831771e-05, + "loss": 0.80991459, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4277, + "time_per_iteration": 2.6920108795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052707, + "balance_loss_mlp": 1.04362309, + "diversity_loss_mlp": 0.0, + "epoch": 0.8230088495575221, + "flos": 473253124608.0, + "grad_norm": 0.06798166655440095, + "language_loss": 0.79305434, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80358148, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4278, + "time_per_iteration": 2.6593875885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056035, + "balance_loss_mlp": 1.04679036, + "diversity_loss_mlp": 0.0, + "epoch": 0.8232012312427857, + "flos": 591672314880.0, + "grad_norm": 0.07994138400827414, + "language_loss": 0.82999951, + "learning_rate": 7.975645631856127e-05, + "loss": 0.84055984, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4279, + "time_per_iteration": 2.6803600788116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_mlp": 1.04226494, + "diversity_loss_mlp": 0.0, + "epoch": 0.8233936129280492, + "flos": 572644380672.0, + "grad_norm": 0.060738985338191206, + "language_loss": 0.744928, + "learning_rate": 7.958773444541916e-05, + "loss": 0.7554431, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.09234619, + "routerloss_mlp": 0.0, + "step": 4280, + "time_per_iteration": 2.7890987396240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055668, + "balance_loss_mlp": 1.04667926, + "diversity_loss_mlp": 0.0, + "epoch": 0.8235859946133128, + "flos": 731337735168.0, + "grad_norm": 0.06641835359143249, + "language_loss": 0.78285408, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79341078, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4281, + "time_per_iteration": 3.0231053829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052951, + "balance_loss_mlp": 1.04405797, + "diversity_loss_mlp": 0.0, + "epoch": 0.8237783762985764, + "flos": 570314483712.0, + "grad_norm": 0.07232954234982779, + "language_loss": 0.81364781, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82417727, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4282, + "time_per_iteration": 2.702601909637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009495, + "balance_loss_mlp": 1.00503695, + "diversity_loss_mlp": 0.0, + "epoch": 0.8239707579838399, + "flos": 1466232897024.0, + "grad_norm": 0.005580683595342396, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76307166, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4283, + "time_per_iteration": 4.935715675354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057513, + "balance_loss_mlp": 1.04841709, + "diversity_loss_mlp": 0.0, + "epoch": 0.8241631396691035, + "flos": 467313297408.0, + "grad_norm": 0.0758894988729268, + "language_loss": 0.81082892, + "learning_rate": 7.89144797921037e-05, + "loss": 0.82140398, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4284, + "time_per_iteration": 2.6500790119171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010322, + "balance_loss_mlp": 1.00588739, + "diversity_loss_mlp": 0.0, + "epoch": 0.8243555213543671, + "flos": 1539426290688.0, + "grad_norm": 0.005340107036422925, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78944594, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4285, + "time_per_iteration": 4.93043065071106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055758, + "balance_loss_mlp": 1.04675198, + "diversity_loss_mlp": 0.0, + "epoch": 0.8245479030396307, + "flos": 797429836800.0, + "grad_norm": 0.052404155401405805, + "language_loss": 0.82728308, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83784062, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4286, + "time_per_iteration": 3.1566803455352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054082, + "balance_loss_mlp": 1.04502165, + "diversity_loss_mlp": 0.0, + "epoch": 0.8247402847248941, + "flos": 646114185216.0, + "grad_norm": 0.07426299244547702, + "language_loss": 0.76636487, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77690566, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4287, + "time_per_iteration": 2.894404888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054163, + "balance_loss_mlp": 1.04488242, + "diversity_loss_mlp": 0.0, + "epoch": 0.8249326664101577, + "flos": 604421595648.0, + "grad_norm": 0.05641463912536871, + "language_loss": 0.79555058, + "learning_rate": 7.824384081587637e-05, + "loss": 0.8060922, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 4288, + "time_per_iteration": 2.8229329586029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058243, + "balance_loss_mlp": 1.04930818, + "diversity_loss_mlp": 0.0, + "epoch": 0.8251250480954213, + "flos": 824369218560.0, + "grad_norm": 0.0762203665991507, + "language_loss": 0.86487937, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87546182, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4289, + "time_per_iteration": 3.1116397380828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_mlp": 1.04234433, + "diversity_loss_mlp": 0.0, + "epoch": 0.8253174297806849, + "flos": 757382897664.0, + "grad_norm": 0.0740808728635397, + "language_loss": 0.78204668, + "learning_rate": 7.790950350913112e-05, + "loss": 0.79255825, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4290, + "time_per_iteration": 2.9050347805023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054866, + "balance_loss_mlp": 1.04616976, + "diversity_loss_mlp": 0.0, + "epoch": 0.8255098114659485, + "flos": 794469648384.0, + "grad_norm": 0.058080618005571384, + "language_loss": 0.87400663, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88455528, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 4291, + "time_per_iteration": 3.1467742919921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052707, + "balance_loss_mlp": 1.04383206, + "diversity_loss_mlp": 0.0, + "epoch": 0.825702193151212, + "flos": 710417475072.0, + "grad_norm": 0.06448799909112234, + "language_loss": 0.77267563, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78320277, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4292, + "time_per_iteration": 2.875955581665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105942, + "balance_loss_mlp": 1.05067623, + "diversity_loss_mlp": 0.0, + "epoch": 0.8258945748364755, + "flos": 683394029568.0, + "grad_norm": 0.06489065655526868, + "language_loss": 0.80734456, + "learning_rate": 7.740922673634537e-05, + "loss": 0.8179388, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4293, + "time_per_iteration": 2.906735420227051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105726, + "balance_loss_mlp": 1.04794431, + "diversity_loss_mlp": 0.0, + "epoch": 0.8260869565217391, + "flos": 594563120640.0, + "grad_norm": 0.06785179357058724, + "language_loss": 0.78951818, + "learning_rate": 7.724279585440186e-05, + "loss": 0.80009079, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 4294, + "time_per_iteration": 2.721102237701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051983, + "balance_loss_mlp": 1.04291677, + "diversity_loss_mlp": 0.0, + "epoch": 0.8262793382070027, + "flos": 651480993792.0, + "grad_norm": 0.07073253675532468, + "language_loss": 0.8505556, + "learning_rate": 7.707652910136098e-05, + "loss": 0.8610754, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4295, + "time_per_iteration": 2.7751898765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055812, + "balance_loss_mlp": 1.04672778, + "diversity_loss_mlp": 0.0, + "epoch": 0.8264717198922663, + "flos": 538922709504.0, + "grad_norm": 0.06741164173780789, + "language_loss": 0.84659898, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85715711, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4296, + "time_per_iteration": 2.6647472381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056393, + "balance_loss_mlp": 1.04746425, + "diversity_loss_mlp": 0.0, + "epoch": 0.8266641015775298, + "flos": 538949873664.0, + "grad_norm": 0.07582259364872852, + "language_loss": 0.75999844, + "learning_rate": 7.674448824012514e-05, + "loss": 0.77056229, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4297, + "time_per_iteration": 2.6833221912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_mlp": 1.04438508, + "diversity_loss_mlp": 0.0, + "epoch": 0.8268564832627934, + "flos": 585361728000.0, + "grad_norm": 0.05929184332183984, + "language_loss": 0.83883959, + "learning_rate": 7.657871426083979e-05, + "loss": 0.84937572, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4298, + "time_per_iteration": 2.8329238891601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053687, + "balance_loss_mlp": 1.04474664, + "diversity_loss_mlp": 0.0, + "epoch": 0.827048864948057, + "flos": 430661173248.0, + "grad_norm": 0.07448007019964706, + "language_loss": 0.84225285, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85278976, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4299, + "time_per_iteration": 2.489332675933838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049289, + "balance_loss_mlp": 1.04037237, + "diversity_loss_mlp": 0.0, + "epoch": 0.8272412466333205, + "flos": 1388430761472.0, + "grad_norm": 0.06599892876771768, + "language_loss": 0.85128617, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86177909, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4300, + "time_per_iteration": 3.732990026473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055631, + "balance_loss_mlp": 1.04661894, + "diversity_loss_mlp": 0.0, + "epoch": 0.827433628318584, + "flos": 538230749184.0, + "grad_norm": 0.05906795179451105, + "language_loss": 0.82889211, + "learning_rate": 7.608237890043335e-05, + "loss": 0.83944845, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4301, + "time_per_iteration": 2.690711259841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048067, + "balance_loss_mlp": 1.03897715, + "diversity_loss_mlp": 0.0, + "epoch": 0.8276260100038476, + "flos": 730734981120.0, + "grad_norm": 0.07258594610710227, + "language_loss": 0.77361107, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78409171, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4302, + "time_per_iteration": 2.9701120853424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788744, + "balance_loss_mlp": 1.3319999, + "diversity_loss_mlp": 0.22346261, + "epoch": 0.8278183916891112, + "flos": 871102273536.0, + "grad_norm": 0.027743371165779296, + "language_loss": 0.82558441, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83347189, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01101248, + "step": 4303, + "time_per_iteration": 3.223346471786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052756, + "balance_loss_mlp": 1.04391634, + "diversity_loss_mlp": 0.0, + "epoch": 0.8280107733743748, + "flos": 594543297024.0, + "grad_norm": 0.05962542188798652, + "language_loss": 0.7781111, + "learning_rate": 7.558752475439134e-05, + "loss": 0.78863871, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4304, + "time_per_iteration": 2.7994863986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051008, + "balance_loss_mlp": 1.04218018, + "diversity_loss_mlp": 0.0, + "epoch": 0.8282031550596384, + "flos": 768607667712.0, + "grad_norm": 0.07052691004217361, + "language_loss": 0.84562683, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85613692, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4305, + "time_per_iteration": 3.0267395973205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.04208159, + "diversity_loss_mlp": 0.0, + "epoch": 0.8283955367449019, + "flos": 696108805632.0, + "grad_norm": 0.07942922848471844, + "language_loss": 0.78335333, + "learning_rate": 7.525844574130947e-05, + "loss": 0.79386634, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 4306, + "time_per_iteration": 2.914696455001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049867, + "balance_loss_mlp": 1.0407536, + "diversity_loss_mlp": 0.0, + "epoch": 0.8285879184301654, + "flos": 660630256128.0, + "grad_norm": 0.08577922080448468, + "language_loss": 0.82953119, + "learning_rate": 7.509415355178806e-05, + "loss": 0.8400299, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4307, + "time_per_iteration": 2.9498178958892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788913, + "balance_loss_mlp": 1.33115017, + "diversity_loss_mlp": 0.22477263, + "epoch": 0.828780300115429, + "flos": 558709042176.0, + "grad_norm": 0.04309088247538252, + "language_loss": 0.77926069, + "learning_rate": 7.493002632534618e-05, + "loss": 0.78714979, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01095133, + "step": 4308, + "time_per_iteration": 2.7063913345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050662, + "balance_loss_mlp": 1.04154897, + "diversity_loss_mlp": 0.0, + "epoch": 0.8289726818006926, + "flos": 830963930112.0, + "grad_norm": 0.05899046117627297, + "language_loss": 0.81765443, + "learning_rate": 7.476606412570352e-05, + "loss": 0.828161, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4309, + "time_per_iteration": 3.0521981716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053534, + "balance_loss_mlp": 1.04459929, + "diversity_loss_mlp": 0.0, + "epoch": 0.8291650634859561, + "flos": 732289227264.0, + "grad_norm": 0.07518852690871787, + "language_loss": 0.80517173, + "learning_rate": 7.460226701651624e-05, + "loss": 0.81570709, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4310, + "time_per_iteration": 2.904289722442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055947, + "balance_loss_mlp": 1.04662442, + "diversity_loss_mlp": 0.0, + "epoch": 0.8293574451712197, + "flos": 860910114816.0, + "grad_norm": 0.06212685924060065, + "language_loss": 0.81412387, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82468331, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 4311, + "time_per_iteration": 3.203298807144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052126, + "balance_loss_mlp": 1.04322684, + "diversity_loss_mlp": 0.0, + "epoch": 0.8295498268564833, + "flos": 495156810240.0, + "grad_norm": 0.05391272281173969, + "language_loss": 0.81940407, + "learning_rate": 7.427516832380948e-05, + "loss": 0.8299253, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4312, + "time_per_iteration": 2.8845975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055692, + "balance_loss_mlp": 1.04694164, + "diversity_loss_mlp": 0.0, + "epoch": 0.8297422085417469, + "flos": 554471391744.0, + "grad_norm": 0.05500480744199572, + "language_loss": 0.77808565, + "learning_rate": 7.4111866867281e-05, + "loss": 0.78864259, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4313, + "time_per_iteration": 2.7781200408935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048602, + "balance_loss_mlp": 1.03975666, + "diversity_loss_mlp": 0.0, + "epoch": 0.8299345902270104, + "flos": 1247497417728.0, + "grad_norm": 0.06268776190670762, + "language_loss": 0.77513206, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78561807, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4314, + "time_per_iteration": 3.6484732627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060785, + "balance_loss_mlp": 1.05197561, + "diversity_loss_mlp": 0.0, + "epoch": 0.8301269719122739, + "flos": 585260411904.0, + "grad_norm": 0.07094165320870974, + "language_loss": 0.83007073, + "learning_rate": 7.378576005087034e-05, + "loss": 0.84067863, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4315, + "time_per_iteration": 2.7556705474853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105563, + "balance_loss_mlp": 1.04686821, + "diversity_loss_mlp": 0.0, + "epoch": 0.8303193535975375, + "flos": 509732352000.0, + "grad_norm": 0.06645426228125094, + "language_loss": 0.84888268, + "learning_rate": 7.362295481759412e-05, + "loss": 0.85943896, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4316, + "time_per_iteration": 2.6553759574890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786621, + "balance_loss_mlp": 1.32643843, + "diversity_loss_mlp": 0.22519124, + "epoch": 0.8305117352828011, + "flos": 580652375040.0, + "grad_norm": 0.03189628781024831, + "language_loss": 0.83680773, + "learning_rate": 7.346031511856722e-05, + "loss": 0.84467387, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01080582, + "step": 4317, + "time_per_iteration": 2.742246150970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054275, + "balance_loss_mlp": 1.04532266, + "diversity_loss_mlp": 0.0, + "epoch": 0.8307041169680647, + "flos": 481626736128.0, + "grad_norm": 0.06852217711760565, + "language_loss": 0.7890569, + "learning_rate": 7.329784101693232e-05, + "loss": 0.79959965, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4318, + "time_per_iteration": 2.601116418838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105841, + "balance_loss_mlp": 1.04927838, + "diversity_loss_mlp": 0.0, + "epoch": 0.8308964986533282, + "flos": 624605852160.0, + "grad_norm": 0.06935977491556748, + "language_loss": 0.83060843, + "learning_rate": 7.313553257576727e-05, + "loss": 0.84119254, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 4319, + "time_per_iteration": 2.7160871028900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_mlp": 1.04382229, + "diversity_loss_mlp": 0.0, + "epoch": 0.8310888803385917, + "flos": 827319495168.0, + "grad_norm": 0.07045309902078044, + "language_loss": 0.78631043, + "learning_rate": 7.297338985808589e-05, + "loss": 0.79683906, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4320, + "time_per_iteration": 3.009129762649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059595, + "balance_loss_mlp": 1.05061913, + "diversity_loss_mlp": 0.0, + "epoch": 0.8312812620238553, + "flos": 583743241728.0, + "grad_norm": 0.06816415290870351, + "language_loss": 0.81865102, + "learning_rate": 7.281141292683746e-05, + "loss": 0.829247, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4321, + "time_per_iteration": 2.814836025238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056811, + "balance_loss_mlp": 1.04793024, + "diversity_loss_mlp": 0.0, + "epoch": 0.8314736437091189, + "flos": 1115605052928.0, + "grad_norm": 0.06950401316575304, + "language_loss": 0.7471621, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75773025, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4322, + "time_per_iteration": 3.438296318054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057458, + "balance_loss_mlp": 1.0484755, + "diversity_loss_mlp": 0.0, + "epoch": 0.8316660253943825, + "flos": 517547625984.0, + "grad_norm": 0.07376809791811713, + "language_loss": 0.82077682, + "learning_rate": 7.248795667511543e-05, + "loss": 0.83135134, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4323, + "time_per_iteration": 2.7750163078308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054403, + "balance_loss_mlp": 1.04560554, + "diversity_loss_mlp": 0.0, + "epoch": 0.831858407079646, + "flos": 795329736192.0, + "grad_norm": 0.07472428991139068, + "language_loss": 0.77946472, + "learning_rate": 7.232647748021864e-05, + "loss": 0.79000878, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4324, + "time_per_iteration": 3.035860776901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058014, + "balance_loss_mlp": 1.04919243, + "diversity_loss_mlp": 0.0, + "epoch": 0.8320507887649096, + "flos": 549967242240.0, + "grad_norm": 0.06856699827771942, + "language_loss": 0.83216256, + "learning_rate": 7.216516432290843e-05, + "loss": 0.84274268, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4325, + "time_per_iteration": 2.705737352371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057993, + "balance_loss_mlp": 1.04915345, + "diversity_loss_mlp": 0.0, + "epoch": 0.8322431704501732, + "flos": 479398155264.0, + "grad_norm": 0.07351613065944015, + "language_loss": 0.82007957, + "learning_rate": 7.20040172658123e-05, + "loss": 0.83065945, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4326, + "time_per_iteration": 2.601170539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060086, + "balance_loss_mlp": 1.0512104, + "diversity_loss_mlp": 0.0, + "epoch": 0.8324355521354367, + "flos": 572434407936.0, + "grad_norm": 0.05702554279595623, + "language_loss": 0.85418373, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86478466, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4327, + "time_per_iteration": 2.6739983558654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057295, + "balance_loss_mlp": 1.04846764, + "diversity_loss_mlp": 0.0, + "epoch": 0.8326279338207002, + "flos": 503454071808.0, + "grad_norm": 0.06350176662838333, + "language_loss": 0.82565081, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83622372, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4328, + "time_per_iteration": 2.608927011489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055632, + "balance_loss_mlp": 1.04681087, + "diversity_loss_mlp": 0.0, + "epoch": 0.8328203155059638, + "flos": 605743474176.0, + "grad_norm": 0.06140661393609168, + "language_loss": 0.81182075, + "learning_rate": 7.152157332111364e-05, + "loss": 0.82237709, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4329, + "time_per_iteration": 2.9149293899536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055768, + "balance_loss_mlp": 1.04682159, + "diversity_loss_mlp": 0.0, + "epoch": 0.8330126971912274, + "flos": 697798872576.0, + "grad_norm": 0.07439273272708623, + "language_loss": 0.8576234, + "learning_rate": 7.136109128985663e-05, + "loss": 0.86818105, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4330, + "time_per_iteration": 2.9639134407043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105965, + "balance_loss_mlp": 1.05070877, + "diversity_loss_mlp": 0.0, + "epoch": 0.833205078876491, + "flos": 494042706432.0, + "grad_norm": 0.08290776170171969, + "language_loss": 0.86890334, + "learning_rate": 7.120077567098249e-05, + "loss": 0.87949985, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4331, + "time_per_iteration": 2.6148195266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054578, + "balance_loss_mlp": 1.04560781, + "diversity_loss_mlp": 0.0, + "epoch": 0.8333974605617546, + "flos": 482812793856.0, + "grad_norm": 0.057322207358884096, + "language_loss": 0.82625836, + "learning_rate": 7.104062652673115e-05, + "loss": 0.83680409, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4332, + "time_per_iteration": 2.621798515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_mlp": 1.0477283, + "diversity_loss_mlp": 0.0, + "epoch": 0.833589842247018, + "flos": 686821151232.0, + "grad_norm": 0.07570063772280167, + "language_loss": 0.82964915, + "learning_rate": 7.088064391927818e-05, + "loss": 0.84021485, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4333, + "time_per_iteration": 2.837819814682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053173, + "balance_loss_mlp": 1.04428554, + "diversity_loss_mlp": 0.0, + "epoch": 0.8337822239322816, + "flos": 881739343872.0, + "grad_norm": 0.06974463300031715, + "language_loss": 0.83023667, + "learning_rate": 7.072082791073419e-05, + "loss": 0.8407684, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4334, + "time_per_iteration": 3.1047897338867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054952, + "balance_loss_mlp": 1.04588628, + "diversity_loss_mlp": 0.0, + "epoch": 0.8339746056175452, + "flos": 497183132160.0, + "grad_norm": 0.07461604540726756, + "language_loss": 0.82598537, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83653492, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4335, + "time_per_iteration": 2.5917162895202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.04616058, + "diversity_loss_mlp": 0.0, + "epoch": 0.8341669873028088, + "flos": 510495892992.0, + "grad_norm": 0.07051755558905955, + "language_loss": 0.8628878, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87344062, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4336, + "time_per_iteration": 2.6134135723114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_mlp": 1.04197288, + "diversity_loss_mlp": 0.0, + "epoch": 0.8343593689880723, + "flos": 692321209344.0, + "grad_norm": 0.06598640893887409, + "language_loss": 0.83991468, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85042214, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4337, + "time_per_iteration": 2.7903592586517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052579, + "balance_loss_mlp": 1.04391873, + "diversity_loss_mlp": 0.0, + "epoch": 0.8345517506733359, + "flos": 552408367104.0, + "grad_norm": 0.0663044915688964, + "language_loss": 0.7816447, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79217046, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 4338, + "time_per_iteration": 2.7299916744232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053332, + "balance_loss_mlp": 1.04413533, + "diversity_loss_mlp": 0.0, + "epoch": 0.8347441323585995, + "flos": 592052613120.0, + "grad_norm": 0.06355289445146371, + "language_loss": 0.76546603, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77599931, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4339, + "time_per_iteration": 2.8064498901367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052893, + "balance_loss_mlp": 1.04425037, + "diversity_loss_mlp": 0.0, + "epoch": 0.834936514043863, + "flos": 614917702656.0, + "grad_norm": 0.061799613244502456, + "language_loss": 0.84427285, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85480177, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 4340, + "time_per_iteration": 2.7731611728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105023, + "balance_loss_mlp": 1.04137301, + "diversity_loss_mlp": 0.0, + "epoch": 0.8351288957291266, + "flos": 467844470784.0, + "grad_norm": 0.15350718356465945, + "language_loss": 0.79499578, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80549812, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4341, + "time_per_iteration": 2.6016902923583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052884, + "balance_loss_mlp": 1.04431295, + "diversity_loss_mlp": 0.0, + "epoch": 0.8353212774143901, + "flos": 509319747072.0, + "grad_norm": 0.07564737297123257, + "language_loss": 0.78984159, + "learning_rate": 6.944830483504328e-05, + "loss": 0.80037045, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.08581543, + "routerloss_mlp": 0.0, + "step": 4342, + "time_per_iteration": 2.670459747314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049647, + "balance_loss_mlp": 1.04070663, + "diversity_loss_mlp": 0.0, + "epoch": 0.8355136590996537, + "flos": 687784753152.0, + "grad_norm": 0.06668235677339521, + "language_loss": 0.8060447, + "learning_rate": 6.928999100098483e-05, + "loss": 0.81654119, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4343, + "time_per_iteration": 2.817136287689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783572, + "balance_loss_mlp": 1.31915021, + "diversity_loss_mlp": 0.22572948, + "epoch": 0.8357060407849173, + "flos": 984409417728.0, + "grad_norm": 0.032919488551848924, + "language_loss": 0.84127021, + "learning_rate": 6.913184438338138e-05, + "loss": 0.84910595, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01113241, + "step": 4344, + "time_per_iteration": 3.2518675327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059144, + "balance_loss_mlp": 1.05024457, + "diversity_loss_mlp": 0.0, + "epoch": 0.8358984224701809, + "flos": 843026393088.0, + "grad_norm": 0.06270529003473267, + "language_loss": 0.85050792, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86109936, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4345, + "time_per_iteration": 3.1636109352111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053656, + "balance_loss_mlp": 1.04487062, + "diversity_loss_mlp": 0.0, + "epoch": 0.8360908041554445, + "flos": 626239019520.0, + "grad_norm": 0.07260078506489727, + "language_loss": 0.82210159, + "learning_rate": 6.881605304306748e-05, + "loss": 0.83263814, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4346, + "time_per_iteration": 2.8204703330993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050108, + "balance_loss_mlp": 1.04092288, + "diversity_loss_mlp": 0.0, + "epoch": 0.8362831858407079, + "flos": 576068931072.0, + "grad_norm": 0.061944149403073474, + "language_loss": 0.8502146, + "learning_rate": 6.865840844295796e-05, + "loss": 0.86071575, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4347, + "time_per_iteration": 2.805941343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053763, + "balance_loss_mlp": 1.04459023, + "diversity_loss_mlp": 0.0, + "epoch": 0.8364755675259715, + "flos": 833783155200.0, + "grad_norm": 0.0772733121075158, + "language_loss": 0.8092171, + "learning_rate": 6.850093130450569e-05, + "loss": 0.81975472, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4348, + "time_per_iteration": 3.040851593017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790162, + "balance_loss_mlp": 1.33250082, + "diversity_loss_mlp": 0.22602889, + "epoch": 0.8366679492112351, + "flos": 582480834048.0, + "grad_norm": 0.039903517211963106, + "language_loss": 0.86440182, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87230343, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0108971, + "step": 4349, + "time_per_iteration": 2.699540615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054884, + "balance_loss_mlp": 1.04582453, + "diversity_loss_mlp": 0.0, + "epoch": 0.8368603308964987, + "flos": 611722948608.0, + "grad_norm": 0.07332657660036589, + "language_loss": 0.87533635, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88588518, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4350, + "time_per_iteration": 2.7678165435791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052928, + "balance_loss_mlp": 1.04408848, + "diversity_loss_mlp": 0.0, + "epoch": 0.8370527125817622, + "flos": 507264062976.0, + "grad_norm": 0.06629049094152589, + "language_loss": 0.85621446, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86674374, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4351, + "time_per_iteration": 2.737682819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045769, + "balance_loss_mlp": 1.03676879, + "diversity_loss_mlp": 0.0, + "epoch": 0.8372450942670258, + "flos": 770952619008.0, + "grad_norm": 0.07766225400345093, + "language_loss": 0.82484055, + "learning_rate": 6.787269858905603e-05, + "loss": 0.8352983, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4352, + "time_per_iteration": 2.9142751693725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048952, + "balance_loss_mlp": 1.04007125, + "diversity_loss_mlp": 0.0, + "epoch": 0.8374374759522893, + "flos": 579276168192.0, + "grad_norm": 0.06438247248872511, + "language_loss": 0.85065448, + "learning_rate": 6.771605967466033e-05, + "loss": 0.86114407, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4353, + "time_per_iteration": 2.6874396800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048044, + "balance_loss_mlp": 1.0389719, + "diversity_loss_mlp": 0.0, + "epoch": 0.8376298576375529, + "flos": 788129699328.0, + "grad_norm": 0.07663124345564373, + "language_loss": 0.82635599, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83683646, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4354, + "time_per_iteration": 2.998286724090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052737, + "balance_loss_mlp": 1.04317021, + "diversity_loss_mlp": 0.0, + "epoch": 0.8378222393228165, + "flos": 577613265408.0, + "grad_norm": 0.07233016182516484, + "language_loss": 0.80633909, + "learning_rate": 6.74032853891452e-05, + "loss": 0.8168664, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 4355, + "time_per_iteration": 2.75176739692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046082, + "balance_loss_mlp": 1.03711188, + "diversity_loss_mlp": 0.0, + "epoch": 0.83801462100808, + "flos": 480865766400.0, + "grad_norm": 0.06437396666642163, + "language_loss": 0.82113147, + "learning_rate": 6.724715013945548e-05, + "loss": 0.83159232, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4356, + "time_per_iteration": 2.638768196105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_mlp": 1.04145241, + "diversity_loss_mlp": 0.0, + "epoch": 0.8382070026933436, + "flos": 550817044992.0, + "grad_norm": 0.06364273403340714, + "language_loss": 0.8922165, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90272063, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4357, + "time_per_iteration": 2.78487491607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051515, + "balance_loss_mlp": 1.04247308, + "diversity_loss_mlp": 0.0, + "epoch": 0.8383993843786072, + "flos": 624968898048.0, + "grad_norm": 0.08356541609520973, + "language_loss": 0.82212794, + "learning_rate": 6.693538372929725e-05, + "loss": 0.83264303, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4358, + "time_per_iteration": 2.9017884731292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786956, + "balance_loss_mlp": 1.32808125, + "diversity_loss_mlp": 0.22438851, + "epoch": 0.8385917660638708, + "flos": 491169153024.0, + "grad_norm": 0.03328062669176706, + "language_loss": 0.86377019, + "learning_rate": 6.677975268986719e-05, + "loss": 0.87163973, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01072117, + "step": 4359, + "time_per_iteration": 2.57958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047883, + "balance_loss_mlp": 1.0387392, + "diversity_loss_mlp": 0.0, + "epoch": 0.8387841477491342, + "flos": 466900692480.0, + "grad_norm": 0.07170710125962251, + "language_loss": 0.87394094, + "learning_rate": 6.662428984145336e-05, + "loss": 0.8844198, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 4360, + "time_per_iteration": 2.5944197177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016166, + "balance_loss_mlp": 1.01177895, + "diversity_loss_mlp": 0.0, + "epoch": 0.8389765294343978, + "flos": 1564188475392.0, + "grad_norm": 0.01396369957588317, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72796351, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.04394531, + "routerloss_mlp": 0.0, + "step": 4361, + "time_per_iteration": 5.049343109130859 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049104, + "balance_loss_mlp": 1.04028869, + "diversity_loss_mlp": 0.0, + "epoch": 0.8391689111196614, + "flos": 602160708096.0, + "grad_norm": 0.0657328713955244, + "language_loss": 0.82911998, + "learning_rate": 6.631386895903308e-05, + "loss": 0.83961105, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4362, + "time_per_iteration": 2.857707977294922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049668, + "balance_loss_mlp": 1.04045308, + "diversity_loss_mlp": 0.0, + "epoch": 0.839361292804925, + "flos": 443047408128.0, + "grad_norm": 0.07766308356740377, + "language_loss": 0.80444038, + "learning_rate": 6.615891104554261e-05, + "loss": 0.81493711, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4363, + "time_per_iteration": 2.481901168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_mlp": 1.0369525, + "diversity_loss_mlp": 0.0, + "epoch": 0.8395536744901886, + "flos": 594167768064.0, + "grad_norm": 0.061496061316517255, + "language_loss": 0.82737863, + "learning_rate": 6.600412156410057e-05, + "loss": 0.83784378, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 4364, + "time_per_iteration": 2.7074997425079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048671, + "balance_loss_mlp": 1.03946805, + "diversity_loss_mlp": 0.0, + "epoch": 0.8397460561754521, + "flos": 889836171264.0, + "grad_norm": 0.067014192244174, + "language_loss": 0.84650993, + "learning_rate": 6.58495005748016e-05, + "loss": 0.85699666, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4365, + "time_per_iteration": 3.1557445526123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045295, + "balance_loss_mlp": 1.03640795, + "diversity_loss_mlp": 0.0, + "epoch": 0.8399384378607156, + "flos": 553503020544.0, + "grad_norm": 0.0631575802857794, + "language_loss": 0.89196813, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90242112, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4366, + "time_per_iteration": 2.624469757080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046761, + "balance_loss_mlp": 1.03753984, + "diversity_loss_mlp": 0.0, + "epoch": 0.8401308195459792, + "flos": 518923832832.0, + "grad_norm": 0.06347741472269025, + "language_loss": 0.83584821, + "learning_rate": 6.554076431268341e-05, + "loss": 0.8463158, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4367, + "time_per_iteration": 2.6431565284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_mlp": 1.04021287, + "diversity_loss_mlp": 0.0, + "epoch": 0.8403232012312428, + "flos": 684933221376.0, + "grad_norm": 0.07076442779164972, + "language_loss": 0.80955088, + "learning_rate": 6.538664915972648e-05, + "loss": 0.82004237, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4368, + "time_per_iteration": 3.018554449081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00773368, + "balance_loss_mlp": 1.30118096, + "diversity_loss_mlp": 0.22479768, + "epoch": 0.8405155829165063, + "flos": 577672736256.0, + "grad_norm": 0.03439452063807504, + "language_loss": 0.77776653, + "learning_rate": 6.523270273863652e-05, + "loss": 0.78550017, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01037853, + "step": 4369, + "time_per_iteration": 2.6944448947906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045809, + "balance_loss_mlp": 1.03648067, + "diversity_loss_mlp": 0.0, + "epoch": 0.8407079646017699, + "flos": 456627041280.0, + "grad_norm": 0.1193689802326749, + "language_loss": 0.87956655, + "learning_rate": 6.507892510918079e-05, + "loss": 0.8900246, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4370, + "time_per_iteration": 2.529339551925659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047708, + "balance_loss_mlp": 1.03855264, + "diversity_loss_mlp": 0.0, + "epoch": 0.8409003462870335, + "flos": 534917426688.0, + "grad_norm": 0.07411757925982031, + "language_loss": 0.81849647, + "learning_rate": 6.492531633106114e-05, + "loss": 0.82897353, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4371, + "time_per_iteration": 2.776374578475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050613, + "balance_loss_mlp": 1.04111791, + "diversity_loss_mlp": 0.0, + "epoch": 0.8410927279722971, + "flos": 556759443456.0, + "grad_norm": 0.08018635739985482, + "language_loss": 0.77876925, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78927541, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 4372, + "time_per_iteration": 2.7516069412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008359, + "balance_loss_mlp": 1.00390017, + "diversity_loss_mlp": 0.0, + "epoch": 0.8412851096575606, + "flos": 1549754270208.0, + "grad_norm": 0.00952058425700796, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78687477, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4373, + "time_per_iteration": 4.912792682647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048789, + "balance_loss_mlp": 1.0395565, + "diversity_loss_mlp": 0.0, + "epoch": 0.8414774913428241, + "flos": 552042749952.0, + "grad_norm": 0.07245552666854996, + "language_loss": 0.78958535, + "learning_rate": 6.446550370075271e-05, + "loss": 0.80007321, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4374, + "time_per_iteration": 2.711447238922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_mlp": 1.03688145, + "diversity_loss_mlp": 0.0, + "epoch": 0.8416698730280877, + "flos": 573015140352.0, + "grad_norm": 0.07770698856431457, + "language_loss": 0.77577722, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78623879, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 4375, + "time_per_iteration": 2.694774627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050017, + "balance_loss_mlp": 1.04059398, + "diversity_loss_mlp": 0.0, + "epoch": 0.8418622547133513, + "flos": 758731940352.0, + "grad_norm": 0.11734230107546348, + "language_loss": 0.80035317, + "learning_rate": 6.415980729547543e-05, + "loss": 0.81085336, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 4376, + "time_per_iteration": 2.918545961380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049017, + "balance_loss_mlp": 1.03976655, + "diversity_loss_mlp": 0.0, + "epoch": 0.8420546363986149, + "flos": 1074156940800.0, + "grad_norm": 0.07794527811003633, + "language_loss": 0.72769749, + "learning_rate": 6.40072128754366e-05, + "loss": 0.73818767, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4377, + "time_per_iteration": 3.4151737689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050973, + "balance_loss_mlp": 1.04171598, + "diversity_loss_mlp": 0.0, + "epoch": 0.8422470180838784, + "flos": 525908754432.0, + "grad_norm": 0.0675536673804059, + "language_loss": 0.82617545, + "learning_rate": 6.385478772280933e-05, + "loss": 0.83668518, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4378, + "time_per_iteration": 2.749711036682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048915, + "balance_loss_mlp": 1.03964031, + "diversity_loss_mlp": 0.0, + "epoch": 0.842439399769142, + "flos": 600834060288.0, + "grad_norm": 0.06567054296588401, + "language_loss": 0.82044506, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83093417, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 4379, + "time_per_iteration": 2.761420488357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049874, + "balance_loss_mlp": 1.04072499, + "diversity_loss_mlp": 0.0, + "epoch": 0.8426317814544055, + "flos": 552222987264.0, + "grad_norm": 0.06119198131713492, + "language_loss": 0.86507058, + "learning_rate": 6.355044545643073e-05, + "loss": 0.87556934, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4380, + "time_per_iteration": 2.816401720046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049119, + "balance_loss_mlp": 1.04015481, + "diversity_loss_mlp": 0.0, + "epoch": 0.8428241631396691, + "flos": 678832980480.0, + "grad_norm": 0.08611471083111012, + "language_loss": 0.77840042, + "learning_rate": 6.33985284608356e-05, + "loss": 0.78889161, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4381, + "time_per_iteration": 2.8088033199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048589, + "balance_loss_mlp": 1.03958273, + "diversity_loss_mlp": 0.0, + "epoch": 0.8430165448249327, + "flos": 753730748928.0, + "grad_norm": 0.06180211012921075, + "language_loss": 0.79696667, + "learning_rate": 6.324678096896435e-05, + "loss": 0.80745256, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4382, + "time_per_iteration": 3.0762522220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049388, + "balance_loss_mlp": 1.04026818, + "diversity_loss_mlp": 0.0, + "epoch": 0.8432089265101962, + "flos": 699140574720.0, + "grad_norm": 0.07097197774761282, + "language_loss": 0.80925977, + "learning_rate": 6.30952030397306e-05, + "loss": 0.81975365, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4383, + "time_per_iteration": 2.8958194255828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_mlp": 1.03793836, + "diversity_loss_mlp": 0.0, + "epoch": 0.8434013081954598, + "flos": 485767839744.0, + "grad_norm": 0.08175099554660337, + "language_loss": 0.84386265, + "learning_rate": 6.294379473198208e-05, + "loss": 0.854334, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4384, + "time_per_iteration": 2.6954331398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049059, + "balance_loss_mlp": 1.03982067, + "diversity_loss_mlp": 0.0, + "epoch": 0.8435936898807234, + "flos": 520623811584.0, + "grad_norm": 0.0940310335311775, + "language_loss": 0.85289472, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86338532, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 4385, + "time_per_iteration": 2.6073288917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052192, + "balance_loss_mlp": 1.0430907, + "diversity_loss_mlp": 0.0, + "epoch": 0.843786071565987, + "flos": 785945534976.0, + "grad_norm": 0.06584361059499325, + "language_loss": 0.80478346, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81530541, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4386, + "time_per_iteration": 2.9602465629577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003223, + "balance_loss_mlp": 0.99876487, + "diversity_loss_mlp": 0.0, + "epoch": 0.8439784532512504, + "flos": 1446278436864.0, + "grad_norm": 0.01332354164942413, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76839739, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4387, + "time_per_iteration": 4.922089099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051887, + "balance_loss_mlp": 1.0426724, + "diversity_loss_mlp": 0.0, + "epoch": 0.844170834936514, + "flos": 708700243968.0, + "grad_norm": 0.08625525862164317, + "language_loss": 0.82786238, + "learning_rate": 6.23398588904906e-05, + "loss": 0.83838129, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4388, + "time_per_iteration": 2.8626224994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049632, + "balance_loss_mlp": 1.04066157, + "diversity_loss_mlp": 0.0, + "epoch": 0.8443632166217776, + "flos": 483428030976.0, + "grad_norm": 0.06592449787759593, + "language_loss": 0.79633564, + "learning_rate": 6.218929957057922e-05, + "loss": 0.80683196, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4389, + "time_per_iteration": 2.681319236755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053609, + "balance_loss_mlp": 1.04455543, + "diversity_loss_mlp": 0.0, + "epoch": 0.8445555983070412, + "flos": 678694588416.0, + "grad_norm": 0.06375633990495472, + "language_loss": 0.80234212, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81287819, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4390, + "time_per_iteration": 2.8914427757263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051123, + "balance_loss_mlp": 1.0421586, + "diversity_loss_mlp": 0.0, + "epoch": 0.8447479799923048, + "flos": 741485477376.0, + "grad_norm": 0.07030854249904422, + "language_loss": 0.74476206, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75527334, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4391, + "time_per_iteration": 2.983142375946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056789, + "balance_loss_mlp": 1.04770541, + "diversity_loss_mlp": 0.0, + "epoch": 0.8449403616775683, + "flos": 953306537472.0, + "grad_norm": 0.06360843007002392, + "language_loss": 0.80354917, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81411707, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4392, + "time_per_iteration": 3.266145706176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105708, + "balance_loss_mlp": 1.04769254, + "diversity_loss_mlp": 0.0, + "epoch": 0.8451327433628318, + "flos": 657363921408.0, + "grad_norm": 0.0822485878003235, + "language_loss": 0.72267312, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73324394, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 4393, + "time_per_iteration": 2.8685081005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104874, + "balance_loss_mlp": 1.03982329, + "diversity_loss_mlp": 0.0, + "epoch": 0.8453251250480954, + "flos": 446113681920.0, + "grad_norm": 0.07697573681675166, + "language_loss": 0.83679235, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84727973, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4394, + "time_per_iteration": 2.533674478530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053449, + "balance_loss_mlp": 1.04453218, + "diversity_loss_mlp": 0.0, + "epoch": 0.845517506733359, + "flos": 542767205376.0, + "grad_norm": 0.07537571823528784, + "language_loss": 0.7097168, + "learning_rate": 6.128951512927305e-05, + "loss": 0.72025126, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4395, + "time_per_iteration": 2.6876683235168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051353, + "balance_loss_mlp": 1.04228103, + "diversity_loss_mlp": 0.0, + "epoch": 0.8457098884186226, + "flos": 502440910848.0, + "grad_norm": 0.08282627197829308, + "language_loss": 0.84426546, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85477906, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4396, + "time_per_iteration": 2.6650242805480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_mlp": 1.0413115, + "diversity_loss_mlp": 0.0, + "epoch": 0.8459022701038861, + "flos": 448893259776.0, + "grad_norm": 0.15468816830135243, + "language_loss": 0.79700321, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80750489, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4397, + "time_per_iteration": 2.7101781368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044589, + "balance_loss_mlp": 1.03563631, + "diversity_loss_mlp": 0.0, + "epoch": 0.8460946517891497, + "flos": 743178115584.0, + "grad_norm": 0.05893126536703995, + "language_loss": 0.75071192, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.76115775, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4398, + "time_per_iteration": 2.9596059322357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_mlp": 1.03793144, + "diversity_loss_mlp": 0.0, + "epoch": 0.8462870334744133, + "flos": 553216324608.0, + "grad_norm": 0.0659677770319019, + "language_loss": 0.80090201, + "learning_rate": 6.069306450876389e-05, + "loss": 0.81137055, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4399, + "time_per_iteration": 2.750497341156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008801, + "balance_loss_mlp": 1.0044378, + "diversity_loss_mlp": 0.0, + "epoch": 0.8464794151596768, + "flos": 1564877864448.0, + "grad_norm": 0.013995388355349315, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82717371, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.04370117, + "routerloss_mlp": 0.0, + "step": 4400, + "time_per_iteration": 4.847966432571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_mlp": 1.03586566, + "diversity_loss_mlp": 0.0, + "epoch": 0.8466717968449403, + "flos": 550197038592.0, + "grad_norm": 0.060817981350280916, + "language_loss": 0.79790008, + "learning_rate": 6.039586229158084e-05, + "loss": 0.80835003, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4401, + "time_per_iteration": 2.668105125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045073, + "balance_loss_mlp": 1.03601933, + "diversity_loss_mlp": 0.0, + "epoch": 0.8468641785302039, + "flos": 551919038976.0, + "grad_norm": 0.07199778737497019, + "language_loss": 0.84602404, + "learning_rate": 6.024751715835314e-05, + "loss": 0.85647476, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4402, + "time_per_iteration": 2.8081796169281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044356, + "balance_loss_mlp": 1.03515351, + "diversity_loss_mlp": 0.0, + "epoch": 0.8470565602154675, + "flos": 572671544832.0, + "grad_norm": 0.10925067279097164, + "language_loss": 0.87193465, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88237822, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4403, + "time_per_iteration": 2.7070863246917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047531, + "balance_loss_mlp": 1.03842974, + "diversity_loss_mlp": 0.0, + "epoch": 0.8472489419007311, + "flos": 472833179136.0, + "grad_norm": 0.08568709869316025, + "language_loss": 0.84353817, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85401344, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4404, + "time_per_iteration": 2.5401875972747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044764, + "balance_loss_mlp": 1.03592503, + "diversity_loss_mlp": 0.0, + "epoch": 0.8474413235859947, + "flos": 798020481024.0, + "grad_norm": 0.0709686000036253, + "language_loss": 0.79758859, + "learning_rate": 5.980350635103954e-05, + "loss": 0.80803621, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4405, + "time_per_iteration": 2.9586398601531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047629, + "balance_loss_mlp": 1.03862858, + "diversity_loss_mlp": 0.0, + "epoch": 0.8476337052712581, + "flos": 502379241984.0, + "grad_norm": 0.0758173793957083, + "language_loss": 0.80622578, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81670201, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4406, + "time_per_iteration": 2.5468907356262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104799, + "balance_loss_mlp": 1.03891182, + "diversity_loss_mlp": 0.0, + "epoch": 0.8478260869565217, + "flos": 931971101184.0, + "grad_norm": 0.08716014432574012, + "language_loss": 0.83022702, + "learning_rate": 5.9508353547573e-05, + "loss": 0.84070694, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4407, + "time_per_iteration": 3.180832862854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_mlp": 1.03713799, + "diversity_loss_mlp": 0.0, + "epoch": 0.8480184686417853, + "flos": 708811471872.0, + "grad_norm": 0.06912642288251827, + "language_loss": 0.80724686, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.81770915, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4408, + "time_per_iteration": 2.8790152072906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_mlp": 1.03665996, + "diversity_loss_mlp": 0.0, + "epoch": 0.8482108503270489, + "flos": 614440857600.0, + "grad_norm": 0.06430935054215667, + "language_loss": 0.82201052, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.83246624, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4409, + "time_per_iteration": 2.8187878131866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048254, + "balance_loss_mlp": 1.03908658, + "diversity_loss_mlp": 0.0, + "epoch": 0.8484032320123124, + "flos": 531016031232.0, + "grad_norm": 0.07260617685747814, + "language_loss": 0.82220393, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83268642, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4410, + "time_per_iteration": 2.618715286254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011478, + "balance_loss_mlp": 1.00716281, + "diversity_loss_mlp": 0.0, + "epoch": 0.848595613697576, + "flos": 1542776315904.0, + "grad_norm": 0.010800011769390029, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77308393, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.04321289, + "routerloss_mlp": 0.0, + "step": 4411, + "time_per_iteration": 4.929163455963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779672, + "balance_loss_mlp": 1.31286287, + "diversity_loss_mlp": 0.22471759, + "epoch": 0.8487879953828396, + "flos": 677342974464.0, + "grad_norm": 0.03344280518316992, + "language_loss": 0.74134266, + "learning_rate": 5.877346528406635e-05, + "loss": 0.74913931, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01088216, + "step": 4412, + "time_per_iteration": 2.887648582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_mlp": 1.03763819, + "diversity_loss_mlp": 0.0, + "epoch": 0.8489803770681031, + "flos": 503673956352.0, + "grad_norm": 0.07759361608874747, + "language_loss": 0.79911488, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.80958003, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4413, + "time_per_iteration": 2.634019613265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051803, + "balance_loss_mlp": 1.04298139, + "diversity_loss_mlp": 0.0, + "epoch": 0.8491727587533667, + "flos": 563186027520.0, + "grad_norm": 0.06257116408066361, + "language_loss": 0.77061796, + "learning_rate": 5.84807086750247e-05, + "loss": 0.78113604, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4414, + "time_per_iteration": 2.739079236984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045842, + "balance_loss_mlp": 1.03654408, + "diversity_loss_mlp": 0.0, + "epoch": 0.8493651404386302, + "flos": 459784719360.0, + "grad_norm": 0.08252582476840821, + "language_loss": 0.779769, + "learning_rate": 5.833458746159243e-05, + "loss": 0.79022747, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 4415, + "time_per_iteration": 2.550938367843628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790044, + "balance_loss_mlp": 1.33385825, + "diversity_loss_mlp": 0.22484043, + "epoch": 0.8495575221238938, + "flos": 461170838016.0, + "grad_norm": 0.03510190626754167, + "language_loss": 0.82241035, + "learning_rate": 5.818863771788013e-05, + "loss": 0.83031082, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01069522, + "step": 4416, + "time_per_iteration": 2.629504442214966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052312, + "balance_loss_mlp": 1.04326987, + "diversity_loss_mlp": 0.0, + "epoch": 0.8497499038091574, + "flos": 870712063488.0, + "grad_norm": 0.06455923563838298, + "language_loss": 0.81343329, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82395649, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4417, + "time_per_iteration": 3.1615569591522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00774549, + "balance_loss_mlp": 1.30053818, + "diversity_loss_mlp": 0.22707056, + "epoch": 0.849942285494421, + "flos": 779600443392.0, + "grad_norm": 0.03325715859037055, + "language_loss": 0.78278667, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79053217, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01074457, + "step": 4418, + "time_per_iteration": 3.063164234161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105333, + "balance_loss_mlp": 1.04439521, + "diversity_loss_mlp": 0.0, + "epoch": 0.8501346671796844, + "flos": 513816556032.0, + "grad_norm": 0.06460470640159872, + "language_loss": 0.84812874, + "learning_rate": 5.775181787135819e-05, + "loss": 0.85866207, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4419, + "time_per_iteration": 2.694917678833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052007, + "balance_loss_mlp": 1.043239, + "diversity_loss_mlp": 0.0, + "epoch": 0.850327048864948, + "flos": 621445602816.0, + "grad_norm": 0.11539940060888441, + "language_loss": 0.83957243, + "learning_rate": 5.76065545724877e-05, + "loss": 0.85009253, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4420, + "time_per_iteration": 2.8541665077209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053783, + "balance_loss_mlp": 1.04484272, + "diversity_loss_mlp": 0.0, + "epoch": 0.8505194305502116, + "flos": 774221524992.0, + "grad_norm": 0.06628978561515504, + "language_loss": 0.79903436, + "learning_rate": 5.746146302598454e-05, + "loss": 0.80957222, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4421, + "time_per_iteration": 3.027402877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057337, + "balance_loss_mlp": 1.04840255, + "diversity_loss_mlp": 0.0, + "epoch": 0.8507118122354752, + "flos": 465257613312.0, + "grad_norm": 0.065145609650453, + "language_loss": 0.86839747, + "learning_rate": 5.731654328817859e-05, + "loss": 0.87897086, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4422, + "time_per_iteration": 2.608247756958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060117, + "balance_loss_mlp": 1.05109882, + "diversity_loss_mlp": 0.0, + "epoch": 0.8509041939207388, + "flos": 534413417472.0, + "grad_norm": 0.06673581896538218, + "language_loss": 0.84873575, + "learning_rate": 5.717179541533257e-05, + "loss": 0.85933691, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4423, + "time_per_iteration": 2.640604019165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055598, + "balance_loss_mlp": 1.04669881, + "diversity_loss_mlp": 0.0, + "epoch": 0.8510965756060023, + "flos": 583738472448.0, + "grad_norm": 0.07136007632395135, + "language_loss": 0.84349924, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85405523, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4424, + "time_per_iteration": 2.681556463241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056087, + "balance_loss_mlp": 1.04699087, + "diversity_loss_mlp": 0.0, + "epoch": 0.8512889572912659, + "flos": 600841400832.0, + "grad_norm": 0.09439640399937352, + "language_loss": 0.77805614, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78861696, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4425, + "time_per_iteration": 2.7769734859466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105464, + "balance_loss_mlp": 1.04534197, + "diversity_loss_mlp": 0.0, + "epoch": 0.8514813389765294, + "flos": 654791745024.0, + "grad_norm": 0.06728138208507028, + "language_loss": 0.78342903, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79397547, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 4426, + "time_per_iteration": 2.878251075744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_mlp": 1.04355907, + "diversity_loss_mlp": 0.0, + "epoch": 0.851673720661793, + "flos": 429761811456.0, + "grad_norm": 0.08229476351335695, + "language_loss": 0.78530198, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.7958256, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4427, + "time_per_iteration": 2.51084041595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105621, + "balance_loss_mlp": 1.04718578, + "diversity_loss_mlp": 0.0, + "epoch": 0.8518661023470565, + "flos": 641572959744.0, + "grad_norm": 0.06960729962592987, + "language_loss": 0.79901236, + "learning_rate": 5.645063599002875e-05, + "loss": 0.80957448, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4428, + "time_per_iteration": 2.7762057781219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055828, + "balance_loss_mlp": 1.04680383, + "diversity_loss_mlp": 0.0, + "epoch": 0.8520584840323201, + "flos": 562143504384.0, + "grad_norm": 0.07302244449525275, + "language_loss": 0.79662502, + "learning_rate": 5.630692048472363e-05, + "loss": 0.80718338, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4429, + "time_per_iteration": 2.660036325454712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056924, + "balance_loss_mlp": 1.04789412, + "diversity_loss_mlp": 0.0, + "epoch": 0.8522508657175837, + "flos": 527050395648.0, + "grad_norm": 0.07546735542766958, + "language_loss": 0.78632665, + "learning_rate": 5.61633772363489e-05, + "loss": 0.79689586, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4430, + "time_per_iteration": 2.6127545833587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105219, + "balance_loss_mlp": 1.04328537, + "diversity_loss_mlp": 0.0, + "epoch": 0.8524432474028473, + "flos": 499120247808.0, + "grad_norm": 0.06572867134879866, + "language_loss": 0.80567098, + "learning_rate": 5.602000630063298e-05, + "loss": 0.81619287, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4431, + "time_per_iteration": 2.5721845626831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053312, + "balance_loss_mlp": 1.04428816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8526356290881109, + "flos": 421314048000.0, + "grad_norm": 0.07674502364366044, + "language_loss": 0.79846716, + "learning_rate": 5.587680773323706e-05, + "loss": 0.80900025, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4432, + "time_per_iteration": 2.510967493057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057356, + "balance_loss_mlp": 1.04839182, + "diversity_loss_mlp": 0.0, + "epoch": 0.8528280107733743, + "flos": 507328303104.0, + "grad_norm": 0.0698638093203012, + "language_loss": 0.80873108, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.8193047, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4433, + "time_per_iteration": 2.6090145111083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054798, + "balance_loss_mlp": 1.04608333, + "diversity_loss_mlp": 0.0, + "epoch": 0.8530203924586379, + "flos": 445893797376.0, + "grad_norm": 0.06627585566585331, + "language_loss": 0.82683206, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.83738005, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4434, + "time_per_iteration": 2.5510103702545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055005, + "balance_loss_mlp": 1.04617763, + "diversity_loss_mlp": 0.0, + "epoch": 0.8532127741439015, + "flos": 657759273984.0, + "grad_norm": 0.06848630308035882, + "language_loss": 0.83932847, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84987855, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4435, + "time_per_iteration": 2.9127962589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052958, + "balance_loss_mlp": 1.0440768, + "diversity_loss_mlp": 0.0, + "epoch": 0.8534051558291651, + "flos": 536019420672.0, + "grad_norm": 0.07760386997403859, + "language_loss": 0.83284372, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.8433733, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4436, + "time_per_iteration": 2.7183430194854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056242, + "balance_loss_mlp": 1.04731894, + "diversity_loss_mlp": 0.0, + "epoch": 0.8535975375144286, + "flos": 533000134656.0, + "grad_norm": 0.08897067825861743, + "language_loss": 0.79124266, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80180502, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4437, + "time_per_iteration": 2.6436634063720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051335, + "balance_loss_mlp": 1.04229927, + "diversity_loss_mlp": 0.0, + "epoch": 0.8537899191996922, + "flos": 574141727232.0, + "grad_norm": 0.07034775984994458, + "language_loss": 0.82836092, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83887428, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4438, + "time_per_iteration": 2.71964430809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105491, + "balance_loss_mlp": 1.04575455, + "diversity_loss_mlp": 0.0, + "epoch": 0.8539823008849557, + "flos": 465007993344.0, + "grad_norm": 0.0746305826676403, + "language_loss": 0.83321023, + "learning_rate": 5.48792487359433e-05, + "loss": 0.8437593, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4439, + "time_per_iteration": 2.7270102500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105129, + "balance_loss_mlp": 1.04193783, + "diversity_loss_mlp": 0.0, + "epoch": 0.8541746825702193, + "flos": 554713671168.0, + "grad_norm": 0.11714515413286376, + "language_loss": 0.81816977, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.82868266, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 4440, + "time_per_iteration": 2.665386915206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047429, + "balance_loss_mlp": 1.03834486, + "diversity_loss_mlp": 0.0, + "epoch": 0.8543670642554829, + "flos": 546391816704.0, + "grad_norm": 0.06595291509459175, + "language_loss": 0.77334499, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.78381932, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4441, + "time_per_iteration": 2.7599966526031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_mlp": 1.04063272, + "diversity_loss_mlp": 0.0, + "epoch": 0.8545594459407464, + "flos": 512027744256.0, + "grad_norm": 0.07060933653649062, + "language_loss": 0.82500267, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83549809, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4442, + "time_per_iteration": 2.639495372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051049, + "balance_loss_mlp": 1.04200077, + "diversity_loss_mlp": 0.0, + "epoch": 0.85475182762601, + "flos": 421185567744.0, + "grad_norm": 0.07063393477475531, + "language_loss": 0.81464767, + "learning_rate": 5.431301565318786e-05, + "loss": 0.82515812, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4443, + "time_per_iteration": 2.4978034496307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048968, + "balance_loss_mlp": 1.03971708, + "diversity_loss_mlp": 0.0, + "epoch": 0.8549442093112736, + "flos": 389435516928.0, + "grad_norm": 0.08111118700719577, + "language_loss": 0.77217865, + "learning_rate": 5.41718898228542e-05, + "loss": 0.78266835, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4444, + "time_per_iteration": 2.4748144149780273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050876, + "balance_loss_mlp": 1.04197693, + "diversity_loss_mlp": 0.0, + "epoch": 0.8551365909965372, + "flos": 605926282752.0, + "grad_norm": 0.09368313437946132, + "language_loss": 0.79476607, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80527484, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4445, + "time_per_iteration": 2.796154499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049626, + "balance_loss_mlp": 1.04050708, + "diversity_loss_mlp": 0.0, + "epoch": 0.8553289726818007, + "flos": 504160713216.0, + "grad_norm": 0.06371937907069437, + "language_loss": 0.78714025, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.79763651, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4446, + "time_per_iteration": 2.5761666297912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051208, + "balance_loss_mlp": 1.04208875, + "diversity_loss_mlp": 0.0, + "epoch": 0.8555213543670642, + "flos": 557009063424.0, + "grad_norm": 0.06774235964888489, + "language_loss": 0.76389277, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77440482, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4447, + "time_per_iteration": 2.772761344909668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050103, + "balance_loss_mlp": 1.04116249, + "diversity_loss_mlp": 0.0, + "epoch": 0.8557137360523278, + "flos": 548104278528.0, + "grad_norm": 0.06327552262806617, + "language_loss": 0.75251746, + "learning_rate": 5.360911790663775e-05, + "loss": 0.76301849, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4448, + "time_per_iteration": 2.6334402561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047773, + "balance_loss_mlp": 1.03859377, + "diversity_loss_mlp": 0.0, + "epoch": 0.8559061177375914, + "flos": 728182628352.0, + "grad_norm": 0.057928896872347986, + "language_loss": 0.78575248, + "learning_rate": 5.346885805197238e-05, + "loss": 0.7962302, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4449, + "time_per_iteration": 2.965585947036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105067, + "balance_loss_mlp": 1.0418725, + "diversity_loss_mlp": 0.0, + "epoch": 0.856098499422855, + "flos": 535881028608.0, + "grad_norm": 0.07751296058129717, + "language_loss": 0.83346003, + "learning_rate": 5.332877155607085e-05, + "loss": 0.84396672, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4450, + "time_per_iteration": 2.6572906970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051985, + "balance_loss_mlp": 1.04291868, + "diversity_loss_mlp": 0.0, + "epoch": 0.8562908811081185, + "flos": 573664882176.0, + "grad_norm": 0.06226038691697754, + "language_loss": 0.83402085, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.84454072, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4451, + "time_per_iteration": 2.715268611907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050652, + "balance_loss_mlp": 1.04167557, + "diversity_loss_mlp": 0.0, + "epoch": 0.856483262793382, + "flos": 781754872320.0, + "grad_norm": 0.07567123638772062, + "language_loss": 0.80818313, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.8186897, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4452, + "time_per_iteration": 3.072892665863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_mlp": 1.03925145, + "diversity_loss_mlp": 0.0, + "epoch": 0.8566756444786456, + "flos": 455819083776.0, + "grad_norm": 0.0664830695636331, + "language_loss": 0.84927678, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85975915, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4453, + "time_per_iteration": 2.538435697555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048709, + "balance_loss_mlp": 1.03954768, + "diversity_loss_mlp": 0.0, + "epoch": 0.8568680261639092, + "flos": 449382587904.0, + "grad_norm": 0.08569801456429596, + "language_loss": 0.84562624, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85611331, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4454, + "time_per_iteration": 2.510293960571289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045918, + "balance_loss_mlp": 1.03693008, + "diversity_loss_mlp": 0.0, + "epoch": 0.8570604078491728, + "flos": 479976316416.0, + "grad_norm": 0.07456272936898871, + "language_loss": 0.82575965, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83621883, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4455, + "time_per_iteration": 2.5304741859436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783782, + "balance_loss_mlp": 1.32045674, + "diversity_loss_mlp": 0.22576013, + "epoch": 0.8572527895344363, + "flos": 505942184448.0, + "grad_norm": 0.031240053389996185, + "language_loss": 0.85362232, + "learning_rate": 5.249189615562627e-05, + "loss": 0.86146021, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01067326, + "step": 4456, + "time_per_iteration": 2.6050779819488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047609, + "balance_loss_mlp": 1.03857875, + "diversity_loss_mlp": 0.0, + "epoch": 0.8574451712196999, + "flos": 787044957696.0, + "grad_norm": 0.05524865057671199, + "language_loss": 0.83069348, + "learning_rate": 5.235302469011905e-05, + "loss": 0.84116954, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4457, + "time_per_iteration": 3.0707337856292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046976, + "balance_loss_mlp": 1.03807688, + "diversity_loss_mlp": 0.0, + "epoch": 0.8576375529049635, + "flos": 509252935680.0, + "grad_norm": 0.061549314191434064, + "language_loss": 0.75128138, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76175112, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4458, + "time_per_iteration": 2.8048369884490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009207, + "balance_loss_mlp": 1.00486779, + "diversity_loss_mlp": 0.0, + "epoch": 0.857829934590227, + "flos": 1460772486144.0, + "grad_norm": 0.009410723197847748, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85776496, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.04345703, + "routerloss_mlp": 0.0, + "step": 4459, + "time_per_iteration": 5.052462339401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049117, + "balance_loss_mlp": 1.04002094, + "diversity_loss_mlp": 0.0, + "epoch": 0.8580223162754905, + "flos": 479296839168.0, + "grad_norm": 0.05814228288805263, + "language_loss": 0.89274621, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90323746, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4460, + "time_per_iteration": 2.707102060317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048266, + "balance_loss_mlp": 1.03917027, + "diversity_loss_mlp": 0.0, + "epoch": 0.8582146979607541, + "flos": 706231954944.0, + "grad_norm": 0.07378533003990426, + "language_loss": 0.7931006, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80358326, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4461, + "time_per_iteration": 2.865081310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104748, + "balance_loss_mlp": 1.03854513, + "diversity_loss_mlp": 0.0, + "epoch": 0.8584070796460177, + "flos": 765158524416.0, + "grad_norm": 0.06549370953575787, + "language_loss": 0.823946, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.8344208, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4462, + "time_per_iteration": 2.960702419281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051966, + "balance_loss_mlp": 1.04283428, + "diversity_loss_mlp": 0.0, + "epoch": 0.8585994613312813, + "flos": 586829339136.0, + "grad_norm": 0.07292053022403922, + "language_loss": 0.85890585, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86942554, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 4463, + "time_per_iteration": 2.795929193496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047374, + "balance_loss_mlp": 1.03847504, + "diversity_loss_mlp": 0.0, + "epoch": 0.8587918430165449, + "flos": 608295826944.0, + "grad_norm": 0.0593280148984403, + "language_loss": 0.78598225, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79645598, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4464, + "time_per_iteration": 2.81134033203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046787, + "balance_loss_mlp": 1.03755462, + "diversity_loss_mlp": 0.0, + "epoch": 0.8589842247018084, + "flos": 588981570048.0, + "grad_norm": 0.08434589868858423, + "language_loss": 0.80900252, + "learning_rate": 5.124831399159535e-05, + "loss": 0.81947035, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4465, + "time_per_iteration": 2.698519229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055861, + "balance_loss_mlp": 1.04674125, + "diversity_loss_mlp": 0.0, + "epoch": 0.8591766063870719, + "flos": 543879111168.0, + "grad_norm": 0.08280689414498507, + "language_loss": 0.78631306, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.79687166, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4466, + "time_per_iteration": 2.7119884490966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051995, + "balance_loss_mlp": 1.04303014, + "diversity_loss_mlp": 0.0, + "epoch": 0.8593689880723355, + "flos": 493756010496.0, + "grad_norm": 0.0696773734857941, + "language_loss": 0.80894464, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.81946456, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4467, + "time_per_iteration": 2.647484064102173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053716, + "balance_loss_mlp": 1.04451299, + "diversity_loss_mlp": 0.0, + "epoch": 0.8595613697575991, + "flos": 533909408256.0, + "grad_norm": 0.07756425408438049, + "language_loss": 0.83735067, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84788781, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4468, + "time_per_iteration": 2.606961488723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050814, + "balance_loss_mlp": 1.04189694, + "diversity_loss_mlp": 0.0, + "epoch": 0.8597537514428626, + "flos": 617628271104.0, + "grad_norm": 0.09275491108708087, + "language_loss": 0.76113212, + "learning_rate": 5.070013822961328e-05, + "loss": 0.77164024, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4469, + "time_per_iteration": 2.7252352237701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044872, + "balance_loss_mlp": 1.03569305, + "diversity_loss_mlp": 0.0, + "epoch": 0.8599461331281262, + "flos": 608730826752.0, + "grad_norm": 0.0715850887288851, + "language_loss": 0.84056306, + "learning_rate": 5.056353024046462e-05, + "loss": 0.85101181, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4470, + "time_per_iteration": 2.705986261367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105045, + "balance_loss_mlp": 1.04136574, + "diversity_loss_mlp": 0.0, + "epoch": 0.8601385148133898, + "flos": 551252044800.0, + "grad_norm": 0.06285887675624062, + "language_loss": 0.83157659, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84208107, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4471, + "time_per_iteration": 2.666837215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049444, + "balance_loss_mlp": 1.04027641, + "diversity_loss_mlp": 0.0, + "epoch": 0.8603308964986534, + "flos": 581200800768.0, + "grad_norm": 0.05893825733891097, + "language_loss": 0.81146169, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.8219561, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4472, + "time_per_iteration": 2.8742566108703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048905, + "balance_loss_mlp": 1.03975582, + "diversity_loss_mlp": 0.0, + "epoch": 0.8605232781839169, + "flos": 629013828096.0, + "grad_norm": 0.0784559569656679, + "language_loss": 0.75468278, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76517183, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4473, + "time_per_iteration": 2.7587168216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049665, + "balance_loss_mlp": 1.04089117, + "diversity_loss_mlp": 0.0, + "epoch": 0.8607156598691804, + "flos": 468141078528.0, + "grad_norm": 0.06949986804746215, + "language_loss": 0.77037829, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78087491, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4474, + "time_per_iteration": 2.6033754348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046102, + "balance_loss_mlp": 1.0372982, + "diversity_loss_mlp": 0.0, + "epoch": 0.860908041554444, + "flos": 488394344448.0, + "grad_norm": 0.06715849698858382, + "language_loss": 0.82796544, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83842647, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4475, + "time_per_iteration": 2.6462340354919434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_mlp": 1.03617787, + "diversity_loss_mlp": 0.0, + "epoch": 0.8611004232397076, + "flos": 592094831616.0, + "grad_norm": 0.15717168716327404, + "language_loss": 0.80459589, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81504726, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4476, + "time_per_iteration": 2.6762094497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049439, + "balance_loss_mlp": 1.03996754, + "diversity_loss_mlp": 0.0, + "epoch": 0.8612928049249712, + "flos": 774209041920.0, + "grad_norm": 0.06321855833863838, + "language_loss": 0.86383665, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.874331, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 4477, + "time_per_iteration": 3.0531985759735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104971, + "balance_loss_mlp": 1.04053116, + "diversity_loss_mlp": 0.0, + "epoch": 0.8614851866102347, + "flos": 537553843200.0, + "grad_norm": 0.06893935293866559, + "language_loss": 0.82464266, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83513981, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4478, + "time_per_iteration": 2.6873598098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104944, + "balance_loss_mlp": 1.04062486, + "diversity_loss_mlp": 0.0, + "epoch": 0.8616775682954982, + "flos": 565916419584.0, + "grad_norm": 0.0676917705812813, + "language_loss": 0.7915647, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80205905, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4479, + "time_per_iteration": 2.6640400886535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049843, + "balance_loss_mlp": 1.04052103, + "diversity_loss_mlp": 0.0, + "epoch": 0.8618699499807618, + "flos": 481592231424.0, + "grad_norm": 0.06998246415259375, + "language_loss": 0.81843102, + "learning_rate": 4.92070558355221e-05, + "loss": 0.82892942, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 4480, + "time_per_iteration": 2.6465768814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044397, + "balance_loss_mlp": 1.0348897, + "diversity_loss_mlp": 0.0, + "epoch": 0.8620623316660254, + "flos": 649506802176.0, + "grad_norm": 0.09745126200827099, + "language_loss": 0.74436772, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.7548117, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 4481, + "time_per_iteration": 2.7863497734069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048433, + "balance_loss_mlp": 1.03935492, + "diversity_loss_mlp": 0.0, + "epoch": 0.862254713351289, + "flos": 751781523456.0, + "grad_norm": 0.06946555375175803, + "language_loss": 0.85534787, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86583215, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4482, + "time_per_iteration": 2.9774255752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051438, + "balance_loss_mlp": 1.04190743, + "diversity_loss_mlp": 0.0, + "epoch": 0.8624470950365525, + "flos": 841543727616.0, + "grad_norm": 0.07498520167107697, + "language_loss": 0.77633011, + "learning_rate": 4.880352388488024e-05, + "loss": 0.78684449, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 4483, + "time_per_iteration": 3.2647812366485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783832, + "balance_loss_mlp": 1.32083893, + "diversity_loss_mlp": 0.22531055, + "epoch": 0.8626394767218161, + "flos": 754793468928.0, + "grad_norm": 0.03436935240738205, + "language_loss": 0.83586842, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84370679, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01075701, + "step": 4484, + "time_per_iteration": 2.928110122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048003, + "balance_loss_mlp": 1.03885961, + "diversity_loss_mlp": 0.0, + "epoch": 0.8628318584070797, + "flos": 703585626624.0, + "grad_norm": 0.0696769189264069, + "language_loss": 0.82539618, + "learning_rate": 4.853537834745203e-05, + "loss": 0.83587623, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4485, + "time_per_iteration": 2.8806722164154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048081, + "balance_loss_mlp": 1.0388062, + "diversity_loss_mlp": 0.0, + "epoch": 0.8630242400923432, + "flos": 471244428288.0, + "grad_norm": 0.07034386086507984, + "language_loss": 0.77557874, + "learning_rate": 4.840156846389487e-05, + "loss": 0.7860595, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 4486, + "time_per_iteration": 2.5718491077423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045399, + "balance_loss_mlp": 1.03601718, + "diversity_loss_mlp": 0.0, + "epoch": 0.8632166217776067, + "flos": 964363553280.0, + "grad_norm": 0.08075284630280707, + "language_loss": 0.77191448, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78236842, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 4487, + "time_per_iteration": 3.206104040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048614, + "balance_loss_mlp": 1.03938758, + "diversity_loss_mlp": 0.0, + "epoch": 0.8634090034628703, + "flos": 767913509376.0, + "grad_norm": 0.07054996301110567, + "language_loss": 0.78534716, + "learning_rate": 4.813447472684246e-05, + "loss": 0.79583335, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 4488, + "time_per_iteration": 2.933553695678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049286, + "balance_loss_mlp": 1.03989816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8636013851481339, + "flos": 520591504896.0, + "grad_norm": 0.07600335888626973, + "language_loss": 0.83061361, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84110641, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 4489, + "time_per_iteration": 2.7383370399475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046793, + "balance_loss_mlp": 1.03779912, + "diversity_loss_mlp": 0.0, + "epoch": 0.8637937668333975, + "flos": 632144342016.0, + "grad_norm": 0.08034973175032056, + "language_loss": 0.80326092, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81372881, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4490, + "time_per_iteration": 2.734177827835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044623, + "balance_loss_mlp": 1.03565812, + "diversity_loss_mlp": 0.0, + "epoch": 0.8639861485186611, + "flos": 856094676480.0, + "grad_norm": 0.057692915875148014, + "language_loss": 0.76451778, + "learning_rate": 4.773514997362e-05, + "loss": 0.77496397, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4491, + "time_per_iteration": 3.0788826942443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049145, + "balance_loss_mlp": 1.04005527, + "diversity_loss_mlp": 0.0, + "epoch": 0.8641785302039245, + "flos": 481261118976.0, + "grad_norm": 0.07466724897853576, + "language_loss": 0.77982771, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.79031909, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4492, + "time_per_iteration": 2.530029058456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048683, + "balance_loss_mlp": 1.039379, + "diversity_loss_mlp": 0.0, + "epoch": 0.8643709118891881, + "flos": 504637558272.0, + "grad_norm": 0.07260420646457022, + "language_loss": 0.80692542, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81741226, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 4493, + "time_per_iteration": 2.577784538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00778204, + "balance_loss_mlp": 1.31030798, + "diversity_loss_mlp": 0.22490472, + "epoch": 0.8645632935744517, + "flos": 552368719872.0, + "grad_norm": 0.03497904945521898, + "language_loss": 0.82458371, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83236575, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01059737, + "step": 4494, + "time_per_iteration": 2.807935953140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047253, + "balance_loss_mlp": 1.03800845, + "diversity_loss_mlp": 0.0, + "epoch": 0.8647556752597153, + "flos": 524737751040.0, + "grad_norm": 0.07146424710596733, + "language_loss": 0.84123516, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.8517077, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4495, + "time_per_iteration": 2.5809860229492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_mlp": 1.03464222, + "diversity_loss_mlp": 0.0, + "epoch": 0.8649480569449788, + "flos": 787768851456.0, + "grad_norm": 0.07059483757370776, + "language_loss": 0.81995988, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83039922, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 4496, + "time_per_iteration": 3.083287477493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104705, + "balance_loss_mlp": 1.03781724, + "diversity_loss_mlp": 0.0, + "epoch": 0.8651404386302424, + "flos": 763863810048.0, + "grad_norm": 0.06772870422342313, + "language_loss": 0.76696306, + "learning_rate": 4.694124264495225e-05, + "loss": 0.77743357, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4497, + "time_per_iteration": 3.074000835418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045348, + "balance_loss_mlp": 1.03595984, + "diversity_loss_mlp": 0.0, + "epoch": 0.865332820315506, + "flos": 539893651968.0, + "grad_norm": 0.07122639959522058, + "language_loss": 0.82500464, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83545816, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 4498, + "time_per_iteration": 2.7418711185455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011015, + "balance_loss_mlp": 1.00648534, + "diversity_loss_mlp": 0.0, + "epoch": 0.8655252020007695, + "flos": 1476632830464.0, + "grad_norm": 0.011864937591166903, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80185461, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.04541016, + "routerloss_mlp": 0.0, + "step": 4499, + "time_per_iteration": 4.7632763385772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_mlp": 1.03568506, + "diversity_loss_mlp": 0.0, + "epoch": 0.8657175836860331, + "flos": 517369586688.0, + "grad_norm": 0.060500475018932964, + "language_loss": 0.82638729, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83683342, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4500, + "time_per_iteration": 2.673696756362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043007, + "balance_loss_mlp": 1.0338217, + "diversity_loss_mlp": 0.0, + "epoch": 0.8659099653712966, + "flos": 590523333120.0, + "grad_norm": 0.07115245817272867, + "language_loss": 0.80032218, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81075215, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4501, + "time_per_iteration": 2.7261881828308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104351, + "balance_loss_mlp": 1.03443861, + "diversity_loss_mlp": 0.0, + "epoch": 0.8661023470565602, + "flos": 590449181184.0, + "grad_norm": 0.05583001645863395, + "language_loss": 0.88010484, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89054, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4502, + "time_per_iteration": 2.8443400859832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_mlp": 1.03399086, + "diversity_loss_mlp": 0.0, + "epoch": 0.8662947287418238, + "flos": 567670726656.0, + "grad_norm": 0.06991854339818697, + "language_loss": 0.79483074, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80526078, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4503, + "time_per_iteration": 2.7233920097351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045559, + "balance_loss_mlp": 1.0366962, + "diversity_loss_mlp": 0.0, + "epoch": 0.8664871104270874, + "flos": 515929139712.0, + "grad_norm": 0.06089898281543335, + "language_loss": 0.82218802, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83264363, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4504, + "time_per_iteration": 2.7425873279571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050274, + "balance_loss_mlp": 1.04102361, + "diversity_loss_mlp": 0.0, + "epoch": 0.866679492112351, + "flos": 557263452672.0, + "grad_norm": 0.06301593457003249, + "language_loss": 0.78539002, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79589272, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 4505, + "time_per_iteration": 2.779776096343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_mlp": 1.03568339, + "diversity_loss_mlp": 0.0, + "epoch": 0.8668718737976144, + "flos": 722448004608.0, + "grad_norm": 0.08053258741139525, + "language_loss": 0.81561208, + "learning_rate": 4.57622578599054e-05, + "loss": 0.82605934, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4506, + "time_per_iteration": 2.9221668243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_mlp": 1.03598642, + "diversity_loss_mlp": 0.0, + "epoch": 0.867064255482878, + "flos": 600705580032.0, + "grad_norm": 0.0716656508067539, + "language_loss": 0.84894359, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.85939521, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4507, + "time_per_iteration": 2.72947359085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045976, + "balance_loss_mlp": 1.03671956, + "diversity_loss_mlp": 0.0, + "epoch": 0.8672566371681416, + "flos": 803527879680.0, + "grad_norm": 0.06708434542706315, + "language_loss": 0.76185739, + "learning_rate": 4.550219979745529e-05, + "loss": 0.77231717, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 4508, + "time_per_iteration": 3.0237209796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044629, + "balance_loss_mlp": 1.03565264, + "diversity_loss_mlp": 0.0, + "epoch": 0.8674490188534052, + "flos": 627368177664.0, + "grad_norm": 0.06518598780385719, + "language_loss": 0.83932543, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.84977174, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4509, + "time_per_iteration": 2.755521059036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047969, + "balance_loss_mlp": 1.03887904, + "diversity_loss_mlp": 0.0, + "epoch": 0.8676414005386687, + "flos": 727831692288.0, + "grad_norm": 0.0684158926680597, + "language_loss": 0.86113983, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87161952, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4510, + "time_per_iteration": 3.0163121223449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046767, + "balance_loss_mlp": 1.03768277, + "diversity_loss_mlp": 0.0, + "epoch": 0.8678337822239323, + "flos": 539972573184.0, + "grad_norm": 0.06806250868382878, + "language_loss": 0.80556583, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.81603348, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4511, + "time_per_iteration": 2.7898309230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045577, + "balance_loss_mlp": 1.03637373, + "diversity_loss_mlp": 0.0, + "epoch": 0.8680261639091958, + "flos": 507521023488.0, + "grad_norm": 0.09053329692660277, + "language_loss": 0.79419863, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80465442, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4512, + "time_per_iteration": 2.579146385192871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104619, + "balance_loss_mlp": 1.03741062, + "diversity_loss_mlp": 0.0, + "epoch": 0.8682185455944594, + "flos": 487126794240.0, + "grad_norm": 0.06296584652642616, + "language_loss": 0.80771571, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.81817764, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4513, + "time_per_iteration": 2.6543962955474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045095, + "balance_loss_mlp": 1.03607059, + "diversity_loss_mlp": 0.0, + "epoch": 0.868410927279723, + "flos": 603690361344.0, + "grad_norm": 0.07075999679510799, + "language_loss": 0.81035638, + "learning_rate": 4.472626206030528e-05, + "loss": 0.82080734, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4514, + "time_per_iteration": 2.7115249633789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104638, + "balance_loss_mlp": 1.03727281, + "diversity_loss_mlp": 0.0, + "epoch": 0.8686033089649865, + "flos": 1118985186816.0, + "grad_norm": 0.08852072985797838, + "language_loss": 0.84644556, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.85690933, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4515, + "time_per_iteration": 3.3953351974487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048525, + "balance_loss_mlp": 1.03951859, + "diversity_loss_mlp": 0.0, + "epoch": 0.8687956906502501, + "flos": 568019091456.0, + "grad_norm": 0.09550241245969901, + "language_loss": 0.83630067, + "learning_rate": 4.446902963685862e-05, + "loss": 0.8467859, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4516, + "time_per_iteration": 2.7019460201263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046576, + "balance_loss_mlp": 1.03759933, + "diversity_loss_mlp": 0.0, + "epoch": 0.8689880723355137, + "flos": 544338703872.0, + "grad_norm": 0.061078878472804264, + "language_loss": 0.84983051, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.86029625, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4517, + "time_per_iteration": 2.6748125553131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050094, + "balance_loss_mlp": 1.04121304, + "diversity_loss_mlp": 0.0, + "epoch": 0.8691804540207773, + "flos": 457425086976.0, + "grad_norm": 0.06941157706477712, + "language_loss": 0.86215872, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.87265968, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4518, + "time_per_iteration": 2.580519914627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_mlp": 1.0403676, + "diversity_loss_mlp": 0.0, + "epoch": 0.8693728357060407, + "flos": 591872375808.0, + "grad_norm": 0.060481411793616664, + "language_loss": 0.79905188, + "learning_rate": 4.40845075221456e-05, + "loss": 0.80954409, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4519, + "time_per_iteration": 2.739733934402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049186, + "balance_loss_mlp": 1.04021573, + "diversity_loss_mlp": 0.0, + "epoch": 0.8695652173913043, + "flos": 680263515648.0, + "grad_norm": 0.08287606201497805, + "language_loss": 0.79479718, + "learning_rate": 4.395668742181164e-05, + "loss": 0.80528903, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4520, + "time_per_iteration": 2.8706867694854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050204, + "balance_loss_mlp": 1.04147816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8697575990765679, + "flos": 492362551296.0, + "grad_norm": 0.06861911538387308, + "language_loss": 0.7854861, + "learning_rate": 4.38290443731934e-05, + "loss": 0.7959882, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4521, + "time_per_iteration": 2.5845677852630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051004, + "balance_loss_mlp": 1.0421524, + "diversity_loss_mlp": 0.0, + "epoch": 0.8699499807618315, + "flos": 526949079552.0, + "grad_norm": 0.0587255823279189, + "language_loss": 0.82027864, + "learning_rate": 4.370157842584671e-05, + "loss": 0.83078861, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4522, + "time_per_iteration": 2.7062559127807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047666, + "balance_loss_mlp": 1.03883255, + "diversity_loss_mlp": 0.0, + "epoch": 0.8701423624470951, + "flos": 814342616064.0, + "grad_norm": 0.07380194299564537, + "language_loss": 0.80566227, + "learning_rate": 4.357428962925808e-05, + "loss": 0.81613898, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4523, + "time_per_iteration": 3.1326324939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050998, + "balance_loss_mlp": 1.04187274, + "diversity_loss_mlp": 0.0, + "epoch": 0.8703347441323586, + "flos": 556789178880.0, + "grad_norm": 0.06623832108710956, + "language_loss": 0.88391662, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89442658, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4524, + "time_per_iteration": 2.684760808944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048562, + "balance_loss_mlp": 1.03950179, + "diversity_loss_mlp": 0.0, + "epoch": 0.8705271258176221, + "flos": 585443220480.0, + "grad_norm": 0.06258298642895538, + "language_loss": 0.84498411, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.8554697, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4525, + "time_per_iteration": 2.8076937198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050394, + "balance_loss_mlp": 1.04153669, + "diversity_loss_mlp": 0.0, + "epoch": 0.8707195075028857, + "flos": 669216411648.0, + "grad_norm": 0.058503085061922935, + "language_loss": 0.85245442, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86295837, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4526, + "time_per_iteration": 2.938445806503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_mlp": 1.03660226, + "diversity_loss_mlp": 0.0, + "epoch": 0.8709118891881493, + "flos": 520391443968.0, + "grad_norm": 0.06425490678836035, + "language_loss": 0.83926785, + "learning_rate": 4.306690693781007e-05, + "loss": 0.84972262, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4527, + "time_per_iteration": 2.759881019592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_mlp": 1.03936505, + "diversity_loss_mlp": 0.0, + "epoch": 0.8711042708734128, + "flos": 553208984064.0, + "grad_norm": 0.07304239619490156, + "language_loss": 0.81745154, + "learning_rate": 4.294050463490401e-05, + "loss": 0.8279348, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4528, + "time_per_iteration": 2.6849725246429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048731, + "balance_loss_mlp": 1.04004014, + "diversity_loss_mlp": 0.0, + "epoch": 0.8712966525586764, + "flos": 502193862144.0, + "grad_norm": 0.08116186300687973, + "language_loss": 0.82389712, + "learning_rate": 4.281427977823094e-05, + "loss": 0.83438438, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4529, + "time_per_iteration": 2.721444606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_mlp": 1.03866649, + "diversity_loss_mlp": 0.0, + "epoch": 0.87148903424394, + "flos": 804096129024.0, + "grad_norm": 0.0788947608454547, + "language_loss": 0.73803437, + "learning_rate": 4.268823241679593e-05, + "loss": 0.74850899, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4530, + "time_per_iteration": 3.0360207557678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047532, + "balance_loss_mlp": 1.03866839, + "diversity_loss_mlp": 0.0, + "epoch": 0.8716814159292036, + "flos": 773438160384.0, + "grad_norm": 0.061803367683131466, + "language_loss": 0.86130869, + "learning_rate": 4.256236259953489e-05, + "loss": 0.87178397, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4531, + "time_per_iteration": 3.0060312747955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051694, + "balance_loss_mlp": 1.04256225, + "diversity_loss_mlp": 0.0, + "epoch": 0.8718737976144671, + "flos": 486835329024.0, + "grad_norm": 0.08097144635360554, + "language_loss": 0.85292768, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86344463, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4532, + "time_per_iteration": 2.5708203315734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042692, + "balance_loss_mlp": 1.03403783, + "diversity_loss_mlp": 0.0, + "epoch": 0.8720661792997306, + "flos": 584123913216.0, + "grad_norm": 0.07173781512264084, + "language_loss": 0.7855528, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.79597974, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.08660889, + "routerloss_mlp": 0.0, + "step": 4533, + "time_per_iteration": 2.714898109436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100683, + "balance_loss_mlp": 1.00234771, + "diversity_loss_mlp": 0.0, + "epoch": 0.8722585609849942, + "flos": 1495942318080.0, + "grad_norm": 0.011018751042369157, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81973636, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4534, + "time_per_iteration": 4.830231189727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046788, + "balance_loss_mlp": 1.03760934, + "diversity_loss_mlp": 0.0, + "epoch": 0.8724509426702578, + "flos": 596169123840.0, + "grad_norm": 0.0639859938433398, + "language_loss": 0.87151349, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88198137, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4535, + "time_per_iteration": 2.7394185066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044463, + "balance_loss_mlp": 1.03511095, + "diversity_loss_mlp": 0.0, + "epoch": 0.8726433243555214, + "flos": 443635481088.0, + "grad_norm": 0.07410951797613952, + "language_loss": 0.80976605, + "learning_rate": 4.193567838376888e-05, + "loss": 0.8202107, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 4536, + "time_per_iteration": 2.5781943798065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048364, + "balance_loss_mlp": 1.03959656, + "diversity_loss_mlp": 0.0, + "epoch": 0.8728357060407849, + "flos": 553181819904.0, + "grad_norm": 0.07408162868136768, + "language_loss": 0.82072723, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83121085, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4537, + "time_per_iteration": 2.6797525882720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046083, + "balance_loss_mlp": 1.03713083, + "diversity_loss_mlp": 0.0, + "epoch": 0.8730280877260485, + "flos": 627807946752.0, + "grad_norm": 0.07156355175880628, + "language_loss": 0.78797638, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79843724, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4538, + "time_per_iteration": 2.8440496921539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047687, + "balance_loss_mlp": 1.03858507, + "diversity_loss_mlp": 0.0, + "epoch": 0.873220469411312, + "flos": 535384359936.0, + "grad_norm": 0.0722387407949978, + "language_loss": 0.79965913, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81013602, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4539, + "time_per_iteration": 2.721238136291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050694, + "balance_loss_mlp": 1.04186094, + "diversity_loss_mlp": 0.0, + "epoch": 0.8734128510965756, + "flos": 561883972608.0, + "grad_norm": 0.12124336335781533, + "language_loss": 0.84041327, + "learning_rate": 4.143753177230242e-05, + "loss": 0.8509202, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4540, + "time_per_iteration": 2.6914098262786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045135, + "balance_loss_mlp": 1.03622985, + "diversity_loss_mlp": 0.0, + "epoch": 0.8736052327818392, + "flos": 686467643904.0, + "grad_norm": 0.07799885017860995, + "language_loss": 0.79752243, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80797374, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4541, + "time_per_iteration": 2.93182110786438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048792, + "balance_loss_mlp": 1.03960705, + "diversity_loss_mlp": 0.0, + "epoch": 0.8737976144671027, + "flos": 531673113600.0, + "grad_norm": 0.06451256022818536, + "language_loss": 0.81514108, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82562894, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4542, + "time_per_iteration": 2.8326876163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00778397, + "balance_loss_mlp": 1.31047845, + "diversity_loss_mlp": 0.22450379, + "epoch": 0.8739899961523663, + "flos": 575592086016.0, + "grad_norm": 0.03126791623306444, + "language_loss": 0.81873107, + "learning_rate": 4.106579095649649e-05, + "loss": 0.82651508, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01090602, + "step": 4543, + "time_per_iteration": 2.9323105812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048028, + "balance_loss_mlp": 1.03904009, + "diversity_loss_mlp": 0.0, + "epoch": 0.8741823778376299, + "flos": 731332965888.0, + "grad_norm": 0.09261999312040192, + "language_loss": 0.76578218, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77626246, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4544, + "time_per_iteration": 2.8980069160461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104863, + "balance_loss_mlp": 1.03955245, + "diversity_loss_mlp": 0.0, + "epoch": 0.8743747595228935, + "flos": 567080082432.0, + "grad_norm": 0.06860768160110936, + "language_loss": 0.83654785, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84703422, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4545, + "time_per_iteration": 2.7457897663116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049582, + "balance_loss_mlp": 1.04058218, + "diversity_loss_mlp": 0.0, + "epoch": 0.8745671412081569, + "flos": 493370569728.0, + "grad_norm": 0.06696244649326027, + "language_loss": 0.82145166, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83194745, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4546, + "time_per_iteration": 2.5956528186798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050136, + "balance_loss_mlp": 1.04104638, + "diversity_loss_mlp": 0.0, + "epoch": 0.8747595228934205, + "flos": 524139766272.0, + "grad_norm": 0.06814063729509118, + "language_loss": 0.83736241, + "learning_rate": 4.057263119533233e-05, + "loss": 0.84786379, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4547, + "time_per_iteration": 2.6598734855651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104913, + "balance_loss_mlp": 1.04005837, + "diversity_loss_mlp": 0.0, + "epoch": 0.8749519045786841, + "flos": 744349118976.0, + "grad_norm": 0.07262523755606552, + "language_loss": 0.80276871, + "learning_rate": 4.044978704935853e-05, + "loss": 0.81325996, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4548, + "time_per_iteration": 3.042619466781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054339, + "balance_loss_mlp": 1.04545808, + "diversity_loss_mlp": 0.0, + "epoch": 0.8751442862639477, + "flos": 594278995968.0, + "grad_norm": 0.0643557055974673, + "language_loss": 0.79893917, + "learning_rate": 4.032712131660027e-05, + "loss": 0.80948257, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4549, + "time_per_iteration": 2.8232662677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045807, + "balance_loss_mlp": 1.03677678, + "diversity_loss_mlp": 0.0, + "epoch": 0.8753366679492113, + "flos": 496530819072.0, + "grad_norm": 0.06974853076229501, + "language_loss": 0.78530467, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79576278, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4550, + "time_per_iteration": 2.7248096466064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046831, + "balance_loss_mlp": 1.03792024, + "diversity_loss_mlp": 0.0, + "epoch": 0.8755290496344748, + "flos": 489864526848.0, + "grad_norm": 0.08026438876668639, + "language_loss": 0.81858146, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.82904983, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4551, + "time_per_iteration": 2.563875436782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046474, + "balance_loss_mlp": 1.03774762, + "diversity_loss_mlp": 0.0, + "epoch": 0.8757214313197383, + "flos": 591859892736.0, + "grad_norm": 0.27955745224323525, + "language_loss": 0.81637728, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82684195, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 4552, + "time_per_iteration": 2.810784339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048687, + "balance_loss_mlp": 1.03973484, + "diversity_loss_mlp": 0.0, + "epoch": 0.8759138130050019, + "flos": 976843763712.0, + "grad_norm": 0.0711083365968444, + "language_loss": 0.78033483, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.79082167, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4553, + "time_per_iteration": 3.2460765838623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_mlp": 1.03957188, + "diversity_loss_mlp": 0.0, + "epoch": 0.8761061946902655, + "flos": 802764338688.0, + "grad_norm": 0.05712124953956382, + "language_loss": 0.77816379, + "learning_rate": 3.971647051542243e-05, + "loss": 0.78864872, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4554, + "time_per_iteration": 3.0767805576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_mlp": 1.03772795, + "diversity_loss_mlp": 0.0, + "epoch": 0.8762985763755291, + "flos": 698495602176.0, + "grad_norm": 0.0721600968568646, + "language_loss": 0.74639142, + "learning_rate": 3.95948762596155e-05, + "loss": 0.75685859, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4555, + "time_per_iteration": 2.9832050800323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052245, + "balance_loss_mlp": 1.04343569, + "diversity_loss_mlp": 0.0, + "epoch": 0.8764909580607926, + "flos": 629717898240.0, + "grad_norm": 0.06902673277726463, + "language_loss": 0.80373311, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.8142556, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4556, + "time_per_iteration": 2.8642075061798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_mlp": 1.03882241, + "diversity_loss_mlp": 0.0, + "epoch": 0.8766833397460562, + "flos": 481545243648.0, + "grad_norm": 0.06429651244751071, + "language_loss": 0.80069965, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81117713, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4557, + "time_per_iteration": 2.6734185218811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_mlp": 1.03912759, + "diversity_loss_mlp": 0.0, + "epoch": 0.8768757214313198, + "flos": 407734414848.0, + "grad_norm": 0.06573901979402896, + "language_loss": 0.78168076, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79216218, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4558, + "time_per_iteration": 2.5166428089141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049801, + "balance_loss_mlp": 1.04095614, + "diversity_loss_mlp": 0.0, + "epoch": 0.8770681031165833, + "flos": 582582150144.0, + "grad_norm": 0.0842466180792191, + "language_loss": 0.8216058, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.83210379, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4559, + "time_per_iteration": 2.666722536087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050514, + "balance_loss_mlp": 1.04134107, + "diversity_loss_mlp": 0.0, + "epoch": 0.8772604848018468, + "flos": 508687257600.0, + "grad_norm": 0.07334962326293068, + "language_loss": 0.80860007, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.81910527, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4560, + "time_per_iteration": 2.627713441848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050224, + "balance_loss_mlp": 1.04125929, + "diversity_loss_mlp": 0.0, + "epoch": 0.8774528664871104, + "flos": 408836408832.0, + "grad_norm": 0.07694067808462435, + "language_loss": 0.8510192, + "learning_rate": 3.886906601970913e-05, + "loss": 0.86152148, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4561, + "time_per_iteration": 2.5129141807556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049748, + "balance_loss_mlp": 1.04076576, + "diversity_loss_mlp": 0.0, + "epoch": 0.877645248172374, + "flos": 500844819456.0, + "grad_norm": 0.05712308761867227, + "language_loss": 0.83274788, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84324539, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4562, + "time_per_iteration": 2.6301164627075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00775546, + "balance_loss_mlp": 1.3038888, + "diversity_loss_mlp": 0.22576925, + "epoch": 0.8778376298576376, + "flos": 633145019904.0, + "grad_norm": 0.034853936620068894, + "language_loss": 0.7813766, + "learning_rate": 3.862856098834189e-05, + "loss": 0.78913212, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01071687, + "step": 4563, + "time_per_iteration": 2.876042604446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055367, + "balance_loss_mlp": 1.04642081, + "diversity_loss_mlp": 0.0, + "epoch": 0.8780300115429012, + "flos": 533988329472.0, + "grad_norm": 0.06747212929306415, + "language_loss": 0.80067873, + "learning_rate": 3.850857712974976e-05, + "loss": 0.81123239, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4564, + "time_per_iteration": 2.8073532581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052003, + "balance_loss_mlp": 1.04328895, + "diversity_loss_mlp": 0.0, + "epoch": 0.8782223932281646, + "flos": 511662127104.0, + "grad_norm": 0.06003904599639906, + "language_loss": 0.77326131, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78378129, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4565, + "time_per_iteration": 2.6049962043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050973, + "balance_loss_mlp": 1.04202604, + "diversity_loss_mlp": 0.0, + "epoch": 0.8784147749134282, + "flos": 780714547200.0, + "grad_norm": 0.064833498730125, + "language_loss": 0.70079195, + "learning_rate": 3.826914695965766e-05, + "loss": 0.71130168, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4566, + "time_per_iteration": 3.1731789112091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786853, + "balance_loss_mlp": 1.32932496, + "diversity_loss_mlp": 0.22292963, + "epoch": 0.8786071565986918, + "flos": 561004434432.0, + "grad_norm": 0.0397840730750478, + "language_loss": 0.76011282, + "learning_rate": 3.814970074111279e-05, + "loss": 0.76798129, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01072608, + "step": 4567, + "time_per_iteration": 2.697258472442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050311, + "balance_loss_mlp": 1.04135227, + "diversity_loss_mlp": 0.0, + "epoch": 0.8787995382839554, + "flos": 603448081920.0, + "grad_norm": 0.06722529563230402, + "language_loss": 0.77491319, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78541636, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4568, + "time_per_iteration": 2.840650796890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050171, + "balance_loss_mlp": 1.04145098, + "diversity_loss_mlp": 0.0, + "epoch": 0.8789919199692189, + "flos": 560233552896.0, + "grad_norm": 0.05883368445240149, + "language_loss": 0.8492918, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.85979354, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4569, + "time_per_iteration": 2.6557326316833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051532, + "balance_loss_mlp": 1.04278803, + "diversity_loss_mlp": 0.0, + "epoch": 0.8791843016544825, + "flos": 539115429888.0, + "grad_norm": 0.07943052402500107, + "language_loss": 0.8255586, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.83607388, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4570, + "time_per_iteration": 2.627609968185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053016, + "balance_loss_mlp": 1.04396188, + "diversity_loss_mlp": 0.0, + "epoch": 0.8793766833397461, + "flos": 1008699899904.0, + "grad_norm": 0.06059091910308417, + "language_loss": 0.79351205, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80404216, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4571, + "time_per_iteration": 3.35367751121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_mlp": 1.0433991, + "diversity_loss_mlp": 0.0, + "epoch": 0.8795690650250096, + "flos": 678637688832.0, + "grad_norm": 0.06539899330048332, + "language_loss": 0.80981296, + "learning_rate": 3.755516016623628e-05, + "loss": 0.82033718, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4572, + "time_per_iteration": 2.880627155303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_mlp": 1.0410192, + "diversity_loss_mlp": 0.0, + "epoch": 0.8797614467102732, + "flos": 453432287232.0, + "grad_norm": 0.07570874184627417, + "language_loss": 0.88668913, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.89718843, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4573, + "time_per_iteration": 2.563573122024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051581, + "balance_loss_mlp": 1.04257524, + "diversity_loss_mlp": 0.0, + "epoch": 0.8799538283955367, + "flos": 550913591808.0, + "grad_norm": 0.06673280620392491, + "language_loss": 0.84119153, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.8517074, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4574, + "time_per_iteration": 2.6805808544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_mlp": 1.04388571, + "diversity_loss_mlp": 0.0, + "epoch": 0.8801462100808003, + "flos": 807429275136.0, + "grad_norm": 0.07043061387858378, + "language_loss": 0.84413314, + "learning_rate": 3.720058989624681e-05, + "loss": 0.85466063, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4575, + "time_per_iteration": 3.049510955810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051216, + "balance_loss_mlp": 1.04210222, + "diversity_loss_mlp": 0.0, + "epoch": 0.8803385917660639, + "flos": 768694302720.0, + "grad_norm": 0.06156041987406192, + "language_loss": 0.84676832, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85728043, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4576, + "time_per_iteration": 2.931907892227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050983, + "balance_loss_mlp": 1.04205978, + "diversity_loss_mlp": 0.0, + "epoch": 0.8805309734513275, + "flos": 567339614208.0, + "grad_norm": 0.05826624297126263, + "language_loss": 0.81173784, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82224762, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4577, + "time_per_iteration": 2.7370834350585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051952, + "balance_loss_mlp": 1.04316616, + "diversity_loss_mlp": 0.0, + "epoch": 0.880723355136591, + "flos": 679779330048.0, + "grad_norm": 0.06645498049207266, + "language_loss": 0.81695998, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82747948, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4578, + "time_per_iteration": 2.7928130626678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105216, + "balance_loss_mlp": 1.04327333, + "diversity_loss_mlp": 0.0, + "epoch": 0.8809157368218545, + "flos": 565629723648.0, + "grad_norm": 0.06357300740797822, + "language_loss": 0.79227793, + "learning_rate": 3.673034519424734e-05, + "loss": 0.80279958, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4579, + "time_per_iteration": 2.7231593132019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050745, + "balance_loss_mlp": 1.04194164, + "diversity_loss_mlp": 0.0, + "epoch": 0.8811081185071181, + "flos": 515407878144.0, + "grad_norm": 0.059350650415536, + "language_loss": 0.76098466, + "learning_rate": 3.661323354789586e-05, + "loss": 0.77149218, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4580, + "time_per_iteration": 2.683220624923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048772, + "balance_loss_mlp": 1.03990269, + "diversity_loss_mlp": 0.0, + "epoch": 0.8813005001923817, + "flos": 594343236096.0, + "grad_norm": 0.06771926957891432, + "language_loss": 0.81324798, + "learning_rate": 3.649630180424191e-05, + "loss": 0.82373571, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4581, + "time_per_iteration": 2.6779592037200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_mlp": 1.04133832, + "diversity_loss_mlp": 0.0, + "epoch": 0.8814928818776453, + "flos": 666940843008.0, + "grad_norm": 0.07585053291634766, + "language_loss": 0.79299724, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80349755, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4582, + "time_per_iteration": 2.831101894378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_mlp": 1.04368544, + "diversity_loss_mlp": 0.0, + "epoch": 0.8816852635629088, + "flos": 609153343488.0, + "grad_norm": 0.06530916783888785, + "language_loss": 0.85757875, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86810547, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4583, + "time_per_iteration": 2.7231874465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050068, + "balance_loss_mlp": 1.04128242, + "diversity_loss_mlp": 0.0, + "epoch": 0.8818776452481724, + "flos": 480379009536.0, + "grad_norm": 0.07680446741638405, + "language_loss": 0.82252479, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83302546, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4584, + "time_per_iteration": 2.6065118312835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796186, + "balance_loss_mlp": 1.34451175, + "diversity_loss_mlp": 0.22621799, + "epoch": 0.882070026933436, + "flos": 1045394242560.0, + "grad_norm": 0.03516245413492739, + "language_loss": 0.73908472, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74704659, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0108207, + "step": 4585, + "time_per_iteration": 3.3458354473114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048695, + "balance_loss_mlp": 1.039891, + "diversity_loss_mlp": 0.0, + "epoch": 0.8822624086186995, + "flos": 474409446912.0, + "grad_norm": 0.06564674034294884, + "language_loss": 0.80001426, + "learning_rate": 3.591434321288345e-05, + "loss": 0.81050122, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4586, + "time_per_iteration": 2.72759747505188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_mlp": 1.04060817, + "diversity_loss_mlp": 0.0, + "epoch": 0.882454790303963, + "flos": 654023434752.0, + "grad_norm": 0.07346558638928435, + "language_loss": 0.81996882, + "learning_rate": 3.579849183630485e-05, + "loss": 0.83046365, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4587, + "time_per_iteration": 2.808663845062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051315, + "balance_loss_mlp": 1.0421896, + "diversity_loss_mlp": 0.0, + "epoch": 0.8826471719892266, + "flos": 470325242880.0, + "grad_norm": 0.06304354104337369, + "language_loss": 0.78938949, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79990268, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4588, + "time_per_iteration": 2.573918581008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047855, + "balance_loss_mlp": 1.03888416, + "diversity_loss_mlp": 0.0, + "epoch": 0.8828395536744902, + "flos": 468753744384.0, + "grad_norm": 0.061374871286848334, + "language_loss": 0.83903325, + "learning_rate": 3.556732978508048e-05, + "loss": 0.8495118, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4589, + "time_per_iteration": 2.6800525188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049011, + "balance_loss_mlp": 1.04007053, + "diversity_loss_mlp": 0.0, + "epoch": 0.8830319353597538, + "flos": 721377944064.0, + "grad_norm": 0.06744146282588834, + "language_loss": 0.81342435, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82391441, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4590, + "time_per_iteration": 2.953735589981079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052142, + "balance_loss_mlp": 1.04338574, + "diversity_loss_mlp": 0.0, + "epoch": 0.8832243170450174, + "flos": 443277204480.0, + "grad_norm": 0.07827681611400703, + "language_loss": 0.81570184, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.82622325, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4591, + "time_per_iteration": 2.611823081970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_mlp": 1.04045248, + "diversity_loss_mlp": 0.0, + "epoch": 0.8834166987302808, + "flos": 566583413760.0, + "grad_norm": 0.07488922713809969, + "language_loss": 0.82166886, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83216357, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4592, + "time_per_iteration": 2.820740222930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049441, + "balance_loss_mlp": 1.04061973, + "diversity_loss_mlp": 0.0, + "epoch": 0.8836090804155444, + "flos": 609316328448.0, + "grad_norm": 0.06826234415728213, + "language_loss": 0.82207388, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83256829, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4593, + "time_per_iteration": 2.7582898139953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048958, + "balance_loss_mlp": 1.04009509, + "diversity_loss_mlp": 0.0, + "epoch": 0.883801462100808, + "flos": 557065963008.0, + "grad_norm": 0.07322628079560306, + "language_loss": 0.80310255, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81359208, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4594, + "time_per_iteration": 2.7062149047851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051053, + "balance_loss_mlp": 1.04161763, + "diversity_loss_mlp": 0.0, + "epoch": 0.8839938437860716, + "flos": 516188671488.0, + "grad_norm": 0.08697939284189399, + "language_loss": 0.77308345, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78359401, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 4595, + "time_per_iteration": 2.6008739471435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047041, + "balance_loss_mlp": 1.03805816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8841862254713351, + "flos": 713696292864.0, + "grad_norm": 0.07630739769725799, + "language_loss": 0.79033625, + "learning_rate": 3.47639446766777e-05, + "loss": 0.80080664, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4596, + "time_per_iteration": 2.8426897525787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048948, + "balance_loss_mlp": 1.040079, + "diversity_loss_mlp": 0.0, + "epoch": 0.8843786071565987, + "flos": 833975875584.0, + "grad_norm": 0.06236969459816259, + "language_loss": 0.82549202, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.83598149, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4597, + "time_per_iteration": 3.0126264095306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050555, + "balance_loss_mlp": 1.0417217, + "diversity_loss_mlp": 0.0, + "epoch": 0.8845709888418622, + "flos": 656884505088.0, + "grad_norm": 0.057498871629657215, + "language_loss": 0.82855976, + "learning_rate": 3.453603099349462e-05, + "loss": 0.83906525, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4598, + "time_per_iteration": 2.9096622467041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779413, + "balance_loss_mlp": 1.31441939, + "diversity_loss_mlp": 0.22293654, + "epoch": 0.8847633705271258, + "flos": 523326666240.0, + "grad_norm": 0.031937649468038294, + "language_loss": 0.80943024, + "learning_rate": 3.442234519350823e-05, + "loss": 0.81722438, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01073514, + "step": 4599, + "time_per_iteration": 2.752638339996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049498, + "balance_loss_mlp": 1.04064703, + "diversity_loss_mlp": 0.0, + "epoch": 0.8849557522123894, + "flos": 548591035392.0, + "grad_norm": 0.06795094778934727, + "language_loss": 0.84458822, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85508323, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4600, + "time_per_iteration": 2.663498878479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779393, + "balance_loss_mlp": 1.31195164, + "diversity_loss_mlp": 0.22577716, + "epoch": 0.8851481338976529, + "flos": 622372128768.0, + "grad_norm": 0.03181593301262544, + "language_loss": 0.83776021, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84555423, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01052869, + "step": 4601, + "time_per_iteration": 2.7995564937591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046945, + "balance_loss_mlp": 1.0379926, + "diversity_loss_mlp": 0.0, + "epoch": 0.8853405155829165, + "flos": 444359374848.0, + "grad_norm": 0.06356049403382279, + "language_loss": 0.80725026, + "learning_rate": 3.408237248940088e-05, + "loss": 0.8177197, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4602, + "time_per_iteration": 2.6017932891845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_mlp": 1.03828001, + "diversity_loss_mlp": 0.0, + "epoch": 0.8855328972681801, + "flos": 730470680064.0, + "grad_norm": 0.07035000464547823, + "language_loss": 0.77883828, + "learning_rate": 3.396940996663683e-05, + "loss": 0.78931201, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4603, + "time_per_iteration": 2.9521942138671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046951, + "balance_loss_mlp": 1.03792644, + "diversity_loss_mlp": 0.0, + "epoch": 0.8857252789534437, + "flos": 487376414208.0, + "grad_norm": 0.06898692389267871, + "language_loss": 0.78990823, + "learning_rate": 3.385662837299375e-05, + "loss": 0.80037773, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4604, + "time_per_iteration": 2.5854694843292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_mlp": 1.03895068, + "diversity_loss_mlp": 0.0, + "epoch": 0.8859176606387072, + "flos": 508556206080.0, + "grad_norm": 0.06638743776056398, + "language_loss": 0.81713545, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82761252, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4605, + "time_per_iteration": 2.692868232727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_mlp": 1.03658962, + "diversity_loss_mlp": 0.0, + "epoch": 0.8861100423239707, + "flos": 516628440576.0, + "grad_norm": 0.06624513803881459, + "language_loss": 0.85526776, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.86572611, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 4606, + "time_per_iteration": 2.6592142581939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790369, + "balance_loss_mlp": 1.33229494, + "diversity_loss_mlp": 0.22699621, + "epoch": 0.8863024240092343, + "flos": 626975396352.0, + "grad_norm": 0.03136786172758775, + "language_loss": 0.79641789, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80432159, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01072356, + "step": 4607, + "time_per_iteration": 2.7557034492492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048991, + "balance_loss_mlp": 1.03997266, + "diversity_loss_mlp": 0.0, + "epoch": 0.8864948056944979, + "flos": 766910260224.0, + "grad_norm": 0.053068589539523224, + "language_loss": 0.83634484, + "learning_rate": 3.340731216429083e-05, + "loss": 0.84683472, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4608, + "time_per_iteration": 2.970646381378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013538, + "balance_loss_mlp": 1.00912714, + "diversity_loss_mlp": 0.0, + "epoch": 0.8866871873797615, + "flos": 1502331452928.0, + "grad_norm": 0.013952158084226052, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79844493, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4609, + "time_per_iteration": 4.800167798995972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046127, + "balance_loss_mlp": 1.03707361, + "diversity_loss_mlp": 0.0, + "epoch": 0.886879569065025, + "flos": 811516050432.0, + "grad_norm": 0.06983974762090492, + "language_loss": 0.82014269, + "learning_rate": 3.3183740769755e-05, + "loss": 0.83060396, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4610, + "time_per_iteration": 3.0428099632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013271, + "balance_loss_mlp": 1.00885999, + "diversity_loss_mlp": 0.0, + "epoch": 0.8870719507502886, + "flos": 1582838309376.0, + "grad_norm": 0.013954976330346456, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.77924109, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4611, + "time_per_iteration": 4.960276126861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048945, + "balance_loss_mlp": 1.04021323, + "diversity_loss_mlp": 0.0, + "epoch": 0.8872643324355521, + "flos": 634027129344.0, + "grad_norm": 0.06747784662244205, + "language_loss": 0.75143421, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76192367, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4612, + "time_per_iteration": 2.8096370697021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046218, + "balance_loss_mlp": 1.03731275, + "diversity_loss_mlp": 0.0, + "epoch": 0.8874567141208157, + "flos": 535755119616.0, + "grad_norm": 0.081523690910391, + "language_loss": 0.83038783, + "learning_rate": 3.284974304209532e-05, + "loss": 0.84084994, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4613, + "time_per_iteration": 2.6296303272247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047961, + "balance_loss_mlp": 1.0389961, + "diversity_loss_mlp": 0.0, + "epoch": 0.8876490958060793, + "flos": 1566302552064.0, + "grad_norm": 0.07384350898299535, + "language_loss": 0.79394948, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80442905, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4614, + "time_per_iteration": 3.9052226543426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045945, + "balance_loss_mlp": 1.0370816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8878414774913428, + "flos": 636633810432.0, + "grad_norm": 0.06075632435028376, + "language_loss": 0.84765017, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.85810959, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4615, + "time_per_iteration": 2.784306764602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049902, + "balance_loss_mlp": 1.04100347, + "diversity_loss_mlp": 0.0, + "epoch": 0.8880338591766064, + "flos": 496429502976.0, + "grad_norm": 0.07661340087165963, + "language_loss": 0.81347793, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82397699, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4616, + "time_per_iteration": 2.585916042327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779874, + "balance_loss_mlp": 1.31519485, + "diversity_loss_mlp": 0.22310758, + "epoch": 0.88822624086187, + "flos": 542861180928.0, + "grad_norm": 0.03294259540614503, + "language_loss": 0.79988885, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.80768752, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01072259, + "step": 4617, + "time_per_iteration": 2.658268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_mlp": 1.03512335, + "diversity_loss_mlp": 0.0, + "epoch": 0.8884186225471336, + "flos": 551822865408.0, + "grad_norm": 0.08219678758811591, + "language_loss": 0.83779407, + "learning_rate": 3.229670801173418e-05, + "loss": 0.84823501, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4618, + "time_per_iteration": 2.6499626636505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013119, + "balance_loss_mlp": 1.00873196, + "diversity_loss_mlp": 0.0, + "epoch": 0.888611004232397, + "flos": 1565263305216.0, + "grad_norm": 0.01269771212796008, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79525316, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.04394531, + "routerloss_mlp": 0.0, + "step": 4619, + "time_per_iteration": 5.039214134216309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048007, + "balance_loss_mlp": 1.03929269, + "diversity_loss_mlp": 0.0, + "epoch": 0.8888033859176606, + "flos": 767028828672.0, + "grad_norm": 0.06229683334708209, + "language_loss": 0.82604653, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83652663, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4620, + "time_per_iteration": 2.987938404083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044784, + "balance_loss_mlp": 1.03616548, + "diversity_loss_mlp": 0.0, + "epoch": 0.8889957676029242, + "flos": 934110849024.0, + "grad_norm": 0.0772642935579886, + "language_loss": 0.8405602, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.851008, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.08630371, + "routerloss_mlp": 0.0, + "step": 4621, + "time_per_iteration": 3.1390573978424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050592, + "balance_loss_mlp": 1.04172254, + "diversity_loss_mlp": 0.0, + "epoch": 0.8891881492881878, + "flos": 589611488256.0, + "grad_norm": 0.06838136238403997, + "language_loss": 0.81778359, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82828951, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4622, + "time_per_iteration": 2.799467086791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047387, + "balance_loss_mlp": 1.03847051, + "diversity_loss_mlp": 0.0, + "epoch": 0.8893805309734514, + "flos": 540718861824.0, + "grad_norm": 0.0659043400927782, + "language_loss": 0.82619703, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83667088, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4623, + "time_per_iteration": 2.7340970039367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046712, + "balance_loss_mlp": 1.0377115, + "diversity_loss_mlp": 0.0, + "epoch": 0.8895729126587149, + "flos": 560095160832.0, + "grad_norm": 0.06558378954602251, + "language_loss": 0.81849378, + "learning_rate": 3.163905853111054e-05, + "loss": 0.8289609, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4624, + "time_per_iteration": 2.6568024158477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047965, + "balance_loss_mlp": 1.03908443, + "diversity_loss_mlp": 0.0, + "epoch": 0.8897652943439784, + "flos": 610154021376.0, + "grad_norm": 0.060975907763050036, + "language_loss": 0.81057096, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82105064, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4625, + "time_per_iteration": 2.7340495586395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044582, + "balance_loss_mlp": 1.03537273, + "diversity_loss_mlp": 0.0, + "epoch": 0.889957676029242, + "flos": 917847811584.0, + "grad_norm": 0.07485889575749058, + "language_loss": 0.770868, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78131384, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 4626, + "time_per_iteration": 3.187793016433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051033, + "balance_loss_mlp": 1.04202616, + "diversity_loss_mlp": 0.0, + "epoch": 0.8901500577145056, + "flos": 488698292736.0, + "grad_norm": 0.08455877289506715, + "language_loss": 0.8016057, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81211603, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4627, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104463, + "balance_loss_mlp": 1.03559375, + "diversity_loss_mlp": 0.0, + "epoch": 0.8903424393997691, + "flos": 733648181760.0, + "grad_norm": 0.06293120132110656, + "language_loss": 0.80719471, + "learning_rate": 3.120426165316398e-05, + "loss": 0.81764102, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4628, + "time_per_iteration": 2.9961817264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044841, + "balance_loss_mlp": 1.03616869, + "diversity_loss_mlp": 0.0, + "epoch": 0.8905348210850327, + "flos": 519813282816.0, + "grad_norm": 0.08203467156217556, + "language_loss": 0.81727576, + "learning_rate": 3.109601733496881e-05, + "loss": 0.82772422, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 4629, + "time_per_iteration": 2.679408073425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042396, + "balance_loss_mlp": 1.03355646, + "diversity_loss_mlp": 0.0, + "epoch": 0.8907272027702963, + "flos": 578976989184.0, + "grad_norm": 0.06898009343071365, + "language_loss": 0.79810011, + "learning_rate": 3.098795506144458e-05, + "loss": 0.80852401, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4630, + "time_per_iteration": 2.83233380317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041898, + "balance_loss_mlp": 1.03328514, + "diversity_loss_mlp": 0.0, + "epoch": 0.8909195844555599, + "flos": 893628910080.0, + "grad_norm": 0.0715777029832187, + "language_loss": 0.7953496, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80576855, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4631, + "time_per_iteration": 3.12410569190979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0077771, + "balance_loss_mlp": 1.31088805, + "diversity_loss_mlp": 0.22250512, + "epoch": 0.8911119661408234, + "flos": 549865926144.0, + "grad_norm": 0.032192261312759214, + "language_loss": 0.84286821, + "learning_rate": 3.077237681615208e-05, + "loss": 0.8506453, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01101306, + "step": 4632, + "time_per_iteration": 2.703425884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_mlp": 1.04004884, + "diversity_loss_mlp": 0.0, + "epoch": 0.8913043478260869, + "flos": 481139979264.0, + "grad_norm": 0.08188608007058847, + "language_loss": 0.84165525, + "learning_rate": 3.066486092807874e-05, + "loss": 0.85214841, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4633, + "time_per_iteration": 2.712557554244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047634, + "balance_loss_mlp": 1.03861618, + "diversity_loss_mlp": 0.0, + "epoch": 0.8914967295113505, + "flos": 484581782016.0, + "grad_norm": 0.06060123366569166, + "language_loss": 0.85206622, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86254251, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4634, + "time_per_iteration": 2.630039691925049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042062, + "balance_loss_mlp": 1.03316331, + "diversity_loss_mlp": 0.0, + "epoch": 0.8916891111966141, + "flos": 445664001024.0, + "grad_norm": 0.06527746139553993, + "language_loss": 0.8135035, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82392418, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4635, + "time_per_iteration": 2.5558903217315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047554, + "balance_loss_mlp": 1.03875005, + "diversity_loss_mlp": 0.0, + "epoch": 0.8918814928818777, + "flos": 564016379904.0, + "grad_norm": 0.06346729793174329, + "language_loss": 0.78307879, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79355425, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4636, + "time_per_iteration": 2.7006030082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045904, + "balance_loss_mlp": 1.03714168, + "diversity_loss_mlp": 0.0, + "epoch": 0.8920738745671412, + "flos": 575943022080.0, + "grad_norm": 0.06783278448064689, + "language_loss": 0.8109082, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82136714, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4637, + "time_per_iteration": 2.6627137660980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043668, + "balance_loss_mlp": 1.03518057, + "diversity_loss_mlp": 0.0, + "epoch": 0.8922662562524047, + "flos": 620180623872.0, + "grad_norm": 0.06701291241567459, + "language_loss": 0.84168345, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.85212016, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.08496094, + "routerloss_mlp": 0.0, + "step": 4638, + "time_per_iteration": 2.7190563678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048979, + "balance_loss_mlp": 1.04025865, + "diversity_loss_mlp": 0.0, + "epoch": 0.8924586379376683, + "flos": 583624673280.0, + "grad_norm": 0.06480897369874776, + "language_loss": 0.79137188, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80186164, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4639, + "time_per_iteration": 2.7548539638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046259, + "balance_loss_mlp": 1.03746128, + "diversity_loss_mlp": 0.0, + "epoch": 0.8926510196229319, + "flos": 525177520128.0, + "grad_norm": 0.06545758779491198, + "language_loss": 0.81798422, + "learning_rate": 2.991735397786538e-05, + "loss": 0.82844687, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4640, + "time_per_iteration": 2.7450599670410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046018, + "balance_loss_mlp": 1.03710771, + "diversity_loss_mlp": 0.0, + "epoch": 0.8928434013081955, + "flos": 486669772800.0, + "grad_norm": 0.07321859189533414, + "language_loss": 0.80895549, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81941569, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4641, + "time_per_iteration": 2.5623698234558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003551, + "balance_loss_mlp": 0.99911606, + "diversity_loss_mlp": 0.0, + "epoch": 0.893035782993459, + "flos": 1448302560768.0, + "grad_norm": 0.005611533508350328, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81334412, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4642, + "time_per_iteration": 4.691712379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_mlp": 1.03812027, + "diversity_loss_mlp": 0.0, + "epoch": 0.8932281646787226, + "flos": 611320255488.0, + "grad_norm": 0.0756626581840296, + "language_loss": 0.8056438, + "learning_rate": 2.95997305629786e-05, + "loss": 0.8161152, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4643, + "time_per_iteration": 2.774066925048828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048848, + "balance_loss_mlp": 1.03975809, + "diversity_loss_mlp": 0.0, + "epoch": 0.8934205463639862, + "flos": 565760775168.0, + "grad_norm": 0.07062905944842346, + "language_loss": 0.84894288, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85943139, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4644, + "time_per_iteration": 2.6488940715789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048016, + "balance_loss_mlp": 1.03935552, + "diversity_loss_mlp": 0.0, + "epoch": 0.8936129280492497, + "flos": 488431420416.0, + "grad_norm": 0.0836667751857819, + "language_loss": 0.78037202, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79085219, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 4645, + "time_per_iteration": 2.583796977996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049053, + "balance_loss_mlp": 1.04036856, + "diversity_loss_mlp": 0.0, + "epoch": 0.8938053097345132, + "flos": 886490542080.0, + "grad_norm": 0.05897365940384636, + "language_loss": 0.807109, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81759953, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 4646, + "time_per_iteration": 3.2107162475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_mlp": 1.03690004, + "diversity_loss_mlp": 0.0, + "epoch": 0.8939976914197768, + "flos": 593285658624.0, + "grad_norm": 0.06566650575637094, + "language_loss": 0.8383972, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.8488546, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4647, + "time_per_iteration": 2.742075204849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_mlp": 1.04111314, + "diversity_loss_mlp": 0.0, + "epoch": 0.8941900731050404, + "flos": 523247745024.0, + "grad_norm": 0.07362813813067959, + "language_loss": 0.81445944, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.82496303, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4648, + "time_per_iteration": 2.664386510848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_mlp": 1.03510404, + "diversity_loss_mlp": 0.0, + "epoch": 0.894382454790304, + "flos": 800582745600.0, + "grad_norm": 0.06107370863093702, + "language_loss": 0.80719924, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.81763935, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4649, + "time_per_iteration": 2.9920804500579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047105, + "balance_loss_mlp": 1.03825331, + "diversity_loss_mlp": 0.0, + "epoch": 0.8945748364755676, + "flos": 479037307392.0, + "grad_norm": 0.06165388839592064, + "language_loss": 0.84649253, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.85696357, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4650, + "time_per_iteration": 2.6212713718414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046466, + "balance_loss_mlp": 1.03758526, + "diversity_loss_mlp": 0.0, + "epoch": 0.894767218160831, + "flos": 508776090624.0, + "grad_norm": 0.06579934808698863, + "language_loss": 0.83054405, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84100872, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4651, + "time_per_iteration": 2.671393394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045401, + "balance_loss_mlp": 1.03632951, + "diversity_loss_mlp": 0.0, + "epoch": 0.8949595998460946, + "flos": 685857549312.0, + "grad_norm": 0.06478595695479929, + "language_loss": 0.81956565, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.83001965, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4652, + "time_per_iteration": 2.849560499191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045732, + "balance_loss_mlp": 1.03662467, + "diversity_loss_mlp": 0.0, + "epoch": 0.8951519815313582, + "flos": 799920520704.0, + "grad_norm": 0.06805126112229812, + "language_loss": 0.7762472, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78670454, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4653, + "time_per_iteration": 2.9823384284973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010449, + "balance_loss_mlp": 1.03572643, + "diversity_loss_mlp": 0.0, + "epoch": 0.8953443632166218, + "flos": 666740782080.0, + "grad_norm": 0.06521391394645211, + "language_loss": 0.86080307, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87125206, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4654, + "time_per_iteration": 2.7805397510528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048835, + "balance_loss_mlp": 1.03972173, + "diversity_loss_mlp": 0.0, + "epoch": 0.8955367449018854, + "flos": 644977686528.0, + "grad_norm": 0.0849204409565989, + "language_loss": 0.83320463, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84369302, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4655, + "time_per_iteration": 2.876401662826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00780551, + "balance_loss_mlp": 1.31460428, + "diversity_loss_mlp": 0.22509943, + "epoch": 0.8957291265871489, + "flos": 808714077696.0, + "grad_norm": 0.034355787829583595, + "language_loss": 0.77789617, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78570163, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0106987, + "step": 4656, + "time_per_iteration": 3.0823395252227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_mlp": 1.03641081, + "diversity_loss_mlp": 0.0, + "epoch": 0.8959215082724125, + "flos": 518923832832.0, + "grad_norm": 0.0676440423058397, + "language_loss": 0.77287573, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78333104, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4657, + "time_per_iteration": 2.64528751373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048024, + "balance_loss_mlp": 1.03879762, + "diversity_loss_mlp": 0.0, + "epoch": 0.896113889957676, + "flos": 476917383168.0, + "grad_norm": 0.0693704945431175, + "language_loss": 0.77242142, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78290164, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4658, + "time_per_iteration": 2.6108851432800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047944, + "balance_loss_mlp": 1.03874731, + "diversity_loss_mlp": 0.0, + "epoch": 0.8963062716429396, + "flos": 518162863104.0, + "grad_norm": 0.0647769416450041, + "language_loss": 0.83169466, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84217411, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 4659, + "time_per_iteration": 2.605060338973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791335, + "balance_loss_mlp": 1.33468997, + "diversity_loss_mlp": 0.22667646, + "epoch": 0.8964986533282031, + "flos": 508484625408.0, + "grad_norm": 0.035487365759697125, + "language_loss": 0.82103157, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82894492, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01065169, + "step": 4660, + "time_per_iteration": 2.7054848670959473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045343, + "balance_loss_mlp": 1.0363133, + "diversity_loss_mlp": 0.0, + "epoch": 0.8966910350134667, + "flos": 536076320256.0, + "grad_norm": 0.08335321412533339, + "language_loss": 0.81629348, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82674694, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4661, + "time_per_iteration": 2.6878600120544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051819, + "balance_loss_mlp": 1.04299188, + "diversity_loss_mlp": 0.0, + "epoch": 0.8968834166987303, + "flos": 723226226688.0, + "grad_norm": 0.08023362288618259, + "language_loss": 0.84117174, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85168993, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4662, + "time_per_iteration": 2.9065072536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783028, + "balance_loss_mlp": 1.31757593, + "diversity_loss_mlp": 0.22707665, + "epoch": 0.8970757983839939, + "flos": 681686710272.0, + "grad_norm": 0.028939334122514253, + "language_loss": 0.84291148, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.85074168, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01070135, + "step": 4663, + "time_per_iteration": 2.978598117828369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044806, + "balance_loss_mlp": 1.03588283, + "diversity_loss_mlp": 0.0, + "epoch": 0.8972681800692575, + "flos": 613037486592.0, + "grad_norm": 0.09868574536780622, + "language_loss": 0.75424099, + "learning_rate": 2.742244971856006e-05, + "loss": 0.76468909, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4664, + "time_per_iteration": 2.7175958156585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104879, + "balance_loss_mlp": 1.03972983, + "diversity_loss_mlp": 0.0, + "epoch": 0.8974605617545209, + "flos": 572350344192.0, + "grad_norm": 0.07019842465420709, + "language_loss": 0.83128035, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84176832, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4665, + "time_per_iteration": 2.7153587341308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104645, + "balance_loss_mlp": 1.03744328, + "diversity_loss_mlp": 0.0, + "epoch": 0.8976529434397845, + "flos": 520418608128.0, + "grad_norm": 0.06031238876791543, + "language_loss": 0.87254226, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88300675, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4666, + "time_per_iteration": 2.6804378032684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047772, + "balance_loss_mlp": 1.03897464, + "diversity_loss_mlp": 0.0, + "epoch": 0.8978453251250481, + "flos": 471355656192.0, + "grad_norm": 0.05793843844833838, + "language_loss": 0.82573009, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83620781, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4667, + "time_per_iteration": 2.6166820526123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.03860664, + "diversity_loss_mlp": 0.0, + "epoch": 0.8980377068103117, + "flos": 591659831808.0, + "grad_norm": 0.057031250426829085, + "language_loss": 0.82203746, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.8325128, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4668, + "time_per_iteration": 2.7726669311523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_mlp": 1.04209065, + "diversity_loss_mlp": 0.0, + "epoch": 0.8982300884955752, + "flos": 767619472896.0, + "grad_norm": 0.07157029094935193, + "language_loss": 0.82771599, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83822584, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4669, + "time_per_iteration": 2.934701681137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050412, + "balance_loss_mlp": 1.04158425, + "diversity_loss_mlp": 0.0, + "epoch": 0.8984224701808388, + "flos": 844575496704.0, + "grad_norm": 0.07594625881413491, + "language_loss": 0.77720773, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78771186, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4670, + "time_per_iteration": 3.232701539993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050559, + "balance_loss_mlp": 1.04171383, + "diversity_loss_mlp": 0.0, + "epoch": 0.8986148518661023, + "flos": 757661879808.0, + "grad_norm": 0.07298208517048191, + "language_loss": 0.75987267, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.77037835, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4671, + "time_per_iteration": 3.183443784713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_mlp": 1.03399336, + "diversity_loss_mlp": 0.0, + "epoch": 0.8988072335513659, + "flos": 563070030336.0, + "grad_norm": 0.0671693421720064, + "language_loss": 0.76635265, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.77678287, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4672, + "time_per_iteration": 2.657771587371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047693, + "balance_loss_mlp": 1.03888941, + "diversity_loss_mlp": 0.0, + "epoch": 0.8989996152366295, + "flos": 492683751936.0, + "grad_norm": 0.07004510948289375, + "language_loss": 0.86707628, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87755322, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4673, + "time_per_iteration": 2.535236120223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047542, + "balance_loss_mlp": 1.03844619, + "diversity_loss_mlp": 0.0, + "epoch": 0.899191996921893, + "flos": 542567144448.0, + "grad_norm": 0.07892824616979383, + "language_loss": 0.75965667, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.77013206, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4674, + "time_per_iteration": 2.6591787338256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048875, + "balance_loss_mlp": 1.03979182, + "diversity_loss_mlp": 0.0, + "epoch": 0.8993843786071566, + "flos": 471325920768.0, + "grad_norm": 0.06983733159730086, + "language_loss": 0.80178928, + "learning_rate": 2.631423662948984e-05, + "loss": 0.81227803, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4675, + "time_per_iteration": 2.5485310554504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048014, + "balance_loss_mlp": 1.03897238, + "diversity_loss_mlp": 0.0, + "epoch": 0.8995767602924202, + "flos": 526726623744.0, + "grad_norm": 0.07663293464144452, + "language_loss": 0.82886845, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83934855, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4676, + "time_per_iteration": 2.712852954864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047902, + "balance_loss_mlp": 1.03895569, + "diversity_loss_mlp": 0.0, + "epoch": 0.8997691419776838, + "flos": 557634212352.0, + "grad_norm": 0.063501986784752, + "language_loss": 0.8503803, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.86085933, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4677, + "time_per_iteration": 2.700191020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049601, + "balance_loss_mlp": 1.04097605, + "diversity_loss_mlp": 0.0, + "epoch": 0.8999615236629472, + "flos": 639027947520.0, + "grad_norm": 0.06651601339856017, + "language_loss": 0.80581087, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81630689, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 4678, + "time_per_iteration": 2.815133571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004703, + "balance_loss_mlp": 1.00024414, + "diversity_loss_mlp": 0.0, + "epoch": 0.9001539053482108, + "flos": 1431510547968.0, + "grad_norm": 0.0032341066943480366, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86788726, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4679, + "time_per_iteration": 4.805148124694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051809, + "balance_loss_mlp": 1.0428679, + "diversity_loss_mlp": 0.0, + "epoch": 0.9003462870334744, + "flos": 566877450240.0, + "grad_norm": 0.07566932247626351, + "language_loss": 0.79916567, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.8096838, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4680, + "time_per_iteration": 2.844775915145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046896, + "balance_loss_mlp": 1.03798509, + "diversity_loss_mlp": 0.0, + "epoch": 0.900538668718738, + "flos": 538655837184.0, + "grad_norm": 0.06791957432772232, + "language_loss": 0.78502154, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.7954905, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4681, + "time_per_iteration": 2.6303482055664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_mlp": 1.03633404, + "diversity_loss_mlp": 0.0, + "epoch": 0.9007310504040016, + "flos": 488387003904.0, + "grad_norm": 0.08260546999078933, + "language_loss": 0.86167276, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.872123, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 4682, + "time_per_iteration": 2.5562498569488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104895, + "balance_loss_mlp": 1.04019439, + "diversity_loss_mlp": 0.0, + "epoch": 0.9009234320892651, + "flos": 652901617152.0, + "grad_norm": 0.07052497776440367, + "language_loss": 0.78726637, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79775584, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4683, + "time_per_iteration": 2.8474693298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045566, + "balance_loss_mlp": 1.03673279, + "diversity_loss_mlp": 0.0, + "epoch": 0.9011158137745287, + "flos": 545569178112.0, + "grad_norm": 0.058284794620577896, + "language_loss": 0.84781289, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.85826856, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4684, + "time_per_iteration": 2.6325201988220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.03712106, + "diversity_loss_mlp": 0.0, + "epoch": 0.9013081954597922, + "flos": 559699808256.0, + "grad_norm": 0.07314098955075891, + "language_loss": 0.82745099, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83790988, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4685, + "time_per_iteration": 2.7466516494750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104569, + "balance_loss_mlp": 1.03673732, + "diversity_loss_mlp": 0.0, + "epoch": 0.9015005771450558, + "flos": 728652132864.0, + "grad_norm": 0.06299423790772288, + "language_loss": 0.81375784, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82421476, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4686, + "time_per_iteration": 2.8934953212738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048153, + "balance_loss_mlp": 1.03924799, + "diversity_loss_mlp": 0.0, + "epoch": 0.9016929588303193, + "flos": 517416574464.0, + "grad_norm": 0.07273312761869775, + "language_loss": 0.81357133, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.82405281, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4687, + "time_per_iteration": 2.7839083671569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046931, + "balance_loss_mlp": 1.03816867, + "diversity_loss_mlp": 0.0, + "epoch": 0.9018853405155829, + "flos": 622335052800.0, + "grad_norm": 0.05747241213566195, + "language_loss": 0.86223972, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87270904, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4688, + "time_per_iteration": 2.8053431510925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_mlp": 1.03985882, + "diversity_loss_mlp": 0.0, + "epoch": 0.9020777222008465, + "flos": 523284820992.0, + "grad_norm": 0.06831532416346216, + "language_loss": 0.77670169, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.78718954, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4689, + "time_per_iteration": 2.6122989654541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_mlp": 1.03666687, + "diversity_loss_mlp": 0.0, + "epoch": 0.9022701038861101, + "flos": 633713269248.0, + "grad_norm": 0.05580417916624313, + "language_loss": 0.81750822, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.82796389, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4690, + "time_per_iteration": 2.8226675987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049641, + "balance_loss_mlp": 1.04086757, + "diversity_loss_mlp": 0.0, + "epoch": 0.9024624855713737, + "flos": 513295294464.0, + "grad_norm": 0.07066245461166361, + "language_loss": 0.84397352, + "learning_rate": 2.474202664305253e-05, + "loss": 0.8544699, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4691, + "time_per_iteration": 2.608060359954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046874, + "balance_loss_mlp": 1.03758168, + "diversity_loss_mlp": 0.0, + "epoch": 0.9026548672566371, + "flos": 477411480576.0, + "grad_norm": 0.06466025971704324, + "language_loss": 0.86426198, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87473077, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 4692, + "time_per_iteration": 2.63151216506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_mlp": 1.0386498, + "diversity_loss_mlp": 0.0, + "epoch": 0.9028472489419007, + "flos": 661994353152.0, + "grad_norm": 0.06521986088761798, + "language_loss": 0.73844278, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74891818, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4693, + "time_per_iteration": 2.833467483520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048441, + "balance_loss_mlp": 1.0395714, + "diversity_loss_mlp": 0.0, + "epoch": 0.9030396306271643, + "flos": 534588885504.0, + "grad_norm": 0.07181614420601379, + "language_loss": 0.82029641, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.8307808, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4694, + "time_per_iteration": 2.6215834617614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050504, + "balance_loss_mlp": 1.04152727, + "diversity_loss_mlp": 0.0, + "epoch": 0.9032320123124279, + "flos": 801032426496.0, + "grad_norm": 0.07933043955400586, + "language_loss": 0.8251496, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83565462, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4695, + "time_per_iteration": 2.9662675857543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045657, + "balance_loss_mlp": 1.03683591, + "diversity_loss_mlp": 0.0, + "epoch": 0.9034243939976914, + "flos": 553942789632.0, + "grad_norm": 0.08647194091584645, + "language_loss": 0.76889336, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77934992, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4696, + "time_per_iteration": 2.6831114292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046387, + "balance_loss_mlp": 1.03765512, + "diversity_loss_mlp": 0.0, + "epoch": 0.903616775682955, + "flos": 503903752704.0, + "grad_norm": 0.06589427726191109, + "language_loss": 0.82852316, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83898699, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 4697, + "time_per_iteration": 2.606084108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046118, + "balance_loss_mlp": 1.03745151, + "diversity_loss_mlp": 0.0, + "epoch": 0.9038091573682185, + "flos": 436297052160.0, + "grad_norm": 0.07072654359751072, + "language_loss": 0.79079431, + "learning_rate": 2.406902878347017e-05, + "loss": 0.80125546, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 4698, + "time_per_iteration": 2.6087543964385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049163, + "balance_loss_mlp": 1.03998375, + "diversity_loss_mlp": 0.0, + "epoch": 0.9040015390534821, + "flos": 532916070912.0, + "grad_norm": 0.08844604656187115, + "language_loss": 0.81696689, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.8274585, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4699, + "time_per_iteration": 2.6180419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044344, + "balance_loss_mlp": 1.03545141, + "diversity_loss_mlp": 0.0, + "epoch": 0.9041939207387457, + "flos": 564307845120.0, + "grad_norm": 0.06789594949929362, + "language_loss": 0.80433279, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.81477618, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4700, + "time_per_iteration": 2.8047759532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_mlp": 1.03756499, + "diversity_loss_mlp": 0.0, + "epoch": 0.9043863024240092, + "flos": 515509194240.0, + "grad_norm": 0.07594330446268198, + "language_loss": 0.77877766, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78924191, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4701, + "time_per_iteration": 2.5752878189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003251, + "balance_loss_mlp": 0.99879241, + "diversity_loss_mlp": 0.0, + "epoch": 0.9045786841092728, + "flos": 1277949063168.0, + "grad_norm": 0.003648556595750329, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73933041, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4702, + "time_per_iteration": 4.9735963344573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_mlp": 1.04117787, + "diversity_loss_mlp": 0.0, + "epoch": 0.9047710657945364, + "flos": 585841144320.0, + "grad_norm": 0.08131986828145982, + "language_loss": 0.8338269, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.84432721, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4703, + "time_per_iteration": 2.736764430999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045539, + "balance_loss_mlp": 1.0364728, + "diversity_loss_mlp": 0.0, + "epoch": 0.9049634474798, + "flos": 571937739264.0, + "grad_norm": 0.085064980666539, + "language_loss": 0.79620826, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80666363, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4704, + "time_per_iteration": 2.7620725631713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010488, + "balance_loss_mlp": 1.04021692, + "diversity_loss_mlp": 0.0, + "epoch": 0.9051558291650635, + "flos": 572619787776.0, + "grad_norm": 0.08171845507100765, + "language_loss": 0.74530506, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75579304, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.0859375, + "routerloss_mlp": 0.0, + "step": 4705, + "time_per_iteration": 2.6691336631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048986, + "balance_loss_mlp": 1.0402658, + "diversity_loss_mlp": 0.0, + "epoch": 0.905348210850327, + "flos": 540538624512.0, + "grad_norm": 0.08031830917867225, + "language_loss": 0.79134667, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80183655, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4706, + "time_per_iteration": 2.657421350479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049352, + "balance_loss_mlp": 1.04074478, + "diversity_loss_mlp": 0.0, + "epoch": 0.9055405925355906, + "flos": 516381391872.0, + "grad_norm": 0.07852771434357471, + "language_loss": 0.81530303, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82579654, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4707, + "time_per_iteration": 2.6042165756225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_mlp": 1.03612292, + "diversity_loss_mlp": 0.0, + "epoch": 0.9057329742208542, + "flos": 914643145728.0, + "grad_norm": 0.052073742250211955, + "language_loss": 0.85184813, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.86229986, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4708, + "time_per_iteration": 3.205712080001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048052, + "balance_loss_mlp": 1.03921902, + "diversity_loss_mlp": 0.0, + "epoch": 0.9059253559061178, + "flos": 905261515776.0, + "grad_norm": 0.07208392805658173, + "language_loss": 0.83473063, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.84521115, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4709, + "time_per_iteration": 3.15082049369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046721, + "balance_loss_mlp": 1.03755391, + "diversity_loss_mlp": 0.0, + "epoch": 0.9061177375913813, + "flos": 664534222848.0, + "grad_norm": 0.09897458123618827, + "language_loss": 0.77498788, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78545511, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4710, + "time_per_iteration": 2.856567144393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047626, + "balance_loss_mlp": 1.03875113, + "diversity_loss_mlp": 0.0, + "epoch": 0.9063101192766448, + "flos": 565609900032.0, + "grad_norm": 0.06579655928741501, + "language_loss": 0.82653207, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.83700836, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4711, + "time_per_iteration": 2.8177871704101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_mlp": 1.03596139, + "diversity_loss_mlp": 0.0, + "epoch": 0.9065025009619084, + "flos": 727377242112.0, + "grad_norm": 0.06583523405134029, + "language_loss": 0.78812146, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.79856777, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 4712, + "time_per_iteration": 2.880993604660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010452, + "balance_loss_mlp": 1.03643262, + "diversity_loss_mlp": 0.0, + "epoch": 0.906694882647172, + "flos": 531512699904.0, + "grad_norm": 0.07415444506941751, + "language_loss": 0.80136561, + "learning_rate": 2.265739417041418e-05, + "loss": 0.81181759, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4713, + "time_per_iteration": 2.627692937850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_mlp": 1.03697634, + "diversity_loss_mlp": 0.0, + "epoch": 0.9068872643324356, + "flos": 429788975616.0, + "grad_norm": 0.06943776230353088, + "language_loss": 0.84932685, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.85978746, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4714, + "time_per_iteration": 2.5953822135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049292, + "balance_loss_mlp": 1.04006529, + "diversity_loss_mlp": 0.0, + "epoch": 0.9070796460176991, + "flos": 588366332928.0, + "grad_norm": 0.07092231807138824, + "language_loss": 0.79715693, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80764985, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4715, + "time_per_iteration": 2.7853944301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.03861296, + "diversity_loss_mlp": 0.0, + "epoch": 0.9072720277029627, + "flos": 571582033920.0, + "grad_norm": 0.08464437568581946, + "language_loss": 0.7548542, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.765329, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4716, + "time_per_iteration": 2.7233853340148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_mlp": 1.03780317, + "diversity_loss_mlp": 0.0, + "epoch": 0.9074644093882263, + "flos": 555798412800.0, + "grad_norm": 0.07842659824105606, + "language_loss": 0.88551593, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89598, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4717, + "time_per_iteration": 2.66381573677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_mlp": 1.03856564, + "diversity_loss_mlp": 0.0, + "epoch": 0.9076567910734898, + "flos": 640994798592.0, + "grad_norm": 0.06367124229028898, + "language_loss": 0.82281721, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83329189, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4718, + "time_per_iteration": 2.7830944061279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047901, + "balance_loss_mlp": 1.03901446, + "diversity_loss_mlp": 0.0, + "epoch": 0.9078491727587533, + "flos": 733998744576.0, + "grad_norm": 0.07734106151470267, + "language_loss": 0.81955713, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.83003616, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4719, + "time_per_iteration": 3.1287927627563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046133, + "balance_loss_mlp": 1.03729379, + "diversity_loss_mlp": 0.0, + "epoch": 0.9080415544440169, + "flos": 654774492672.0, + "grad_norm": 0.060901042499375765, + "language_loss": 0.86802292, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.87848425, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4720, + "time_per_iteration": 2.8568358421325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041752, + "balance_loss_mlp": 1.03286505, + "diversity_loss_mlp": 0.0, + "epoch": 0.9082339361292805, + "flos": 597463838208.0, + "grad_norm": 0.06480953268672687, + "language_loss": 0.79562217, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80603969, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4721, + "time_per_iteration": 2.7394514083862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104847, + "balance_loss_mlp": 1.03955877, + "diversity_loss_mlp": 0.0, + "epoch": 0.9084263178145441, + "flos": 504407761920.0, + "grad_norm": 0.09226866260525313, + "language_loss": 0.84760112, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85808581, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4722, + "time_per_iteration": 2.616605281829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046083, + "balance_loss_mlp": 1.03716016, + "diversity_loss_mlp": 0.0, + "epoch": 0.9086186994998077, + "flos": 550031482368.0, + "grad_norm": 0.07637156979590433, + "language_loss": 0.80386579, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.81432664, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4723, + "time_per_iteration": 2.740421772003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_mlp": 1.03707719, + "diversity_loss_mlp": 0.0, + "epoch": 0.9088110811850711, + "flos": 1134076847616.0, + "grad_norm": 0.06514717136506207, + "language_loss": 0.75284863, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76330721, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4724, + "time_per_iteration": 3.563429117202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052369, + "balance_loss_mlp": 1.0434463, + "diversity_loss_mlp": 0.0, + "epoch": 0.9090034628703347, + "flos": 556991811072.0, + "grad_norm": 0.06971007170583818, + "language_loss": 0.76744211, + "learning_rate": 2.155810244111628e-05, + "loss": 0.77796578, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4725, + "time_per_iteration": 2.658780336380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052144, + "balance_loss_mlp": 1.0433048, + "diversity_loss_mlp": 0.0, + "epoch": 0.9091958445555983, + "flos": 543970515456.0, + "grad_norm": 0.06413099042531242, + "language_loss": 0.84407449, + "learning_rate": 2.146770131403658e-05, + "loss": 0.8545959, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4726, + "time_per_iteration": 2.6778671741485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_mlp": 1.04029298, + "diversity_loss_mlp": 0.0, + "epoch": 0.9093882262408619, + "flos": 526113957888.0, + "grad_norm": 0.07280363304099743, + "language_loss": 0.81181479, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82230693, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4727, + "time_per_iteration": 2.6568636894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_mlp": 1.04238701, + "diversity_loss_mlp": 0.0, + "epoch": 0.9095806079261254, + "flos": 548526795264.0, + "grad_norm": 0.0725280737417026, + "language_loss": 0.81922674, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.82973742, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 4728, + "time_per_iteration": 2.643022060394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044931, + "balance_loss_mlp": 1.03610396, + "diversity_loss_mlp": 0.0, + "epoch": 0.909772989611389, + "flos": 572535724032.0, + "grad_norm": 0.0673800156354799, + "language_loss": 0.84635472, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85680401, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4729, + "time_per_iteration": 2.724855661392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048461, + "balance_loss_mlp": 1.03952658, + "diversity_loss_mlp": 0.0, + "epoch": 0.9099653712966526, + "flos": 561812391936.0, + "grad_norm": 0.07330494114530435, + "language_loss": 0.79589331, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.80637789, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4730, + "time_per_iteration": 2.665303945541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047478, + "balance_loss_mlp": 1.03856754, + "diversity_loss_mlp": 0.0, + "epoch": 0.9101577529819161, + "flos": 1093800112128.0, + "grad_norm": 0.078385767023693, + "language_loss": 0.80267072, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81314552, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4731, + "time_per_iteration": 3.366713762283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046528, + "balance_loss_mlp": 1.03736663, + "diversity_loss_mlp": 0.0, + "epoch": 0.9103501346671797, + "flos": 445444116480.0, + "grad_norm": 0.08027492001685438, + "language_loss": 0.81851661, + "learning_rate": 2.092919721190678e-05, + "loss": 0.82898188, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4732, + "time_per_iteration": 2.511289119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_mlp": 1.04403806, + "diversity_loss_mlp": 0.0, + "epoch": 0.9105425163524432, + "flos": 500770667520.0, + "grad_norm": 0.07912673976757961, + "language_loss": 0.77801937, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.7885493, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4733, + "time_per_iteration": 2.6270110607147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048563, + "balance_loss_mlp": 1.03949749, + "diversity_loss_mlp": 0.0, + "epoch": 0.9107348980377068, + "flos": 657519565824.0, + "grad_norm": 0.055649375090756015, + "language_loss": 0.84341621, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.85390186, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4734, + "time_per_iteration": 2.8428561687469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048531, + "balance_loss_mlp": 1.0395968, + "diversity_loss_mlp": 0.0, + "epoch": 0.9109272797229704, + "flos": 553668576768.0, + "grad_norm": 0.07562354165732797, + "language_loss": 0.84999311, + "learning_rate": 2.066245558029256e-05, + "loss": 0.8604784, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4735, + "time_per_iteration": 2.617300033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047339, + "balance_loss_mlp": 1.03857076, + "diversity_loss_mlp": 0.0, + "epoch": 0.911119661408234, + "flos": 519007896576.0, + "grad_norm": 0.06845754764753385, + "language_loss": 0.84216273, + "learning_rate": 2.057391384781182e-05, + "loss": 0.8526361, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4736, + "time_per_iteration": 2.621656894683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053341, + "balance_loss_mlp": 1.04450214, + "diversity_loss_mlp": 0.0, + "epoch": 0.9113120430934974, + "flos": 554375218176.0, + "grad_norm": 0.07185753448877732, + "language_loss": 0.83150327, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.8420366, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4737, + "time_per_iteration": 2.6248881816864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052271, + "balance_loss_mlp": 1.04334199, + "diversity_loss_mlp": 0.0, + "epoch": 0.911504424778761, + "flos": 501889913856.0, + "grad_norm": 0.06362345813560902, + "language_loss": 0.81097478, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.8214975, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4738, + "time_per_iteration": 2.6537606716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050862, + "balance_loss_mlp": 1.0419693, + "diversity_loss_mlp": 0.0, + "epoch": 0.9116968064640246, + "flos": 611100370944.0, + "grad_norm": 0.06023003948048014, + "language_loss": 0.81882358, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.82933223, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4739, + "time_per_iteration": 2.7091641426086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047604, + "balance_loss_mlp": 1.03856826, + "diversity_loss_mlp": 0.0, + "epoch": 0.9118891881492882, + "flos": 572918593536.0, + "grad_norm": 0.06392422998543029, + "language_loss": 0.82626665, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.8367427, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4740, + "time_per_iteration": 2.762544631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049849, + "balance_loss_mlp": 1.04099774, + "diversity_loss_mlp": 0.0, + "epoch": 0.9120815698345518, + "flos": 635961673728.0, + "grad_norm": 0.0822598036225358, + "language_loss": 0.78046763, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.79096615, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4741, + "time_per_iteration": 2.84562087059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_mlp": 1.04134798, + "diversity_loss_mlp": 0.0, + "epoch": 0.9122739515198153, + "flos": 702300824064.0, + "grad_norm": 0.06551662933562434, + "language_loss": 0.857319, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.86782068, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4742, + "time_per_iteration": 2.8531861305236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050211, + "balance_loss_mlp": 1.04143143, + "diversity_loss_mlp": 0.0, + "epoch": 0.9124663332050789, + "flos": 524690763264.0, + "grad_norm": 0.08699441594773756, + "language_loss": 0.87479031, + "learning_rate": 1.995933526832239e-05, + "loss": 0.88529241, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4743, + "time_per_iteration": 2.650739908218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049409, + "balance_loss_mlp": 1.04080176, + "diversity_loss_mlp": 0.0, + "epoch": 0.9126587148903424, + "flos": 563299826688.0, + "grad_norm": 0.06693150560912724, + "language_loss": 0.826424, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83691812, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4744, + "time_per_iteration": 2.679450035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045731, + "balance_loss_mlp": 1.03671229, + "diversity_loss_mlp": 0.0, + "epoch": 0.912851096575606, + "flos": 505942184448.0, + "grad_norm": 0.08010451753321661, + "language_loss": 0.79965168, + "learning_rate": 1.978541819374574e-05, + "loss": 0.81010902, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4745, + "time_per_iteration": 2.5925939083099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_mlp": 1.03974199, + "diversity_loss_mlp": 0.0, + "epoch": 0.9130434782608695, + "flos": 550730783232.0, + "grad_norm": 0.06455396152064795, + "language_loss": 0.82245004, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83293486, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4746, + "time_per_iteration": 2.6314661502838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_mlp": 1.04123759, + "diversity_loss_mlp": 0.0, + "epoch": 0.9132358599461331, + "flos": 468976200192.0, + "grad_norm": 0.06909556408267023, + "language_loss": 0.83497131, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84546977, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4747, + "time_per_iteration": 2.5474631786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046479, + "balance_loss_mlp": 1.03760934, + "diversity_loss_mlp": 0.0, + "epoch": 0.9134282416313967, + "flos": 506097828864.0, + "grad_norm": 0.07312632583700283, + "language_loss": 0.79836029, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.80882508, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4748, + "time_per_iteration": 2.680522918701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050241, + "balance_loss_mlp": 1.04132986, + "diversity_loss_mlp": 0.0, + "epoch": 0.9136206233166603, + "flos": 604819519488.0, + "grad_norm": 0.06502832751654097, + "language_loss": 0.83780789, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.84831029, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4749, + "time_per_iteration": 2.7452023029327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050292, + "balance_loss_mlp": 1.04147661, + "diversity_loss_mlp": 0.0, + "epoch": 0.9138130050019239, + "flos": 561738240000.0, + "grad_norm": 0.07375447300189412, + "language_loss": 0.82539463, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.83589756, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4750, + "time_per_iteration": 2.6701533794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_mlp": 1.04132831, + "diversity_loss_mlp": 0.0, + "epoch": 0.9140053866871873, + "flos": 690117221376.0, + "grad_norm": 0.06117546898764861, + "language_loss": 0.90313232, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91363287, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4751, + "time_per_iteration": 2.8322813510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_mlp": 1.03372943, + "diversity_loss_mlp": 0.0, + "epoch": 0.9141977683724509, + "flos": 551012336640.0, + "grad_norm": 0.05974577392766342, + "language_loss": 0.84016383, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85059029, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4752, + "time_per_iteration": 2.688077449798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050001, + "balance_loss_mlp": 1.04098237, + "diversity_loss_mlp": 0.0, + "epoch": 0.9143901500577145, + "flos": 540088943616.0, + "grad_norm": 0.06413328541809935, + "language_loss": 0.75752521, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76802522, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4753, + "time_per_iteration": 2.650331974029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047101, + "balance_loss_mlp": 1.03808916, + "diversity_loss_mlp": 0.0, + "epoch": 0.9145825317429781, + "flos": 528767626752.0, + "grad_norm": 0.08121838802327101, + "language_loss": 0.80860132, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.81907237, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4754, + "time_per_iteration": 2.6111409664154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051972, + "balance_loss_mlp": 1.04308462, + "diversity_loss_mlp": 0.0, + "epoch": 0.9147749134282416, + "flos": 514792641024.0, + "grad_norm": 0.06557647778558516, + "language_loss": 0.79137278, + "learning_rate": 1.892702433097776e-05, + "loss": 0.80189246, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4755, + "time_per_iteration": 2.6349050998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047014, + "balance_loss_mlp": 1.0382818, + "diversity_loss_mlp": 0.0, + "epoch": 0.9149672951135052, + "flos": 514441704960.0, + "grad_norm": 0.06908775382754948, + "language_loss": 0.85741401, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.8678841, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4756, + "time_per_iteration": 2.681579113006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_mlp": 1.03613043, + "diversity_loss_mlp": 0.0, + "epoch": 0.9151596767987688, + "flos": 577069608960.0, + "grad_norm": 0.06619379563809555, + "language_loss": 0.81299222, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.82344431, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4757, + "time_per_iteration": 2.8199880123138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104812, + "balance_loss_mlp": 1.03948975, + "diversity_loss_mlp": 0.0, + "epoch": 0.9153520584840323, + "flos": 619335590400.0, + "grad_norm": 0.07903863840267403, + "language_loss": 0.82496881, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83544993, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4758, + "time_per_iteration": 2.7341158390045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789047, + "balance_loss_mlp": 1.333637, + "diversity_loss_mlp": 0.22318089, + "epoch": 0.9155444401692959, + "flos": 468921871872.0, + "grad_norm": 0.0321241392563351, + "language_loss": 0.83172476, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.83961523, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01063856, + "step": 4759, + "time_per_iteration": 2.597241163253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008181, + "balance_loss_mlp": 1.00372291, + "diversity_loss_mlp": 0.0, + "epoch": 0.9157368218545594, + "flos": 1410711054336.0, + "grad_norm": 0.006260194037571693, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75827253, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4760, + "time_per_iteration": 4.852627754211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007794, + "balance_loss_mlp": 1.00331163, + "diversity_loss_mlp": 0.0, + "epoch": 0.915929203539823, + "flos": 1522019040768.0, + "grad_norm": 0.006798931475656377, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80583847, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4761, + "time_per_iteration": 4.994880437850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047238, + "balance_loss_mlp": 1.03847599, + "diversity_loss_mlp": 0.0, + "epoch": 0.9161215852250866, + "flos": 535752548352.0, + "grad_norm": 0.05790619573319675, + "language_loss": 0.80362964, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81410205, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4762, + "time_per_iteration": 2.752257823944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046406, + "balance_loss_mlp": 1.03779316, + "diversity_loss_mlp": 0.0, + "epoch": 0.9163139669103502, + "flos": 590624649216.0, + "grad_norm": 0.07895774001894396, + "language_loss": 0.8113842, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.82184827, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.08624268, + "routerloss_mlp": 0.0, + "step": 4763, + "time_per_iteration": 2.7287051677703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_mlp": 1.03780174, + "diversity_loss_mlp": 0.0, + "epoch": 0.9165063485956138, + "flos": 821975081472.0, + "grad_norm": 0.06309721497849985, + "language_loss": 0.8474853, + "learning_rate": 1.817043762598397e-05, + "loss": 0.85795045, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4764, + "time_per_iteration": 3.033647060394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047904, + "balance_loss_mlp": 1.03908885, + "diversity_loss_mlp": 0.0, + "epoch": 0.9166987302808772, + "flos": 525194772480.0, + "grad_norm": 0.06604892374800723, + "language_loss": 0.8237828, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.83426178, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4765, + "time_per_iteration": 2.6534650325775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_mlp": 1.03842866, + "diversity_loss_mlp": 0.0, + "epoch": 0.9168911119661408, + "flos": 655095693312.0, + "grad_norm": 0.05990107828974712, + "language_loss": 0.84426653, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85473955, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4766, + "time_per_iteration": 2.907374620437622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049797, + "balance_loss_mlp": 1.04099941, + "diversity_loss_mlp": 0.0, + "epoch": 0.9170834936514044, + "flos": 491747314176.0, + "grad_norm": 0.06352266446456978, + "language_loss": 0.8504523, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.86095023, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4767, + "time_per_iteration": 2.526810884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052899, + "balance_loss_mlp": 1.04373765, + "diversity_loss_mlp": 0.0, + "epoch": 0.917275875336668, + "flos": 628040314368.0, + "grad_norm": 0.07650045088890157, + "language_loss": 0.80317563, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81370461, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4768, + "time_per_iteration": 2.8045382499694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006939, + "balance_loss_mlp": 1.00245714, + "diversity_loss_mlp": 0.0, + "epoch": 0.9174682570219315, + "flos": 1517981824512.0, + "grad_norm": 0.004694640504473852, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79187173, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4769, + "time_per_iteration": 5.044682264328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045239, + "balance_loss_mlp": 1.03626275, + "diversity_loss_mlp": 0.0, + "epoch": 0.917660638707195, + "flos": 560021008896.0, + "grad_norm": 0.06781997849214876, + "language_loss": 0.85250586, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.86295819, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4770, + "time_per_iteration": 2.691663980484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049626, + "balance_loss_mlp": 1.04060245, + "diversity_loss_mlp": 0.0, + "epoch": 0.9178530203924586, + "flos": 447252751872.0, + "grad_norm": 0.06638212987757935, + "language_loss": 0.84090322, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.85139954, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4771, + "time_per_iteration": 2.4912991523742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048451, + "balance_loss_mlp": 1.03953981, + "diversity_loss_mlp": 0.0, + "epoch": 0.9180454020777222, + "flos": 465981507072.0, + "grad_norm": 0.06646365406462024, + "language_loss": 0.80387986, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81436437, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4772, + "time_per_iteration": 2.5629544258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051354, + "balance_loss_mlp": 1.04250824, + "diversity_loss_mlp": 0.0, + "epoch": 0.9182377837629858, + "flos": 596314856448.0, + "grad_norm": 0.06012915212224945, + "language_loss": 0.87101483, + "learning_rate": 1.74290029706784e-05, + "loss": 0.88152838, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4773, + "time_per_iteration": 2.7718729972839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_mlp": 1.04024041, + "diversity_loss_mlp": 0.0, + "epoch": 0.9184301654482493, + "flos": 996671941632.0, + "grad_norm": 0.05995829646518676, + "language_loss": 0.8283515, + "learning_rate": 1.734755767142876e-05, + "loss": 0.83884239, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4774, + "time_per_iteration": 3.344503164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051218, + "balance_loss_mlp": 1.04242659, + "diversity_loss_mlp": 0.0, + "epoch": 0.9186225471335129, + "flos": 508860154368.0, + "grad_norm": 0.06073994859782487, + "language_loss": 0.84713805, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.85765028, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4775, + "time_per_iteration": 2.641633987426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048807, + "balance_loss_mlp": 1.04000342, + "diversity_loss_mlp": 0.0, + "epoch": 0.9188149288187765, + "flos": 940423633920.0, + "grad_norm": 0.07386829063235183, + "language_loss": 0.79117858, + "learning_rate": 1.718522925136551e-05, + "loss": 0.80166662, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4776, + "time_per_iteration": 3.311635971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_mlp": 1.03558719, + "diversity_loss_mlp": 0.0, + "epoch": 0.91900731050404, + "flos": 583674232320.0, + "grad_norm": 0.065220381744787, + "language_loss": 0.84085238, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.85129607, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4777, + "time_per_iteration": 2.6673994064331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049607, + "balance_loss_mlp": 1.04089904, + "diversity_loss_mlp": 0.0, + "epoch": 0.9191996921893035, + "flos": 581213283840.0, + "grad_norm": 0.07320352446310975, + "language_loss": 0.79461032, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.8051064, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4778, + "time_per_iteration": 2.7164108753204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_mlp": 1.03928864, + "diversity_loss_mlp": 0.0, + "epoch": 0.9193920738745671, + "flos": 908935686144.0, + "grad_norm": 0.06805017648291643, + "language_loss": 0.79739892, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.80787992, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4779, + "time_per_iteration": 3.1064183712005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006109, + "balance_loss_mlp": 1.00162721, + "diversity_loss_mlp": 0.0, + "epoch": 0.9195844555598307, + "flos": 1558372359168.0, + "grad_norm": 0.003729713968603667, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80801499, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4780, + "time_per_iteration": 4.670097351074219 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_mlp": 1.03773558, + "diversity_loss_mlp": 0.0, + "epoch": 0.9197768372450943, + "flos": 474053741568.0, + "grad_norm": 0.07167718666233086, + "language_loss": 0.78371525, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79418308, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4781, + "time_per_iteration": 2.550713300704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047938, + "balance_loss_mlp": 1.03888965, + "diversity_loss_mlp": 0.0, + "epoch": 0.9199692189303579, + "flos": 857016059904.0, + "grad_norm": 0.06622093872641387, + "language_loss": 0.84516716, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85564649, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4782, + "time_per_iteration": 3.2526657581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049444, + "balance_loss_mlp": 1.04045606, + "diversity_loss_mlp": 0.0, + "epoch": 0.9201616006156214, + "flos": 504390509568.0, + "grad_norm": 0.06845257893605372, + "language_loss": 0.77780342, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78829783, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4783, + "time_per_iteration": 2.6809587478637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_mlp": 1.03927171, + "diversity_loss_mlp": 0.0, + "epoch": 0.9203539823008849, + "flos": 548781184512.0, + "grad_norm": 0.06867364706040735, + "language_loss": 0.85155487, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.86203671, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4784, + "time_per_iteration": 2.7345173358917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787021, + "balance_loss_mlp": 1.32680988, + "diversity_loss_mlp": 0.22533412, + "epoch": 0.9205463639861485, + "flos": 540004879872.0, + "grad_norm": 0.03407668721721812, + "language_loss": 0.82609832, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83396852, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01094901, + "step": 4785, + "time_per_iteration": 2.685168504714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044838, + "balance_loss_mlp": 1.03591502, + "diversity_loss_mlp": 0.0, + "epoch": 0.9207387456714121, + "flos": 799725229056.0, + "grad_norm": 0.0666841111034061, + "language_loss": 0.77980995, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79025835, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4786, + "time_per_iteration": 3.038740873336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051934, + "balance_loss_mlp": 1.04323745, + "diversity_loss_mlp": 0.0, + "epoch": 0.9209311273566756, + "flos": 502848746496.0, + "grad_norm": 0.06529131061254304, + "language_loss": 0.78631401, + "learning_rate": 1.630583198044333e-05, + "loss": 0.79683334, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4787, + "time_per_iteration": 2.65899658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_mlp": 1.03834498, + "diversity_loss_mlp": 0.0, + "epoch": 0.9211235090419392, + "flos": 569323717632.0, + "grad_norm": 0.0788130161570292, + "language_loss": 0.8252883, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83576053, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4788, + "time_per_iteration": 2.6822633743286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_mlp": 1.04112673, + "diversity_loss_mlp": 0.0, + "epoch": 0.9213158907272028, + "flos": 806549736960.0, + "grad_norm": 0.07410580856976316, + "language_loss": 0.82474685, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.83524632, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4789, + "time_per_iteration": 2.9761576652526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045718, + "balance_loss_mlp": 1.03685474, + "diversity_loss_mlp": 0.0, + "epoch": 0.9215082724124664, + "flos": 490682396160.0, + "grad_norm": 0.0657414722313636, + "language_loss": 0.76699907, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77745622, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4790, + "time_per_iteration": 2.523589849472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004074, + "balance_loss_mlp": 0.99959189, + "diversity_loss_mlp": 0.0, + "epoch": 0.9217006540977299, + "flos": 1514495232000.0, + "grad_norm": 0.003599452207974624, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78074342, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4791, + "time_per_iteration": 4.9881064891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052288, + "balance_loss_mlp": 1.04335308, + "diversity_loss_mlp": 0.0, + "epoch": 0.9218930357829934, + "flos": 743793352704.0, + "grad_norm": 0.06705071724600334, + "language_loss": 0.76482338, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77534628, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4792, + "time_per_iteration": 2.9655344486236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044613, + "balance_loss_mlp": 1.03580952, + "diversity_loss_mlp": 0.0, + "epoch": 0.922085417468257, + "flos": 453036934656.0, + "grad_norm": 0.20959696332428077, + "language_loss": 0.80366439, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81411052, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4793, + "time_per_iteration": 2.554954767227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044371, + "balance_loss_mlp": 1.0355078, + "diversity_loss_mlp": 0.0, + "epoch": 0.9222777991535206, + "flos": 500249405952.0, + "grad_norm": 0.07075391253683742, + "language_loss": 0.84841311, + "learning_rate": 1.575804349061616e-05, + "loss": 0.8588568, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4794, + "time_per_iteration": 2.5949018001556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_mlp": 1.0387888, + "diversity_loss_mlp": 0.0, + "epoch": 0.9224701808387842, + "flos": 527959669248.0, + "grad_norm": 0.0784160138888604, + "language_loss": 0.79135698, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.80183321, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4795, + "time_per_iteration": 2.598656415939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048282, + "balance_loss_mlp": 1.03969288, + "diversity_loss_mlp": 0.0, + "epoch": 0.9226625625240477, + "flos": 874640623104.0, + "grad_norm": 0.06249472558878416, + "language_loss": 0.75247115, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76295394, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.08587646, + "routerloss_mlp": 0.0, + "step": 4796, + "time_per_iteration": 3.1448936462402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050009, + "balance_loss_mlp": 1.04117608, + "diversity_loss_mlp": 0.0, + "epoch": 0.9228549442093112, + "flos": 502774594560.0, + "grad_norm": 0.07031980659654383, + "language_loss": 0.88239074, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.89289081, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4797, + "time_per_iteration": 2.543046474456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783825, + "balance_loss_mlp": 1.32076359, + "diversity_loss_mlp": 0.2258461, + "epoch": 0.9230473258945748, + "flos": 599989026816.0, + "grad_norm": 0.030753006157988122, + "language_loss": 0.84967744, + "learning_rate": 1.544915681564829e-05, + "loss": 0.85751569, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01051996, + "step": 4798, + "time_per_iteration": 2.819098949432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049115, + "balance_loss_mlp": 1.04029381, + "diversity_loss_mlp": 0.0, + "epoch": 0.9232397075798384, + "flos": 822508826112.0, + "grad_norm": 0.06926441515905145, + "language_loss": 0.79267633, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80316746, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4799, + "time_per_iteration": 3.0866541862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048573, + "balance_loss_mlp": 1.03970361, + "diversity_loss_mlp": 0.0, + "epoch": 0.923432089265102, + "flos": 707030000640.0, + "grad_norm": 0.06842232748476472, + "language_loss": 0.84939086, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85987657, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4800, + "time_per_iteration": 2.840742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048436, + "balance_loss_mlp": 1.03941798, + "diversity_loss_mlp": 0.0, + "epoch": 0.9236244709503655, + "flos": 701861054976.0, + "grad_norm": 0.07816499010690336, + "language_loss": 0.76574665, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77623105, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4801, + "time_per_iteration": 2.8335320949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_mlp": 1.04159379, + "diversity_loss_mlp": 0.0, + "epoch": 0.9238168526356291, + "flos": 515039689728.0, + "grad_norm": 0.06210245880406286, + "language_loss": 0.843297, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85380167, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4802, + "time_per_iteration": 2.6566197872161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047457, + "balance_loss_mlp": 1.03858757, + "diversity_loss_mlp": 0.0, + "epoch": 0.9240092343208927, + "flos": 492024098304.0, + "grad_norm": 0.09058835826894181, + "language_loss": 0.81587046, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82634509, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4803, + "time_per_iteration": 2.6244726181030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045538, + "balance_loss_mlp": 1.0367403, + "diversity_loss_mlp": 0.0, + "epoch": 0.9242016160061562, + "flos": 647218750464.0, + "grad_norm": 0.06939366274556823, + "language_loss": 0.73765552, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.74811089, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4804, + "time_per_iteration": 2.8761777877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050005, + "balance_loss_mlp": 1.04128492, + "diversity_loss_mlp": 0.0, + "epoch": 0.9243939976914197, + "flos": 729430354944.0, + "grad_norm": 0.07337139477875909, + "language_loss": 0.79396987, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80446994, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4805, + "time_per_iteration": 2.9650769233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046276, + "balance_loss_mlp": 1.03742468, + "diversity_loss_mlp": 0.0, + "epoch": 0.9245863793766833, + "flos": 452246229504.0, + "grad_norm": 0.07187105546875673, + "language_loss": 0.90605378, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.91651654, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4806, + "time_per_iteration": 2.6060450077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788148, + "balance_loss_mlp": 1.33026791, + "diversity_loss_mlp": 0.22471815, + "epoch": 0.9247787610619469, + "flos": 755030605824.0, + "grad_norm": 0.03522090358058462, + "language_loss": 0.77311206, + "learning_rate": 1.476516966469732e-05, + "loss": 0.78099358, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01065494, + "step": 4807, + "time_per_iteration": 2.9656925201416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047033, + "balance_loss_mlp": 1.03775859, + "diversity_loss_mlp": 0.0, + "epoch": 0.9249711427472105, + "flos": 561928389120.0, + "grad_norm": 0.05970940147953983, + "language_loss": 0.85029161, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.860762, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 4808, + "time_per_iteration": 2.730725049972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045322, + "balance_loss_mlp": 1.0360359, + "diversity_loss_mlp": 0.0, + "epoch": 0.9251635244324741, + "flos": 526699459584.0, + "grad_norm": 0.07434097229920794, + "language_loss": 0.85175872, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86221194, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 4809, + "time_per_iteration": 2.677678346633911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047523, + "balance_loss_mlp": 1.03848672, + "diversity_loss_mlp": 0.0, + "epoch": 0.9253559061177375, + "flos": 611280608256.0, + "grad_norm": 0.06773039177733224, + "language_loss": 0.79278344, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80325866, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4810, + "time_per_iteration": 2.7994203567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003108, + "balance_loss_mlp": 0.99864995, + "diversity_loss_mlp": 0.0, + "epoch": 0.9255482878030011, + "flos": 1551258957312.0, + "grad_norm": 0.003310724835280569, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77928501, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4811, + "time_per_iteration": 4.716477394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053397, + "balance_loss_mlp": 1.04443264, + "diversity_loss_mlp": 0.0, + "epoch": 0.9257406694882647, + "flos": 766366603776.0, + "grad_norm": 0.07798685492020957, + "language_loss": 0.80983555, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.82036948, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4812, + "time_per_iteration": 3.111435651779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046457, + "balance_loss_mlp": 1.03743255, + "diversity_loss_mlp": 0.0, + "epoch": 0.9259330511735283, + "flos": 497991089664.0, + "grad_norm": 0.07891038810151499, + "language_loss": 0.83191156, + "learning_rate": 1.431765421986686e-05, + "loss": 0.84237611, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4813, + "time_per_iteration": 2.5696511268615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_mlp": 1.04083896, + "diversity_loss_mlp": 0.0, + "epoch": 0.9261254328587919, + "flos": 626874080256.0, + "grad_norm": 0.06938826271777476, + "language_loss": 0.79197675, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80247152, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4814, + "time_per_iteration": 2.716487407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047968, + "balance_loss_mlp": 1.03926587, + "diversity_loss_mlp": 0.0, + "epoch": 0.9263178145440554, + "flos": 597382345728.0, + "grad_norm": 0.06659923130000121, + "language_loss": 0.8535648, + "learning_rate": 1.416999056594831e-05, + "loss": 0.86404449, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4815, + "time_per_iteration": 2.7244887351989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050391, + "balance_loss_mlp": 1.0416646, + "diversity_loss_mlp": 0.0, + "epoch": 0.926510196229319, + "flos": 388563319296.0, + "grad_norm": 0.06890226138960381, + "language_loss": 0.83825701, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84876096, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4816, + "time_per_iteration": 2.464979887008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048928, + "balance_loss_mlp": 1.04029167, + "diversity_loss_mlp": 0.0, + "epoch": 0.9267025779145825, + "flos": 545798974464.0, + "grad_norm": 0.07919281923401009, + "language_loss": 0.84257257, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85306185, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 4817, + "time_per_iteration": 2.640580415725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_mlp": 1.03899682, + "diversity_loss_mlp": 0.0, + "epoch": 0.9268949595998461, + "flos": 499789813248.0, + "grad_norm": 0.06905431252215245, + "language_loss": 0.82030249, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.83077914, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.08679199, + "routerloss_mlp": 0.0, + "step": 4818, + "time_per_iteration": 2.6683123111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_mlp": 1.04013073, + "diversity_loss_mlp": 0.0, + "epoch": 0.9270873412851096, + "flos": 432828085248.0, + "grad_norm": 0.06364347314694363, + "language_loss": 0.82941604, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.8399058, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4819, + "time_per_iteration": 2.622507333755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047498, + "balance_loss_mlp": 1.03880203, + "diversity_loss_mlp": 0.0, + "epoch": 0.9272797229703732, + "flos": 466769640960.0, + "grad_norm": 0.07369631813155064, + "language_loss": 0.8604511, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87092614, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4820, + "time_per_iteration": 2.5886447429656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042961, + "balance_loss_mlp": 1.03391302, + "diversity_loss_mlp": 0.0, + "epoch": 0.9274721046556368, + "flos": 704838122496.0, + "grad_norm": 0.06986061953541225, + "language_loss": 0.78981894, + "learning_rate": 1.373152729763938e-05, + "loss": 0.80024862, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4821, + "time_per_iteration": 3.002431869506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100315, + "balance_loss_mlp": 0.99869162, + "diversity_loss_mlp": 0.0, + "epoch": 0.9276644863409004, + "flos": 1402255950336.0, + "grad_norm": 0.0033138689547235365, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83383614, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4822, + "time_per_iteration": 4.872236728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048216, + "balance_loss_mlp": 1.03961504, + "diversity_loss_mlp": 0.0, + "epoch": 0.927856868026164, + "flos": 741722614272.0, + "grad_norm": 0.10753003885480804, + "language_loss": 0.80162168, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81210387, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4823, + "time_per_iteration": 3.065385103225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010481, + "balance_loss_mlp": 1.03920078, + "diversity_loss_mlp": 0.0, + "epoch": 0.9280492497114274, + "flos": 412223883264.0, + "grad_norm": 0.07544984559040653, + "language_loss": 0.74334532, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.75382626, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4824, + "time_per_iteration": 2.459182024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045084, + "balance_loss_mlp": 1.03613138, + "diversity_loss_mlp": 0.0, + "epoch": 0.928241631396691, + "flos": 646504768512.0, + "grad_norm": 0.1022591189326798, + "language_loss": 0.84062541, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85107625, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4825, + "time_per_iteration": 2.7902333736419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104756, + "balance_loss_mlp": 1.03844011, + "diversity_loss_mlp": 0.0, + "epoch": 0.9284340130819546, + "flos": 696855094272.0, + "grad_norm": 0.06332347540086566, + "language_loss": 0.80870605, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81918162, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4826, + "time_per_iteration": 3.014462947845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078881, + "balance_loss_mlp": 1.33157349, + "diversity_loss_mlp": 0.22439189, + "epoch": 0.9286263947672182, + "flos": 759132062208.0, + "grad_norm": 0.028742947039502215, + "language_loss": 0.83905512, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.84694326, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01082761, + "step": 4827, + "time_per_iteration": 3.0634989738464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_mlp": 1.03804338, + "diversity_loss_mlp": 0.0, + "epoch": 0.9288187764524817, + "flos": 672823770624.0, + "grad_norm": 0.07468304915568001, + "language_loss": 0.80064201, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81110942, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4828, + "time_per_iteration": 2.9195716381073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104862, + "balance_loss_mlp": 1.03953636, + "diversity_loss_mlp": 0.0, + "epoch": 0.9290111581377453, + "flos": 500469290496.0, + "grad_norm": 0.06920378526179259, + "language_loss": 0.83656001, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.84704626, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4829, + "time_per_iteration": 2.5818231105804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100325, + "balance_loss_mlp": 0.99879169, + "diversity_loss_mlp": 0.0, + "epoch": 0.9292035398230089, + "flos": 1563627566592.0, + "grad_norm": 0.0032198614954978416, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73125315, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4830, + "time_per_iteration": 4.951828718185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003246, + "balance_loss_mlp": 0.99878782, + "diversity_loss_mlp": 0.0, + "epoch": 0.9293959215082724, + "flos": 1518673411584.0, + "grad_norm": 0.003220380799395436, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80515087, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4831, + "time_per_iteration": 4.905702590942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_mlp": 1.04304385, + "diversity_loss_mlp": 0.0, + "epoch": 0.929588303193536, + "flos": 557836844544.0, + "grad_norm": 0.08579455116544467, + "language_loss": 0.84383392, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.85435468, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4832, + "time_per_iteration": 2.6667189598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051534, + "balance_loss_mlp": 1.04263496, + "diversity_loss_mlp": 0.0, + "epoch": 0.9297806848787995, + "flos": 478580285952.0, + "grad_norm": 0.07653793753230506, + "language_loss": 0.80192435, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.81243968, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4833, + "time_per_iteration": 2.530576705932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784425, + "balance_loss_mlp": 1.32099247, + "diversity_loss_mlp": 0.22666103, + "epoch": 0.9299730665640631, + "flos": 564537641472.0, + "grad_norm": 0.02823635345590092, + "language_loss": 0.80189478, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.80973905, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01059832, + "step": 4834, + "time_per_iteration": 2.8291900157928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046644, + "balance_loss_mlp": 1.03810263, + "diversity_loss_mlp": 0.0, + "epoch": 0.9301654482493267, + "flos": 560174082048.0, + "grad_norm": 0.06755490191164544, + "language_loss": 0.82792151, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83838797, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.08551025, + "routerloss_mlp": 0.0, + "step": 4835, + "time_per_iteration": 2.823146104812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_mlp": 0.99874461, + "diversity_loss_mlp": 0.0, + "epoch": 0.9303578299345903, + "flos": 1520096606208.0, + "grad_norm": 0.0032138775564420387, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77855623, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4836, + "time_per_iteration": 4.9668896198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_mlp": 1.04090357, + "diversity_loss_mlp": 0.0, + "epoch": 0.9305502116198537, + "flos": 530843134464.0, + "grad_norm": 0.07503406646188047, + "language_loss": 0.82993883, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.84043521, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4837, + "time_per_iteration": 2.637373924255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045053, + "balance_loss_mlp": 1.03608274, + "diversity_loss_mlp": 0.0, + "epoch": 0.9307425933051173, + "flos": 474898775040.0, + "grad_norm": 0.08374095917705242, + "language_loss": 0.81554383, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82599437, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4838, + "time_per_iteration": 2.515183448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784124, + "balance_loss_mlp": 1.32047153, + "diversity_loss_mlp": 0.22594652, + "epoch": 0.9309349749903809, + "flos": 584892223488.0, + "grad_norm": 0.03184031575728359, + "language_loss": 0.86872089, + "learning_rate": 1.245693929549213e-05, + "loss": 0.87656212, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01091547, + "step": 4839, + "time_per_iteration": 2.7616403102874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_mlp": 1.03896284, + "diversity_loss_mlp": 0.0, + "epoch": 0.9311273566756445, + "flos": 861666315264.0, + "grad_norm": 0.061490618450412385, + "language_loss": 0.76999998, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.78047729, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4840, + "time_per_iteration": 3.0911495685577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049189, + "balance_loss_mlp": 1.04037976, + "diversity_loss_mlp": 0.0, + "epoch": 0.9313197383609081, + "flos": 548094366720.0, + "grad_norm": 0.07195558921256455, + "language_loss": 0.82423127, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83472311, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4841, + "time_per_iteration": 2.6239800453186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_mlp": 1.0373354, + "diversity_loss_mlp": 0.0, + "epoch": 0.9315121200461716, + "flos": 468756315648.0, + "grad_norm": 0.07717139537202818, + "language_loss": 0.81388533, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82434833, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4842, + "time_per_iteration": 2.5533297061920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104779, + "balance_loss_mlp": 1.03906965, + "diversity_loss_mlp": 0.0, + "epoch": 0.9317045017314352, + "flos": 417659701248.0, + "grad_norm": 0.07073553761883396, + "language_loss": 0.77673644, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.78721428, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4843, + "time_per_iteration": 2.528705358505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046518, + "balance_loss_mlp": 1.03777993, + "diversity_loss_mlp": 0.0, + "epoch": 0.9318968834166987, + "flos": 540489065472.0, + "grad_norm": 0.06887316839423005, + "language_loss": 0.7711761, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78164124, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4844, + "time_per_iteration": 2.7841336727142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_mlp": 1.0387938, + "diversity_loss_mlp": 0.0, + "epoch": 0.9320892651019623, + "flos": 521330452992.0, + "grad_norm": 0.07471339735643584, + "language_loss": 0.80957037, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.82004702, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4845, + "time_per_iteration": 2.5967259407043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047686, + "balance_loss_mlp": 1.03901899, + "diversity_loss_mlp": 0.0, + "epoch": 0.9322816467872258, + "flos": 582072998400.0, + "grad_norm": 0.0577436249864269, + "language_loss": 0.80821908, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.8186959, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 4846, + "time_per_iteration": 2.735156774520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00780467, + "balance_loss_mlp": 1.31358051, + "diversity_loss_mlp": 0.22594975, + "epoch": 0.9324740284724894, + "flos": 484747338240.0, + "grad_norm": 0.03394753668394222, + "language_loss": 0.82436854, + "learning_rate": 1.191013150742537e-05, + "loss": 0.83217323, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01070218, + "step": 4847, + "time_per_iteration": 2.730957269668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047642, + "balance_loss_mlp": 1.03871894, + "diversity_loss_mlp": 0.0, + "epoch": 0.932666410157753, + "flos": 732585461760.0, + "grad_norm": 0.06722310118133415, + "language_loss": 0.82897216, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.83944857, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4848, + "time_per_iteration": 3.0189881324768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044207, + "balance_loss_mlp": 1.03535616, + "diversity_loss_mlp": 0.0, + "epoch": 0.9328587918430166, + "flos": 965537127936.0, + "grad_norm": 0.08276324861402574, + "language_loss": 0.78624225, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.79668438, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4849, + "time_per_iteration": 3.2938950061798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043788, + "balance_loss_mlp": 1.03470397, + "diversity_loss_mlp": 0.0, + "epoch": 0.9330511735282802, + "flos": 614552085504.0, + "grad_norm": 0.07019081687121781, + "language_loss": 0.80391824, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81435609, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4850, + "time_per_iteration": 2.7515499591827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047473, + "balance_loss_mlp": 1.03849709, + "diversity_loss_mlp": 0.0, + "epoch": 0.9332435552135436, + "flos": 559101823488.0, + "grad_norm": 0.06820253841014733, + "language_loss": 0.85668182, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.86715662, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4851, + "time_per_iteration": 2.680340528488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047775, + "balance_loss_mlp": 1.03895366, + "diversity_loss_mlp": 0.0, + "epoch": 0.9334359368988072, + "flos": 515536358400.0, + "grad_norm": 0.1196628498062152, + "language_loss": 0.8199991, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.83047688, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4852, + "time_per_iteration": 2.5966830253601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045996, + "balance_loss_mlp": 1.03706086, + "diversity_loss_mlp": 0.0, + "epoch": 0.9336283185840708, + "flos": 539809588224.0, + "grad_norm": 0.07419739239105261, + "language_loss": 0.82826304, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.838723, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4853, + "time_per_iteration": 2.7714791297912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100356, + "balance_loss_mlp": 0.999125, + "diversity_loss_mlp": 0.0, + "epoch": 0.9338207002693344, + "flos": 1562824751616.0, + "grad_norm": 0.004307105036144614, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.7945857, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4854, + "time_per_iteration": 4.893805265426636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046802, + "balance_loss_mlp": 1.03777242, + "diversity_loss_mlp": 0.0, + "epoch": 0.9340130819545979, + "flos": 645261811200.0, + "grad_norm": 0.06988266936343929, + "language_loss": 0.81466687, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82513487, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4855, + "time_per_iteration": 2.9019949436187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046813, + "balance_loss_mlp": 1.03802776, + "diversity_loss_mlp": 0.0, + "epoch": 0.9342054636398615, + "flos": 503441588736.0, + "grad_norm": 0.06582390304127933, + "language_loss": 0.76894152, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.77940965, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4856, + "time_per_iteration": 2.650545597076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049411, + "balance_loss_mlp": 1.04064322, + "diversity_loss_mlp": 0.0, + "epoch": 0.934397845325125, + "flos": 593026126848.0, + "grad_norm": 0.0537499767930613, + "language_loss": 0.84482789, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.855322, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4857, + "time_per_iteration": 2.82725191116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044882, + "balance_loss_mlp": 1.03607237, + "diversity_loss_mlp": 0.0, + "epoch": 0.9345902270103886, + "flos": 499891129344.0, + "grad_norm": 0.0729144221953202, + "language_loss": 0.80315518, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.813604, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4858, + "time_per_iteration": 2.575934410095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00589881, + "balance_loss_mlp": 1.02917051, + "diversity_loss_mlp": 0.13201948, + "epoch": 0.9347826086956522, + "flos": 1520329347072.0, + "grad_norm": 0.001270733186004784, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.76577604, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00928602, + "step": 4859, + "time_per_iteration": 4.699007987976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043954, + "balance_loss_mlp": 1.03486431, + "diversity_loss_mlp": 0.0, + "epoch": 0.9349749903809157, + "flos": 504550923264.0, + "grad_norm": 0.05691745976231031, + "language_loss": 0.81198478, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.82242435, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4860, + "time_per_iteration": 2.777012825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043681, + "balance_loss_mlp": 1.0347048, + "diversity_loss_mlp": 0.0, + "epoch": 0.9351673720661793, + "flos": 568901200896.0, + "grad_norm": 0.10010618557822787, + "language_loss": 0.79151934, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.80195618, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4861, + "time_per_iteration": 2.6320016384124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044691, + "balance_loss_mlp": 1.03600073, + "diversity_loss_mlp": 0.0, + "epoch": 0.9353597537514429, + "flos": 544605576192.0, + "grad_norm": 0.08362946312424821, + "language_loss": 0.86286509, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87331206, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4862, + "time_per_iteration": 2.6105833053588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044576, + "balance_loss_mlp": 1.03586817, + "diversity_loss_mlp": 0.0, + "epoch": 0.9355521354367065, + "flos": 518997984768.0, + "grad_norm": 0.05900589694062164, + "language_loss": 0.84758484, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85803056, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4863, + "time_per_iteration": 2.7426371574401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045058, + "balance_loss_mlp": 1.03628969, + "diversity_loss_mlp": 0.0, + "epoch": 0.93574451712197, + "flos": 446316314112.0, + "grad_norm": 0.0932071553441471, + "language_loss": 0.78725177, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79770231, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4864, + "time_per_iteration": 2.5507235527038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045394, + "balance_loss_mlp": 1.0366559, + "diversity_loss_mlp": 0.0, + "epoch": 0.9359368988072335, + "flos": 480517401600.0, + "grad_norm": 0.08522352532070332, + "language_loss": 0.76506388, + "learning_rate": 1.072417553472832e-05, + "loss": 0.77551782, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4865, + "time_per_iteration": 4.053428649902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045737, + "balance_loss_mlp": 1.03688622, + "diversity_loss_mlp": 0.0, + "epoch": 0.9361292804924971, + "flos": 497118892032.0, + "grad_norm": 0.06592512300053538, + "language_loss": 0.85022455, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86068201, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4866, + "time_per_iteration": 2.608532667160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_mlp": 1.03633118, + "diversity_loss_mlp": 0.0, + "epoch": 0.9363216621777607, + "flos": 618122368512.0, + "grad_norm": 0.08990017203823457, + "language_loss": 0.84334439, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85379487, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4867, + "time_per_iteration": 2.7455151081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003805, + "balance_loss_mlp": 0.99937075, + "diversity_loss_mlp": 0.0, + "epoch": 0.9365140438630243, + "flos": 1415929559040.0, + "grad_norm": 0.005040674101907188, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80207145, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4868, + "time_per_iteration": 4.876135587692261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043847, + "balance_loss_mlp": 1.03491819, + "diversity_loss_mlp": 0.0, + "epoch": 0.9367064255482878, + "flos": 590503509504.0, + "grad_norm": 0.07053266752313711, + "language_loss": 0.81646079, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82689929, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4869, + "time_per_iteration": 2.708939790725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046683, + "balance_loss_mlp": 1.03781986, + "diversity_loss_mlp": 0.0, + "epoch": 0.9368988072335513, + "flos": 526637790720.0, + "grad_norm": 0.060976282943095796, + "language_loss": 0.82172537, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.83219218, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4870, + "time_per_iteration": 2.65925669670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010449, + "balance_loss_mlp": 1.03607297, + "diversity_loss_mlp": 0.0, + "epoch": 0.9370911889188149, + "flos": 743205279744.0, + "grad_norm": 0.062164083958686674, + "language_loss": 0.78947985, + "learning_rate": 1.034252625822113e-05, + "loss": 0.79992884, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4871, + "time_per_iteration": 2.9242799282073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040518, + "balance_loss_mlp": 1.03191113, + "diversity_loss_mlp": 0.0, + "epoch": 0.9372835706040785, + "flos": 546038682624.0, + "grad_norm": 0.06036408822352837, + "language_loss": 0.78672194, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.79712713, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4872, + "time_per_iteration": 2.7019548416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044427, + "balance_loss_mlp": 1.03556955, + "diversity_loss_mlp": 0.0, + "epoch": 0.9374759522893421, + "flos": 491633515008.0, + "grad_norm": 0.0656254889693481, + "language_loss": 0.81680477, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82724905, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4873, + "time_per_iteration": 2.6661787033081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_mlp": 1.04079902, + "diversity_loss_mlp": 0.0, + "epoch": 0.9376683339746056, + "flos": 578421222912.0, + "grad_norm": 0.07062356836033176, + "language_loss": 0.82414687, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83464432, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4874, + "time_per_iteration": 2.711991310119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047638, + "balance_loss_mlp": 1.03853059, + "diversity_loss_mlp": 0.0, + "epoch": 0.9378607156598692, + "flos": 506290549248.0, + "grad_norm": 0.07310284560827243, + "language_loss": 0.80373824, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81421459, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4875, + "time_per_iteration": 2.650681972503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_mlp": 1.03720248, + "diversity_loss_mlp": 0.0, + "epoch": 0.9380530973451328, + "flos": 520015915008.0, + "grad_norm": 0.062293314386374414, + "language_loss": 0.77575111, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.78621429, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4876, + "time_per_iteration": 2.6609630584716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.03741789, + "diversity_loss_mlp": 0.0, + "epoch": 0.9382454790303963, + "flos": 557799768576.0, + "grad_norm": 0.06315414550541629, + "language_loss": 0.84719789, + "learning_rate": 9.967720642029999e-06, + "loss": 0.85766232, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4877, + "time_per_iteration": 2.651707172393799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045511, + "balance_loss_mlp": 1.03690422, + "diversity_loss_mlp": 0.0, + "epoch": 0.9384378607156598, + "flos": 695476316160.0, + "grad_norm": 0.0631685338403412, + "language_loss": 0.81854308, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82899821, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4878, + "time_per_iteration": 2.949418783187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049635, + "balance_loss_mlp": 1.04068828, + "diversity_loss_mlp": 0.0, + "epoch": 0.9386302424009234, + "flos": 554750747136.0, + "grad_norm": 0.08565110846317762, + "language_loss": 0.80980134, + "learning_rate": 9.844307158203058e-06, + "loss": 0.82029772, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4879, + "time_per_iteration": 2.6912460327148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048881, + "balance_loss_mlp": 1.03982735, + "diversity_loss_mlp": 0.0, + "epoch": 0.938822624086187, + "flos": 566981337600.0, + "grad_norm": 0.0804374374941349, + "language_loss": 0.79621142, + "learning_rate": 9.782885847304469e-06, + "loss": 0.80670023, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4880, + "time_per_iteration": 2.6459033489227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045381, + "balance_loss_mlp": 1.03668451, + "diversity_loss_mlp": 0.0, + "epoch": 0.9390150057714506, + "flos": 417602801664.0, + "grad_norm": 0.07482420746454603, + "language_loss": 0.80257022, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81302404, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4881, + "time_per_iteration": 2.5740063190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_mlp": 1.03760171, + "diversity_loss_mlp": 0.0, + "epoch": 0.9392073874567142, + "flos": 1553839967232.0, + "grad_norm": 0.0852712224295467, + "language_loss": 0.76510745, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77557057, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4882, + "time_per_iteration": 3.689307689666748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050073, + "balance_loss_mlp": 1.04114449, + "diversity_loss_mlp": 0.0, + "epoch": 0.9393997691419776, + "flos": 652536000000.0, + "grad_norm": 0.09232552056587429, + "language_loss": 0.7808578, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79135859, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4883, + "time_per_iteration": 2.7796614170074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004059, + "balance_loss_mlp": 0.99962485, + "diversity_loss_mlp": 0.0, + "epoch": 0.9395921508272412, + "flos": 1553294817792.0, + "grad_norm": 0.004454986396057403, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79174733, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4884, + "time_per_iteration": 4.815665245056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049205, + "balance_loss_mlp": 1.04029393, + "diversity_loss_mlp": 0.0, + "epoch": 0.9397845325125048, + "flos": 498144162816.0, + "grad_norm": 0.06863865940742271, + "language_loss": 0.78660077, + "learning_rate": 9.478634554578314e-06, + "loss": 0.79709285, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4885, + "time_per_iteration": 2.6168384552001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104537, + "balance_loss_mlp": 1.03678083, + "diversity_loss_mlp": 0.0, + "epoch": 0.9399769141977684, + "flos": 498596414976.0, + "grad_norm": 0.07504646640886149, + "language_loss": 0.83853602, + "learning_rate": 9.418355513755638e-06, + "loss": 0.84898973, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.08599854, + "routerloss_mlp": 0.0, + "step": 4886, + "time_per_iteration": 2.5939505100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00589544, + "balance_loss_mlp": 1.02856016, + "diversity_loss_mlp": 0.13189431, + "epoch": 0.9401692958830319, + "flos": 1402500427776.0, + "grad_norm": 0.0012775322428382279, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.79921734, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00931658, + "step": 4887, + "time_per_iteration": 4.869856357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047496, + "balance_loss_mlp": 1.03856742, + "diversity_loss_mlp": 0.0, + "epoch": 0.9403616775682955, + "flos": 540123448320.0, + "grad_norm": 0.06148309655419226, + "language_loss": 0.85074973, + "learning_rate": 9.298368837495575e-06, + "loss": 0.86122465, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4888, + "time_per_iteration": 2.723494052886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004242, + "balance_loss_mlp": 0.99983096, + "diversity_loss_mlp": 0.0, + "epoch": 0.9405540592535591, + "flos": 1322058184704.0, + "grad_norm": 0.0026510918871896585, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76173675, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4889, + "time_per_iteration": 4.887513637542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047457, + "balance_loss_mlp": 1.03848016, + "diversity_loss_mlp": 0.0, + "epoch": 0.9407464409388226, + "flos": 572362827264.0, + "grad_norm": 0.07795508435687046, + "language_loss": 0.83106863, + "learning_rate": 9.179144190235799e-06, + "loss": 0.8415432, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4890, + "time_per_iteration": 2.6607882976531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_mlp": 1.03781509, + "diversity_loss_mlp": 0.0, + "epoch": 0.9409388226240862, + "flos": 511264203264.0, + "grad_norm": 0.06087500740988416, + "language_loss": 0.76773834, + "learning_rate": 9.119817685386112e-06, + "loss": 0.77820671, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4891, + "time_per_iteration": 2.704505205154419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004256, + "balance_loss_mlp": 0.99982125, + "diversity_loss_mlp": 0.0, + "epoch": 0.9411312043093497, + "flos": 1569901077504.0, + "grad_norm": 0.0026524442975608157, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81246138, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4892, + "time_per_iteration": 4.861233949661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_mlp": 1.04099298, + "diversity_loss_mlp": 0.0, + "epoch": 0.9413235859946133, + "flos": 569469450240.0, + "grad_norm": 0.0781928260181619, + "language_loss": 0.78609961, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79659593, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 4893, + "time_per_iteration": 2.7279999256134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048837, + "balance_loss_mlp": 1.03969967, + "diversity_loss_mlp": 0.0, + "epoch": 0.9415159676798769, + "flos": 781905747456.0, + "grad_norm": 0.06974865955281616, + "language_loss": 0.80413878, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81462717, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4894, + "time_per_iteration": 3.0058786869049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050294, + "balance_loss_mlp": 1.04135358, + "diversity_loss_mlp": 0.0, + "epoch": 0.9417083493651405, + "flos": 849341749248.0, + "grad_norm": 0.08932063460271895, + "language_loss": 0.79991817, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81042111, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4895, + "time_per_iteration": 3.1561882495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046423, + "balance_loss_mlp": 1.03778601, + "diversity_loss_mlp": 0.0, + "epoch": 0.941900731050404, + "flos": 529333304832.0, + "grad_norm": 0.0641512346414091, + "language_loss": 0.85852486, + "learning_rate": 8.826044268024025e-06, + "loss": 0.86898911, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4896, + "time_per_iteration": 2.6913957595825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045265, + "balance_loss_mlp": 1.03639615, + "diversity_loss_mlp": 0.0, + "epoch": 0.9420931127356675, + "flos": 557073303552.0, + "grad_norm": 0.0665448744143015, + "language_loss": 0.80267036, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81312299, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4897, + "time_per_iteration": 2.7335498332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104815, + "balance_loss_mlp": 1.0395788, + "diversity_loss_mlp": 0.0, + "epoch": 0.9422854944209311, + "flos": 652543340544.0, + "grad_norm": 0.07266036540005272, + "language_loss": 0.86784083, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87832236, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.08581543, + "routerloss_mlp": 0.0, + "step": 4898, + "time_per_iteration": 2.820343255996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_mlp": 1.04090655, + "diversity_loss_mlp": 0.0, + "epoch": 0.9424778761061947, + "flos": 553685829120.0, + "grad_norm": 0.07366201746067845, + "language_loss": 0.84326768, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85376465, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4899, + "time_per_iteration": 2.708554744720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_mlp": 1.03626442, + "diversity_loss_mlp": 0.0, + "epoch": 0.9426702577914583, + "flos": 588559053312.0, + "grad_norm": 0.07321817964783915, + "language_loss": 0.79721165, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80766267, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4900, + "time_per_iteration": 2.674393892288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054375, + "balance_loss_mlp": 1.04557145, + "diversity_loss_mlp": 0.0, + "epoch": 0.9428626394767218, + "flos": 616625021952.0, + "grad_norm": 0.0749978632070715, + "language_loss": 0.78455758, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79510128, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4901, + "time_per_iteration": 2.805161952972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047513, + "balance_loss_mlp": 1.03873909, + "diversity_loss_mlp": 0.0, + "epoch": 0.9430550211619854, + "flos": 610410981888.0, + "grad_norm": 0.07047076389805079, + "language_loss": 0.82071722, + "learning_rate": 8.479809201123178e-06, + "loss": 0.83119237, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4902, + "time_per_iteration": 2.732999324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_mlp": 1.03907359, + "diversity_loss_mlp": 0.0, + "epoch": 0.943247402847249, + "flos": 565990571520.0, + "grad_norm": 0.06786486493908951, + "language_loss": 0.78043211, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79091066, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4903, + "time_per_iteration": 2.7100279331207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048262, + "balance_loss_mlp": 1.03943491, + "diversity_loss_mlp": 0.0, + "epoch": 0.9434397845325125, + "flos": 527040483840.0, + "grad_norm": 0.07474785644916408, + "language_loss": 0.81409293, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82457554, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4904, + "time_per_iteration": 2.598313093185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046884, + "balance_loss_mlp": 1.0381397, + "diversity_loss_mlp": 0.0, + "epoch": 0.943632166217776, + "flos": 593451214848.0, + "grad_norm": 0.06861839019347821, + "language_loss": 0.82857311, + "learning_rate": 8.309267504391593e-06, + "loss": 0.83904195, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4905, + "time_per_iteration": 2.7130138874053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010495, + "balance_loss_mlp": 1.04049969, + "diversity_loss_mlp": 0.0, + "epoch": 0.9438245479030396, + "flos": 572770289664.0, + "grad_norm": 0.05740754157545699, + "language_loss": 0.85487771, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86537278, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4906, + "time_per_iteration": 2.819689989089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046047, + "balance_loss_mlp": 1.03723109, + "diversity_loss_mlp": 0.0, + "epoch": 0.9440169295883032, + "flos": 488258523648.0, + "grad_norm": 0.0749683755111213, + "language_loss": 0.81567025, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82613063, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4907, + "time_per_iteration": 2.554344415664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049809, + "balance_loss_mlp": 1.04098761, + "diversity_loss_mlp": 0.0, + "epoch": 0.9442093112735668, + "flos": 731742999552.0, + "grad_norm": 0.06901073906266146, + "language_loss": 0.73883832, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74933642, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4908, + "time_per_iteration": 3.0110507011413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047925, + "balance_loss_mlp": 1.03897214, + "diversity_loss_mlp": 0.0, + "epoch": 0.9444016929588304, + "flos": 571031036928.0, + "grad_norm": 0.07411833720689497, + "language_loss": 0.8239246, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83440387, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4909, + "time_per_iteration": 2.6770436763763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047051, + "balance_loss_mlp": 1.03801453, + "diversity_loss_mlp": 0.0, + "epoch": 0.9445940746440938, + "flos": 509292582912.0, + "grad_norm": 0.06788128134122538, + "language_loss": 0.86283165, + "learning_rate": 8.028849459169318e-06, + "loss": 0.8733021, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4910, + "time_per_iteration": 2.582549810409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049067, + "balance_loss_mlp": 1.04030466, + "diversity_loss_mlp": 0.0, + "epoch": 0.9447864563293574, + "flos": 624556293120.0, + "grad_norm": 0.0678450295570026, + "language_loss": 0.80976182, + "learning_rate": 7.97333876382028e-06, + "loss": 0.82025248, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4911, + "time_per_iteration": 2.8425984382629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049899, + "balance_loss_mlp": 1.04112482, + "diversity_loss_mlp": 0.0, + "epoch": 0.944978838014621, + "flos": 505270047744.0, + "grad_norm": 0.08525541673585063, + "language_loss": 0.81182563, + "learning_rate": 7.918019090162098e-06, + "loss": 0.82232463, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4912, + "time_per_iteration": 2.7192227840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004436, + "balance_loss_mlp": 1.00002539, + "diversity_loss_mlp": 0.0, + "epoch": 0.9451712196998846, + "flos": 1484205451776.0, + "grad_norm": 0.00558203174928547, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79291773, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4913, + "time_per_iteration": 4.945667505264282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050028, + "balance_loss_mlp": 1.0412302, + "diversity_loss_mlp": 0.0, + "epoch": 0.9453636013851482, + "flos": 521137732608.0, + "grad_norm": 0.07323836789774518, + "language_loss": 0.90345061, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91395086, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4914, + "time_per_iteration": 2.628188371658325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004442, + "balance_loss_mlp": 1.00000703, + "diversity_loss_mlp": 0.0, + "epoch": 0.9455559830704117, + "flos": 1496902975488.0, + "grad_norm": 0.00558152160329536, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.8456679, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4915, + "time_per_iteration": 4.940939426422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049716, + "balance_loss_mlp": 1.04091215, + "diversity_loss_mlp": 0.0, + "epoch": 0.9457483647556753, + "flos": 498126910464.0, + "grad_norm": 0.05816068289189103, + "language_loss": 0.81779099, + "learning_rate": 7.698651040865534e-06, + "loss": 0.8282882, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4916, + "time_per_iteration": 2.622225522994995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045766, + "balance_loss_mlp": 1.03712368, + "diversity_loss_mlp": 0.0, + "epoch": 0.9459407464409388, + "flos": 1019405979648.0, + "grad_norm": 0.06122686842867312, + "language_loss": 0.82315564, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83361328, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.08654785, + "routerloss_mlp": 0.0, + "step": 4917, + "time_per_iteration": 3.3565821647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050444, + "balance_loss_mlp": 1.04189634, + "diversity_loss_mlp": 0.0, + "epoch": 0.9461331281262024, + "flos": 513589330944.0, + "grad_norm": 0.07064430272408662, + "language_loss": 0.81672692, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82723141, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.08557129, + "routerloss_mlp": 0.0, + "step": 4918, + "time_per_iteration": 2.609248399734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049398, + "balance_loss_mlp": 1.04064822, + "diversity_loss_mlp": 0.0, + "epoch": 0.9463255098114659, + "flos": 528023909376.0, + "grad_norm": 0.07970710282703287, + "language_loss": 0.7821058, + "learning_rate": 7.536131776620936e-06, + "loss": 0.7925998, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4919, + "time_per_iteration": 2.6066248416900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049566, + "balance_loss_mlp": 1.0406847, + "diversity_loss_mlp": 0.0, + "epoch": 0.9465178914967295, + "flos": 506043500544.0, + "grad_norm": 0.08687319482199532, + "language_loss": 0.83590424, + "learning_rate": 7.482341043430485e-06, + "loss": 0.8463999, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4920, + "time_per_iteration": 2.579651117324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_mlp": 1.03711653, + "diversity_loss_mlp": 0.0, + "epoch": 0.9467102731819931, + "flos": 660254727168.0, + "grad_norm": 0.06849366756552606, + "language_loss": 0.85644251, + "learning_rate": 7.428741522553184e-06, + "loss": 0.86690247, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4921, + "time_per_iteration": 2.9116263389587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045153, + "balance_loss_mlp": 1.03621817, + "diversity_loss_mlp": 0.0, + "epoch": 0.9469026548672567, + "flos": 675183403008.0, + "grad_norm": 0.06484399276768851, + "language_loss": 0.89472318, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90517473, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4922, + "time_per_iteration": 2.9387049674987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047394, + "balance_loss_mlp": 1.03844738, + "diversity_loss_mlp": 0.0, + "epoch": 0.9470950365525203, + "flos": 513964859904.0, + "grad_norm": 0.08622456288461161, + "language_loss": 0.80096912, + "learning_rate": 7.32211620090012e-06, + "loss": 0.81144309, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4923, + "time_per_iteration": 2.6302578449249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050313, + "balance_loss_mlp": 1.04158056, + "diversity_loss_mlp": 0.0, + "epoch": 0.9472874182377837, + "flos": 550103063040.0, + "grad_norm": 0.0601694962527871, + "language_loss": 0.81003237, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82053542, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4924, + "time_per_iteration": 2.808788299560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051085, + "balance_loss_mlp": 1.04240632, + "diversity_loss_mlp": 0.0, + "epoch": 0.9474797999230473, + "flos": 542769776640.0, + "grad_norm": 0.06384621728093878, + "language_loss": 0.80346602, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81397688, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 4925, + "time_per_iteration": 2.6172335147857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049268, + "balance_loss_mlp": 1.04039288, + "diversity_loss_mlp": 0.0, + "epoch": 0.9476721816083109, + "flos": 844644879360.0, + "grad_norm": 0.06326857300487894, + "language_loss": 0.85833907, + "learning_rate": 7.163612828585242e-06, + "loss": 0.86883175, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4926, + "time_per_iteration": 3.1013805866241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046961, + "balance_loss_mlp": 1.03822935, + "diversity_loss_mlp": 0.0, + "epoch": 0.9478645632935745, + "flos": 638002676736.0, + "grad_norm": 0.0714765450100148, + "language_loss": 0.7945109, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80498052, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 4927, + "time_per_iteration": 2.7759459018707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044976, + "balance_loss_mlp": 1.03620195, + "diversity_loss_mlp": 0.0, + "epoch": 0.948056944978838, + "flos": 656832748032.0, + "grad_norm": 0.08515861260909238, + "language_loss": 0.75973248, + "learning_rate": 7.058900559793469e-06, + "loss": 0.77018219, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4928, + "time_per_iteration": 2.8861470222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052944, + "balance_loss_mlp": 1.04416978, + "diversity_loss_mlp": 0.0, + "epoch": 0.9482493266641016, + "flos": 440907660288.0, + "grad_norm": 0.06735199813953592, + "language_loss": 0.83267879, + "learning_rate": 7.00683148031378e-06, + "loss": 0.84320819, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4929, + "time_per_iteration": 2.510803699493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045383, + "balance_loss_mlp": 1.03666258, + "diversity_loss_mlp": 0.0, + "epoch": 0.9484417083493651, + "flos": 545989123584.0, + "grad_norm": 0.06926665939050473, + "language_loss": 0.78147107, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.79192489, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4930, + "time_per_iteration": 2.7705516815185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784775, + "balance_loss_mlp": 1.32251549, + "diversity_loss_mlp": 0.22577199, + "epoch": 0.9486340900346287, + "flos": 538598937600.0, + "grad_norm": 0.030705907107943475, + "language_loss": 0.80018926, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80803692, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01063121, + "step": 4931, + "time_per_iteration": 2.700617551803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052654, + "balance_loss_mlp": 1.04359388, + "diversity_loss_mlp": 0.0, + "epoch": 0.9488264717198923, + "flos": 681669457920.0, + "grad_norm": 0.07163166168335688, + "language_loss": 0.85786635, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86839288, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4932, + "time_per_iteration": 2.8230526447296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_mlp": 1.03682232, + "diversity_loss_mlp": 0.0, + "epoch": 0.9490188534051558, + "flos": 462603944448.0, + "grad_norm": 0.07113425512473334, + "language_loss": 0.88082981, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.89128458, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.08660889, + "routerloss_mlp": 0.0, + "step": 4933, + "time_per_iteration": 2.5242044925689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052399, + "balance_loss_mlp": 1.04369068, + "diversity_loss_mlp": 0.0, + "epoch": 0.9492112350904194, + "flos": 543135393792.0, + "grad_norm": 0.06957529053478449, + "language_loss": 0.82772219, + "learning_rate": 6.7493574384489e-06, + "loss": 0.83824623, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4934, + "time_per_iteration": 2.682114362716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046039, + "balance_loss_mlp": 1.03765225, + "diversity_loss_mlp": 0.0, + "epoch": 0.949403616775683, + "flos": 550322947584.0, + "grad_norm": 0.06306988880080433, + "language_loss": 0.8386761, + "learning_rate": 6.698437041126992e-06, + "loss": 0.84913647, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.0838623, + "routerloss_mlp": 0.0, + "step": 4935, + "time_per_iteration": 2.726893424987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046934, + "balance_loss_mlp": 1.03838086, + "diversity_loss_mlp": 0.0, + "epoch": 0.9495959984609466, + "flos": 598383023616.0, + "grad_norm": 0.05973475098726946, + "language_loss": 0.82893109, + "learning_rate": 6.647708160456678e-06, + "loss": 0.83940041, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.08563232, + "routerloss_mlp": 0.0, + "step": 4936, + "time_per_iteration": 2.729111671447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_mlp": 1.03814435, + "diversity_loss_mlp": 0.0, + "epoch": 0.94978838014621, + "flos": 608409626112.0, + "grad_norm": 0.07659756248200288, + "language_loss": 0.82697654, + "learning_rate": 6.597170816132702e-06, + "loss": 0.83744407, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4937, + "time_per_iteration": 2.8081254959106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784639, + "balance_loss_mlp": 1.32296765, + "diversity_loss_mlp": 0.22491853, + "epoch": 0.9499807618314736, + "flos": 540832660992.0, + "grad_norm": 0.031155014429691368, + "language_loss": 0.86999297, + "learning_rate": 6.546825027775427e-06, + "loss": 0.87783933, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01069584, + "step": 4938, + "time_per_iteration": 2.647392749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049194, + "balance_loss_mlp": 1.04043269, + "diversity_loss_mlp": 0.0, + "epoch": 0.9501731435167372, + "flos": 594600196608.0, + "grad_norm": 0.06549207812906088, + "language_loss": 0.82709306, + "learning_rate": 6.496670814930717e-06, + "loss": 0.83758503, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4939, + "time_per_iteration": 2.6947948932647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049253, + "balance_loss_mlp": 1.04041934, + "diversity_loss_mlp": 0.0, + "epoch": 0.9503655252020008, + "flos": 454138928640.0, + "grad_norm": 0.0674263053300071, + "language_loss": 0.80045903, + "learning_rate": 6.446708197070161e-06, + "loss": 0.81095159, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4940, + "time_per_iteration": 2.537261486053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.03906798, + "diversity_loss_mlp": 0.0, + "epoch": 0.9505579068872644, + "flos": 667944092160.0, + "grad_norm": 0.06671960471522939, + "language_loss": 0.84743893, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85791707, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4941, + "time_per_iteration": 2.7824418544769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051988, + "balance_loss_mlp": 1.04320264, + "diversity_loss_mlp": 0.0, + "epoch": 0.9507502885725279, + "flos": 402207192576.0, + "grad_norm": 0.07518292778028754, + "language_loss": 0.81734824, + "learning_rate": 6.347357823816235e-06, + "loss": 0.8278681, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4942, + "time_per_iteration": 2.5175111293792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045578, + "balance_loss_mlp": 1.03662586, + "diversity_loss_mlp": 0.0, + "epoch": 0.9509426702577914, + "flos": 700358565888.0, + "grad_norm": 0.06073583327995898, + "language_loss": 0.79565704, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80611289, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4943, + "time_per_iteration": 2.98564076423645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_mlp": 1.03589809, + "diversity_loss_mlp": 0.0, + "epoch": 0.951135051943055, + "flos": 501415640064.0, + "grad_norm": 0.07464458367850044, + "language_loss": 0.82931554, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83976078, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.08630371, + "routerloss_mlp": 0.0, + "step": 4944, + "time_per_iteration": 2.586824417114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048064, + "balance_loss_mlp": 1.03944492, + "diversity_loss_mlp": 0.0, + "epoch": 0.9513274336283186, + "flos": 614621094912.0, + "grad_norm": 0.0706686343064775, + "language_loss": 0.81845355, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82893419, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4945, + "time_per_iteration": 2.921309232711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046293, + "balance_loss_mlp": 1.03738809, + "diversity_loss_mlp": 0.0, + "epoch": 0.9515198153135821, + "flos": 519586057728.0, + "grad_norm": 0.07524726970917751, + "language_loss": 0.82137179, + "learning_rate": 6.150957065611363e-06, + "loss": 0.83183479, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4946, + "time_per_iteration": 2.5640242099761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049384, + "balance_loss_mlp": 1.04034781, + "diversity_loss_mlp": 0.0, + "epoch": 0.9517121969988457, + "flos": 664954168320.0, + "grad_norm": 0.07065066286266242, + "language_loss": 0.76635486, + "learning_rate": 6.102336151595667e-06, + "loss": 0.77684867, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4947, + "time_per_iteration": 2.965193033218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049032, + "balance_loss_mlp": 1.04028833, + "diversity_loss_mlp": 0.0, + "epoch": 0.9519045786841093, + "flos": 676409107968.0, + "grad_norm": 0.06944081610529035, + "language_loss": 0.75779366, + "learning_rate": 6.053906985658553e-06, + "loss": 0.76828402, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4948, + "time_per_iteration": 2.8114254474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047439, + "balance_loss_mlp": 1.03859949, + "diversity_loss_mlp": 0.0, + "epoch": 0.9520969603693729, + "flos": 652901617152.0, + "grad_norm": 0.06267886834412634, + "language_loss": 0.80306596, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81354034, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4949, + "time_per_iteration": 2.829516887664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047548, + "balance_loss_mlp": 1.03901839, + "diversity_loss_mlp": 0.0, + "epoch": 0.9522893420546364, + "flos": 743284200960.0, + "grad_norm": 0.06460536676220141, + "language_loss": 0.83404064, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84451616, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.08538818, + "routerloss_mlp": 0.0, + "step": 4950, + "time_per_iteration": 3.064345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047703, + "balance_loss_mlp": 1.03875649, + "diversity_loss_mlp": 0.0, + "epoch": 0.9524817237398999, + "flos": 761696898048.0, + "grad_norm": 0.07065514061093704, + "language_loss": 0.80931592, + "learning_rate": 5.909770163964545e-06, + "loss": 0.81979299, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4951, + "time_per_iteration": 2.9210174083709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045969, + "balance_loss_mlp": 1.03724885, + "diversity_loss_mlp": 0.0, + "epoch": 0.9526741054251635, + "flos": 529125903360.0, + "grad_norm": 0.0779800356462361, + "language_loss": 0.82006431, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83052403, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4952, + "time_per_iteration": 2.570007801055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048957, + "balance_loss_mlp": 1.0397898, + "diversity_loss_mlp": 0.0, + "epoch": 0.9528664871104271, + "flos": 488441332224.0, + "grad_norm": 0.07317068745782636, + "language_loss": 0.81126779, + "learning_rate": 5.814638032609787e-06, + "loss": 0.82175738, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4953, + "time_per_iteration": 2.593344211578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047896, + "balance_loss_mlp": 1.03926563, + "diversity_loss_mlp": 0.0, + "epoch": 0.9530588687956907, + "flos": 517745115648.0, + "grad_norm": 0.06495580169291973, + "language_loss": 0.85402286, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86450183, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4954, + "time_per_iteration": 2.757946491241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00780626, + "balance_loss_mlp": 1.31376719, + "diversity_loss_mlp": 0.22618601, + "epoch": 0.9532512504809542, + "flos": 675148898304.0, + "grad_norm": 0.03586731087797504, + "language_loss": 0.81228065, + "learning_rate": 5.720273340271864e-06, + "loss": 0.82008696, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0106497, + "step": 4955, + "time_per_iteration": 2.883862018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049414, + "balance_loss_mlp": 1.04027104, + "diversity_loss_mlp": 0.0, + "epoch": 0.9534436321662177, + "flos": 489523502592.0, + "grad_norm": 0.07193968737801358, + "language_loss": 0.84132719, + "learning_rate": 5.673378829575249e-06, + "loss": 0.85182136, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 4956, + "time_per_iteration": 2.5883569717407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046949, + "balance_loss_mlp": 1.03826427, + "diversity_loss_mlp": 0.0, + "epoch": 0.9536360138514813, + "flos": 496585147392.0, + "grad_norm": 0.06822952225428794, + "language_loss": 0.81915605, + "learning_rate": 5.626676233493167e-06, + "loss": 0.82962549, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 4957, + "time_per_iteration": 2.630600690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_mlp": 1.040012, + "diversity_loss_mlp": 0.0, + "epoch": 0.9538283955367449, + "flos": 801462283776.0, + "grad_norm": 0.05995693166435021, + "language_loss": 0.83973289, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85022032, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4958, + "time_per_iteration": 3.0566930770874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045737, + "balance_loss_mlp": 1.0366534, + "diversity_loss_mlp": 0.0, + "epoch": 0.9540207772220085, + "flos": 556668039168.0, + "grad_norm": 0.06699001332746012, + "language_loss": 0.80331284, + "learning_rate": 5.533846857624203e-06, + "loss": 0.81377017, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4959, + "time_per_iteration": 2.761378049850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047149, + "balance_loss_mlp": 1.03821445, + "diversity_loss_mlp": 0.0, + "epoch": 0.954213158907272, + "flos": 684505935360.0, + "grad_norm": 0.0761611393687458, + "language_loss": 0.82048774, + "learning_rate": 5.487720113876882e-06, + "loss": 0.83095926, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4960, + "time_per_iteration": 2.932245969772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_mlp": 1.04009259, + "diversity_loss_mlp": 0.0, + "epoch": 0.9544055405925356, + "flos": 535752548352.0, + "grad_norm": 0.06840338993330367, + "language_loss": 0.8257823, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83627176, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4961, + "time_per_iteration": 2.7189135551452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049536, + "balance_loss_mlp": 1.04058886, + "diversity_loss_mlp": 0.0, + "epoch": 0.9545979222777992, + "flos": 825404401152.0, + "grad_norm": 0.06804248679935226, + "language_loss": 0.80613565, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81663102, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4962, + "time_per_iteration": 3.102736711502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078831, + "balance_loss_mlp": 1.33004642, + "diversity_loss_mlp": 0.2248106, + "epoch": 0.9547903039630627, + "flos": 761691755520.0, + "grad_norm": 0.03404897095721445, + "language_loss": 0.77822566, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78610873, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01088165, + "step": 4963, + "time_per_iteration": 3.1009397506713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051718, + "balance_loss_mlp": 1.04287314, + "diversity_loss_mlp": 0.0, + "epoch": 0.9549826856483262, + "flos": 515306562048.0, + "grad_norm": 0.0785854138679803, + "language_loss": 0.82759595, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.83811319, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4964, + "time_per_iteration": 2.5947694778442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052088, + "balance_loss_mlp": 1.04327834, + "diversity_loss_mlp": 0.0, + "epoch": 0.9551750673335898, + "flos": 643107382272.0, + "grad_norm": 0.06792534083569658, + "language_loss": 0.82819939, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83872032, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4965, + "time_per_iteration": 2.803609609603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050399, + "balance_loss_mlp": 1.04159546, + "diversity_loss_mlp": 0.0, + "epoch": 0.9553674490188534, + "flos": 472208030208.0, + "grad_norm": 0.06616240585597659, + "language_loss": 0.8283782, + "learning_rate": 5.214991993520546e-06, + "loss": 0.83888221, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4966, + "time_per_iteration": 2.584310531616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048058, + "balance_loss_mlp": 1.03910518, + "diversity_loss_mlp": 0.0, + "epoch": 0.955559830704117, + "flos": 528317945856.0, + "grad_norm": 0.07793598675668457, + "language_loss": 0.8188796, + "learning_rate": 5.170209528521763e-06, + "loss": 0.82936013, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4967, + "time_per_iteration": 2.592332601547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104625, + "balance_loss_mlp": 1.03739893, + "diversity_loss_mlp": 0.0, + "epoch": 0.9557522123893806, + "flos": 548168518656.0, + "grad_norm": 0.06516874865343447, + "language_loss": 0.84235787, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85282034, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4968, + "time_per_iteration": 2.6265814304351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044806, + "balance_loss_mlp": 1.03580022, + "diversity_loss_mlp": 0.0, + "epoch": 0.955944594074644, + "flos": 509465479680.0, + "grad_norm": 0.05920920196225761, + "language_loss": 0.81924808, + "learning_rate": 5.08122094572222e-06, + "loss": 0.82969612, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4969, + "time_per_iteration": 2.668456554412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_mlp": 1.03809857, + "diversity_loss_mlp": 0.0, + "epoch": 0.9561369757599076, + "flos": 527578997760.0, + "grad_norm": 0.07042790663947672, + "language_loss": 0.79412282, + "learning_rate": 5.037014862469824e-06, + "loss": 0.80459142, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4970, + "time_per_iteration": 2.7282607555389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050652, + "balance_loss_mlp": 1.0418489, + "diversity_loss_mlp": 0.0, + "epoch": 0.9563293574451712, + "flos": 498201062400.0, + "grad_norm": 0.06399713345893698, + "language_loss": 0.80029887, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81080544, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4971, + "time_per_iteration": 2.6104438304901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00588737, + "balance_loss_mlp": 1.02730632, + "diversity_loss_mlp": 0.13157621, + "epoch": 0.9565217391304348, + "flos": 1408875628032.0, + "grad_norm": 0.0012650050689020306, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.823623, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0092962, + "step": 4972, + "time_per_iteration": 4.941720008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044467, + "balance_loss_mlp": 1.03565741, + "diversity_loss_mlp": 0.0, + "epoch": 0.9567141208156984, + "flos": 503846853120.0, + "grad_norm": 0.059256065258913096, + "language_loss": 0.78335071, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79379541, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4973, + "time_per_iteration": 2.788773775100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049048, + "balance_loss_mlp": 1.04036331, + "diversity_loss_mlp": 0.0, + "epoch": 0.9569065025009619, + "flos": 433213526016.0, + "grad_norm": 0.08268664024117288, + "language_loss": 0.79965454, + "learning_rate": 4.86211231669359e-06, + "loss": 0.81014502, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 4974, + "time_per_iteration": 2.4901206493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047752, + "balance_loss_mlp": 1.03915691, + "diversity_loss_mlp": 0.0, + "epoch": 0.9570988841862255, + "flos": 589959853056.0, + "grad_norm": 0.0658884479140285, + "language_loss": 0.78595436, + "learning_rate": 4.818867211936806e-06, + "loss": 0.7964319, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 4975, + "time_per_iteration": 4.219155550003052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_mlp": 1.03510857, + "diversity_loss_mlp": 0.0, + "epoch": 0.957291265871489, + "flos": 767278448640.0, + "grad_norm": 0.07813154083214305, + "language_loss": 0.78541613, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79585493, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4976, + "time_per_iteration": 2.9422388076782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045842, + "balance_loss_mlp": 1.03703845, + "diversity_loss_mlp": 0.0, + "epoch": 0.9574836475567526, + "flos": 639104670720.0, + "grad_norm": 0.07237747383924455, + "language_loss": 0.84659564, + "learning_rate": 4.732953758233849e-06, + "loss": 0.85705405, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4977, + "time_per_iteration": 2.826688528060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004691, + "balance_loss_mlp": 1.0002805, + "diversity_loss_mlp": 0.0, + "epoch": 0.9576760292420161, + "flos": 1575939649536.0, + "grad_norm": 0.006664188824760945, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79611945, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4978, + "time_per_iteration": 4.937689781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078841, + "balance_loss_mlp": 1.33186579, + "diversity_loss_mlp": 0.22349364, + "epoch": 0.9578684109272797, + "flos": 496345439232.0, + "grad_norm": 0.030270093123026424, + "language_loss": 0.87261242, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.8804965, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01073015, + "step": 4979, + "time_per_iteration": 2.6448476314544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787724, + "balance_loss_mlp": 1.330446, + "diversity_loss_mlp": 0.2238563, + "epoch": 0.9580607926125433, + "flos": 429954531840.0, + "grad_norm": 0.03851656500602482, + "language_loss": 0.85486841, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86274564, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0105729, + "step": 4980, + "time_per_iteration": 2.513583183288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_mlp": 1.03938699, + "diversity_loss_mlp": 0.0, + "epoch": 0.9582531742978069, + "flos": 1127262251520.0, + "grad_norm": 0.0738676496011813, + "language_loss": 0.80298102, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81346583, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4981, + "time_per_iteration": 3.532383441925049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_mlp": 1.03933644, + "diversity_loss_mlp": 0.0, + "epoch": 0.9584455559830705, + "flos": 524458395648.0, + "grad_norm": 0.05859325637714088, + "language_loss": 0.79110616, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80158764, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4982, + "time_per_iteration": 2.6554603576660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048551, + "balance_loss_mlp": 1.03964579, + "diversity_loss_mlp": 0.0, + "epoch": 0.9586379376683339, + "flos": 634187543040.0, + "grad_norm": 0.05822993259734132, + "language_loss": 0.81000149, + "learning_rate": 4.479828637655392e-06, + "loss": 0.82048702, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4983, + "time_per_iteration": 2.836662530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045539, + "balance_loss_mlp": 1.03656244, + "diversity_loss_mlp": 0.0, + "epoch": 0.9588303193535975, + "flos": 416061038592.0, + "grad_norm": 0.06921858371067632, + "language_loss": 0.83688623, + "learning_rate": 4.438314345641459e-06, + "loss": 0.84734166, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4984, + "time_per_iteration": 2.4890353679656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047663, + "balance_loss_mlp": 1.03846598, + "diversity_loss_mlp": 0.0, + "epoch": 0.9590227010388611, + "flos": 481683635712.0, + "grad_norm": 0.0655069361339347, + "language_loss": 0.78102469, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79150128, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4985, + "time_per_iteration": 2.5810418128967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046293, + "balance_loss_mlp": 1.03757238, + "diversity_loss_mlp": 0.0, + "epoch": 0.9592150827241247, + "flos": 684540440064.0, + "grad_norm": 0.0696645623460603, + "language_loss": 0.80404431, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81450725, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4986, + "time_per_iteration": 3.0027694702148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044808, + "balance_loss_mlp": 1.03609419, + "diversity_loss_mlp": 0.0, + "epoch": 0.9594074644093882, + "flos": 574490092032.0, + "grad_norm": 0.06168953583598696, + "language_loss": 0.70886958, + "learning_rate": 4.314925898349642e-06, + "loss": 0.71931762, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4987, + "time_per_iteration": 2.7255663871765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046824, + "balance_loss_mlp": 1.03819966, + "diversity_loss_mlp": 0.0, + "epoch": 0.9595998460946518, + "flos": 546871233024.0, + "grad_norm": 0.0653725751798929, + "language_loss": 0.78369594, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79416412, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 4988, + "time_per_iteration": 2.7598073482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042023, + "balance_loss_mlp": 1.03311229, + "diversity_loss_mlp": 0.0, + "epoch": 0.9597922277799154, + "flos": 474043829760.0, + "grad_norm": 0.07692135244194774, + "language_loss": 0.78684759, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79726779, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4989, + "time_per_iteration": 2.5303213596343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047607, + "balance_loss_mlp": 1.03871953, + "diversity_loss_mlp": 0.0, + "epoch": 0.9599846094651789, + "flos": 514691324928.0, + "grad_norm": 0.08379738751426644, + "language_loss": 0.85613489, + "learning_rate": 4.193269428723889e-06, + "loss": 0.866611, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4990, + "time_per_iteration": 2.614570379257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046822, + "balance_loss_mlp": 1.03815556, + "diversity_loss_mlp": 0.0, + "epoch": 0.9601769911504425, + "flos": 594983066112.0, + "grad_norm": 0.08435652614677631, + "language_loss": 0.78316408, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79363227, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.08679199, + "routerloss_mlp": 0.0, + "step": 4991, + "time_per_iteration": 2.748410224914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104591, + "balance_loss_mlp": 1.03710628, + "diversity_loss_mlp": 0.0, + "epoch": 0.960369372835706, + "flos": 493012293120.0, + "grad_norm": 0.06666949415129908, + "language_loss": 0.79405791, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80451697, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4992, + "time_per_iteration": 2.6129846572875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049094, + "balance_loss_mlp": 1.04027796, + "diversity_loss_mlp": 0.0, + "epoch": 0.9605617545209696, + "flos": 579293420544.0, + "grad_norm": 0.06505303405528073, + "language_loss": 0.82855588, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83904684, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4993, + "time_per_iteration": 2.697122097015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048804, + "balance_loss_mlp": 1.03996491, + "diversity_loss_mlp": 0.0, + "epoch": 0.9607541362062332, + "flos": 927708857856.0, + "grad_norm": 0.05557800406655289, + "language_loss": 0.86002243, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87051046, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4994, + "time_per_iteration": 3.2234411239624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049247, + "balance_loss_mlp": 1.04041374, + "diversity_loss_mlp": 0.0, + "epoch": 0.9609465178914968, + "flos": 573121225728.0, + "grad_norm": 0.05698113601966363, + "language_loss": 0.75638676, + "learning_rate": 3.994358637073036e-06, + "loss": 0.7668792, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4995, + "time_per_iteration": 2.811509847640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047757, + "balance_loss_mlp": 1.03900671, + "diversity_loss_mlp": 0.0, + "epoch": 0.9611388995767602, + "flos": 530850475008.0, + "grad_norm": 0.06182635414067332, + "language_loss": 0.85539091, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86586845, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4996, + "time_per_iteration": 2.6234097480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00776504, + "balance_loss_mlp": 1.30815172, + "diversity_loss_mlp": 0.22351003, + "epoch": 0.9613312812620238, + "flos": 646247808000.0, + "grad_norm": 0.03585301103792293, + "language_loss": 0.82592523, + "learning_rate": 3.916142178097881e-06, + "loss": 0.83369029, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01067326, + "step": 4997, + "time_per_iteration": 2.7915287017822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0077909, + "balance_loss_mlp": 1.31180668, + "diversity_loss_mlp": 0.22519468, + "epoch": 0.9615236629472874, + "flos": 496152718848.0, + "grad_norm": 0.032099715647482555, + "language_loss": 0.77762806, + "learning_rate": 3.877322836288888e-06, + "loss": 0.78541887, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0105895, + "step": 4998, + "time_per_iteration": 2.8831381797790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045673, + "balance_loss_mlp": 1.03671455, + "diversity_loss_mlp": 0.0, + "epoch": 0.961716044632551, + "flos": 512974093824.0, + "grad_norm": 0.0659062812504805, + "language_loss": 0.75562751, + "learning_rate": 3.838696106385153e-06, + "loss": 0.76608419, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4999, + "time_per_iteration": 2.5965874195098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049929, + "balance_loss_mlp": 1.0409348, + "diversity_loss_mlp": 0.0, + "epoch": 0.9619084263178146, + "flos": 501084527616.0, + "grad_norm": 0.06697543006955084, + "language_loss": 0.80806673, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81856602, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 5000, + "time_per_iteration": 2.5651276111602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_mlp": 1.0366478, + "diversity_loss_mlp": 0.0, + "epoch": 0.9621008080030781, + "flos": 595635379200.0, + "grad_norm": 0.0765647536824451, + "language_loss": 0.75030309, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.76075912, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 5001, + "time_per_iteration": 2.750175952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048226, + "balance_loss_mlp": 1.03932738, + "diversity_loss_mlp": 0.0, + "epoch": 0.9622931896883417, + "flos": 502250761728.0, + "grad_norm": 0.07727900973651224, + "language_loss": 0.81910348, + "learning_rate": 3.723971737693899e-06, + "loss": 0.82958579, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5002, + "time_per_iteration": 2.665245294570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048413, + "balance_loss_mlp": 1.03946078, + "diversity_loss_mlp": 0.0, + "epoch": 0.9624855713736052, + "flos": 607287808512.0, + "grad_norm": 0.0718035222006464, + "language_loss": 0.80944788, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81993198, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 5003, + "time_per_iteration": 2.7820627689361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047459, + "balance_loss_mlp": 1.03892946, + "diversity_loss_mlp": 0.0, + "epoch": 0.9626779530588688, + "flos": 510715777536.0, + "grad_norm": 0.09658490174394786, + "language_loss": 0.85061997, + "learning_rate": 3.648452157695936e-06, + "loss": 0.86109459, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.08538818, + "routerloss_mlp": 0.0, + "step": 5004, + "time_per_iteration": 2.5650572776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051022, + "balance_loss_mlp": 1.04228425, + "diversity_loss_mlp": 0.0, + "epoch": 0.9628703347441323, + "flos": 627294025728.0, + "grad_norm": 0.07079516660765435, + "language_loss": 0.82573175, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83624196, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 5005, + "time_per_iteration": 2.808318853378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054525, + "balance_loss_mlp": 1.04536355, + "diversity_loss_mlp": 0.0, + "epoch": 0.9630627164293959, + "flos": 630758223360.0, + "grad_norm": 0.06358415598016834, + "language_loss": 0.77436566, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78491098, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 5006, + "time_per_iteration": 2.7581474781036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_mlp": 1.0372808, + "diversity_loss_mlp": 0.0, + "epoch": 0.9632550981146595, + "flos": 570558961152.0, + "grad_norm": 0.06259715736563402, + "language_loss": 0.78214157, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79260308, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 5007, + "time_per_iteration": 2.8067400455474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047305, + "balance_loss_mlp": 1.03849518, + "diversity_loss_mlp": 0.0, + "epoch": 0.9634474797999231, + "flos": 466117327872.0, + "grad_norm": 0.0652004870167461, + "language_loss": 0.8097052, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.82017827, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5008, + "time_per_iteration": 2.6624722480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043775, + "balance_loss_mlp": 1.03475678, + "diversity_loss_mlp": 0.0, + "epoch": 0.9636398614851867, + "flos": 526600714752.0, + "grad_norm": 0.07542594197578673, + "language_loss": 0.85320652, + "learning_rate": 3.463025724284974e-06, + "loss": 0.8636443, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 5009, + "time_per_iteration": 2.649427890777588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_mlp": 1.03576136, + "diversity_loss_mlp": 0.0, + "epoch": 0.9638322431704501, + "flos": 564831677952.0, + "grad_norm": 0.06511821335900564, + "language_loss": 0.75133872, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76178598, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 5010, + "time_per_iteration": 2.780074119567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046262, + "balance_loss_mlp": 1.03736854, + "diversity_loss_mlp": 0.0, + "epoch": 0.9640246248557137, + "flos": 477772328448.0, + "grad_norm": 0.07329288404167861, + "language_loss": 0.84246582, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.8529284, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 5011, + "time_per_iteration": 2.651488780975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047189, + "balance_loss_mlp": 1.03848636, + "diversity_loss_mlp": 0.0, + "epoch": 0.9642170065409773, + "flos": 539318062080.0, + "grad_norm": 0.06680869041289342, + "language_loss": 0.88673419, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89720607, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 5012, + "time_per_iteration": 2.6489691734313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_mlp": 1.03752685, + "diversity_loss_mlp": 0.0, + "epoch": 0.9644093882262409, + "flos": 523754325504.0, + "grad_norm": 0.06514803880345414, + "language_loss": 0.83791411, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.848378, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 5013, + "time_per_iteration": 2.57792067527771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046587, + "balance_loss_mlp": 1.03800964, + "diversity_loss_mlp": 0.0, + "epoch": 0.9646017699115044, + "flos": 574290031104.0, + "grad_norm": 0.06277044595718272, + "language_loss": 0.78603232, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79649818, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.08587646, + "routerloss_mlp": 0.0, + "step": 5014, + "time_per_iteration": 2.75705885887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049, + "balance_loss_mlp": 1.04026842, + "diversity_loss_mlp": 0.0, + "epoch": 0.964794151596768, + "flos": 636799366656.0, + "grad_norm": 0.10341285482454692, + "language_loss": 0.84443051, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85492051, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 5015, + "time_per_iteration": 2.7894856929779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104746, + "balance_loss_mlp": 1.03886533, + "diversity_loss_mlp": 0.0, + "epoch": 0.9649865332820315, + "flos": 617435550720.0, + "grad_norm": 0.07303173278488923, + "language_loss": 0.86459041, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87506503, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 5016, + "time_per_iteration": 2.774505376815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045328, + "balance_loss_mlp": 1.036268, + "diversity_loss_mlp": 0.0, + "epoch": 0.9651789149672951, + "flos": 516183528960.0, + "grad_norm": 0.06203977251445547, + "language_loss": 0.81297398, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82342726, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 5017, + "time_per_iteration": 2.7457613945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045833, + "balance_loss_mlp": 1.0369395, + "diversity_loss_mlp": 0.0, + "epoch": 0.9653712966525587, + "flos": 492940712448.0, + "grad_norm": 0.07389028070446926, + "language_loss": 0.80003834, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.81049669, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5018, + "time_per_iteration": 2.5559167861938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_mlp": 1.03835368, + "diversity_loss_mlp": 0.0, + "epoch": 0.9655636783378222, + "flos": 536560505856.0, + "grad_norm": 0.05876051048061586, + "language_loss": 0.82367206, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83414584, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 5019, + "time_per_iteration": 2.7295024394989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048475, + "balance_loss_mlp": 1.03974926, + "diversity_loss_mlp": 0.0, + "epoch": 0.9657560600230858, + "flos": 459023749632.0, + "grad_norm": 0.0742577236438263, + "language_loss": 0.82501537, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83550012, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 5020, + "time_per_iteration": 2.732282876968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048667, + "balance_loss_mlp": 1.0402087, + "diversity_loss_mlp": 0.0, + "epoch": 0.9659484417083494, + "flos": 686178749952.0, + "grad_norm": 0.07257927833574024, + "language_loss": 0.83663607, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84712267, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.08465576, + "routerloss_mlp": 0.0, + "step": 5021, + "time_per_iteration": 2.8645994663238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003265, + "balance_loss_mlp": 0.99885476, + "diversity_loss_mlp": 0.0, + "epoch": 0.966140823393613, + "flos": 1502292178944.0, + "grad_norm": 0.004502170891661989, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81697512, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5022, + "time_per_iteration": 4.703518390655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049998, + "balance_loss_mlp": 1.04122436, + "diversity_loss_mlp": 0.0, + "epoch": 0.9663332050788765, + "flos": 464899336704.0, + "grad_norm": 0.08988904326994861, + "language_loss": 0.81278229, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.82328224, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 5023, + "time_per_iteration": 2.581846237182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010486, + "balance_loss_mlp": 1.03996289, + "diversity_loss_mlp": 0.0, + "epoch": 0.96652558676414, + "flos": 500834907648.0, + "grad_norm": 0.07024301133900458, + "language_loss": 0.85463035, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86511636, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5024, + "time_per_iteration": 2.6678829193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047106, + "balance_loss_mlp": 1.03803396, + "diversity_loss_mlp": 0.0, + "epoch": 0.9667179684494036, + "flos": 424839914496.0, + "grad_norm": 0.0827162063613028, + "language_loss": 0.82914466, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.8396157, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 5025, + "time_per_iteration": 2.4615111351013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047117, + "balance_loss_mlp": 1.03826559, + "diversity_loss_mlp": 0.0, + "epoch": 0.9669103501346672, + "flos": 516996628992.0, + "grad_norm": 0.061914921870518225, + "language_loss": 0.85848838, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.86895955, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 5026, + "time_per_iteration": 2.6631765365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045557, + "balance_loss_mlp": 1.03661585, + "diversity_loss_mlp": 0.0, + "epoch": 0.9671027318199308, + "flos": 456241600512.0, + "grad_norm": 0.10389844527854888, + "language_loss": 0.75783134, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.76828694, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 5027, + "time_per_iteration": 2.6192245483398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_mlp": 1.03396487, + "diversity_loss_mlp": 0.0, + "epoch": 0.9672951135051943, + "flos": 525058951680.0, + "grad_norm": 0.06651584976337663, + "language_loss": 0.80529153, + "learning_rate": 2.802372171957057e-06, + "loss": 0.8157168, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.08575439, + "routerloss_mlp": 0.0, + "step": 5028, + "time_per_iteration": 2.6251182556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047556, + "balance_loss_mlp": 1.03856707, + "diversity_loss_mlp": 0.0, + "epoch": 0.9674874951904578, + "flos": 573986082816.0, + "grad_norm": 0.06722764033814799, + "language_loss": 0.79839933, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.80887485, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 5029, + "time_per_iteration": 2.7434749603271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_mlp": 1.03893399, + "diversity_loss_mlp": 0.0, + "epoch": 0.9676798768757214, + "flos": 629184153600.0, + "grad_norm": 0.06316563947076154, + "language_loss": 0.80004889, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.81052518, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 5030, + "time_per_iteration": 2.9535553455352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003251, + "balance_loss_mlp": 0.99884009, + "diversity_loss_mlp": 0.0, + "epoch": 0.967872258560985, + "flos": 1463880605184.0, + "grad_norm": 0.004505813137803552, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76566613, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5031, + "time_per_iteration": 4.665933609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049308, + "balance_loss_mlp": 1.04061723, + "diversity_loss_mlp": 0.0, + "epoch": 0.9680646402462486, + "flos": 565503814656.0, + "grad_norm": 0.07437893126618236, + "language_loss": 0.79223692, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80272996, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 5032, + "time_per_iteration": 2.6745200157165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003248, + "balance_loss_mlp": 0.99883741, + "diversity_loss_mlp": 0.0, + "epoch": 0.9682570219315121, + "flos": 1434463022592.0, + "grad_norm": 0.004505868190554417, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79078054, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5033, + "time_per_iteration": 4.830533027648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043814, + "balance_loss_mlp": 1.03500438, + "diversity_loss_mlp": 0.0, + "epoch": 0.9684494036167757, + "flos": 584610670080.0, + "grad_norm": 0.07679444902591688, + "language_loss": 0.81878042, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82921857, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 5034, + "time_per_iteration": 2.7140636444091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048554, + "balance_loss_mlp": 1.03991711, + "diversity_loss_mlp": 0.0, + "epoch": 0.9686417853020393, + "flos": 559064747520.0, + "grad_norm": 0.06455129167487729, + "language_loss": 0.84188414, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85236967, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 5035, + "time_per_iteration": 2.7100539207458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048563, + "balance_loss_mlp": 1.03969407, + "diversity_loss_mlp": 0.0, + "epoch": 0.9688341669873028, + "flos": 784927604736.0, + "grad_norm": 0.07457469088112735, + "language_loss": 0.8308925, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84137809, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 5036, + "time_per_iteration": 3.0273303985595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00775046, + "balance_loss_mlp": 1.30442953, + "diversity_loss_mlp": 0.22392677, + "epoch": 0.9690265486725663, + "flos": 395899176960.0, + "grad_norm": 0.03634578837356394, + "language_loss": 0.79774749, + "learning_rate": 2.513747116326126e-06, + "loss": 0.805498, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01086747, + "step": 5037, + "time_per_iteration": 2.496250629425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046622, + "balance_loss_mlp": 1.03794384, + "diversity_loss_mlp": 0.0, + "epoch": 0.9692189303578299, + "flos": 476373726720.0, + "grad_norm": 0.07461894486851982, + "language_loss": 0.77795297, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78841919, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 5038, + "time_per_iteration": 2.735316753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046103, + "balance_loss_mlp": 1.03756189, + "diversity_loss_mlp": 0.0, + "epoch": 0.9694113120430935, + "flos": 597575066112.0, + "grad_norm": 0.07661744515255002, + "language_loss": 0.79197067, + "learning_rate": 2.451732453851385e-06, + "loss": 0.8024317, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.08551025, + "routerloss_mlp": 0.0, + "step": 5039, + "time_per_iteration": 2.7147159576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043927, + "balance_loss_mlp": 1.03520727, + "diversity_loss_mlp": 0.0, + "epoch": 0.9696036937283571, + "flos": 500881895424.0, + "grad_norm": 0.06459150402718168, + "language_loss": 0.82762325, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.83806252, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 5040, + "time_per_iteration": 2.5953493118286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_mlp": 1.03482664, + "diversity_loss_mlp": 0.0, + "epoch": 0.9697960754136207, + "flos": 432277088256.0, + "grad_norm": 0.08520160899358113, + "language_loss": 0.87077874, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88121581, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 5041, + "time_per_iteration": 2.470695972442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047399, + "balance_loss_mlp": 1.03847671, + "diversity_loss_mlp": 0.0, + "epoch": 0.9699884570988841, + "flos": 568540353024.0, + "grad_norm": 0.0661289335538221, + "language_loss": 0.85483861, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86531258, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 5042, + "time_per_iteration": 2.7053682804107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104593, + "balance_loss_mlp": 1.03708434, + "diversity_loss_mlp": 0.0, + "epoch": 0.9701808387841477, + "flos": 516215835648.0, + "grad_norm": 0.06476327659734085, + "language_loss": 0.81779778, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82825708, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 5043, + "time_per_iteration": 2.6728196144104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_mlp": 1.03794777, + "diversity_loss_mlp": 0.0, + "epoch": 0.9703732204694113, + "flos": 491517517824.0, + "grad_norm": 0.08267177511200062, + "language_loss": 0.76453322, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77500081, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 5044, + "time_per_iteration": 2.5768589973449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047615, + "balance_loss_mlp": 1.03866804, + "diversity_loss_mlp": 0.0, + "epoch": 0.9705656021546749, + "flos": 626120451072.0, + "grad_norm": 0.06897516762466789, + "language_loss": 0.80167985, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.81215596, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 5045, + "time_per_iteration": 2.795342206954956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_mlp": 1.03677726, + "diversity_loss_mlp": 0.0, + "epoch": 0.9707579838399384, + "flos": 471437148672.0, + "grad_norm": 0.0755169004935037, + "language_loss": 0.83042562, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.84088135, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 5046, + "time_per_iteration": 2.5994091033935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_mlp": 1.03778839, + "diversity_loss_mlp": 0.0, + "epoch": 0.970950365525202, + "flos": 492103019520.0, + "grad_norm": 0.07013648257820884, + "language_loss": 0.80700469, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81747067, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 5047, + "time_per_iteration": 2.695164680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_mlp": 1.03531933, + "diversity_loss_mlp": 0.0, + "epoch": 0.9711427472104656, + "flos": 557322923520.0, + "grad_norm": 0.06514840583334829, + "language_loss": 0.80631614, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81675637, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 5048, + "time_per_iteration": 2.692713975906372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_mlp": 1.04089093, + "diversity_loss_mlp": 0.0, + "epoch": 0.9713351288957291, + "flos": 625841095680.0, + "grad_norm": 0.06192564808689567, + "language_loss": 0.83786458, + "learning_rate": 2.153250946564489e-06, + "loss": 0.84835804, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.08459473, + "routerloss_mlp": 0.0, + "step": 5049, + "time_per_iteration": 2.934725761413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049582, + "balance_loss_mlp": 1.04098153, + "diversity_loss_mlp": 0.0, + "epoch": 0.9715275105809927, + "flos": 499073260032.0, + "grad_norm": 0.0692175783084948, + "language_loss": 0.81435341, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.82484925, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 5050, + "time_per_iteration": 2.732560873031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047834, + "balance_loss_mlp": 1.03919172, + "diversity_loss_mlp": 0.0, + "epoch": 0.9717198922662562, + "flos": 477515367936.0, + "grad_norm": 0.07244382675246107, + "language_loss": 0.77611834, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78659672, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5051, + "time_per_iteration": 2.553946018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048661, + "balance_loss_mlp": 1.03976798, + "diversity_loss_mlp": 0.0, + "epoch": 0.9719122739515198, + "flos": 553446120960.0, + "grad_norm": 0.058871704281843434, + "language_loss": 0.78665662, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79714322, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 5052, + "time_per_iteration": 2.700554847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104788, + "balance_loss_mlp": 1.03924966, + "diversity_loss_mlp": 0.0, + "epoch": 0.9721046556367834, + "flos": 565852179456.0, + "grad_norm": 0.06621518812082018, + "language_loss": 0.79820377, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.80868256, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.08630371, + "routerloss_mlp": 0.0, + "step": 5053, + "time_per_iteration": 2.6846559047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048468, + "balance_loss_mlp": 1.03977799, + "diversity_loss_mlp": 0.0, + "epoch": 0.972297037322047, + "flos": 560315045376.0, + "grad_norm": 0.07341823776686772, + "language_loss": 0.78280944, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.79329413, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 5054, + "time_per_iteration": 2.773064136505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.03857064, + "diversity_loss_mlp": 0.0, + "epoch": 0.9724894190073105, + "flos": 512440349184.0, + "grad_norm": 0.07604483960314544, + "language_loss": 0.79561597, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.80608791, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 5055, + "time_per_iteration": 2.6578407287597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046653, + "balance_loss_mlp": 1.03799832, + "diversity_loss_mlp": 0.0, + "epoch": 0.972681800692574, + "flos": 613832961024.0, + "grad_norm": 0.0731380618710485, + "language_loss": 0.80641949, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81688601, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5056, + "time_per_iteration": 2.778132438659668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049506, + "balance_loss_mlp": 1.04049361, + "diversity_loss_mlp": 0.0, + "epoch": 0.9728741823778376, + "flos": 833911635456.0, + "grad_norm": 0.06341434190577709, + "language_loss": 0.83532751, + "learning_rate": 1.92838141509849e-06, + "loss": 0.84582257, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 5057, + "time_per_iteration": 3.070535898208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104959, + "balance_loss_mlp": 1.04053009, + "diversity_loss_mlp": 0.0, + "epoch": 0.9730665640631012, + "flos": 571450982400.0, + "grad_norm": 0.06728126412432961, + "language_loss": 0.84373492, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85423088, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 5058, + "time_per_iteration": 2.7407948970794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041956, + "balance_loss_mlp": 1.03302085, + "diversity_loss_mlp": 0.0, + "epoch": 0.9732589457483648, + "flos": 506520345600.0, + "grad_norm": 0.06896959434834592, + "language_loss": 0.77172613, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78214562, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 5059, + "time_per_iteration": 2.593101978302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_mlp": 1.03695965, + "diversity_loss_mlp": 0.0, + "epoch": 0.9734513274336283, + "flos": 926977623552.0, + "grad_norm": 0.06467450172514855, + "language_loss": 0.80509335, + "learning_rate": 1.84724562509897e-06, + "loss": 0.8155489, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.08599854, + "routerloss_mlp": 0.0, + "step": 5060, + "time_per_iteration": 3.130805015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048144, + "balance_loss_mlp": 1.03940582, + "diversity_loss_mlp": 0.0, + "epoch": 0.9736437091188919, + "flos": 491930122752.0, + "grad_norm": 0.07143647662877724, + "language_loss": 0.7819376, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79241908, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 5061, + "time_per_iteration": 2.7030551433563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105027, + "balance_loss_mlp": 1.04135358, + "diversity_loss_mlp": 0.0, + "epoch": 0.9738360908041555, + "flos": 613321611264.0, + "grad_norm": 0.07722158587827427, + "language_loss": 0.8399719, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.8504746, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 5062, + "time_per_iteration": 2.7250983715057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00588666, + "balance_loss_mlp": 1.0272553, + "diversity_loss_mlp": 0.13149816, + "epoch": 0.974028472489419, + "flos": 1549561549824.0, + "grad_norm": 0.001262541739400147, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.76580763, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00928975, + "step": 5063, + "time_per_iteration": 4.984234094619751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0058866, + "balance_loss_mlp": 1.02724576, + "diversity_loss_mlp": 0.13149402, + "epoch": 0.9742208541746825, + "flos": 1411155965952.0, + "grad_norm": 0.0012626586872862898, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.8026638, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00929021, + "step": 5064, + "time_per_iteration": 4.959820032119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043111, + "balance_loss_mlp": 1.03426552, + "diversity_loss_mlp": 0.0, + "epoch": 0.9744132358599461, + "flos": 674884597248.0, + "grad_norm": 0.061567595116442546, + "language_loss": 0.76945543, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77988654, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 5065, + "time_per_iteration": 2.8605847358703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046536, + "balance_loss_mlp": 1.03767872, + "diversity_loss_mlp": 0.0, + "epoch": 0.9746056175452097, + "flos": 598407616512.0, + "grad_norm": 0.06408228412896971, + "language_loss": 0.77837121, + "learning_rate": 1.690196122544896e-06, + "loss": 0.78883654, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 5066, + "time_per_iteration": 2.8428735733032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051292, + "balance_loss_mlp": 1.04271507, + "diversity_loss_mlp": 0.0, + "epoch": 0.9747979992304733, + "flos": 732175428096.0, + "grad_norm": 0.06431524577835049, + "language_loss": 0.82438833, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83490127, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.08581543, + "routerloss_mlp": 0.0, + "step": 5067, + "time_per_iteration": 2.9748458862304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046087, + "balance_loss_mlp": 1.03706264, + "diversity_loss_mlp": 0.0, + "epoch": 0.9749903809157369, + "flos": 616499112960.0, + "grad_norm": 0.07892101071391965, + "language_loss": 0.76234651, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.7728073, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 5068, + "time_per_iteration": 2.720424175262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049533, + "balance_loss_mlp": 1.04073572, + "diversity_loss_mlp": 0.0, + "epoch": 0.9751827626010003, + "flos": 468398039040.0, + "grad_norm": 0.06592156995071553, + "language_loss": 0.84109974, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.85159504, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 5069, + "time_per_iteration": 2.5736031532287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048842, + "balance_loss_mlp": 1.03985965, + "diversity_loss_mlp": 0.0, + "epoch": 0.9753751442862639, + "flos": 599215574016.0, + "grad_norm": 0.08190997494854581, + "language_loss": 0.85377657, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86426497, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 5070, + "time_per_iteration": 2.8101613521575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049289, + "balance_loss_mlp": 1.04077792, + "diversity_loss_mlp": 0.0, + "epoch": 0.9755675259715275, + "flos": 650806285824.0, + "grad_norm": 0.0795575480548678, + "language_loss": 0.82202387, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83251673, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.08514404, + "routerloss_mlp": 0.0, + "step": 5071, + "time_per_iteration": 2.890133857727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047822, + "balance_loss_mlp": 1.03918517, + "diversity_loss_mlp": 0.0, + "epoch": 0.9757599076567911, + "flos": 563658103296.0, + "grad_norm": 0.08438970561016089, + "language_loss": 0.79632509, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80680329, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 5072, + "time_per_iteration": 2.678088426589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_mlp": 1.03586388, + "diversity_loss_mlp": 0.0, + "epoch": 0.9759522893420547, + "flos": 504637558272.0, + "grad_norm": 0.07402137285679701, + "language_loss": 0.80289578, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81334102, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 5073, + "time_per_iteration": 2.5970799922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048341, + "balance_loss_mlp": 1.03980589, + "diversity_loss_mlp": 0.0, + "epoch": 0.9761446710273182, + "flos": 583728560640.0, + "grad_norm": 0.07471313714776352, + "language_loss": 0.82160485, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83208829, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.08544922, + "routerloss_mlp": 0.0, + "step": 5074, + "time_per_iteration": 2.724550724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047011, + "balance_loss_mlp": 1.03809404, + "diversity_loss_mlp": 0.0, + "epoch": 0.9763370527125818, + "flos": 482207468544.0, + "grad_norm": 0.07314052432610715, + "language_loss": 0.81910318, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.82957333, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 5075, + "time_per_iteration": 2.6065866947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047284, + "balance_loss_mlp": 1.03871298, + "diversity_loss_mlp": 0.0, + "epoch": 0.9765294343978453, + "flos": 618987225600.0, + "grad_norm": 0.06220869349054033, + "language_loss": 0.78624564, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79671854, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.08575439, + "routerloss_mlp": 0.0, + "step": 5076, + "time_per_iteration": 2.7079405784606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791822, + "balance_loss_mlp": 1.33702493, + "diversity_loss_mlp": 0.22525913, + "epoch": 0.9767218160831089, + "flos": 526573550592.0, + "grad_norm": 0.034551396825965836, + "language_loss": 0.85462183, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86254001, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01067994, + "step": 5077, + "time_per_iteration": 2.641120195388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_mlp": 1.03591776, + "diversity_loss_mlp": 0.0, + "epoch": 0.9769141977683724, + "flos": 525194772480.0, + "grad_norm": 0.06272749449600955, + "language_loss": 0.84269607, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.85314226, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 5078, + "time_per_iteration": 2.6361942291259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049536, + "balance_loss_mlp": 1.04075623, + "diversity_loss_mlp": 0.0, + "epoch": 0.977106579453636, + "flos": 457615236096.0, + "grad_norm": 0.06781093629055318, + "language_loss": 0.80375177, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81424713, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 5079, + "time_per_iteration": 2.799654245376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_mlp": 1.03826737, + "diversity_loss_mlp": 0.0, + "epoch": 0.9772989611388996, + "flos": 532090861056.0, + "grad_norm": 0.07054076117423486, + "language_loss": 0.8182112, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82868278, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 5080, + "time_per_iteration": 2.6261301040649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_mlp": 1.03806853, + "diversity_loss_mlp": 0.0, + "epoch": 0.9774913428241632, + "flos": 755349235200.0, + "grad_norm": 0.08607720599847436, + "language_loss": 0.85967886, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.87014663, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 5081, + "time_per_iteration": 3.0126025676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100325, + "balance_loss_mlp": 0.99883974, + "diversity_loss_mlp": 0.0, + "epoch": 0.9776837245094268, + "flos": 1554320088576.0, + "grad_norm": 0.004504556807133143, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79898739, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5082, + "time_per_iteration": 4.989394903182983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048062, + "balance_loss_mlp": 1.03916299, + "diversity_loss_mlp": 0.0, + "epoch": 0.9778761061946902, + "flos": 592534600704.0, + "grad_norm": 0.08681180158775233, + "language_loss": 0.84184444, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.85232502, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5083, + "time_per_iteration": 2.694652557373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049697, + "balance_loss_mlp": 1.04097056, + "diversity_loss_mlp": 0.0, + "epoch": 0.9780684878799538, + "flos": 414951704064.0, + "grad_norm": 0.06774609280174271, + "language_loss": 0.81603408, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.82653111, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 5084, + "time_per_iteration": 2.469529151916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049134, + "balance_loss_mlp": 1.04026437, + "diversity_loss_mlp": 0.0, + "epoch": 0.9782608695652174, + "flos": 568411872768.0, + "grad_norm": 0.06648884426689973, + "language_loss": 0.84724671, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.85773802, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 5085, + "time_per_iteration": 2.7240426540374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046379, + "balance_loss_mlp": 1.03747988, + "diversity_loss_mlp": 0.0, + "epoch": 0.978453251250481, + "flos": 690472926720.0, + "grad_norm": 0.07204518126062733, + "language_loss": 0.82956612, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84002984, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5086, + "time_per_iteration": 2.872143507003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_mlp": 1.03782678, + "diversity_loss_mlp": 0.0, + "epoch": 0.9786456329357445, + "flos": 502505150976.0, + "grad_norm": 0.06206480321158436, + "language_loss": 0.77373308, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.7841996, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 5087, + "time_per_iteration": 2.6224989891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046355, + "balance_loss_mlp": 1.0377537, + "diversity_loss_mlp": 0.0, + "epoch": 0.9788380146210081, + "flos": 863183485440.0, + "grad_norm": 0.07890344203678189, + "language_loss": 0.80865037, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.81911391, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 5088, + "time_per_iteration": 3.021143913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046697, + "balance_loss_mlp": 1.03788161, + "diversity_loss_mlp": 0.0, + "epoch": 0.9790303963062716, + "flos": 512717133312.0, + "grad_norm": 0.0707462132351249, + "language_loss": 0.83914399, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.84961092, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 5089, + "time_per_iteration": 2.6006627082824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048629, + "balance_loss_mlp": 1.03978372, + "diversity_loss_mlp": 0.0, + "epoch": 0.9792227779915352, + "flos": 494428147200.0, + "grad_norm": 0.07258951215182398, + "language_loss": 0.86249393, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87298024, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 5090, + "time_per_iteration": 2.57961106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104929, + "balance_loss_mlp": 1.04054606, + "diversity_loss_mlp": 0.0, + "epoch": 0.9794151596767988, + "flos": 608325562368.0, + "grad_norm": 0.0687229233050849, + "language_loss": 0.81661642, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82710934, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 5091, + "time_per_iteration": 2.774179458618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043168, + "balance_loss_mlp": 1.03394735, + "diversity_loss_mlp": 0.0, + "epoch": 0.9796075413620623, + "flos": 478222009344.0, + "grad_norm": 0.06774886047931106, + "language_loss": 0.86759937, + "learning_rate": 1.09015417612357e-06, + "loss": 0.87803102, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 5092, + "time_per_iteration": 2.5726425647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044727, + "balance_loss_mlp": 1.03592968, + "diversity_loss_mlp": 0.0, + "epoch": 0.9797999230473259, + "flos": 592220740608.0, + "grad_norm": 0.06986809662631227, + "language_loss": 0.84486377, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85531104, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 5093, + "time_per_iteration": 2.734572649002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044433, + "balance_loss_mlp": 1.03564167, + "diversity_loss_mlp": 0.0, + "epoch": 0.9799923047325895, + "flos": 556381343232.0, + "grad_norm": 0.06627525100654652, + "language_loss": 0.8142283, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82467258, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 5094, + "time_per_iteration": 2.901499032974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104922, + "balance_loss_mlp": 1.04027307, + "diversity_loss_mlp": 0.0, + "epoch": 0.9801846864178531, + "flos": 579456405504.0, + "grad_norm": 0.05858269256579561, + "language_loss": 0.84523547, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85572767, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 5095, + "time_per_iteration": 2.749011754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048661, + "balance_loss_mlp": 1.03991067, + "diversity_loss_mlp": 0.0, + "epoch": 0.9803770681031165, + "flos": 515101358592.0, + "grad_norm": 0.08054047976821545, + "language_loss": 0.8013413, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.81182784, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 5096, + "time_per_iteration": 2.6734437942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048068, + "balance_loss_mlp": 1.03946686, + "diversity_loss_mlp": 0.0, + "epoch": 0.9805694497883801, + "flos": 566988678144.0, + "grad_norm": 0.06350240490258963, + "language_loss": 0.7813378, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79181844, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 5097, + "time_per_iteration": 2.726039409637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047979, + "balance_loss_mlp": 1.03928292, + "diversity_loss_mlp": 0.0, + "epoch": 0.9807618314736437, + "flos": 479351167488.0, + "grad_norm": 0.06123275422091068, + "language_loss": 0.73776084, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74824059, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 5098, + "time_per_iteration": 2.6765458583831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779933, + "balance_loss_mlp": 1.31478071, + "diversity_loss_mlp": 0.22396225, + "epoch": 0.9809542131589073, + "flos": 545285053440.0, + "grad_norm": 0.03778989641153832, + "language_loss": 0.80182397, + "learning_rate": 9.509698444908344e-07, + "loss": 0.8096233, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0105617, + "step": 5099, + "time_per_iteration": 2.6399407386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047076, + "balance_loss_mlp": 1.03835607, + "diversity_loss_mlp": 0.0, + "epoch": 0.9811465948441709, + "flos": 520843696128.0, + "grad_norm": 0.0712325944726878, + "language_loss": 0.79504228, + "learning_rate": 9.318612999057452e-07, + "loss": 0.80551302, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 5100, + "time_per_iteration": 2.605034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047574, + "balance_loss_mlp": 1.03872824, + "diversity_loss_mlp": 0.0, + "epoch": 0.9813389765294344, + "flos": 541282341888.0, + "grad_norm": 0.07915756516451043, + "language_loss": 0.80425239, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81472808, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 5101, + "time_per_iteration": 2.653615713119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_mlp": 1.03676605, + "diversity_loss_mlp": 0.0, + "epoch": 0.981531358214698, + "flos": 567356866560.0, + "grad_norm": 0.07121268040890673, + "language_loss": 0.84309268, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85354877, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 5102, + "time_per_iteration": 2.7331223487854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048136, + "balance_loss_mlp": 1.03933203, + "diversity_loss_mlp": 0.0, + "epoch": 0.9817237398999615, + "flos": 577272241152.0, + "grad_norm": 0.06082212845964829, + "language_loss": 0.80932826, + "learning_rate": 8.756982280578307e-07, + "loss": 0.81980968, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5103, + "time_per_iteration": 2.731088876724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047414, + "balance_loss_mlp": 1.03868246, + "diversity_loss_mlp": 0.0, + "epoch": 0.9819161215852251, + "flos": 701507547648.0, + "grad_norm": 0.06577153639103081, + "language_loss": 0.82189977, + "learning_rate": 8.573647489714676e-07, + "loss": 0.83237398, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 5104, + "time_per_iteration": 2.952533721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047188, + "balance_loss_mlp": 1.03831923, + "diversity_loss_mlp": 0.0, + "epoch": 0.9821085032704886, + "flos": 624188104704.0, + "grad_norm": 0.06798431241240387, + "language_loss": 0.84167528, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85214722, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 5105, + "time_per_iteration": 2.86313533782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_mlp": 1.03541374, + "diversity_loss_mlp": 0.0, + "epoch": 0.9823008849557522, + "flos": 499505688576.0, + "grad_norm": 0.06686184516115971, + "language_loss": 0.81452221, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82496238, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 5106, + "time_per_iteration": 2.708230495452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045424, + "balance_loss_mlp": 1.03651953, + "diversity_loss_mlp": 0.0, + "epoch": 0.9824932666410158, + "flos": 523815994368.0, + "grad_norm": 0.07713140113072105, + "language_loss": 0.72798324, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73843747, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5107, + "time_per_iteration": 2.6602892875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047946, + "balance_loss_mlp": 1.0389818, + "diversity_loss_mlp": 0.0, + "epoch": 0.9826856483262794, + "flos": 502663366656.0, + "grad_norm": 0.06073968757615098, + "language_loss": 0.82624412, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83672357, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 5108, + "time_per_iteration": 2.637178421020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_mlp": 1.03743768, + "diversity_loss_mlp": 0.0, + "epoch": 0.982878030011543, + "flos": 562056869376.0, + "grad_norm": 0.05986915063822493, + "language_loss": 0.84416521, + "learning_rate": 7.686042586151354e-07, + "loss": 0.85462821, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 5109, + "time_per_iteration": 2.827469825744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046109, + "balance_loss_mlp": 1.03744864, + "diversity_loss_mlp": 0.0, + "epoch": 0.9830704116968064, + "flos": 537101591040.0, + "grad_norm": 0.05962385879994031, + "language_loss": 0.82830834, + "learning_rate": 7.514335898027857e-07, + "loss": 0.83876944, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5110, + "time_per_iteration": 2.7789480686187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052062, + "balance_loss_mlp": 1.0431633, + "diversity_loss_mlp": 0.0, + "epoch": 0.98326279338207, + "flos": 458949597696.0, + "grad_norm": 0.08038091049338392, + "language_loss": 0.84353125, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85405189, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5111, + "time_per_iteration": 2.504210948944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046838, + "balance_loss_mlp": 1.03787303, + "diversity_loss_mlp": 0.0, + "epoch": 0.9834551750673336, + "flos": 640974974976.0, + "grad_norm": 0.06156712151194387, + "language_loss": 0.79174638, + "learning_rate": 7.17673735218416e-07, + "loss": 0.80221474, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 5112, + "time_per_iteration": 2.8035426139831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045207, + "balance_loss_mlp": 1.03661203, + "diversity_loss_mlp": 0.0, + "epoch": 0.9836475567525972, + "flos": 1071807220224.0, + "grad_norm": 0.062084580460965294, + "language_loss": 0.7939449, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80439693, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 5113, + "time_per_iteration": 3.4046199321746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051509, + "balance_loss_mlp": 1.04283631, + "diversity_loss_mlp": 0.0, + "epoch": 0.9838399384378607, + "flos": 565209778176.0, + "grad_norm": 0.08317258429297145, + "language_loss": 0.76198953, + "learning_rate": 6.846892349181566e-07, + "loss": 0.77250463, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 5114, + "time_per_iteration": 2.668950319290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050737, + "balance_loss_mlp": 1.04192185, + "diversity_loss_mlp": 0.0, + "epoch": 0.9840323201231242, + "flos": 772805670912.0, + "grad_norm": 0.07567501347544295, + "language_loss": 0.79288757, + "learning_rate": 6.684877586787819e-07, + "loss": 0.80339497, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5115, + "time_per_iteration": 2.9638354778289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_mlp": 1.03803074, + "diversity_loss_mlp": 0.0, + "epoch": 0.9842247018083878, + "flos": 472262358528.0, + "grad_norm": 0.07643720957533141, + "language_loss": 0.85790366, + "learning_rate": 6.524801401249225e-07, + "loss": 0.86837137, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 5116, + "time_per_iteration": 2.5682291984558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_mlp": 1.03958189, + "diversity_loss_mlp": 0.0, + "epoch": 0.9844170834936514, + "flos": 525259012608.0, + "grad_norm": 0.07092299014904967, + "language_loss": 0.84942091, + "learning_rate": 6.366663854713295e-07, + "loss": 0.85990334, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 5117, + "time_per_iteration": 2.637977123260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003203, + "balance_loss_mlp": 0.99879217, + "diversity_loss_mlp": 0.0, + "epoch": 0.984609465178915, + "flos": 1567247408640.0, + "grad_norm": 0.004507137876237267, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78165722, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5118, + "time_per_iteration": 4.920542001724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052333, + "balance_loss_mlp": 1.04354155, + "diversity_loss_mlp": 0.0, + "epoch": 0.9848018468641785, + "flos": 519548981760.0, + "grad_norm": 0.07669150259725825, + "language_loss": 0.82077813, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83130145, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 5119, + "time_per_iteration": 2.606952428817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_mlp": 1.03820455, + "diversity_loss_mlp": 0.0, + "epoch": 0.9849942285494421, + "flos": 493004952576.0, + "grad_norm": 0.061362579804974775, + "language_loss": 0.83024836, + "learning_rate": 5.903883659301167e-07, + "loss": 0.84071916, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 5120, + "time_per_iteration": 2.588484525680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051257, + "balance_loss_mlp": 1.04235184, + "diversity_loss_mlp": 0.0, + "epoch": 0.9851866102347057, + "flos": 546001606656.0, + "grad_norm": 0.0845871079135169, + "language_loss": 0.81128502, + "learning_rate": 5.753501275193029e-07, + "loss": 0.82179761, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5121, + "time_per_iteration": 2.6300275325775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044388, + "balance_loss_mlp": 1.03557277, + "diversity_loss_mlp": 0.0, + "epoch": 0.9853789919199692, + "flos": 476257729536.0, + "grad_norm": 0.07512722548004026, + "language_loss": 0.80214739, + "learning_rate": 5.605057829531912e-07, + "loss": 0.81259131, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5122, + "time_per_iteration": 2.528691053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051018, + "balance_loss_mlp": 1.04198194, + "diversity_loss_mlp": 0.0, + "epoch": 0.9855713736052328, + "flos": 1032619995648.0, + "grad_norm": 0.1156037342387967, + "language_loss": 0.76233137, + "learning_rate": 5.458553379950049e-07, + "loss": 0.77284151, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 5123, + "time_per_iteration": 3.356245517730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_mlp": 1.04011154, + "diversity_loss_mlp": 0.0, + "epoch": 0.9857637552904963, + "flos": 495050724864.0, + "grad_norm": 0.0641282180922578, + "language_loss": 0.82703745, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83752573, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 5124, + "time_per_iteration": 2.625892400741577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051607, + "balance_loss_mlp": 1.04285097, + "diversity_loss_mlp": 0.0, + "epoch": 0.9859561369757599, + "flos": 592267728384.0, + "grad_norm": 0.06640628679407225, + "language_loss": 0.8357659, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84628195, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 5125, + "time_per_iteration": 2.6943421363830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045914, + "balance_loss_mlp": 1.03727758, + "diversity_loss_mlp": 0.0, + "epoch": 0.9861485186610235, + "flos": 486971149824.0, + "grad_norm": 0.07733437230097125, + "language_loss": 0.78536099, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79582012, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5126, + "time_per_iteration": 2.663972854614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047998, + "balance_loss_mlp": 1.03925443, + "diversity_loss_mlp": 0.0, + "epoch": 0.9863409003462871, + "flos": 518795352576.0, + "grad_norm": 0.06032739387712679, + "language_loss": 0.82490909, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83538908, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 5127, + "time_per_iteration": 2.6729202270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003205, + "balance_loss_mlp": 0.99879479, + "diversity_loss_mlp": 0.0, + "epoch": 0.9865332820315506, + "flos": 1486026570240.0, + "grad_norm": 0.004506363295624896, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80185938, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5128, + "time_per_iteration": 4.911416530609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078477, + "balance_loss_mlp": 1.32232308, + "diversity_loss_mlp": 0.22574797, + "epoch": 0.9867256637168141, + "flos": 582112645632.0, + "grad_norm": 0.03417894522546616, + "language_loss": 0.79182434, + "learning_rate": 4.620248732582488e-07, + "loss": 0.79967207, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01073482, + "step": 5129, + "time_per_iteration": 2.7484471797943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0077241, + "balance_loss_mlp": 1.299196, + "diversity_loss_mlp": 0.22459432, + "epoch": 0.9869180454020777, + "flos": 959303264256.0, + "grad_norm": 0.0327459890880189, + "language_loss": 0.86703897, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87476307, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01051447, + "step": 5130, + "time_per_iteration": 3.2471301555633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.03996539, + "diversity_loss_mlp": 0.0, + "epoch": 0.9871104270873413, + "flos": 770730163200.0, + "grad_norm": 0.07462217297713208, + "language_loss": 0.82822615, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.83871394, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5131, + "time_per_iteration": 3.0264713764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_mlp": 1.03579676, + "diversity_loss_mlp": 0.0, + "epoch": 0.9873028087726049, + "flos": 446444794368.0, + "grad_norm": 0.09684750541354396, + "language_loss": 0.78034192, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.7907899, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 5132, + "time_per_iteration": 2.501401662826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047696, + "balance_loss_mlp": 1.03900599, + "diversity_loss_mlp": 0.0, + "epoch": 0.9874951904578684, + "flos": 507612427776.0, + "grad_norm": 0.06608816794625222, + "language_loss": 0.86122322, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87170017, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 5133, + "time_per_iteration": 2.595851421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046161, + "balance_loss_mlp": 1.03731585, + "diversity_loss_mlp": 0.0, + "epoch": 0.987687572143132, + "flos": 716742743040.0, + "grad_norm": 0.07376071696211185, + "language_loss": 0.81970578, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83016741, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 5134, + "time_per_iteration": 2.899350881576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003204, + "balance_loss_mlp": 0.99879336, + "diversity_loss_mlp": 0.0, + "epoch": 0.9878799538283956, + "flos": 1538647695360.0, + "grad_norm": 0.004506854986446618, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80821157, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5135, + "time_per_iteration": 4.867925405502319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_mlp": 1.04155612, + "diversity_loss_mlp": 0.0, + "epoch": 0.9880723355136591, + "flos": 721424931840.0, + "grad_norm": 0.06071682459398163, + "language_loss": 0.81917751, + "learning_rate": 3.730469030412964e-07, + "loss": 0.82968003, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 5136, + "time_per_iteration": 2.9082465171813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784556, + "balance_loss_mlp": 1.3212409, + "diversity_loss_mlp": 0.22676432, + "epoch": 0.9882647171989226, + "flos": 557350087680.0, + "grad_norm": 0.028741736801368708, + "language_loss": 0.84462202, + "learning_rate": 3.611116155572969e-07, + "loss": 0.8524676, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01055351, + "step": 5137, + "time_per_iteration": 2.687598705291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048005, + "balance_loss_mlp": 1.03901052, + "diversity_loss_mlp": 0.0, + "epoch": 0.9884570988841862, + "flos": 562820410368.0, + "grad_norm": 0.07713102005937741, + "language_loss": 0.80440414, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81488419, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 5138, + "time_per_iteration": 2.7116920948028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_mlp": 1.03775895, + "diversity_loss_mlp": 0.0, + "epoch": 0.9886494805694498, + "flos": 431763167232.0, + "grad_norm": 0.07051878557324726, + "language_loss": 0.86536169, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87582827, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5139, + "time_per_iteration": 2.477654218673706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045545, + "balance_loss_mlp": 1.03696823, + "diversity_loss_mlp": 0.0, + "epoch": 0.9888418622547134, + "flos": 592082348544.0, + "grad_norm": 0.05631423705134008, + "language_loss": 0.90553308, + "learning_rate": 3.264696333806771e-07, + "loss": 0.9159885, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.08587646, + "routerloss_mlp": 0.0, + "step": 5140, + "time_per_iteration": 2.789351224899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049588, + "balance_loss_mlp": 1.04073703, + "diversity_loss_mlp": 0.0, + "epoch": 0.989034243939977, + "flos": 1134993461760.0, + "grad_norm": 0.06262136237267299, + "language_loss": 0.80186951, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81236541, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 5141, + "time_per_iteration": 3.521420478820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104667, + "balance_loss_mlp": 1.03778934, + "diversity_loss_mlp": 0.0, + "epoch": 0.9892266256252404, + "flos": 566670048768.0, + "grad_norm": 0.0653214866342138, + "language_loss": 0.81865728, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.82912397, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 5142, + "time_per_iteration": 2.6905152797698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_mlp": 1.03794312, + "diversity_loss_mlp": 0.0, + "epoch": 0.989419007310504, + "flos": 640577051136.0, + "grad_norm": 0.06437869536727725, + "language_loss": 0.83950132, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.84996706, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 5143, + "time_per_iteration": 2.9280619621276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00775776, + "balance_loss_mlp": 1.30826199, + "diversity_loss_mlp": 0.22223487, + "epoch": 0.9896113889957676, + "flos": 455478059520.0, + "grad_norm": 0.03094231827555858, + "language_loss": 0.81775147, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82550919, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01052798, + "step": 5144, + "time_per_iteration": 2.6317298412323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046559, + "balance_loss_mlp": 1.03809488, + "diversity_loss_mlp": 0.0, + "epoch": 0.9898037706810312, + "flos": 567339614208.0, + "grad_norm": 0.06731066884585553, + "language_loss": 0.80676913, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81723469, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.08465576, + "routerloss_mlp": 0.0, + "step": 5145, + "time_per_iteration": 2.6584229469299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00588607, + "balance_loss_mlp": 1.02718186, + "diversity_loss_mlp": 0.13146883, + "epoch": 0.9899961523662947, + "flos": 1550268191232.0, + "grad_norm": 0.0012619225721446723, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.7873503, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00928183, + "step": 5146, + "time_per_iteration": 4.944198369979858 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046886, + "balance_loss_mlp": 1.03796947, + "diversity_loss_mlp": 0.0, + "epoch": 0.9901885340515583, + "flos": 610709787648.0, + "grad_norm": 0.06397137457157225, + "language_loss": 0.85200578, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86247468, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 5147, + "time_per_iteration": 2.920581579208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045844, + "balance_loss_mlp": 1.03715396, + "diversity_loss_mlp": 0.0, + "epoch": 0.9903809157368219, + "flos": 517483385856.0, + "grad_norm": 0.06276990657159663, + "language_loss": 0.82674694, + "learning_rate": 2.426269020866512e-07, + "loss": 0.83720535, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 5148, + "time_per_iteration": 2.5547163486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047255, + "balance_loss_mlp": 1.0385884, + "diversity_loss_mlp": 0.0, + "epoch": 0.9905732974220854, + "flos": 1100426757120.0, + "grad_norm": 0.06810375608375513, + "language_loss": 0.80711174, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81758434, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5149, + "time_per_iteration": 3.4215774536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045945, + "balance_loss_mlp": 1.03725505, + "diversity_loss_mlp": 0.0, + "epoch": 0.990765679107349, + "flos": 858002056704.0, + "grad_norm": 0.08140595339599294, + "language_loss": 0.84472948, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85518897, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 5150, + "time_per_iteration": 3.104238271713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_mlp": 1.03784227, + "diversity_loss_mlp": 0.0, + "epoch": 0.9909580607926125, + "flos": 491287721472.0, + "grad_norm": 0.07994567324384995, + "language_loss": 0.80567187, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81613684, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5151, + "time_per_iteration": 2.597073554992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046205, + "balance_loss_mlp": 1.03738976, + "diversity_loss_mlp": 0.0, + "epoch": 0.9911504424778761, + "flos": 585060350976.0, + "grad_norm": 0.0788095686937427, + "language_loss": 0.79632246, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80678451, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 5152, + "time_per_iteration": 2.672553062438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104469, + "balance_loss_mlp": 1.03561211, + "diversity_loss_mlp": 0.0, + "epoch": 0.9913428241631397, + "flos": 570030359040.0, + "grad_norm": 0.06752430275446872, + "language_loss": 0.81667304, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82711995, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 5153, + "time_per_iteration": 2.6830427646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047286, + "balance_loss_mlp": 1.03867936, + "diversity_loss_mlp": 0.0, + "epoch": 0.9915352058484033, + "flos": 489745958400.0, + "grad_norm": 0.06636262173491685, + "language_loss": 0.86006486, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.8705377, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 5154, + "time_per_iteration": 2.6368730068206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104807, + "balance_loss_mlp": 1.03920066, + "diversity_loss_mlp": 0.0, + "epoch": 0.9917275875336667, + "flos": 744047741952.0, + "grad_norm": 0.060555053830850476, + "language_loss": 0.82984126, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.84032202, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 5155, + "time_per_iteration": 2.989109754562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043535, + "balance_loss_mlp": 1.03463578, + "diversity_loss_mlp": 0.0, + "epoch": 0.9919199692189303, + "flos": 508272081408.0, + "grad_norm": 0.06288570543658592, + "language_loss": 0.80066729, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81110263, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 5156, + "time_per_iteration": 2.6498100757598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045724, + "balance_loss_mlp": 1.03691423, + "diversity_loss_mlp": 0.0, + "epoch": 0.9921123509041939, + "flos": 543963174912.0, + "grad_norm": 0.06594459780967553, + "language_loss": 0.84395134, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85440862, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 5157, + "time_per_iteration": 2.6490535736083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045394, + "balance_loss_mlp": 1.03646517, + "diversity_loss_mlp": 0.0, + "epoch": 0.9923047325894575, + "flos": 671561362944.0, + "grad_norm": 0.06545947039571581, + "language_loss": 0.77654356, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78699744, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 5158, + "time_per_iteration": 2.7639706134796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047617, + "balance_loss_mlp": 1.03868222, + "diversity_loss_mlp": 0.0, + "epoch": 0.9924971142747211, + "flos": 466557096960.0, + "grad_norm": 0.06168897901648668, + "language_loss": 0.8080498, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81852591, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 5159, + "time_per_iteration": 2.7385036945343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049861, + "balance_loss_mlp": 1.04073584, + "diversity_loss_mlp": 0.0, + "epoch": 0.9926894959599846, + "flos": 491581757952.0, + "grad_norm": 0.06899221386615825, + "language_loss": 0.82835615, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83885473, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 5160, + "time_per_iteration": 2.559859037399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050661, + "balance_loss_mlp": 1.04187524, + "diversity_loss_mlp": 0.0, + "epoch": 0.9928818776452482, + "flos": 492389715456.0, + "grad_norm": 0.08668343737324606, + "language_loss": 0.81916565, + "learning_rate": 1.328673533166902e-07, + "loss": 0.82967234, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 5161, + "time_per_iteration": 2.5678670406341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048471, + "balance_loss_mlp": 1.03970289, + "diversity_loss_mlp": 0.0, + "epoch": 0.9930742593305117, + "flos": 546357312000.0, + "grad_norm": 0.06843444651252836, + "language_loss": 0.84165454, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85213923, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 5162, + "time_per_iteration": 2.7581584453582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_mlp": 1.03851247, + "diversity_loss_mlp": 0.0, + "epoch": 0.9932666410157753, + "flos": 585510031872.0, + "grad_norm": 0.06263196001846472, + "language_loss": 0.85711837, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.86758995, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5163, + "time_per_iteration": 2.7846977710723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046625, + "balance_loss_mlp": 1.0378511, + "diversity_loss_mlp": 0.0, + "epoch": 0.9934590227010388, + "flos": 537086909952.0, + "grad_norm": 0.06164233206359557, + "language_loss": 0.83855546, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.84902167, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 5164, + "time_per_iteration": 2.716646671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_mlp": 1.03856587, + "diversity_loss_mlp": 0.0, + "epoch": 0.9936514043863024, + "flos": 518014559232.0, + "grad_norm": 0.06133860998625567, + "language_loss": 0.86944854, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.8799212, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 5165, + "time_per_iteration": 2.614095687866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104736, + "balance_loss_mlp": 1.03866947, + "diversity_loss_mlp": 0.0, + "epoch": 0.993843786071566, + "flos": 744625903104.0, + "grad_norm": 0.06754893239543082, + "language_loss": 0.80281818, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81329167, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 5166, + "time_per_iteration": 3.028465986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050745, + "balance_loss_mlp": 1.04209042, + "diversity_loss_mlp": 0.0, + "epoch": 0.9940361677568296, + "flos": 525918666240.0, + "grad_norm": 0.06956871932384841, + "language_loss": 0.82008004, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83058745, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5167, + "time_per_iteration": 2.698882818222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104561, + "balance_loss_mlp": 1.03672278, + "diversity_loss_mlp": 0.0, + "epoch": 0.9942285494420932, + "flos": 555650108928.0, + "grad_norm": 0.06410012888366921, + "language_loss": 0.80157578, + "learning_rate": 8.735020633177104e-08, + "loss": 0.81203187, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 5168, + "time_per_iteration": 2.7812376022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046457, + "balance_loss_mlp": 1.0377903, + "diversity_loss_mlp": 0.0, + "epoch": 0.9944209311273566, + "flos": 585996788736.0, + "grad_norm": 0.06620347908149736, + "language_loss": 0.82235384, + "learning_rate": 8.162407083411872e-08, + "loss": 0.83281839, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 5169, + "time_per_iteration": 2.7237818241119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047633, + "balance_loss_mlp": 1.0389545, + "diversity_loss_mlp": 0.0, + "epoch": 0.9946133128126202, + "flos": 735518486016.0, + "grad_norm": 0.06912708749251066, + "language_loss": 0.82253057, + "learning_rate": 7.609202086272804e-08, + "loss": 0.83300692, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 5170, + "time_per_iteration": 2.9818952083587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047087, + "balance_loss_mlp": 1.03824186, + "diversity_loss_mlp": 0.0, + "epoch": 0.9948056944978838, + "flos": 646018011648.0, + "grad_norm": 0.08243647739411311, + "language_loss": 0.82281691, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83328784, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 5171, + "time_per_iteration": 2.7422502040863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104891, + "balance_loss_mlp": 1.04017246, + "diversity_loss_mlp": 0.0, + "epoch": 0.9949980761831474, + "flos": 445846809600.0, + "grad_norm": 0.06824796371814347, + "language_loss": 0.86093032, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87141943, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 5172, + "time_per_iteration": 2.51432728767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046756, + "balance_loss_mlp": 1.03810704, + "diversity_loss_mlp": 0.0, + "epoch": 0.995190457868411, + "flos": 435637398528.0, + "grad_norm": 0.06509423598404523, + "language_loss": 0.85527599, + "learning_rate": 6.066040520641414e-08, + "loss": 0.86574364, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.08654785, + "routerloss_mlp": 0.0, + "step": 5173, + "time_per_iteration": 2.6191818714141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047522, + "balance_loss_mlp": 1.0386107, + "diversity_loss_mlp": 0.0, + "epoch": 0.9953828395536745, + "flos": 514187315712.0, + "grad_norm": 0.06870476422803651, + "language_loss": 0.81628877, + "learning_rate": 5.590471806377062e-08, + "loss": 0.82676393, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5174, + "time_per_iteration": 2.569406270980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046917, + "balance_loss_mlp": 1.03805971, + "diversity_loss_mlp": 0.0, + "epoch": 0.995575221238938, + "flos": 479847836160.0, + "grad_norm": 0.06879136838428648, + "language_loss": 0.81909287, + "learning_rate": 5.134312643245709e-08, + "loss": 0.82956201, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 5175, + "time_per_iteration": 2.5882654190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049317, + "balance_loss_mlp": 1.04018593, + "diversity_loss_mlp": 0.0, + "epoch": 0.9957676029242016, + "flos": 587785600512.0, + "grad_norm": 0.08802784581931292, + "language_loss": 0.76484299, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77533621, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 5176, + "time_per_iteration": 2.7355172634124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781269, + "balance_loss_mlp": 1.31630397, + "diversity_loss_mlp": 0.2250234, + "epoch": 0.9959599846094652, + "flos": 426465741312.0, + "grad_norm": 0.03484461119289524, + "language_loss": 0.80370349, + "learning_rate": 4.280223671243588e-08, + "loss": 0.81151617, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01060532, + "step": 5177, + "time_per_iteration": 2.488933563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_mlp": 1.03673339, + "diversity_loss_mlp": 0.0, + "epoch": 0.9961523662947287, + "flos": 611619061248.0, + "grad_norm": 0.060646192988618466, + "language_loss": 0.80473614, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.81519341, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 5178, + "time_per_iteration": 2.860849380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045766, + "balance_loss_mlp": 1.03690243, + "diversity_loss_mlp": 0.0, + "epoch": 0.9963447479799923, + "flos": 550785111552.0, + "grad_norm": 0.06956117500096984, + "language_loss": 0.73755258, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.74801028, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 5179, + "time_per_iteration": 2.652787446975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051578, + "balance_loss_mlp": 1.04275656, + "diversity_loss_mlp": 0.0, + "epoch": 0.9965371296652559, + "flos": 625873402368.0, + "grad_norm": 0.081637230316847, + "language_loss": 0.89049286, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.90100861, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 5180, + "time_per_iteration": 2.7644760608673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048065, + "balance_loss_mlp": 1.03896928, + "diversity_loss_mlp": 0.0, + "epoch": 0.9967295113505195, + "flos": 639522044928.0, + "grad_norm": 0.0759879935902396, + "language_loss": 0.81941384, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.82989448, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 5181, + "time_per_iteration": 2.9104771614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046151, + "balance_loss_mlp": 1.03727567, + "diversity_loss_mlp": 0.0, + "epoch": 0.996921893035783, + "flos": 607389124608.0, + "grad_norm": 0.0884261396290618, + "language_loss": 0.76887906, + "learning_rate": 2.484679859793282e-08, + "loss": 0.77934057, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 5182, + "time_per_iteration": 2.721599578857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_mlp": 1.03908062, + "diversity_loss_mlp": 0.0, + "epoch": 0.9971142747210465, + "flos": 644162388480.0, + "grad_norm": 0.0648988132762576, + "language_loss": 0.81727201, + "learning_rate": 2.183802848243488e-08, + "loss": 0.82775426, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 5183, + "time_per_iteration": 2.7815635204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048087, + "balance_loss_mlp": 1.03952742, + "diversity_loss_mlp": 0.0, + "epoch": 0.9973066564063101, + "flos": 1040773722624.0, + "grad_norm": 0.05502432672300637, + "language_loss": 0.81058741, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82106829, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.08569336, + "routerloss_mlp": 0.0, + "step": 5184, + "time_per_iteration": 3.372502326965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105351, + "balance_loss_mlp": 1.04470634, + "diversity_loss_mlp": 0.0, + "epoch": 0.9974990380915737, + "flos": 665095131648.0, + "grad_norm": 0.08025246784684749, + "language_loss": 0.83187962, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84241462, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 5185, + "time_per_iteration": 2.835519313812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047458, + "balance_loss_mlp": 1.03881598, + "diversity_loss_mlp": 0.0, + "epoch": 0.9976914197768373, + "flos": 718121521152.0, + "grad_norm": 0.06904687845719167, + "language_loss": 0.77359349, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78406811, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5186, + "time_per_iteration": 2.8937785625457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048144, + "balance_loss_mlp": 1.03904831, + "diversity_loss_mlp": 0.0, + "epoch": 0.9978838014621008, + "flos": 518328419328.0, + "grad_norm": 0.07280590001962838, + "language_loss": 0.79471743, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.80519885, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 5187, + "time_per_iteration": 2.635932207107544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044774, + "balance_loss_mlp": 1.03606606, + "diversity_loss_mlp": 0.0, + "epoch": 0.9980761831473643, + "flos": 603430829568.0, + "grad_norm": 0.05359795749809877, + "language_loss": 0.84325933, + "learning_rate": 9.70582968801148e-09, + "loss": 0.85370713, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 5188, + "time_per_iteration": 2.7615973949432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045401, + "balance_loss_mlp": 1.03626382, + "diversity_loss_mlp": 0.0, + "epoch": 0.9982685648326279, + "flos": 453523691520.0, + "grad_norm": 0.0657633073490906, + "language_loss": 0.8937813, + "learning_rate": 7.861726879943021e-09, + "loss": 0.9042353, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 5189, + "time_per_iteration": 2.543257236480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045558, + "balance_loss_mlp": 1.03698051, + "diversity_loss_mlp": 0.0, + "epoch": 0.9984609465178915, + "flos": 481424103936.0, + "grad_norm": 0.0777283177143095, + "language_loss": 0.78666133, + "learning_rate": 6.211738235173403e-09, + "loss": 0.79711688, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.08581543, + "routerloss_mlp": 0.0, + "step": 5190, + "time_per_iteration": 2.6314117908477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010468, + "balance_loss_mlp": 1.03816903, + "diversity_loss_mlp": 0.0, + "epoch": 0.9986533282031551, + "flos": 476941976064.0, + "grad_norm": 0.05898093011437241, + "language_loss": 0.84184742, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85231537, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 5191, + "time_per_iteration": 2.6695079803466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104994, + "balance_loss_mlp": 1.04094553, + "diversity_loss_mlp": 0.0, + "epoch": 0.9988457098884186, + "flos": 641948488704.0, + "grad_norm": 0.06405577435904004, + "language_loss": 0.86847579, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87897515, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 5192, + "time_per_iteration": 2.8024892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046763, + "balance_loss_mlp": 1.03778648, + "diversity_loss_mlp": 0.0, + "epoch": 0.9990380915736822, + "flos": 396321693696.0, + "grad_norm": 0.0686453524231272, + "language_loss": 0.88108921, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.89155686, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 5193, + "time_per_iteration": 2.4370131492614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045344, + "balance_loss_mlp": 1.0364393, + "diversity_loss_mlp": 0.0, + "epoch": 0.9992304732589458, + "flos": 576123259392.0, + "grad_norm": 0.06828670759326802, + "language_loss": 0.85050082, + "learning_rate": 1.552936970405927e-09, + "loss": 0.86095428, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5194, + "time_per_iteration": 2.765718698501587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048829, + "balance_loss_mlp": 1.04024625, + "diversity_loss_mlp": 0.0, + "epoch": 0.9994228549442093, + "flos": 544291716096.0, + "grad_norm": 0.07220046609149769, + "language_loss": 0.75592577, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76641411, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.08587646, + "routerloss_mlp": 0.0, + "step": 5195, + "time_per_iteration": 2.713330030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790766, + "balance_loss_mlp": 1.33585405, + "diversity_loss_mlp": 0.22418211, + "epoch": 0.9996152366294728, + "flos": 1471314502656.0, + "grad_norm": 0.03504416823087641, + "language_loss": 0.81017089, + "learning_rate": 3.882343933003796e-10, + "loss": 0.81807852, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01074793, + "step": 5196, + "time_per_iteration": 3.730872631072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_mlp": 1.028754, + "diversity_loss_mlp": 0.0, + "epoch": 0.9998076183147364, + "flos": 618950149632.0, + "grad_norm": 0.09543829836144671, + "language_loss": 0.69830346, + "learning_rate": 9.70586077619906e-11, + "loss": 0.70866984, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.0788269, + "routerloss_mlp": 0.0, + "step": 5197, + "time_per_iteration": 4.026475429534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018596, + "balance_loss_mlp": 1.01257348, + "diversity_loss_mlp": 0.0, + "epoch": 1.0, + "flos": 1290737617920.0, + "grad_norm": 0.032396730253084045, + "language_loss": 0.84149116, + "learning_rate": 0.0, + "loss": 0.85167712, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.06033325, + "routerloss_mlp": 0.0, + "step": 5198, + "time_per_iteration": 5.587369918823242 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.171926856433664e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/training_args.bin b/sft_pretrain/Full_competesmoev30/checkpoint-5198/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b6a9277adbc97dc93da839d7637a55f6cb09192 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe769c1cc19035ec98b831c3889d46da4eb91c0444d770f41a815de3d19398a +size 7992 diff --git a/sft_pretrain/Full_competesmoev30/checkpoint-5198/zero_to_fp32.py b/sft_pretrain/Full_competesmoev30/checkpoint-5198/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/checkpoint-5198/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_competesmoev30/config.json b/sft_pretrain/Full_competesmoev30/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c85d1759754ccff61df63edfccf471768773f5e5 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.005, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.005, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": true, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 9, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "competesmoev30", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.07, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_competesmoev30/generation_config.json b/sft_pretrain/Full_competesmoev30/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_competesmoev30/model-00001-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_competesmoev30/model-00002-of-00002.safetensors b/sft_pretrain/Full_competesmoev30/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..221517641f8c3e836c30a881dbeae36e687c8737 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24b87b0e369f9a71b0854220a5351ec7cad9e6d1184d114409009a80f2629f49 +size 3759030203 diff --git a/sft_pretrain/Full_competesmoev30/model.safetensors.index.json b/sft_pretrain/Full_competesmoev30/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..1c36aea017a82c896c2bf8d32802184967811e4c --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/model.safetensors.index.json @@ -0,0 +1,673 @@ +{ + "metadata": { + "total_size": 8731429675 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.prob_flips": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_competesmoev30/special_tokens_map.json b/sft_pretrain/Full_competesmoev30/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_competesmoev30/tokenizer.model b/sft_pretrain/Full_competesmoev30/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_competesmoev30/tokenizer_config.json b/sft_pretrain/Full_competesmoev30/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_competesmoev30/trainer_state.json b/sft_pretrain/Full_competesmoev30/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0e8828a49ad1f8ce920fdc810fbf50b49d32f564 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/trainer_state.json @@ -0,0 +1,87809 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03936368, + "balance_loss_mlp": 2.84994221, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 15.847607787273237, + "language_loss": 2.91765308, + "learning_rate": 0.0, + "loss": 1.97528625, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 24.278199672698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02015882, + "balance_loss_mlp": 1.26743817, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 26.39987998366427, + "language_loss": 2.42349291, + "learning_rate": 0.00013726078121135892, + "loss": 2.44365168, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 7.4765625, + "step": 2, + "time_per_iteration": 2.74550199508667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02034476, + "balance_loss_mlp": 1.28603244, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 23.46624299076427, + "language_loss": 2.13354897, + "learning_rate": 0.00021755319103969496, + "loss": 2.15389395, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 7.4765625, + "step": 3, + "time_per_iteration": 2.820986270904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02058399, + "balance_loss_mlp": 1.29927421, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 3.493910581799846, + "language_loss": 1.37129521, + "learning_rate": 0.00027452156242271784, + "loss": 1.3918792, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 7.5859375, + "step": 4, + "time_per_iteration": 2.677243947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02066247, + "balance_loss_mlp": 1.30979228, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 0.8674817587168525, + "language_loss": 1.33187473, + "learning_rate": 0.0003187096642208417, + "loss": 1.35253716, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 7.55859375, + "step": 5, + "time_per_iteration": 2.6032657623291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02071583, + "balance_loss_mlp": 1.31322157, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 2.033424387355904, + "language_loss": 1.30649018, + "learning_rate": 0.0003548139722510539, + "loss": 1.32720602, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 7.578125, + "step": 6, + "time_per_iteration": 2.6967170238494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101369, + "balance_loss_mlp": 1.33652186, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.7061194413900653, + "language_loss": 1.22160292, + "learning_rate": 0.00038533972973918044, + "loss": 1.24261677, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 7.64453125, + "step": 7, + "time_per_iteration": 2.7199785709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02146806, + "balance_loss_mlp": 1.36975181, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.35850971046258795, + "language_loss": 1.17196155, + "learning_rate": 0.0004117823436340768, + "loss": 1.19342971, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 7.76171875, + "step": 8, + "time_per_iteration": 2.6428823471069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02153063, + "balance_loss_mlp": 1.36837983, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.22105321402960548, + "language_loss": 1.2430563, + "learning_rate": 0.00043510638207938993, + "loss": 1.26458693, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 7.8359375, + "step": 9, + "time_per_iteration": 2.7773404121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02194678, + "balance_loss_mlp": 1.4077065, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.2650641779955913, + "language_loss": 1.13927829, + "learning_rate": 0.00045597044543220066, + "loss": 1.16122508, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 7.87109375, + "step": 10, + "time_per_iteration": 2.6966803073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02215625, + "balance_loss_mlp": 1.42216802, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.17099192662038445, + "language_loss": 1.11761594, + "learning_rate": 0.00047484428652143135, + "loss": 1.13977218, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 7.921875, + "step": 11, + "time_per_iteration": 2.846426010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218955, + "balance_loss_mlp": 1.42854977, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.11899482154082718, + "language_loss": 1.17641664, + "learning_rate": 0.0004920747534624128, + "loss": 1.19860613, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 7.890625, + "step": 12, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02207543, + "balance_loss_mlp": 1.41751897, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.14172497717456267, + "language_loss": 1.20158505, + "learning_rate": 0.0005079252465375872, + "loss": 1.22366059, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 7.8984375, + "step": 13, + "time_per_iteration": 2.7560088634490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02203989, + "balance_loss_mlp": 1.41625452, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.1448362910448976, + "language_loss": 1.09927368, + "learning_rate": 0.0005226005109505393, + "loss": 1.12131357, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 7.859375, + "step": 14, + "time_per_iteration": 2.623379707336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02125464, + "balance_loss_mlp": 1.36481309, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.13392565488521943, + "language_loss": 1.15514731, + "learning_rate": 0.0005362628552605367, + "loss": 1.17640197, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 7.59765625, + "step": 15, + "time_per_iteration": 2.596914768218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02122013, + "balance_loss_mlp": 1.3682282, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.12347082932885804, + "language_loss": 1.19854355, + "learning_rate": 0.0005490431248454357, + "loss": 1.21976352, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 7.53125, + "step": 16, + "time_per_iteration": 2.685072898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02419001, + "balance_loss_mlp": 1.67742407, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2736231848322761, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78124118, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.40625, + "step": 17, + "time_per_iteration": 5.928683757781982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02002798, + "balance_loss_mlp": 1.29097593, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.09154168539226555, + "language_loss": 1.06151795, + "learning_rate": 0.0005723671632907488, + "loss": 1.08154595, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.12109375, + "step": 18, + "time_per_iteration": 2.6618175506591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945774, + "balance_loss_mlp": 1.26141703, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11342789334024792, + "language_loss": 1.1168499, + "learning_rate": 0.0005830738490244919, + "loss": 1.13630772, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.8515625, + "step": 19, + "time_per_iteration": 2.5248160362243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01908107, + "balance_loss_mlp": 1.24625731, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.10096694408553891, + "language_loss": 1.13845825, + "learning_rate": 0.0005932312266435596, + "loss": 1.15753937, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.62109375, + "step": 20, + "time_per_iteration": 2.800579309463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01843731, + "balance_loss_mlp": 1.21316147, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.1378013237236713, + "language_loss": 1.09039617, + "learning_rate": 0.0006028929207788754, + "loss": 1.10883355, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 6.30078125, + "step": 21, + "time_per_iteration": 2.693075656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01796963, + "balance_loss_mlp": 1.19309616, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.10529209836160877, + "language_loss": 1.11936951, + "learning_rate": 0.0006121050677327902, + "loss": 1.13733912, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 6.03125, + "step": 22, + "time_per_iteration": 2.8881568908691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746784, + "balance_loss_mlp": 1.17724967, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.085047282331545, + "language_loss": 1.02962387, + "learning_rate": 0.0006209076479463684, + "loss": 1.04709172, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 5.70703125, + "step": 23, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01714578, + "balance_loss_mlp": 1.16831291, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.1446104563316411, + "language_loss": 1.12823486, + "learning_rate": 0.0006293355346737718, + "loss": 1.1453805, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 5.46875, + "step": 24, + "time_per_iteration": 2.662325382232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664908, + "balance_loss_mlp": 1.14725351, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.08929005506461926, + "language_loss": 1.08926165, + "learning_rate": 0.0006374193284416834, + "loss": 1.10591078, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 5.17578125, + "step": 25, + "time_per_iteration": 2.7794790267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01647718, + "balance_loss_mlp": 1.15752983, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.382953647696995, + "language_loss": 1.07588863, + "learning_rate": 0.0006451860277489461, + "loss": 1.09236586, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 4.89453125, + "step": 26, + "time_per_iteration": 2.6574552059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01623745, + "balance_loss_mlp": 1.1686517, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.13377036730821817, + "language_loss": 1.14740276, + "learning_rate": 0.0006526595731190848, + "loss": 1.16364002, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 4.55078125, + "step": 27, + "time_per_iteration": 2.5226099491119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558493, + "balance_loss_mlp": 1.14078379, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.07887885702942038, + "language_loss": 1.08901012, + "learning_rate": 0.0006598612921618983, + "loss": 1.10459495, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 4.18359375, + "step": 28, + "time_per_iteration": 2.839459180831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01503024, + "balance_loss_mlp": 1.11487842, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.08107526710192482, + "language_loss": 1.0255661, + "learning_rate": 0.0006668102665011454, + "loss": 1.04059625, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 3.87695312, + "step": 29, + "time_per_iteration": 3.257913589477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01474291, + "balance_loss_mlp": 1.11227608, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.13697687064909753, + "language_loss": 1.11483085, + "learning_rate": 0.0006735236364718957, + "loss": 1.1295737, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 3.6171875, + "step": 30, + "time_per_iteration": 2.7084178924560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0142553, + "balance_loss_mlp": 1.09460521, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.11726589989245696, + "language_loss": 1.10265064, + "learning_rate": 0.0006800168558381346, + "loss": 1.11690593, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 3.31054688, + "step": 31, + "time_per_iteration": 2.588890552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01390474, + "balance_loss_mlp": 1.08758759, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.10666498872881085, + "language_loss": 1.13109517, + "learning_rate": 0.0006863039060567947, + "loss": 1.14499998, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 3.0234375, + "step": 32, + "time_per_iteration": 2.671940326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372012, + "balance_loss_mlp": 1.09372997, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.09439068448398888, + "language_loss": 1.06106949, + "learning_rate": 0.0006923974775611263, + "loss": 1.07478976, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 2.78710938, + "step": 33, + "time_per_iteration": 2.854475498199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01370442, + "balance_loss_mlp": 1.11390388, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.06215931521992215, + "language_loss": 1.03014469, + "learning_rate": 0.0006983091239737814, + "loss": 1.04384923, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 2.56445312, + "step": 34, + "time_per_iteration": 3.0690298080444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361344, + "balance_loss_mlp": 1.12464166, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.09515467516314563, + "language_loss": 1.01683736, + "learning_rate": 0.0007040493939600222, + "loss": 1.03045082, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 2.36523438, + "step": 35, + "time_per_iteration": 2.8111989498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01344012, + "balance_loss_mlp": 1.12600231, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.06987238068095514, + "language_loss": 1.02534437, + "learning_rate": 0.0007096279445021078, + "loss": 1.0387845, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 2.18554688, + "step": 36, + "time_per_iteration": 2.704871654510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01340389, + "balance_loss_mlp": 1.14107156, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.1404335763188921, + "language_loss": 1.09097314, + "learning_rate": 0.0007150536386503726, + "loss": 1.10437703, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 1.9921875, + "step": 37, + "time_per_iteration": 2.872793436050415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315876, + "balance_loss_mlp": 1.13486814, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.16061978088166937, + "language_loss": 1.01896858, + "learning_rate": 0.0007203346302358509, + "loss": 1.0321275, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 1.81054688, + "step": 38, + "time_per_iteration": 2.9352476596832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304512, + "balance_loss_mlp": 1.13332772, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.19798610454398824, + "language_loss": 1.06942129, + "learning_rate": 0.000725478437577282, + "loss": 1.08246636, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 1.71386719, + "step": 39, + "time_per_iteration": 2.766380786895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266397, + "balance_loss_mlp": 1.10894561, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.0682924496804484, + "language_loss": 1.01676083, + "learning_rate": 0.0007304920078549186, + "loss": 1.02942467, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 1.57324219, + "step": 40, + "time_per_iteration": 2.7017316818237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260084, + "balance_loss_mlp": 1.10988009, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.18661861035366387, + "language_loss": 1.03648829, + "learning_rate": 0.0007353817735343603, + "loss": 1.04908907, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 1.50097656, + "step": 41, + "time_per_iteration": 2.7103593349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243555, + "balance_loss_mlp": 1.10651195, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.09436856387031409, + "language_loss": 0.996611, + "learning_rate": 0.0007401537019902344, + "loss": 1.00904644, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 1.37109375, + "step": 42, + "time_per_iteration": 2.6113343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223311, + "balance_loss_mlp": 1.09961998, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.12261468754490484, + "language_loss": 1.02989793, + "learning_rate": 0.0007448133392900729, + "loss": 1.04213095, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.23535156, + "step": 43, + "time_per_iteration": 2.6736834049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123183, + "balance_loss_mlp": 1.11490965, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.06742287935331995, + "language_loss": 0.98469728, + "learning_rate": 0.0007493658489441491, + "loss": 0.9970156, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.16699219, + "step": 44, + "time_per_iteration": 2.8660154342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221739, + "balance_loss_mlp": 1.11549973, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.13165016268944502, + "language_loss": 1.02125764, + "learning_rate": 0.0007538160463002316, + "loss": 1.03347504, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.06445312, + "step": 45, + "time_per_iteration": 2.647026777267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219104, + "balance_loss_mlp": 1.12082767, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.09154051415002856, + "language_loss": 1.05303812, + "learning_rate": 0.0007581684291577274, + "loss": 1.06522906, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.98193359, + "step": 46, + "time_per_iteration": 2.5779762268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211973, + "balance_loss_mlp": 1.12180293, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.10098348979088022, + "language_loss": 1.08761919, + "learning_rate": 0.0007624272050891776, + "loss": 1.09973884, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.90185547, + "step": 47, + "time_per_iteration": 2.8511393070220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.09893048, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.06288361982709323, + "language_loss": 0.98731792, + "learning_rate": 0.0007665963158851307, + "loss": 0.9991011, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.79345703, + "step": 48, + "time_per_iteration": 2.7975704669952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117803, + "balance_loss_mlp": 1.10588408, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.07935638516568921, + "language_loss": 1.07018328, + "learning_rate": 0.0007706794594783609, + "loss": 1.08196378, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.72167969, + "step": 49, + "time_per_iteration": 2.762869358062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170672, + "balance_loss_mlp": 1.10281849, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.06589219417940043, + "language_loss": 1.06122911, + "learning_rate": 0.0007746801096530423, + "loss": 1.07293582, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.67919922, + "step": 50, + "time_per_iteration": 2.755232334136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116692, + "balance_loss_mlp": 1.10545588, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.09337036144210262, + "language_loss": 1.10751569, + "learning_rate": 0.0007786015338021173, + "loss": 1.11918497, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.61376953, + "step": 51, + "time_per_iteration": 2.6145899295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159208, + "balance_loss_mlp": 1.10279799, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.0700474736529942, + "language_loss": 1.03127432, + "learning_rate": 0.0007824468089603051, + "loss": 1.04286635, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.56396484, + "step": 52, + "time_per_iteration": 2.653333902359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.1128397, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.0678828268350522, + "language_loss": 1.02721131, + "learning_rate": 0.0007862188363098669, + "loss": 1.0388329, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.4934082, + "step": 53, + "time_per_iteration": 3.16854190826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150565, + "balance_loss_mlp": 1.10464573, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.07226768628462193, + "language_loss": 1.03151178, + "learning_rate": 0.0007899203543304438, + "loss": 1.04301751, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.45947266, + "step": 54, + "time_per_iteration": 2.684342384338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153237, + "balance_loss_mlp": 1.10901022, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.2877805661885644, + "language_loss": 1.16480064, + "learning_rate": 0.0007935539507422731, + "loss": 1.17633295, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.44213867, + "step": 55, + "time_per_iteration": 2.550560235977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135293, + "balance_loss_mlp": 1.09545326, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.09011321470942846, + "language_loss": 1.08752644, + "learning_rate": 0.0007971220733732573, + "loss": 1.09887934, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.39819336, + "step": 56, + "time_per_iteration": 2.6777026653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138051, + "balance_loss_mlp": 1.10307515, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.08011479339587849, + "language_loss": 1.04026377, + "learning_rate": 0.0008006270400641869, + "loss": 1.05164433, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.34985352, + "step": 57, + "time_per_iteration": 2.6899423599243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140153, + "balance_loss_mlp": 1.10787153, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.11169369867739573, + "language_loss": 1.05261517, + "learning_rate": 0.0008040710477125043, + "loss": 1.06401682, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.32275391, + "step": 58, + "time_per_iteration": 2.723038911819458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144338, + "balance_loss_mlp": 1.11403465, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.15034464280850074, + "language_loss": 1.06417704, + "learning_rate": 0.0008074561805429771, + "loss": 1.07562041, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.30297852, + "step": 59, + "time_per_iteration": 2.6378283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136058, + "balance_loss_mlp": 1.10842514, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.12260992246729245, + "language_loss": 1.03937411, + "learning_rate": 0.0008107844176832545, + "loss": 1.05073476, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.27612305, + "step": 60, + "time_per_iteration": 2.700141668319702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143848, + "balance_loss_mlp": 1.11745548, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.07189127634205647, + "language_loss": 1.05365705, + "learning_rate": 0.0008140576401132568, + "loss": 1.06509542, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.2644043, + "step": 61, + "time_per_iteration": 2.6508264541625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141309, + "balance_loss_mlp": 1.11781311, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.05216073972873087, + "language_loss": 1.06422329, + "learning_rate": 0.0008172776370494935, + "loss": 1.07563639, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.23461914, + "step": 62, + "time_per_iteration": 2.725492238998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136117, + "balance_loss_mlp": 1.11272764, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.101779425959611, + "language_loss": 1.13612652, + "learning_rate": 0.0008204461118185703, + "loss": 1.14748764, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.23376465, + "step": 63, + "time_per_iteration": 2.5753746032714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148279, + "balance_loss_mlp": 1.12627339, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.07447427381713748, + "language_loss": 1.0324012, + "learning_rate": 0.0008235646872681536, + "loss": 1.04388404, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.22009277, + "step": 64, + "time_per_iteration": 2.5766890048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134709, + "balance_loss_mlp": 1.11331069, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.38827595406324295, + "language_loss": 1.02755439, + "learning_rate": 0.0008266349107584288, + "loss": 1.03890157, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.2142334, + "step": 65, + "time_per_iteration": 2.6795432567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150765, + "balance_loss_mlp": 1.12982011, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.12495940986475743, + "language_loss": 1.06208372, + "learning_rate": 0.0008296582587724851, + "loss": 1.07359147, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.20947266, + "step": 66, + "time_per_iteration": 2.7176458835601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140545, + "balance_loss_mlp": 1.11969519, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.1040817091496257, + "language_loss": 1.04495656, + "learning_rate": 0.0008326361411800136, + "loss": 1.05636215, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.20861816, + "step": 67, + "time_per_iteration": 2.944484233856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11664486, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.1236975736999165, + "language_loss": 1.04613113, + "learning_rate": 0.0008355699051851403, + "loss": 1.05749726, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.1998291, + "step": 68, + "time_per_iteration": 2.7155401706695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163949, + "balance_loss_mlp": 1.14371967, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.08669769947970225, + "language_loss": 1.11325383, + "learning_rate": 0.0008384608389860635, + "loss": 1.12489343, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.20214844, + "step": 69, + "time_per_iteration": 2.6746206283569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170727, + "balance_loss_mlp": 1.15127182, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.13494585106435908, + "language_loss": 1.01927853, + "learning_rate": 0.000841310175171381, + "loss": 1.03098571, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.19433594, + "step": 70, + "time_per_iteration": 2.6096978187561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116458, + "balance_loss_mlp": 1.14537501, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.08071853308807045, + "language_loss": 0.99831259, + "learning_rate": 0.000844119093875517, + "loss": 1.00995839, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.19189453, + "step": 71, + "time_per_iteration": 2.7110228538513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172694, + "balance_loss_mlp": 1.1531322, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.1298896621631551, + "language_loss": 1.05077183, + "learning_rate": 0.0008468887257134666, + "loss": 1.06249881, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.19543457, + "step": 72, + "time_per_iteration": 2.6877832412719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117331, + "balance_loss_mlp": 1.15338969, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.15655470084299106, + "language_loss": 1.07319438, + "learning_rate": 0.0008496201545131264, + "loss": 1.08492744, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.19909668, + "step": 73, + "time_per_iteration": 2.712404251098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155518, + "balance_loss_mlp": 1.13590837, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.16190508579873739, + "language_loss": 1.04767108, + "learning_rate": 0.0008523144198617317, + "loss": 1.05922627, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.19604492, + "step": 74, + "time_per_iteration": 3.1923534870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136399, + "balance_loss_mlp": 1.11624122, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.09478832041488004, + "language_loss": 1.04861999, + "learning_rate": 0.0008549725194813783, + "loss": 1.05998397, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.20153809, + "step": 75, + "time_per_iteration": 2.6708076000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116508, + "balance_loss_mlp": 1.09800684, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.08770819878028477, + "language_loss": 1.03907192, + "learning_rate": 0.0008575954114472099, + "loss": 1.05023694, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.18481445, + "step": 76, + "time_per_iteration": 3.13152813911438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115762, + "balance_loss_mlp": 1.09717751, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.13848190952411177, + "language_loss": 1.01474786, + "learning_rate": 0.0008601840162606118, + "loss": 1.02590549, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.18591309, + "step": 77, + "time_per_iteration": 3.0026464462280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126663, + "balance_loss_mlp": 1.10745883, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.04300320251384177, + "language_loss": 1.07548404, + "learning_rate": 0.000862739218788641, + "loss": 1.08675063, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.19189453, + "step": 78, + "time_per_iteration": 2.780151128768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136666, + "balance_loss_mlp": 1.11736631, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.05300805683051922, + "language_loss": 1.05217659, + "learning_rate": 0.0008652618700799138, + "loss": 1.0635432, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.19287109, + "step": 79, + "time_per_iteration": 2.644989252090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115046, + "balance_loss_mlp": 1.13105261, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.13679514692214284, + "language_loss": 1.04483461, + "learning_rate": 0.0008677527890662774, + "loss": 1.05633926, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.19384766, + "step": 80, + "time_per_iteration": 2.4652533531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151484, + "balance_loss_mlp": 1.13120639, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.06949005945359786, + "language_loss": 1.05593443, + "learning_rate": 0.0008702127641587799, + "loss": 1.06744933, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.20263672, + "step": 81, + "time_per_iteration": 2.6423192024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155894, + "balance_loss_mlp": 1.13492513, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.09507058081046676, + "language_loss": 1.01514888, + "learning_rate": 0.0008726425547457192, + "loss": 1.02670789, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.20959473, + "step": 82, + "time_per_iteration": 2.7670798301696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.11376882, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.0793725108169458, + "language_loss": 1.00304663, + "learning_rate": 0.0008750428925998964, + "loss": 1.01438546, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.20117188, + "step": 83, + "time_per_iteration": 2.7451062202453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145272, + "balance_loss_mlp": 1.12516141, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.14534943996774727, + "language_loss": 1.06251049, + "learning_rate": 0.0008774144832015932, + "loss": 1.07396317, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.2010498, + "step": 84, + "time_per_iteration": 2.7039954662323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01784137, + "balance_loss_mlp": 1.77116704, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.33978769388161495, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76558447, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.12988281, + "step": 85, + "time_per_iteration": 4.672428846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133263, + "balance_loss_mlp": 1.11339045, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.0814354491433929, + "language_loss": 1.01647198, + "learning_rate": 0.0008820741205014318, + "loss": 1.02780461, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.19873047, + "step": 86, + "time_per_iteration": 2.9217472076416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135249, + "balance_loss_mlp": 1.11522174, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.09136661427056217, + "language_loss": 1.02933669, + "learning_rate": 0.0008843634575408404, + "loss": 1.04068923, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20031738, + "step": 87, + "time_per_iteration": 2.7795376777648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126576, + "balance_loss_mlp": 1.10805094, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.08653972064742017, + "language_loss": 1.04609084, + "learning_rate": 0.0008866266301555082, + "loss": 1.0573566, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.18518066, + "step": 88, + "time_per_iteration": 2.7490010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144591, + "balance_loss_mlp": 1.12630451, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.0643644920813647, + "language_loss": 1.05052233, + "learning_rate": 0.0008888642296509615, + "loss": 1.06196821, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.18273926, + "step": 89, + "time_per_iteration": 2.594862222671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167485, + "balance_loss_mlp": 1.14840007, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.0960094219381758, + "language_loss": 1.09507632, + "learning_rate": 0.0008910768275115906, + "loss": 1.10675108, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.1907959, + "step": 90, + "time_per_iteration": 2.732243299484253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168948, + "balance_loss_mlp": 1.14970791, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.08670111946866453, + "language_loss": 1.05579484, + "learning_rate": 0.0008932649762767675, + "loss": 1.06748414, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.19238281, + "step": 91, + "time_per_iteration": 2.58011531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156938, + "balance_loss_mlp": 1.13799536, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.1377326340865385, + "language_loss": 1.07988524, + "learning_rate": 0.0008954292103690864, + "loss": 1.09145451, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.18933105, + "step": 92, + "time_per_iteration": 2.88777494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144865, + "balance_loss_mlp": 1.12581539, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.08013614344713903, + "language_loss": 1.10040021, + "learning_rate": 0.0008975700468778296, + "loss": 1.11184883, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.19042969, + "step": 93, + "time_per_iteration": 2.5774590969085693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153192, + "balance_loss_mlp": 1.13429725, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.08120240816831911, + "language_loss": 1.03244281, + "learning_rate": 0.0008996879863005366, + "loss": 1.04397476, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.18896484, + "step": 94, + "time_per_iteration": 2.6684646606445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166139, + "balance_loss_mlp": 1.14685082, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.10696755240582503, + "language_loss": 1.0365541, + "learning_rate": 0.0009017835132453337, + "loss": 1.04821539, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.19262695, + "step": 95, + "time_per_iteration": 2.5731871128082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160643, + "balance_loss_mlp": 1.14130712, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.09689172385373614, + "language_loss": 1.03809953, + "learning_rate": 0.0009038570970964896, + "loss": 1.04970598, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.1932373, + "step": 96, + "time_per_iteration": 2.7642133235931396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142174, + "balance_loss_mlp": 1.1226114, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.0731237284630876, + "language_loss": 1.01012015, + "learning_rate": 0.0009059091926454854, + "loss": 1.02154183, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.19543457, + "step": 97, + "time_per_iteration": 2.5798768997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134737, + "balance_loss_mlp": 1.11522222, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.09616120207899966, + "language_loss": 1.00179553, + "learning_rate": 0.0009079402406897198, + "loss": 1.01314282, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.19494629, + "step": 98, + "time_per_iteration": 3.2566075325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.12357211, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.06455780129345397, + "language_loss": 1.01265812, + "learning_rate": 0.0009099506686008212, + "loss": 1.02409148, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.19763184, + "step": 99, + "time_per_iteration": 2.799565553665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129571, + "balance_loss_mlp": 1.11054564, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.10657448879387016, + "language_loss": 1.0467732, + "learning_rate": 0.0009119408908644013, + "loss": 1.05806899, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.19030762, + "step": 100, + "time_per_iteration": 2.684875249862671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122425, + "balance_loss_mlp": 1.10363734, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.06970738765852934, + "language_loss": 1.09725833, + "learning_rate": 0.0009139113095929519, + "loss": 1.1084826, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.18762207, + "step": 101, + "time_per_iteration": 2.8530783653259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130466, + "balance_loss_mlp": 1.11095107, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.04951217111237057, + "language_loss": 1.03750157, + "learning_rate": 0.0009158623150134762, + "loss": 1.04880619, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.19506836, + "step": 102, + "time_per_iteration": 2.5738718509674072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124372, + "balance_loss_mlp": 1.10552466, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.07829016079597523, + "language_loss": 1.03829539, + "learning_rate": 0.000917794285931332, + "loss": 1.04953909, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.18859863, + "step": 103, + "time_per_iteration": 2.6672050952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116034, + "balance_loss_mlp": 1.09756863, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.06055754000551873, + "language_loss": 0.96430528, + "learning_rate": 0.0009197075901716639, + "loss": 0.97546566, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.18444824, + "step": 104, + "time_per_iteration": 2.7030909061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143399, + "balance_loss_mlp": 1.12458754, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.08851166873462187, + "language_loss": 1.06492853, + "learning_rate": 0.0009216025849997171, + "loss": 1.07636249, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.18798828, + "step": 105, + "time_per_iteration": 2.770717144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136641, + "balance_loss_mlp": 1.11799645, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.1087806769155691, + "language_loss": 1.01426148, + "learning_rate": 0.0009234796175212258, + "loss": 1.02562797, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.18640137, + "step": 106, + "time_per_iteration": 2.9345030784606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145469, + "balance_loss_mlp": 1.12691963, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.08314221817588373, + "language_loss": 1.04264343, + "learning_rate": 0.000925339025064007, + "loss": 1.05409813, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.18530273, + "step": 107, + "time_per_iteration": 2.9724230766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136133, + "balance_loss_mlp": 1.11766744, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.06103111074840472, + "language_loss": 0.9746207, + "learning_rate": 0.0009271811355418027, + "loss": 0.98598194, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.18457031, + "step": 108, + "time_per_iteration": 2.8312766551971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114122, + "balance_loss_mlp": 1.12251627, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.09366723049874563, + "language_loss": 1.0430491, + "learning_rate": 0.0009290062678013548, + "loss": 1.05446124, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.18713379, + "step": 109, + "time_per_iteration": 2.8890299797058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119997, + "balance_loss_mlp": 1.10091138, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.07845117671788823, + "language_loss": 1.02498507, + "learning_rate": 0.0009308147319536321, + "loss": 1.03618503, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.19067383, + "step": 110, + "time_per_iteration": 2.6301145553588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124178, + "balance_loss_mlp": 1.10517561, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.06169483511964636, + "language_loss": 1.08628201, + "learning_rate": 0.0009326068296900676, + "loss": 1.09752393, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.18981934, + "step": 111, + "time_per_iteration": 2.8480148315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124091, + "balance_loss_mlp": 1.1046958, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.07277353768082521, + "language_loss": 1.00328588, + "learning_rate": 0.0009343828545846161, + "loss": 1.01452684, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.19384766, + "step": 112, + "time_per_iteration": 2.785245656967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145687, + "balance_loss_mlp": 1.12596965, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.0989159829516975, + "language_loss": 1.03963184, + "learning_rate": 0.0009361430923823841, + "loss": 1.05108869, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.19702148, + "step": 113, + "time_per_iteration": 2.6218817234039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139838, + "balance_loss_mlp": 1.11994159, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.08134488401387123, + "language_loss": 1.07289195, + "learning_rate": 0.0009378878212755459, + "loss": 1.08429039, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.19885254, + "step": 114, + "time_per_iteration": 2.489394426345825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135841, + "balance_loss_mlp": 1.11546779, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.08931795851274972, + "language_loss": 0.98084462, + "learning_rate": 0.0009396173121672103, + "loss": 0.992203, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.20373535, + "step": 115, + "time_per_iteration": 2.6338186264038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132046, + "balance_loss_mlp": 1.11229324, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.07784948028132394, + "language_loss": 1.03230667, + "learning_rate": 0.0009413318289238633, + "loss": 1.04362714, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.1973877, + "step": 116, + "time_per_iteration": 2.7797064781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119746, + "balance_loss_mlp": 1.10049319, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.10235619274826367, + "language_loss": 0.95674431, + "learning_rate": 0.0009430316286169771, + "loss": 0.96794176, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.19226074, + "step": 117, + "time_per_iteration": 3.0148251056671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123727, + "balance_loss_mlp": 1.10400951, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.08556933686221588, + "language_loss": 1.00759292, + "learning_rate": 0.0009447169617543361, + "loss": 1.0188303, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.19714355, + "step": 118, + "time_per_iteration": 2.570577383041382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147362, + "balance_loss_mlp": 1.12738276, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.14195532580527156, + "language_loss": 1.07468402, + "learning_rate": 0.0009463880725016029, + "loss": 1.08615768, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.19970703, + "step": 119, + "time_per_iteration": 2.687791585922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119491, + "balance_loss_mlp": 1.1002152, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.12580227983012474, + "language_loss": 1.02723956, + "learning_rate": 0.0009480451988946134, + "loss": 1.03843451, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19274902, + "step": 120, + "time_per_iteration": 2.86080002784729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.09974504, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.09779732210141849, + "language_loss": 1.04102588, + "learning_rate": 0.0009496885730428627, + "loss": 1.05221319, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1895752, + "step": 121, + "time_per_iteration": 3.058720350265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129608, + "balance_loss_mlp": 1.11076128, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.21300696817673925, + "language_loss": 1.02294064, + "learning_rate": 0.0009513184213246156, + "loss": 1.03423667, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.18859863, + "step": 122, + "time_per_iteration": 2.634585380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112851, + "balance_loss_mlp": 1.10879278, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.08876505507315528, + "language_loss": 1.05331969, + "learning_rate": 0.0009529349645740552, + "loss": 1.06460488, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.19702148, + "step": 123, + "time_per_iteration": 2.68062686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139736, + "balance_loss_mlp": 1.11948287, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.07165211399576038, + "language_loss": 1.04294729, + "learning_rate": 0.0009545384182608524, + "loss": 1.05434453, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.20239258, + "step": 124, + "time_per_iteration": 2.541867971420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147945, + "balance_loss_mlp": 1.12758446, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.1170262954091428, + "language_loss": 1.01733518, + "learning_rate": 0.0009561289926625252, + "loss": 1.02881455, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.20361328, + "step": 125, + "time_per_iteration": 2.6904866695404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144311, + "balance_loss_mlp": 1.12337756, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.0767802787123007, + "language_loss": 1.06512678, + "learning_rate": 0.0009577068930299292, + "loss": 1.07656991, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.20935059, + "step": 126, + "time_per_iteration": 2.5956666469573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112026, + "balance_loss_mlp": 1.10011339, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.05578094289714296, + "language_loss": 1.01563096, + "learning_rate": 0.0009592723197462087, + "loss": 1.02683353, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.20141602, + "step": 127, + "time_per_iteration": 2.652282953262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135813, + "balance_loss_mlp": 1.11633444, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.08941911012616197, + "language_loss": 0.98464531, + "learning_rate": 0.0009608254684795125, + "loss": 0.99600339, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.19470215, + "step": 128, + "time_per_iteration": 2.9219348430633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113214, + "balance_loss_mlp": 1.11204123, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.07851670709976168, + "language_loss": 1.01339173, + "learning_rate": 0.0009623665303297678, + "loss": 1.02471328, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.20092773, + "step": 129, + "time_per_iteration": 2.72129225730896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138949, + "balance_loss_mlp": 1.11936343, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.10234054898828188, + "language_loss": 1.05215728, + "learning_rate": 0.0009638956919697878, + "loss": 1.0635469, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19580078, + "step": 130, + "time_per_iteration": 2.8943347930908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120076, + "balance_loss_mlp": 1.10040641, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.07955649128739337, + "language_loss": 0.97532988, + "learning_rate": 0.0009654131357809714, + "loss": 0.98653066, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.19665527, + "step": 131, + "time_per_iteration": 2.5710790157318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131377, + "balance_loss_mlp": 1.11108756, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.09603534709419483, + "language_loss": 1.06830871, + "learning_rate": 0.0009669190399838441, + "loss": 1.07962251, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.20275879, + "step": 132, + "time_per_iteration": 3.12355899810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104613, + "balance_loss_mlp": 1.08422863, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.07678679730921736, + "language_loss": 0.99635059, + "learning_rate": 0.0009684135787636724, + "loss": 1.0073967, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.20373535, + "step": 133, + "time_per_iteration": 2.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011225, + "balance_loss_mlp": 1.10198379, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.06194161941979751, + "language_loss": 1.03999257, + "learning_rate": 0.0009698969223913726, + "loss": 1.05121756, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.2052002, + "step": 134, + "time_per_iteration": 3.0173001289367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111055, + "balance_loss_mlp": 1.09066617, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.06876216863310104, + "language_loss": 1.06792855, + "learning_rate": 0.0009713692373399265, + "loss": 1.07903397, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.19873047, + "step": 135, + "time_per_iteration": 2.670929431915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134721, + "balance_loss_mlp": 1.33280921, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.15411027982306336, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.80803436, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.14355469, + "step": 136, + "time_per_iteration": 5.4502341747283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142923, + "balance_loss_mlp": 1.13023889, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.0420308652143082, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.78953964, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.911421298980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140624, + "balance_loss_mlp": 1.1204778, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.15008184892874737, + "language_loss": 0.99414909, + "learning_rate": 0.0009757216201974225, + "loss": 1.00555539, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.20141602, + "step": 138, + "time_per_iteration": 2.805294990539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163113, + "balance_loss_mlp": 1.1417979, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.10042691837700132, + "language_loss": 1.04683781, + "learning_rate": 0.0009771514130396581, + "loss": 1.05846894, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.21325684, + "step": 139, + "time_per_iteration": 2.6785237789154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171786, + "balance_loss_mlp": 1.15150893, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.13712828131438198, + "language_loss": 1.04777944, + "learning_rate": 0.00097857095638274, + "loss": 1.05949712, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.20275879, + "step": 140, + "time_per_iteration": 2.5689632892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161751, + "balance_loss_mlp": 1.140818, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.04776427930188189, + "language_loss": 0.96152979, + "learning_rate": 0.0009799803961288726, + "loss": 0.97314727, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.20922852, + "step": 141, + "time_per_iteration": 3.005524158477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114311, + "balance_loss_mlp": 1.12280869, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.08242063446041879, + "language_loss": 1.02058709, + "learning_rate": 0.000981379875086876, + "loss": 1.03201818, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.20300293, + "step": 142, + "time_per_iteration": 3.0404272079467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149559, + "balance_loss_mlp": 1.12884021, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.08811908081945614, + "language_loss": 0.97007114, + "learning_rate": 0.0009827695330590185, + "loss": 0.98156673, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.20727539, + "step": 143, + "time_per_iteration": 2.677872896194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139202, + "balance_loss_mlp": 1.11838782, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.09095558281985278, + "language_loss": 0.9660008, + "learning_rate": 0.0009841495069248256, + "loss": 0.97739279, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.20788574, + "step": 144, + "time_per_iteration": 3.0181970596313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124304, + "balance_loss_mlp": 1.10402668, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.06968867614461936, + "language_loss": 0.96011639, + "learning_rate": 0.0009855199307219871, + "loss": 0.97135949, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.20275879, + "step": 145, + "time_per_iteration": 2.6638803482055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129462, + "balance_loss_mlp": 1.10819507, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.10380696742567494, + "language_loss": 0.97768301, + "learning_rate": 0.0009868809357244854, + "loss": 0.98897767, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.21264648, + "step": 146, + "time_per_iteration": 2.6609416007995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_mlp": 1.08754969, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.04767435219925792, + "language_loss": 1.01976728, + "learning_rate": 0.0009882326505180556, + "loss": 1.03085351, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.21081543, + "step": 147, + "time_per_iteration": 2.7018306255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116059, + "balance_loss_mlp": 1.09487534, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.081387986355653, + "language_loss": 1.0020777, + "learning_rate": 0.0009895752010730906, + "loss": 1.01323831, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.21191406, + "step": 148, + "time_per_iteration": 2.9776458740234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114545, + "balance_loss_mlp": 1.09280121, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.07164111136345892, + "language_loss": 1.06547272, + "learning_rate": 0.0009909087108150867, + "loss": 1.07661819, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.21740723, + "step": 149, + "time_per_iteration": 2.7685787677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120998, + "balance_loss_mlp": 1.09932601, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.09002123643314056, + "language_loss": 1.07463562, + "learning_rate": 0.0009922333006927371, + "loss": 1.08584571, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.2166748, + "step": 150, + "time_per_iteration": 2.5377442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134752, + "balance_loss_mlp": 1.11268604, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.07882603128859848, + "language_loss": 1.00827551, + "learning_rate": 0.0009935490892437632, + "loss": 1.01962304, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.22070312, + "step": 151, + "time_per_iteration": 2.5629055500030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126049, + "balance_loss_mlp": 1.10497248, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.07540534084758796, + "language_loss": 0.99210167, + "learning_rate": 0.0009948561926585687, + "loss": 1.00336218, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.21069336, + "step": 152, + "time_per_iteration": 2.755824565887451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133957, + "balance_loss_mlp": 1.1110214, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09890448438657973, + "language_loss": 1.02627087, + "learning_rate": 0.0009961547248418122, + "loss": 1.03761053, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.22937012, + "step": 153, + "time_per_iteration": 2.6255645751953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115876, + "balance_loss_mlp": 1.09208155, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.0750271830701194, + "language_loss": 0.99508584, + "learning_rate": 0.0009974447974719707, + "loss": 1.00624466, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.23791504, + "step": 154, + "time_per_iteration": 2.685029983520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126502, + "balance_loss_mlp": 1.10213518, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.12681443605953674, + "language_loss": 1.01620197, + "learning_rate": 0.0009987265200589763, + "loss": 1.02746701, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.24365234, + "step": 155, + "time_per_iteration": 2.7264955043792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119337, + "balance_loss_mlp": 1.09590077, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.07965097154096117, + "language_loss": 1.01522899, + "learning_rate": 0.001, + "loss": 1.02642226, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.23400879, + "step": 156, + "time_per_iteration": 2.864698886871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111456, + "balance_loss_mlp": 1.09257805, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.061020534493473076, + "language_loss": 0.9859184, + "learning_rate": 0.0009999999029413921, + "loss": 0.99706399, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.2199707, + "step": 157, + "time_per_iteration": 2.8241283893585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125679, + "balance_loss_mlp": 1.1049242, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.05862251807890935, + "language_loss": 1.00346851, + "learning_rate": 0.0009999996117656068, + "loss": 1.01472545, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.2076416, + "step": 158, + "time_per_iteration": 2.7097458839416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113624, + "balance_loss_mlp": 1.09279847, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.09545570145123992, + "language_loss": 0.93653512, + "learning_rate": 0.0009999991264727564, + "loss": 0.94767129, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.20837402, + "step": 159, + "time_per_iteration": 2.756363868713379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110635, + "balance_loss_mlp": 1.08577418, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.09475469160316574, + "language_loss": 1.04571712, + "learning_rate": 0.0009999984470630296, + "loss": 1.05678058, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.20581055, + "step": 160, + "time_per_iteration": 2.5990707874298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112086, + "balance_loss_mlp": 1.09061611, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.07420241291943742, + "language_loss": 0.9342289, + "learning_rate": 0.0009999975735366902, + "loss": 0.94534969, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.21472168, + "step": 161, + "time_per_iteration": 3.06878662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114184, + "balance_loss_mlp": 1.09270215, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.0799449593456649, + "language_loss": 0.95189524, + "learning_rate": 0.0009999965058940775, + "loss": 0.96303707, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.21484375, + "step": 162, + "time_per_iteration": 3.4937808513641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112457, + "balance_loss_mlp": 1.10226631, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.08293329451395655, + "language_loss": 1.01278222, + "learning_rate": 0.0009999952441356057, + "loss": 1.02402782, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.22314453, + "step": 163, + "time_per_iteration": 2.535121202468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109663, + "balance_loss_mlp": 1.08820534, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.06727245316799851, + "language_loss": 1.0154388, + "learning_rate": 0.000999993788261765, + "loss": 1.02653539, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.21472168, + "step": 164, + "time_per_iteration": 3.5832889080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110987, + "balance_loss_mlp": 1.08942175, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.07205404441274409, + "language_loss": 1.03110182, + "learning_rate": 0.00099999213827312, + "loss": 1.04221165, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.21569824, + "step": 165, + "time_per_iteration": 2.8096628189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118839, + "balance_loss_mlp": 1.09684491, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.050309165813849886, + "language_loss": 0.98088074, + "learning_rate": 0.000999990294170312, + "loss": 0.99206913, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.22009277, + "step": 166, + "time_per_iteration": 2.663135051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116813, + "balance_loss_mlp": 1.09486628, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.06058681172545402, + "language_loss": 1.02190185, + "learning_rate": 0.0009999882559540566, + "loss": 1.03306985, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.21948242, + "step": 167, + "time_per_iteration": 2.649784564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118672, + "balance_loss_mlp": 1.09543872, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.10019647540930027, + "language_loss": 0.98887956, + "learning_rate": 0.000999986023625145, + "loss": 1.00006628, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.23217773, + "step": 168, + "time_per_iteration": 2.6998720169067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01817799, + "balance_loss_mlp": 1.79767668, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.21411409700219255, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.80742216, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.20117188, + "step": 169, + "time_per_iteration": 5.029488563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112576, + "balance_loss_mlp": 1.10157228, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.09130724925200479, + "language_loss": 0.99515283, + "learning_rate": 0.0009999809766328958, + "loss": 1.00641036, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.24206543, + "step": 170, + "time_per_iteration": 2.6508679389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153612, + "balance_loss_mlp": 1.12968671, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.0981725040523357, + "language_loss": 1.01766157, + "learning_rate": 0.0009999781619715177, + "loss": 1.02919769, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.23925781, + "step": 171, + "time_per_iteration": 2.5449466705322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151071, + "balance_loss_mlp": 1.12767053, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.10018141203760955, + "language_loss": 1.0104121, + "learning_rate": 0.000999975153201402, + "loss": 1.02192283, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.23388672, + "step": 172, + "time_per_iteration": 2.8463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114097, + "balance_loss_mlp": 1.11745048, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.05920698759335099, + "language_loss": 0.98661143, + "learning_rate": 0.0009999719503237174, + "loss": 0.99802113, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.23498535, + "step": 173, + "time_per_iteration": 2.733147144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157549, + "balance_loss_mlp": 1.1333611, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.12686135486457134, + "language_loss": 1.07479167, + "learning_rate": 0.0009999685533397073, + "loss": 1.08636713, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.24194336, + "step": 174, + "time_per_iteration": 2.5705809593200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110896, + "balance_loss_mlp": 1.08707762, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.07652801902249555, + "language_loss": 0.99758261, + "learning_rate": 0.00099996496225069, + "loss": 1.00869155, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.23815918, + "step": 175, + "time_per_iteration": 2.6572659015655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118018, + "balance_loss_mlp": 1.09399772, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.05463854096335067, + "language_loss": 1.01895058, + "learning_rate": 0.0009999611770580604, + "loss": 1.03013086, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.24023438, + "step": 176, + "time_per_iteration": 2.8216159343719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.09596181, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.08810438351502946, + "language_loss": 1.01167393, + "learning_rate": 0.0009999571977632876, + "loss": 1.02288568, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.25231934, + "step": 177, + "time_per_iteration": 2.581037998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115073, + "balance_loss_mlp": 1.09040904, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.08419866181616258, + "language_loss": 1.03353202, + "learning_rate": 0.0009999530243679166, + "loss": 1.04468274, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.24682617, + "step": 178, + "time_per_iteration": 2.5844500064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137225, + "balance_loss_mlp": 1.11332321, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.13671082465577608, + "language_loss": 0.99045932, + "learning_rate": 0.0009999486568735675, + "loss": 1.00183165, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.23913574, + "step": 179, + "time_per_iteration": 3.044409990310669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125047, + "balance_loss_mlp": 1.1010983, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.0738854697341979, + "language_loss": 0.99422705, + "learning_rate": 0.0009999440952819362, + "loss": 1.00547755, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.23950195, + "step": 180, + "time_per_iteration": 3.644280433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112251, + "balance_loss_mlp": 1.08836114, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.04789131390967285, + "language_loss": 0.98983485, + "learning_rate": 0.0009999393395947935, + "loss": 1.00095737, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.2388916, + "step": 181, + "time_per_iteration": 2.8229053020477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114504, + "balance_loss_mlp": 1.08992302, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.08040661288612141, + "language_loss": 1.02358437, + "learning_rate": 0.0009999343898139858, + "loss": 1.03472936, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.24584961, + "step": 182, + "time_per_iteration": 2.6112709045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123737, + "balance_loss_mlp": 1.09824967, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.0879280890069936, + "language_loss": 1.01010704, + "learning_rate": 0.0009999292459414348, + "loss": 1.02134442, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.25476074, + "step": 183, + "time_per_iteration": 2.574800491333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111495, + "balance_loss_mlp": 1.08559036, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.08068750200828848, + "language_loss": 1.05455053, + "learning_rate": 0.0009999239079791374, + "loss": 1.06566548, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.25915527, + "step": 184, + "time_per_iteration": 2.5650548934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110884, + "balance_loss_mlp": 1.08343673, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.07300059562366337, + "language_loss": 0.98493111, + "learning_rate": 0.0009999183759291659, + "loss": 0.99601954, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.25427246, + "step": 185, + "time_per_iteration": 2.7383785247802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110571, + "balance_loss_mlp": 1.08168936, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.09426698036311254, + "language_loss": 1.00536895, + "learning_rate": 0.0009999126497936682, + "loss": 1.01642609, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.24023438, + "step": 186, + "time_per_iteration": 2.5103538036346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110913, + "balance_loss_mlp": 1.08740544, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.07507023604654985, + "language_loss": 1.03590488, + "learning_rate": 0.0009999067295748676, + "loss": 1.047014, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.23510742, + "step": 187, + "time_per_iteration": 2.806403160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112247, + "balance_loss_mlp": 1.09995186, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.10679989437153373, + "language_loss": 1.00781608, + "learning_rate": 0.000999900615275062, + "loss": 1.01904082, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.22509766, + "step": 188, + "time_per_iteration": 2.6750597953796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105556, + "balance_loss_mlp": 1.0823226, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.06425431277780277, + "language_loss": 1.06987619, + "learning_rate": 0.0009998943068966256, + "loss": 1.0809319, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.23242188, + "step": 189, + "time_per_iteration": 2.4297006130218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.0826813, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.07322572175010231, + "language_loss": 1.01591444, + "learning_rate": 0.0009998878044420072, + "loss": 1.02697778, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23669434, + "step": 190, + "time_per_iteration": 2.6686899662017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108272, + "balance_loss_mlp": 1.08489525, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.07088525550270033, + "language_loss": 0.97819, + "learning_rate": 0.0009998811079137318, + "loss": 0.98927271, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.23400879, + "step": 191, + "time_per_iteration": 2.5795974731445312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118931, + "balance_loss_mlp": 1.09439743, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.07437245365565072, + "language_loss": 0.9895249, + "learning_rate": 0.0009998742173143987, + "loss": 1.0007143, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.24536133, + "step": 192, + "time_per_iteration": 2.6109251976013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133748, + "balance_loss_mlp": 1.10824919, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.06698686336952825, + "language_loss": 0.98415262, + "learning_rate": 0.0009998671326466833, + "loss": 0.99549013, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.25524902, + "step": 193, + "time_per_iteration": 2.955780506134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136952, + "balance_loss_mlp": 1.10922432, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.07154145387165563, + "language_loss": 0.99267447, + "learning_rate": 0.0009998598539133362, + "loss": 1.00404394, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.27734375, + "step": 194, + "time_per_iteration": 3.0137686729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163557, + "balance_loss_mlp": 1.13373041, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.09795763902625766, + "language_loss": 1.00780571, + "learning_rate": 0.0009998523811171828, + "loss": 1.01944125, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.2980957, + "step": 195, + "time_per_iteration": 2.5090267658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164155, + "balance_loss_mlp": 1.13323212, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.0756543485462421, + "language_loss": 1.0036695, + "learning_rate": 0.0009998447142611248, + "loss": 1.015311, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.30883789, + "step": 196, + "time_per_iteration": 2.653759241104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12615836, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.10738469994654526, + "language_loss": 0.9438082, + "learning_rate": 0.0009998368533481387, + "loss": 0.95537138, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.30126953, + "step": 197, + "time_per_iteration": 3.03090763092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123277, + "balance_loss_mlp": 1.09433353, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.08947148055588174, + "language_loss": 0.97516447, + "learning_rate": 0.0009998287983812762, + "loss": 0.98639727, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.28930664, + "step": 198, + "time_per_iteration": 2.842519760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.10672641, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.08719552456544254, + "language_loss": 1.03183711, + "learning_rate": 0.0009998205493636646, + "loss": 1.04316807, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.26416016, + "step": 199, + "time_per_iteration": 2.657094955444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099837, + "balance_loss_mlp": 1.07485092, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.11937452390124363, + "language_loss": 0.95869702, + "learning_rate": 0.0009998121062985063, + "loss": 0.96969533, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.24987793, + "step": 200, + "time_per_iteration": 2.6954355239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108694, + "balance_loss_mlp": 1.08444691, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.09459530753006626, + "language_loss": 0.98493665, + "learning_rate": 0.0009998034691890794, + "loss": 0.9960236, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.24243164, + "step": 201, + "time_per_iteration": 2.7717928886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104411, + "balance_loss_mlp": 1.08075976, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.07675440437740683, + "language_loss": 1.0290482, + "learning_rate": 0.0009997946380387369, + "loss": 1.04009235, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.23632812, + "step": 202, + "time_per_iteration": 2.63975191116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111336, + "balance_loss_mlp": 1.08706474, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.09220046036918417, + "language_loss": 1.04956245, + "learning_rate": 0.0009997856128509076, + "loss": 1.06067586, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.24279785, + "step": 203, + "time_per_iteration": 2.856816053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124883, + "balance_loss_mlp": 1.10112453, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.08622839045605694, + "language_loss": 0.99688643, + "learning_rate": 0.0009997763936290952, + "loss": 1.00813532, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.23754883, + "step": 204, + "time_per_iteration": 2.5392112731933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113243, + "balance_loss_mlp": 1.10773039, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.09842935942049862, + "language_loss": 1.0453217, + "learning_rate": 0.0009997669803768789, + "loss": 1.05664587, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24694824, + "step": 205, + "time_per_iteration": 2.7708992958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108465, + "balance_loss_mlp": 1.08426595, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.10843184908981528, + "language_loss": 0.9984858, + "learning_rate": 0.0009997573730979134, + "loss": 1.00957048, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.24194336, + "step": 206, + "time_per_iteration": 2.7474939823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01685643, + "balance_loss_mlp": 1.6616106, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.13014896830523812, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.80878842, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 0.24023438, + "step": 207, + "time_per_iteration": 4.682751655578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109669, + "balance_loss_mlp": 1.08474243, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.07677308889428856, + "language_loss": 0.98866731, + "learning_rate": 0.0009997375764747294, + "loss": 0.99976397, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.24926758, + "step": 208, + "time_per_iteration": 2.9866418838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110763, + "balance_loss_mlp": 1.08659935, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.07362493409063897, + "language_loss": 0.96845645, + "learning_rate": 0.0009997273871381967, + "loss": 0.97956407, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.24169922, + "step": 209, + "time_per_iteration": 2.7354848384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125333, + "balance_loss_mlp": 1.09998906, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.07873798613461079, + "language_loss": 1.01664305, + "learning_rate": 0.0009997170037902862, + "loss": 1.0278964, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.25366211, + "step": 210, + "time_per_iteration": 2.704061269760132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120611, + "balance_loss_mlp": 1.09462297, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.06515356853390573, + "language_loss": 1.04550838, + "learning_rate": 0.0009997064264350292, + "loss": 1.05671442, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.26013184, + "step": 211, + "time_per_iteration": 2.8975577354431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113542, + "balance_loss_mlp": 1.08662462, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.07652094351016743, + "language_loss": 0.98263478, + "learning_rate": 0.0009996956550765317, + "loss": 0.99377024, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.26928711, + "step": 212, + "time_per_iteration": 2.6716954708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125588, + "balance_loss_mlp": 1.09752572, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07289633346919515, + "language_loss": 0.93075061, + "learning_rate": 0.0009996846897189762, + "loss": 0.94200653, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.28051758, + "step": 213, + "time_per_iteration": 2.621661901473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110085, + "balance_loss_mlp": 1.08412087, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.055838089119108855, + "language_loss": 0.99370623, + "learning_rate": 0.0009996735303666193, + "loss": 1.004807, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.2598877, + "step": 214, + "time_per_iteration": 2.6928601264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095032, + "balance_loss_mlp": 1.06966448, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.04962656356162825, + "language_loss": 1.01034558, + "learning_rate": 0.0009996621770237937, + "loss": 1.02129602, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.25390625, + "step": 215, + "time_per_iteration": 2.760256290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098352, + "balance_loss_mlp": 1.07167339, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.06820201547086252, + "language_loss": 0.97216904, + "learning_rate": 0.0009996506296949073, + "loss": 0.98315251, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.26708984, + "step": 216, + "time_per_iteration": 2.921712636947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106582, + "balance_loss_mlp": 1.0792954, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.05678696526689756, + "language_loss": 0.96681535, + "learning_rate": 0.0009996388883844428, + "loss": 0.97788119, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27282715, + "step": 217, + "time_per_iteration": 2.6392288208007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092837, + "balance_loss_mlp": 1.06704009, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.06325985488704432, + "language_loss": 1.01514912, + "learning_rate": 0.0009996269530969588, + "loss": 1.02607751, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.25830078, + "step": 218, + "time_per_iteration": 2.6588566303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105308, + "balance_loss_mlp": 1.08038127, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.07879458740668356, + "language_loss": 0.99769139, + "learning_rate": 0.0009996148238370888, + "loss": 1.00874448, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.24938965, + "step": 219, + "time_per_iteration": 2.7322278022766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103161, + "balance_loss_mlp": 1.07711363, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.0629407592127239, + "language_loss": 0.95434463, + "learning_rate": 0.0009996025006095421, + "loss": 0.96537632, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.26049805, + "step": 220, + "time_per_iteration": 3.336355209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02460831, + "balance_loss_mlp": 2.43965983, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.4526401201513886, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.80243975, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 0.21191406, + "step": 221, + "time_per_iteration": 5.584397315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138695, + "balance_loss_mlp": 1.11146736, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.08000509590360377, + "language_loss": 0.96767551, + "learning_rate": 0.0009995772722706307, + "loss": 0.9790625, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.27246094, + "step": 222, + "time_per_iteration": 2.932035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177646, + "balance_loss_mlp": 1.14898777, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.06295735346771135, + "language_loss": 1.10290885, + "learning_rate": 0.0009995643671690604, + "loss": 1.1146853, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.28686523, + "step": 223, + "time_per_iteration": 2.489574909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118606, + "balance_loss_mlp": 1.15768862, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.06397701682602697, + "language_loss": 0.97599596, + "learning_rate": 0.0009995512681194023, + "loss": 0.98785651, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.28369141, + "step": 224, + "time_per_iteration": 2.8617055416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204697, + "balance_loss_mlp": 1.17644429, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.0569906191636753, + "language_loss": 0.95713508, + "learning_rate": 0.0009995379751267417, + "loss": 0.96918201, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28295898, + "step": 225, + "time_per_iteration": 3.272956371307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211045, + "balance_loss_mlp": 1.17959809, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.06210348551978246, + "language_loss": 0.970909, + "learning_rate": 0.0009995244881962398, + "loss": 0.98301941, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.31420898, + "step": 226, + "time_per_iteration": 2.629014253616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207143, + "balance_loss_mlp": 1.17750776, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.06412842399528458, + "language_loss": 0.97423029, + "learning_rate": 0.0009995108073331323, + "loss": 0.98630178, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.29614258, + "step": 227, + "time_per_iteration": 2.598266124725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209228, + "balance_loss_mlp": 1.1790204, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.05900157234221112, + "language_loss": 1.00919747, + "learning_rate": 0.0009994969325427309, + "loss": 1.02128983, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.30200195, + "step": 228, + "time_per_iteration": 2.681445598602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208149, + "balance_loss_mlp": 1.17727375, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.08372721248844238, + "language_loss": 0.96768719, + "learning_rate": 0.0009994828638304218, + "loss": 0.97976863, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.30883789, + "step": 229, + "time_per_iteration": 2.6330137252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213023, + "balance_loss_mlp": 1.18202829, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.09332052147555223, + "language_loss": 1.02555704, + "learning_rate": 0.0009994686012016675, + "loss": 1.0376873, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.30981445, + "step": 230, + "time_per_iteration": 2.519575595855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205079, + "balance_loss_mlp": 1.17470419, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.07303811655625075, + "language_loss": 1.02279592, + "learning_rate": 0.000999454144662005, + "loss": 1.03484678, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.3034668, + "step": 231, + "time_per_iteration": 2.8772194385528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200788, + "balance_loss_mlp": 1.16729009, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.05982585511102693, + "language_loss": 0.9550131, + "learning_rate": 0.0009994394942170468, + "loss": 0.96702093, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.33520508, + "step": 232, + "time_per_iteration": 2.705536127090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200355, + "balance_loss_mlp": 1.16673827, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.06482734437318205, + "language_loss": 0.93872058, + "learning_rate": 0.0009994246498724808, + "loss": 0.95072412, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.33642578, + "step": 233, + "time_per_iteration": 2.729526996612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204357, + "balance_loss_mlp": 1.17043054, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.06840473363398163, + "language_loss": 0.96267349, + "learning_rate": 0.00099940961163407, + "loss": 0.97471702, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.33935547, + "step": 234, + "time_per_iteration": 2.8506321907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210646, + "balance_loss_mlp": 1.1758604, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.061734633326469966, + "language_loss": 0.99016106, + "learning_rate": 0.0009993943795076528, + "loss": 1.0022676, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.34814453, + "step": 235, + "time_per_iteration": 2.6817193031311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012082, + "balance_loss_mlp": 1.17379582, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.07722659013027651, + "language_loss": 1.01211047, + "learning_rate": 0.0009993789534991427, + "loss": 1.02419257, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34423828, + "step": 236, + "time_per_iteration": 2.4797797203063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216471, + "balance_loss_mlp": 1.18354487, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.057771959372629855, + "language_loss": 0.96296465, + "learning_rate": 0.0009993633336145287, + "loss": 0.97512937, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.3293457, + "step": 237, + "time_per_iteration": 2.629390001296997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225643, + "balance_loss_mlp": 1.19369495, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.07668042159358972, + "language_loss": 1.00654197, + "learning_rate": 0.0009993475198598752, + "loss": 1.01879823, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.31958008, + "step": 238, + "time_per_iteration": 3.01481032371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220207, + "balance_loss_mlp": 1.1866858, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.08994725037560618, + "language_loss": 0.96828419, + "learning_rate": 0.0009993315122413212, + "loss": 0.98048627, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.33544922, + "step": 239, + "time_per_iteration": 2.6483867168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215592, + "balance_loss_mlp": 1.18042517, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.08238446857980607, + "language_loss": 0.9678297, + "learning_rate": 0.0009993153107650818, + "loss": 0.97998565, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.35180664, + "step": 240, + "time_per_iteration": 2.594534158706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199347, + "balance_loss_mlp": 1.16303563, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.09316981102360596, + "language_loss": 0.96465278, + "learning_rate": 0.0009992989154374468, + "loss": 0.9766463, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.36328125, + "step": 241, + "time_per_iteration": 2.5503900051116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190623, + "balance_loss_mlp": 1.15631413, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.06540072726643342, + "language_loss": 1.03219867, + "learning_rate": 0.0009992823262647817, + "loss": 1.04410505, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.34301758, + "step": 242, + "time_per_iteration": 2.7218894958496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156597, + "balance_loss_mlp": 1.1235044, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.09177405734811558, + "language_loss": 0.97326249, + "learning_rate": 0.0009992655432535264, + "loss": 0.98482847, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.33105469, + "step": 243, + "time_per_iteration": 2.800133466720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136682, + "balance_loss_mlp": 1.10614085, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.0753000751829641, + "language_loss": 0.98140877, + "learning_rate": 0.0009992485664101973, + "loss": 0.99277562, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.30517578, + "step": 244, + "time_per_iteration": 2.6863763332366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.08648348, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.06369495608278983, + "language_loss": 1.00049853, + "learning_rate": 0.000999231395741385, + "loss": 1.01165819, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.29467773, + "step": 245, + "time_per_iteration": 3.145612955093384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104415, + "balance_loss_mlp": 1.0764488, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.058358007346171054, + "language_loss": 0.97651666, + "learning_rate": 0.0009992140312537557, + "loss": 0.98756075, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.2800293, + "step": 246, + "time_per_iteration": 2.612847328186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092763, + "balance_loss_mlp": 1.06641817, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.0813165094086701, + "language_loss": 0.93562448, + "learning_rate": 0.000999196472954051, + "loss": 0.94655204, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.26379395, + "step": 247, + "time_per_iteration": 2.9633545875549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02706023, + "balance_loss_mlp": 2.55038333, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.26644214904670055, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.82130873, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 1.5546875, + "step": 248, + "time_per_iteration": 5.665804624557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.12381256, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.07780849766073628, + "language_loss": 1.00670481, + "learning_rate": 0.0009991607749457578, + "loss": 1.01821971, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.27709961, + "step": 249, + "time_per_iteration": 2.511357069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173992, + "balance_loss_mlp": 1.14483345, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.08242230719461915, + "language_loss": 0.98555326, + "learning_rate": 0.0009991426352510286, + "loss": 0.99729323, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.29174805, + "step": 250, + "time_per_iteration": 2.9747626781463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213643, + "balance_loss_mlp": 1.18186164, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.08110439009499554, + "language_loss": 0.99640858, + "learning_rate": 0.0009991243017719422, + "loss": 1.00854492, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.31787109, + "step": 251, + "time_per_iteration": 2.6450002193450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247147, + "balance_loss_mlp": 1.21276748, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.09531666026222298, + "language_loss": 0.94547766, + "learning_rate": 0.0009991057745156165, + "loss": 0.95794916, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.34375, + "step": 252, + "time_per_iteration": 2.608226776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0212821, + "balance_loss_mlp": 2.05687547, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.23568337742673945, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84039193, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.71484375, + "step": 253, + "time_per_iteration": 5.009166955947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253718, + "balance_loss_mlp": 1.22112656, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.11732554794190522, + "language_loss": 1.02719152, + "learning_rate": 0.0009990681387000943, + "loss": 1.03972876, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.32568359, + "step": 254, + "time_per_iteration": 2.733544111251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259536, + "balance_loss_mlp": 1.22959042, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.10757948615664437, + "language_loss": 0.99075437, + "learning_rate": 0.0009990490301555093, + "loss": 1.00334978, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.29907227, + "step": 255, + "time_per_iteration": 2.952223777770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01833791, + "balance_loss_mlp": 1.79201972, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.13001926806611183, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81048942, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.41796875, + "step": 256, + "time_per_iteration": 4.834028244018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01839647, + "balance_loss_mlp": 1.7994014, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.11989001468728706, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81082386, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.40234375, + "step": 257, + "time_per_iteration": 4.963416814804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01764173, + "balance_loss_mlp": 1.72659838, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.09913369297847359, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71740055, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.375, + "step": 258, + "time_per_iteration": 4.860485076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242536, + "balance_loss_mlp": 1.21342516, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.09740558448014502, + "language_loss": 0.93272007, + "learning_rate": 0.0009989706585723202, + "loss": 0.94514549, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29101562, + "step": 259, + "time_per_iteration": 2.763617753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252163, + "balance_loss_mlp": 1.22202659, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.1249592106702951, + "language_loss": 0.99313855, + "learning_rate": 0.0009989505813633442, + "loss": 1.0056603, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.30102539, + "step": 260, + "time_per_iteration": 2.687018394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240716, + "balance_loss_mlp": 1.2099601, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.12109163963871895, + "language_loss": 0.99271172, + "learning_rate": 0.000998930310444573, + "loss": 1.00511885, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.30712891, + "step": 261, + "time_per_iteration": 2.7355992794036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194626, + "balance_loss_mlp": 1.16220057, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.10196827835843725, + "language_loss": 0.96712077, + "learning_rate": 0.0009989098458238765, + "loss": 0.97906703, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.32421875, + "step": 262, + "time_per_iteration": 2.8160154819488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120265, + "balance_loss_mlp": 1.16850853, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.08050125519090791, + "language_loss": 0.96376812, + "learning_rate": 0.0009988891875091998, + "loss": 0.97579467, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.34179688, + "step": 263, + "time_per_iteration": 2.7738425731658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221172, + "balance_loss_mlp": 1.18657792, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.09840792148235085, + "language_loss": 0.91716301, + "learning_rate": 0.0009988683355085636, + "loss": 0.92937469, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.34619141, + "step": 264, + "time_per_iteration": 2.7763147354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240941, + "balance_loss_mlp": 1.20393836, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.10851467261948886, + "language_loss": 0.99809039, + "learning_rate": 0.000998847289830063, + "loss": 1.01049972, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.37011719, + "step": 265, + "time_per_iteration": 2.824655532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228337, + "balance_loss_mlp": 1.1930747, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.10300549526892724, + "language_loss": 0.92410266, + "learning_rate": 0.0009988260504818682, + "loss": 0.93638599, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.35253906, + "step": 266, + "time_per_iteration": 2.5484864711761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187227, + "balance_loss_mlp": 1.15127397, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.08304900792028935, + "language_loss": 0.99349552, + "learning_rate": 0.000998804617472226, + "loss": 1.00536776, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.35986328, + "step": 267, + "time_per_iteration": 2.67124342918396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115754, + "balance_loss_mlp": 1.1241138, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.09977621520267708, + "language_loss": 0.94207335, + "learning_rate": 0.0009987829908094568, + "loss": 0.95364869, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.33447266, + "step": 268, + "time_per_iteration": 2.813934087753296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134088, + "balance_loss_mlp": 1.09908843, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.11738978381138881, + "language_loss": 1.00792646, + "learning_rate": 0.0009987611705019569, + "loss": 1.01926744, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.3503418, + "step": 269, + "time_per_iteration": 4.138862133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117111, + "balance_loss_mlp": 1.08282614, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.05348082980263852, + "language_loss": 0.99369657, + "learning_rate": 0.0009987391565581978, + "loss": 1.00486767, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.34277344, + "step": 270, + "time_per_iteration": 2.586071014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126117, + "balance_loss_mlp": 1.09176075, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.07524916084480812, + "language_loss": 0.92056942, + "learning_rate": 0.000998716948986726, + "loss": 0.93183053, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.34350586, + "step": 271, + "time_per_iteration": 2.7993569374084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142479, + "balance_loss_mlp": 1.10948217, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.0817059207133684, + "language_loss": 0.94050443, + "learning_rate": 0.0009986945477961633, + "loss": 0.95192927, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.33032227, + "step": 272, + "time_per_iteration": 2.692488193511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162369, + "balance_loss_mlp": 1.13108802, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.07154102990319093, + "language_loss": 0.9958387, + "learning_rate": 0.0009986719529952066, + "loss": 1.00746238, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.3125, + "step": 273, + "time_per_iteration": 2.834634780883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151893, + "balance_loss_mlp": 1.12099373, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.11641144040169231, + "language_loss": 0.98596179, + "learning_rate": 0.000998649164592628, + "loss": 0.99748075, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.30859375, + "step": 274, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128229, + "balance_loss_mlp": 1.0986656, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.08444223005841496, + "language_loss": 0.96863008, + "learning_rate": 0.0009986261825972748, + "loss": 0.97991234, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29541016, + "step": 275, + "time_per_iteration": 2.66398549079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116765, + "balance_loss_mlp": 1.08734369, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.09541227165854013, + "language_loss": 0.9859423, + "learning_rate": 0.000998603007018069, + "loss": 0.99711001, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29394531, + "step": 276, + "time_per_iteration": 2.7675342559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108591, + "balance_loss_mlp": 1.07731009, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.06559506468622318, + "language_loss": 0.95903766, + "learning_rate": 0.0009985796378640089, + "loss": 0.97012359, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.3125, + "step": 277, + "time_per_iteration": 2.7019519805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111687, + "balance_loss_mlp": 1.08012068, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.07318038514420845, + "language_loss": 0.95983016, + "learning_rate": 0.0009985560751441665, + "loss": 0.97094703, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.31542969, + "step": 278, + "time_per_iteration": 2.8234922885894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111914, + "balance_loss_mlp": 1.0874306, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.07220087085065136, + "language_loss": 0.98319995, + "learning_rate": 0.00099853231886769, + "loss": 0.99439132, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.31713867, + "step": 279, + "time_per_iteration": 2.7748613357543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133292, + "balance_loss_mlp": 1.10162961, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06439402113592181, + "language_loss": 0.98657203, + "learning_rate": 0.0009985083690438024, + "loss": 0.99790496, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.31640625, + "step": 280, + "time_per_iteration": 2.700810670852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132869, + "balance_loss_mlp": 1.10204113, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.04843472954862069, + "language_loss": 0.89283121, + "learning_rate": 0.0009984842256818016, + "loss": 0.9041599, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.30786133, + "step": 281, + "time_per_iteration": 3.115292549133301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113546, + "balance_loss_mlp": 1.10580087, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.06657413960403659, + "language_loss": 0.99515754, + "learning_rate": 0.0009984598887910613, + "loss": 1.00651217, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.29614258, + "step": 282, + "time_per_iteration": 2.735640048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140553, + "balance_loss_mlp": 1.10893846, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.07881571737542031, + "language_loss": 0.95306879, + "learning_rate": 0.0009984353583810297, + "loss": 0.96447432, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.31616211, + "step": 283, + "time_per_iteration": 2.8240931034088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128602, + "balance_loss_mlp": 1.09834647, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.0943213260733239, + "language_loss": 0.97471213, + "learning_rate": 0.0009984106344612302, + "loss": 0.98599815, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.30224609, + "step": 284, + "time_per_iteration": 2.802689790725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119254, + "balance_loss_mlp": 1.08964229, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.0726777825280204, + "language_loss": 0.92919928, + "learning_rate": 0.0009983857170412615, + "loss": 0.94039178, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.29589844, + "step": 285, + "time_per_iteration": 3.0111782550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.10165143, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.06957121076923053, + "language_loss": 0.92976809, + "learning_rate": 0.000998360606130798, + "loss": 0.94110835, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.32324219, + "step": 286, + "time_per_iteration": 2.8221306800842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01949249, + "balance_loss_mlp": 1.90461755, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.20138197735421756, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71022367, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.44726562, + "step": 287, + "time_per_iteration": 4.872509956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160615, + "balance_loss_mlp": 1.12447047, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.09083797153449202, + "language_loss": 0.98382282, + "learning_rate": 0.0009983098038774552, + "loss": 0.99542892, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.36132812, + "step": 288, + "time_per_iteration": 2.7861900329589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0156365, + "balance_loss_mlp": 1.54524422, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.05039988105800305, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.79733872, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.18359375, + "step": 289, + "time_per_iteration": 4.809176683425903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183294, + "balance_loss_mlp": 1.14958155, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.11767359006900376, + "language_loss": 0.95852768, + "learning_rate": 0.0009982582277800948, + "loss": 0.9703607, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.33666992, + "step": 290, + "time_per_iteration": 2.5785539150238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.11738336, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.09005932528563108, + "language_loss": 1.03039932, + "learning_rate": 0.0009982321495648908, + "loss": 1.04188573, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.3125, + "step": 291, + "time_per_iteration": 2.798412561416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133747, + "balance_loss_mlp": 1.10218096, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.07041326246084649, + "language_loss": 0.9488259, + "learning_rate": 0.0009982058779188115, + "loss": 0.96016335, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.31542969, + "step": 292, + "time_per_iteration": 2.7117443084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113109, + "balance_loss_mlp": 1.08354521, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.0659469171672323, + "language_loss": 1.02221513, + "learning_rate": 0.0009981794128520567, + "loss": 1.0333463, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.29589844, + "step": 293, + "time_per_iteration": 2.83561372756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113063, + "balance_loss_mlp": 1.10104227, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.07618014203826041, + "language_loss": 0.98908657, + "learning_rate": 0.000998152754374901, + "loss": 1.00039291, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.2956543, + "step": 294, + "time_per_iteration": 2.879502773284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133819, + "balance_loss_mlp": 1.1052562, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.09109925372268521, + "language_loss": 0.94850433, + "learning_rate": 0.0009981259024976943, + "loss": 0.95984244, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.28564453, + "step": 295, + "time_per_iteration": 2.708038568496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129466, + "balance_loss_mlp": 1.10023606, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.08548016831625774, + "language_loss": 0.92669952, + "learning_rate": 0.0009980988572308612, + "loss": 0.93799424, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.29248047, + "step": 296, + "time_per_iteration": 2.99466609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126952, + "balance_loss_mlp": 1.09779358, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.05751010220277151, + "language_loss": 0.96034563, + "learning_rate": 0.0009980716185849015, + "loss": 0.9716152, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.29174805, + "step": 297, + "time_per_iteration": 3.0216734409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135652, + "balance_loss_mlp": 1.10651755, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06310788330802251, + "language_loss": 0.92855394, + "learning_rate": 0.0009980441865703904, + "loss": 0.93991041, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29150391, + "step": 298, + "time_per_iteration": 2.6354267597198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124221, + "balance_loss_mlp": 1.09456158, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.07879622532675779, + "language_loss": 1.0091691, + "learning_rate": 0.000998016561197978, + "loss": 1.02041125, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29638672, + "step": 299, + "time_per_iteration": 2.726853370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104202, + "balance_loss_mlp": 1.0768075, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.07606317837722033, + "language_loss": 0.9243238, + "learning_rate": 0.0009979887424783895, + "loss": 0.9353658, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.27441406, + "step": 300, + "time_per_iteration": 2.866880416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03286275, + "balance_loss_mlp": 5.97428513, + "diversity_loss_mlp": 0.40086228, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.08630620995418306, + "language_loss": 1.00780904, + "learning_rate": 0.0009979607304224248, + "loss": 1.04067183, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.09870158, + "step": 301, + "time_per_iteration": 2.8737847805023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101287, + "balance_loss_mlp": 1.07100797, + "diversity_loss_mlp": 0.0, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.07465341521099292, + "language_loss": 0.98771101, + "learning_rate": 0.000997932525040959, + "loss": 0.99872386, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.30273438, + "routerloss_mlp": 0.0, + "step": 302, + "time_per_iteration": 2.646038055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097684, + "balance_loss_mlp": 1.06912112, + "diversity_loss_mlp": 0.0, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.0784548088046029, + "language_loss": 1.01345074, + "learning_rate": 0.000997904126344943, + "loss": 1.02442753, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.28527832, + "routerloss_mlp": 0.0, + "step": 303, + "time_per_iteration": 2.607773542404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117476, + "balance_loss_mlp": 1.08612442, + "diversity_loss_mlp": 0.0, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.08413175271133923, + "language_loss": 0.96722186, + "learning_rate": 0.0009978755343454018, + "loss": 0.97839665, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.31323242, + "routerloss_mlp": 0.0, + "step": 304, + "time_per_iteration": 2.7423698902130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.11099684, + "diversity_loss_mlp": 0.0, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.08591892096672729, + "language_loss": 0.97475642, + "learning_rate": 0.0009978467490534355, + "loss": 0.98621881, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.35229492, + "routerloss_mlp": 0.0, + "step": 305, + "time_per_iteration": 2.5751075744628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144387, + "balance_loss_mlp": 1.10974526, + "diversity_loss_mlp": 0.0, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.06674928608125212, + "language_loss": 0.95161211, + "learning_rate": 0.00099781777048022, + "loss": 0.96305597, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.34667969, + "routerloss_mlp": 0.0, + "step": 306, + "time_per_iteration": 2.697453260421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142445, + "balance_loss_mlp": 1.10766006, + "diversity_loss_mlp": 0.0, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.08714127978238019, + "language_loss": 0.96547389, + "learning_rate": 0.0009977885986370057, + "loss": 0.97689843, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.34790039, + "routerloss_mlp": 0.0, + "step": 307, + "time_per_iteration": 2.555311679840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114348, + "balance_loss_mlp": 1.11098385, + "diversity_loss_mlp": 0.0, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.07630797692789458, + "language_loss": 0.93133295, + "learning_rate": 0.000997759233535118, + "loss": 0.94276774, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.32495117, + "routerloss_mlp": 0.0, + "step": 308, + "time_per_iteration": 2.7760326862335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137991, + "balance_loss_mlp": 1.10530353, + "diversity_loss_mlp": 0.0, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.1535726459245726, + "language_loss": 0.98530197, + "learning_rate": 0.0009977296751859576, + "loss": 0.99668187, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.32666016, + "routerloss_mlp": 0.0, + "step": 309, + "time_per_iteration": 2.7718236446380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119868, + "balance_loss_mlp": 1.09030402, + "diversity_loss_mlp": 0.0, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.09363029892750833, + "language_loss": 1.00139546, + "learning_rate": 0.0009976999236009998, + "loss": 1.01259422, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 310, + "time_per_iteration": 2.7480924129486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128418, + "balance_loss_mlp": 1.1004039, + "diversity_loss_mlp": 0.0, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.11799476734746514, + "language_loss": 1.01830125, + "learning_rate": 0.0009976699787917955, + "loss": 1.02958548, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.28051758, + "routerloss_mlp": 0.0, + "step": 311, + "time_per_iteration": 2.6702628135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02237821, + "balance_loss_mlp": 2.22513723, + "diversity_loss_mlp": 0.0, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.1521885653041848, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75680816, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 312, + "time_per_iteration": 4.968472480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01934551, + "balance_loss_mlp": 3.38140035, + "diversity_loss_mlp": 0.39575127, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.05936914788699087, + "language_loss": 0.983639, + "learning_rate": 0.0009976095095472243, + "loss": 1.00298452, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.04597524, + "step": 313, + "time_per_iteration": 2.6077775955200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140935, + "balance_loss_mlp": 1.11120427, + "diversity_loss_mlp": 0.0, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.09323488343042824, + "language_loss": 0.95392269, + "learning_rate": 0.0009975789851353334, + "loss": 0.96533203, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29736328, + "routerloss_mlp": 0.0, + "step": 314, + "time_per_iteration": 2.810530424118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152798, + "balance_loss_mlp": 1.12359178, + "diversity_loss_mlp": 0.0, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.09115128879339694, + "language_loss": 0.97407585, + "learning_rate": 0.0009975482675461487, + "loss": 0.98560387, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.29223633, + "routerloss_mlp": 0.0, + "step": 315, + "time_per_iteration": 2.658961772918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165231, + "balance_loss_mlp": 1.13464189, + "diversity_loss_mlp": 0.0, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.08232329918432242, + "language_loss": 0.95008749, + "learning_rate": 0.0009975173567915952, + "loss": 0.96173978, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.3059082, + "routerloss_mlp": 0.0, + "step": 316, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208938, + "balance_loss_mlp": 1.17508304, + "diversity_loss_mlp": 0.0, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.11734128354988786, + "language_loss": 0.89037865, + "learning_rate": 0.000997486252883674, + "loss": 0.90246803, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.33886719, + "routerloss_mlp": 0.0, + "step": 317, + "time_per_iteration": 2.82440447807312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246386, + "balance_loss_mlp": 1.21069503, + "diversity_loss_mlp": 0.0, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.09191065951965113, + "language_loss": 0.94435382, + "learning_rate": 0.0009974549558344602, + "loss": 0.95681769, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.35693359, + "routerloss_mlp": 0.0, + "step": 318, + "time_per_iteration": 3.6594014167785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256455, + "balance_loss_mlp": 1.22028661, + "diversity_loss_mlp": 0.0, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.10186826507715854, + "language_loss": 1.03254342, + "learning_rate": 0.000997423465656105, + "loss": 1.04510808, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.36181641, + "routerloss_mlp": 0.0, + "step": 319, + "time_per_iteration": 2.7277376651763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228783, + "balance_loss_mlp": 1.19342566, + "diversity_loss_mlp": 0.0, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.07892523617459922, + "language_loss": 1.00628281, + "learning_rate": 0.0009973917823608335, + "loss": 1.01857066, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.35375977, + "routerloss_mlp": 0.0, + "step": 320, + "time_per_iteration": 2.608973503112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216411, + "balance_loss_mlp": 1.18279386, + "diversity_loss_mlp": 0.0, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.08046246772740448, + "language_loss": 0.96186835, + "learning_rate": 0.0009973599059609462, + "loss": 0.9740324, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 321, + "time_per_iteration": 2.736543655395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188588, + "balance_loss_mlp": 1.15735531, + "diversity_loss_mlp": 0.0, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.06958940991484033, + "language_loss": 0.93877137, + "learning_rate": 0.000997327836468819, + "loss": 0.95065725, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.31225586, + "routerloss_mlp": 0.0, + "step": 322, + "time_per_iteration": 2.6034624576568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_mlp": 1.14392066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.10097410409674823, + "language_loss": 0.96476239, + "learning_rate": 0.000997295573896902, + "loss": 0.97648811, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28649902, + "routerloss_mlp": 0.0, + "step": 323, + "time_per_iteration": 2.8207039833068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02388506, + "balance_loss_mlp": 2.37343788, + "diversity_loss_mlp": 0.0, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.2858946964689234, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83584547, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 324, + "time_per_iteration": 4.691263437271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01793915, + "balance_loss_mlp": 1.78142214, + "diversity_loss_mlp": 0.0, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.11944332826526777, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80365855, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 325, + "time_per_iteration": 4.837715148925781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214832, + "balance_loss_mlp": 1.18657923, + "diversity_loss_mlp": 0.0, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.0814388529334085, + "language_loss": 0.91516924, + "learning_rate": 0.000997197627828043, + "loss": 0.92731762, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 326, + "time_per_iteration": 2.5261096954345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228602, + "balance_loss_mlp": 1.20018268, + "diversity_loss_mlp": 0.0, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.08774897428196327, + "language_loss": 0.86495018, + "learning_rate": 0.0009971645930629716, + "loss": 0.87723619, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.28442383, + "routerloss_mlp": 0.0, + "step": 327, + "time_per_iteration": 2.73193621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236303, + "balance_loss_mlp": 1.20914674, + "diversity_loss_mlp": 0.0, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.0823367638378532, + "language_loss": 0.99889791, + "learning_rate": 0.0009971313652814872, + "loss": 1.01126099, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.2722168, + "routerloss_mlp": 0.0, + "step": 328, + "time_per_iteration": 2.79278826713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224995, + "balance_loss_mlp": 1.1973865, + "diversity_loss_mlp": 0.0, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.1407341288256049, + "language_loss": 0.97435188, + "learning_rate": 0.0009970979444964903, + "loss": 0.98660183, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27636719, + "routerloss_mlp": 0.0, + "step": 329, + "time_per_iteration": 2.9955334663391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213807, + "balance_loss_mlp": 1.18553066, + "diversity_loss_mlp": 0.0, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.10291010686297611, + "language_loss": 0.9869082, + "learning_rate": 0.0009970643307209556, + "loss": 0.99904621, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 330, + "time_per_iteration": 2.79775071144104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202809, + "balance_loss_mlp": 1.17248201, + "diversity_loss_mlp": 0.0, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.08231148280507655, + "language_loss": 0.94842714, + "learning_rate": 0.0009970305239679334, + "loss": 0.96045524, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.30322266, + "routerloss_mlp": 0.0, + "step": 331, + "time_per_iteration": 2.802400827407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203401, + "balance_loss_mlp": 1.17300248, + "diversity_loss_mlp": 0.0, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.08804880344809486, + "language_loss": 0.99692816, + "learning_rate": 0.0009969965242505483, + "loss": 1.00896215, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.30371094, + "routerloss_mlp": 0.0, + "step": 332, + "time_per_iteration": 2.634702682495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224958, + "balance_loss_mlp": 1.19243741, + "diversity_loss_mlp": 0.0, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.06414677867033303, + "language_loss": 0.95931363, + "learning_rate": 0.0009969623315820007, + "loss": 0.97156322, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 333, + "time_per_iteration": 2.6661436557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245141, + "balance_loss_mlp": 1.21149969, + "diversity_loss_mlp": 0.0, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.06624608002660057, + "language_loss": 0.9590115, + "learning_rate": 0.000996927945975565, + "loss": 0.97146285, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.33618164, + "routerloss_mlp": 0.0, + "step": 334, + "time_per_iteration": 2.576922655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252992, + "balance_loss_mlp": 1.21672821, + "diversity_loss_mlp": 0.0, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.07108304231036514, + "language_loss": 0.93002915, + "learning_rate": 0.0009968933674445906, + "loss": 0.94255906, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.36230469, + "routerloss_mlp": 0.0, + "step": 335, + "time_per_iteration": 2.706836462020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267675, + "balance_loss_mlp": 1.23026776, + "diversity_loss_mlp": 0.0, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.0701420022906001, + "language_loss": 0.95153642, + "learning_rate": 0.0009968585960025028, + "loss": 0.96421325, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.37402344, + "routerloss_mlp": 0.0, + "step": 336, + "time_per_iteration": 2.9356396198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01838771, + "balance_loss_mlp": 1.81416643, + "diversity_loss_mlp": 0.0, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.09587986506557475, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79491967, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.24511719, + "routerloss_mlp": 0.0, + "step": 337, + "time_per_iteration": 4.784119606018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242978, + "balance_loss_mlp": 1.20874155, + "diversity_loss_mlp": 0.0, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.1007121907193806, + "language_loss": 0.9314844, + "learning_rate": 0.0009967884744390583, + "loss": 0.94391423, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.3425293, + "routerloss_mlp": 0.0, + "step": 338, + "time_per_iteration": 3.5315823554992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209945, + "balance_loss_mlp": 1.1758039, + "diversity_loss_mlp": 0.0, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.10820011352875603, + "language_loss": 0.93812096, + "learning_rate": 0.0009967531243449256, + "loss": 0.95022047, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.34130859, + "routerloss_mlp": 0.0, + "step": 339, + "time_per_iteration": 2.6663827896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172072, + "balance_loss_mlp": 1.13959908, + "diversity_loss_mlp": 0.0, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.07246387309668721, + "language_loss": 1.014539, + "learning_rate": 0.000996717581394126, + "loss": 1.02625966, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.32470703, + "routerloss_mlp": 0.0, + "step": 340, + "time_per_iteration": 2.5849766731262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142038, + "balance_loss_mlp": 1.11142516, + "diversity_loss_mlp": 0.0, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.07622939946709405, + "language_loss": 1.01788783, + "learning_rate": 0.000996681845600459, + "loss": 1.0293082, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.30615234, + "routerloss_mlp": 0.0, + "step": 341, + "time_per_iteration": 2.6651370525360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138836, + "balance_loss_mlp": 1.10901034, + "diversity_loss_mlp": 0.0, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.06359259902727714, + "language_loss": 0.94080132, + "learning_rate": 0.0009966459169777982, + "loss": 0.95218974, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.29785156, + "routerloss_mlp": 0.0, + "step": 342, + "time_per_iteration": 2.524775981903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136152, + "balance_loss_mlp": 1.10670757, + "diversity_loss_mlp": 0.0, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.07912610309003802, + "language_loss": 1.03090763, + "learning_rate": 0.0009966097955400924, + "loss": 1.04226899, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.29418945, + "routerloss_mlp": 0.0, + "step": 343, + "time_per_iteration": 2.662269115447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074802, + "balance_loss_mlp": 1.74366593, + "diversity_loss_mlp": 0.35364389, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.10968898462568231, + "language_loss": 0.99445379, + "learning_rate": 0.0009965734813013652, + "loss": 1.00520182, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02614743, + "step": 344, + "time_per_iteration": 2.82026743888855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138748, + "balance_loss_mlp": 1.10989952, + "diversity_loss_mlp": 0.0, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.13046244738635646, + "language_loss": 0.99630761, + "learning_rate": 0.0009965369742757151, + "loss": 1.00769508, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.28833008, + "routerloss_mlp": 0.0, + "step": 345, + "time_per_iteration": 2.565809965133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112942, + "balance_loss_mlp": 1.10131097, + "diversity_loss_mlp": 0.0, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.1120170016707216, + "language_loss": 0.96858162, + "learning_rate": 0.0009965002744773152, + "loss": 0.9798758, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 346, + "time_per_iteration": 3.52542781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144914, + "balance_loss_mlp": 1.1170671, + "diversity_loss_mlp": 0.0, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.08447825810050776, + "language_loss": 0.93369007, + "learning_rate": 0.0009964633819204139, + "loss": 0.94513917, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.27832031, + "routerloss_mlp": 0.0, + "step": 347, + "time_per_iteration": 2.6504640579223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02729187, + "balance_loss_mlp": 2.68856025, + "diversity_loss_mlp": 0.0, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.36365581545094156, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.84530306, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 348, + "time_per_iteration": 4.9217259883880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01886969, + "balance_loss_mlp": 1.8606472, + "diversity_loss_mlp": 0.0, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.11180228987157655, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.77040851, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.26367188, + "routerloss_mlp": 0.0, + "step": 349, + "time_per_iteration": 4.915479898452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148873, + "balance_loss_mlp": 1.11942816, + "diversity_loss_mlp": 0.0, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.08620115988858058, + "language_loss": 0.93105251, + "learning_rate": 0.000996351547842304, + "loss": 0.94254124, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29443359, + "routerloss_mlp": 0.0, + "step": 350, + "time_per_iteration": 3.2273383140563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183797, + "balance_loss_mlp": 1.152946, + "diversity_loss_mlp": 0.0, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.10656846418921655, + "language_loss": 0.91589314, + "learning_rate": 0.0009963138843953744, + "loss": 0.92773116, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.30810547, + "routerloss_mlp": 0.0, + "step": 351, + "time_per_iteration": 2.6443302631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122224, + "balance_loss_mlp": 1.19079256, + "diversity_loss_mlp": 0.0, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.12218392571909323, + "language_loss": 0.95582229, + "learning_rate": 0.000996276028262306, + "loss": 0.9680447, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.31420898, + "routerloss_mlp": 0.0, + "step": 352, + "time_per_iteration": 2.819287061691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121763, + "balance_loss_mlp": 1.18711233, + "diversity_loss_mlp": 0.0, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.14903684788896404, + "language_loss": 1.01496267, + "learning_rate": 0.0009962379794577964, + "loss": 1.02713895, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.30493164, + "routerloss_mlp": 0.0, + "step": 353, + "time_per_iteration": 2.591759204864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123139, + "balance_loss_mlp": 1.2003479, + "diversity_loss_mlp": 0.0, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.0632056956592815, + "language_loss": 0.9195236, + "learning_rate": 0.000996199737996617, + "loss": 0.9318375, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "routerloss_mlp": 0.0, + "step": 354, + "time_per_iteration": 2.889040231704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209696, + "balance_loss_mlp": 1.17963195, + "diversity_loss_mlp": 0.0, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.07119928644727336, + "language_loss": 1.00405252, + "learning_rate": 0.0009961613038936149, + "loss": 1.0161494, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.30029297, + "routerloss_mlp": 0.0, + "step": 355, + "time_per_iteration": 2.5856525897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187257, + "balance_loss_mlp": 1.15755057, + "diversity_loss_mlp": 0.0, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.07116362106359332, + "language_loss": 0.93361115, + "learning_rate": 0.000996122677163711, + "loss": 0.9454838, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.296875, + "routerloss_mlp": 0.0, + "step": 356, + "time_per_iteration": 2.8134818077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213028, + "balance_loss_mlp": 1.18367887, + "diversity_loss_mlp": 0.0, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.08014414191517881, + "language_loss": 0.98940754, + "learning_rate": 0.000996083857821902, + "loss": 1.0015378, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.29345703, + "routerloss_mlp": 0.0, + "step": 357, + "time_per_iteration": 3.0531890392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237281, + "balance_loss_mlp": 1.20714498, + "diversity_loss_mlp": 0.0, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.06260381392843543, + "language_loss": 0.96791607, + "learning_rate": 0.0009960448458832588, + "loss": 0.98028892, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30126953, + "routerloss_mlp": 0.0, + "step": 358, + "time_per_iteration": 2.696443557739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236116, + "balance_loss_mlp": 1.20750594, + "diversity_loss_mlp": 0.0, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.07177130169486132, + "language_loss": 0.96227086, + "learning_rate": 0.000996005641362927, + "loss": 0.97463197, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28637695, + "routerloss_mlp": 0.0, + "step": 359, + "time_per_iteration": 2.58060884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229528, + "balance_loss_mlp": 1.19984436, + "diversity_loss_mlp": 0.0, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.09877521418753983, + "language_loss": 0.99257219, + "learning_rate": 0.0009959662442761274, + "loss": 1.00486755, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.29663086, + "routerloss_mlp": 0.0, + "step": 360, + "time_per_iteration": 2.8970725536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241998, + "balance_loss_mlp": 1.21033561, + "diversity_loss_mlp": 0.0, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.07509157549903762, + "language_loss": 0.93086261, + "learning_rate": 0.000995926654638155, + "loss": 0.9432826, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.31640625, + "routerloss_mlp": 0.0, + "step": 361, + "time_per_iteration": 2.787796974182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225169, + "balance_loss_mlp": 1.19405532, + "diversity_loss_mlp": 0.0, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.08313329413520473, + "language_loss": 0.94580126, + "learning_rate": 0.00099588687246438, + "loss": 0.95805293, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.31103516, + "routerloss_mlp": 0.0, + "step": 362, + "time_per_iteration": 2.826186418533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188587, + "balance_loss_mlp": 1.15785527, + "diversity_loss_mlp": 0.0, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.12654684897021498, + "language_loss": 1.02203465, + "learning_rate": 0.0009958468977702471, + "loss": 1.03392053, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 363, + "time_per_iteration": 2.5915637016296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02117372, + "balance_loss_mlp": 1.97470212, + "diversity_loss_mlp": 0.0, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.12517092959889778, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81852078, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 1.4296875, + "routerloss_mlp": 0.0, + "step": 364, + "time_per_iteration": 4.79950737953186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195198, + "balance_loss_mlp": 1.16406059, + "diversity_loss_mlp": 0.0, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.08484436116426784, + "language_loss": 0.90580225, + "learning_rate": 0.0009957663708830612, + "loss": 0.91775423, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 365, + "time_per_iteration": 3.2616662979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119947, + "balance_loss_mlp": 1.16575801, + "diversity_loss_mlp": 0.0, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.10575932689534903, + "language_loss": 0.93159938, + "learning_rate": 0.0009957258187212714, + "loss": 0.9435941, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.33740234, + "routerloss_mlp": 0.0, + "step": 366, + "time_per_iteration": 3.0113134384155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02012454, + "balance_loss_mlp": 1.90030205, + "diversity_loss_mlp": 0.0, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.0781885975604906, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.81207317, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 1.125, + "routerloss_mlp": 0.0, + "step": 367, + "time_per_iteration": 4.857182502746582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238272, + "balance_loss_mlp": 1.20377314, + "diversity_loss_mlp": 0.0, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.10459556468103207, + "language_loss": 0.9040041, + "learning_rate": 0.0009956441370400167, + "loss": 0.91638684, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.34472656, + "routerloss_mlp": 0.0, + "step": 368, + "time_per_iteration": 2.6384623050689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212552, + "balance_loss_mlp": 1.17986465, + "diversity_loss_mlp": 0.0, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.11871319311308551, + "language_loss": 0.96155751, + "learning_rate": 0.0009956030075522636, + "loss": 0.973683, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.3269043, + "routerloss_mlp": 0.0, + "step": 369, + "time_per_iteration": 2.7690951824188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.85686088, + "diversity_loss_mlp": 0.26596725, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.0445321938876095, + "language_loss": 0.99161661, + "learning_rate": 0.0009955616856543587, + "loss": 1.00259984, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.03691306, + "step": 370, + "time_per_iteration": 2.6551451683044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136923, + "balance_loss_mlp": 1.10690594, + "diversity_loss_mlp": 0.0, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.06345816714032589, + "language_loss": 0.89315635, + "learning_rate": 0.0009955201713623448, + "loss": 0.90452558, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.29980469, + "routerloss_mlp": 0.0, + "step": 371, + "time_per_iteration": 2.7738049030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01981215, + "balance_loss_mlp": 1.93124223, + "diversity_loss_mlp": 0.0, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.16358882606758401, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78653932, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.5, + "routerloss_mlp": 0.0, + "step": 372, + "time_per_iteration": 4.94252347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117773, + "balance_loss_mlp": 1.08999681, + "diversity_loss_mlp": 0.0, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.14652608757044766, + "language_loss": 1.03006279, + "learning_rate": 0.0009954365656605333, + "loss": 1.04124057, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.27783203, + "routerloss_mlp": 0.0, + "step": 373, + "time_per_iteration": 2.551156759262085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138367, + "balance_loss_mlp": 1.10901785, + "diversity_loss_mlp": 0.0, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.09116429227244367, + "language_loss": 0.95790577, + "learning_rate": 0.0009953944742831947, + "loss": 0.96928942, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.29296875, + "routerloss_mlp": 0.0, + "step": 374, + "time_per_iteration": 2.995286226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159694, + "balance_loss_mlp": 1.13084567, + "diversity_loss_mlp": 0.0, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.10582188185488459, + "language_loss": 0.99257255, + "learning_rate": 0.0009953521905766642, + "loss": 1.00416946, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.28808594, + "routerloss_mlp": 0.0, + "step": 375, + "time_per_iteration": 2.946237325668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186879, + "balance_loss_mlp": 1.15664721, + "diversity_loss_mlp": 0.0, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.09648654328935216, + "language_loss": 0.97696835, + "learning_rate": 0.0009953097145573577, + "loss": 0.98883718, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.30200195, + "routerloss_mlp": 0.0, + "step": 376, + "time_per_iteration": 2.64080548286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119333, + "balance_loss_mlp": 1.16164398, + "diversity_loss_mlp": 0.0, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.11805021949506506, + "language_loss": 0.95023847, + "learning_rate": 0.000995267046241766, + "loss": 0.96217185, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 377, + "time_per_iteration": 3.2120020389556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188603, + "balance_loss_mlp": 1.15617776, + "diversity_loss_mlp": 0.0, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.10215127385841216, + "language_loss": 0.94931126, + "learning_rate": 0.0009952241856464547, + "loss": 0.96119732, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.32421875, + "routerloss_mlp": 0.0, + "step": 378, + "time_per_iteration": 2.595047950744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183617, + "balance_loss_mlp": 1.14971423, + "diversity_loss_mlp": 0.0, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.08294465031859817, + "language_loss": 1.01604176, + "learning_rate": 0.0009951811327880632, + "loss": 1.02787805, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.33911133, + "routerloss_mlp": 0.0, + "step": 379, + "time_per_iteration": 2.7318813800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173744, + "balance_loss_mlp": 1.13891101, + "diversity_loss_mlp": 0.0, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.06744176383892367, + "language_loss": 0.94898254, + "learning_rate": 0.0009951378876833063, + "loss": 0.96071994, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.34838867, + "routerloss_mlp": 0.0, + "step": 380, + "time_per_iteration": 2.565268039703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198329, + "balance_loss_mlp": 1.16392517, + "diversity_loss_mlp": 0.0, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.08808941505023588, + "language_loss": 1.01867247, + "learning_rate": 0.0009950944503489736, + "loss": 1.03065586, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.34399414, + "routerloss_mlp": 0.0, + "step": 381, + "time_per_iteration": 2.7605583667755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220014, + "balance_loss_mlp": 1.18479919, + "diversity_loss_mlp": 0.0, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.09503573620830386, + "language_loss": 0.95487726, + "learning_rate": 0.0009950508208019285, + "loss": 0.96707737, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.35253906, + "routerloss_mlp": 0.0, + "step": 382, + "time_per_iteration": 3.023996591567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224507, + "balance_loss_mlp": 1.19086623, + "diversity_loss_mlp": 0.0, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.09021711867793632, + "language_loss": 1.0023253, + "learning_rate": 0.0009950069990591096, + "loss": 1.01457047, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.33666992, + "routerloss_mlp": 0.0, + "step": 383, + "time_per_iteration": 2.62634015083313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02435347, + "balance_loss_mlp": 2.36668229, + "diversity_loss_mlp": 0.0, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.252441104666548, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78836709, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.6875, + "routerloss_mlp": 0.0, + "step": 384, + "time_per_iteration": 4.887000322341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205074, + "balance_loss_mlp": 1.17217231, + "diversity_loss_mlp": 0.0, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.13776686153508858, + "language_loss": 0.92669415, + "learning_rate": 0.0009949187790542777, + "loss": 0.93874478, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.32910156, + "routerloss_mlp": 0.0, + "step": 385, + "time_per_iteration": 2.7325563430786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158547, + "balance_loss_mlp": 1.12683773, + "diversity_loss_mlp": 0.0, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.09404920935129117, + "language_loss": 0.89306223, + "learning_rate": 0.0009948743808265148, + "loss": 0.90464771, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.31689453, + "routerloss_mlp": 0.0, + "step": 386, + "time_per_iteration": 2.723581314086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152345, + "balance_loss_mlp": 1.12321043, + "diversity_loss_mlp": 0.0, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.11553674714385681, + "language_loss": 0.98625511, + "learning_rate": 0.0009948297904714782, + "loss": 0.99777853, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29125977, + "routerloss_mlp": 0.0, + "step": 387, + "time_per_iteration": 2.6925902366638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152671, + "balance_loss_mlp": 1.12460923, + "diversity_loss_mlp": 0.0, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.10281917509950625, + "language_loss": 0.91430104, + "learning_rate": 0.0009947850080064796, + "loss": 0.92582774, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.28076172, + "routerloss_mlp": 0.0, + "step": 388, + "time_per_iteration": 2.7813222408294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_mlp": 1.80238378, + "diversity_loss_mlp": 0.24433145, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.03140321958098528, + "language_loss": 0.96549261, + "learning_rate": 0.0009947400334489047, + "loss": 0.97600979, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0283502, + "step": 389, + "time_per_iteration": 3.055640459060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_mlp": 1.11867988, + "diversity_loss_mlp": 0.0, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.10120121915973303, + "language_loss": 0.87344396, + "learning_rate": 0.0009946948668162145, + "loss": 0.88490444, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.27392578, + "routerloss_mlp": 0.0, + "step": 390, + "time_per_iteration": 2.7240688800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159261, + "balance_loss_mlp": 1.13079381, + "diversity_loss_mlp": 0.0, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.0733706931740777, + "language_loss": 0.92598295, + "learning_rate": 0.0009946495081259441, + "loss": 0.93757558, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 391, + "time_per_iteration": 2.8451168537139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145886, + "balance_loss_mlp": 1.11753774, + "diversity_loss_mlp": 0.0, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.0986246500370879, + "language_loss": 0.95604634, + "learning_rate": 0.0009946039573957035, + "loss": 0.96750522, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.28344727, + "routerloss_mlp": 0.0, + "step": 392, + "time_per_iteration": 2.943962574005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142945, + "balance_loss_mlp": 1.11550307, + "diversity_loss_mlp": 0.0, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.0698233472363084, + "language_loss": 0.92221498, + "learning_rate": 0.000994558214643177, + "loss": 0.93364441, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.27441406, + "routerloss_mlp": 0.0, + "step": 393, + "time_per_iteration": 2.7336390018463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137637, + "balance_loss_mlp": 1.10933709, + "diversity_loss_mlp": 0.0, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.0667709001177297, + "language_loss": 0.93581867, + "learning_rate": 0.000994512279886123, + "loss": 0.94719505, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.28295898, + "routerloss_mlp": 0.0, + "step": 394, + "time_per_iteration": 3.0792524814605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148773, + "balance_loss_mlp": 1.12104487, + "diversity_loss_mlp": 0.0, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.057306164352953166, + "language_loss": 0.94243777, + "learning_rate": 0.0009944661531423758, + "loss": 0.95392549, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.27758789, + "routerloss_mlp": 0.0, + "step": 395, + "time_per_iteration": 2.7003707885742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169709, + "balance_loss_mlp": 1.14162326, + "diversity_loss_mlp": 0.0, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.09187664036534561, + "language_loss": 0.92709243, + "learning_rate": 0.000994419834429843, + "loss": 0.93878949, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.28125, + "routerloss_mlp": 0.0, + "step": 396, + "time_per_iteration": 2.654961109161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184579, + "balance_loss_mlp": 1.15613592, + "diversity_loss_mlp": 0.0, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.10401840603132484, + "language_loss": 0.96742636, + "learning_rate": 0.0009943733237665069, + "loss": 0.97927213, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.28466797, + "routerloss_mlp": 0.0, + "step": 397, + "time_per_iteration": 2.8282015323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204203, + "balance_loss_mlp": 1.17542565, + "diversity_loss_mlp": 0.0, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.06433229599495933, + "language_loss": 0.96130294, + "learning_rate": 0.0009943266211704248, + "loss": 0.97334492, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.28759766, + "routerloss_mlp": 0.0, + "step": 398, + "time_per_iteration": 2.970426321029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183998, + "balance_loss_mlp": 1.15534043, + "diversity_loss_mlp": 0.0, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.08157022591406732, + "language_loss": 0.98195136, + "learning_rate": 0.000994279726659728, + "loss": 0.99379134, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.28662109, + "routerloss_mlp": 0.0, + "step": 399, + "time_per_iteration": 2.5123794078826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177562, + "balance_loss_mlp": 1.14926195, + "diversity_loss_mlp": 0.0, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.07895179134063258, + "language_loss": 0.95376462, + "learning_rate": 0.0009942326402526231, + "loss": 0.96554029, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.28320312, + "routerloss_mlp": 0.0, + "step": 400, + "time_per_iteration": 2.52349591255188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146856, + "balance_loss_mlp": 1.11905658, + "diversity_loss_mlp": 0.0, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.0705701607591385, + "language_loss": 0.94442534, + "learning_rate": 0.0009941853619673902, + "loss": 0.95589387, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.27807617, + "routerloss_mlp": 0.0, + "step": 401, + "time_per_iteration": 2.643442153930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134479, + "balance_loss_mlp": 1.10811007, + "diversity_loss_mlp": 0.0, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.11619926948996102, + "language_loss": 0.97199881, + "learning_rate": 0.0009941378918223844, + "loss": 0.9833436, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.26416016, + "routerloss_mlp": 0.0, + "step": 402, + "time_per_iteration": 3.05241322517395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124539, + "balance_loss_mlp": 1.09765708, + "diversity_loss_mlp": 0.0, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.0628584922031364, + "language_loss": 0.90586787, + "learning_rate": 0.0009940902298360354, + "loss": 0.91711324, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.26916504, + "routerloss_mlp": 0.0, + "step": 403, + "time_per_iteration": 2.739593744277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123125, + "balance_loss_mlp": 1.09564674, + "diversity_loss_mlp": 0.0, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.07463467829204698, + "language_loss": 0.99357891, + "learning_rate": 0.0009940423760268473, + "loss": 1.00481009, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.27478027, + "routerloss_mlp": 0.0, + "step": 404, + "time_per_iteration": 2.863248825073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123907, + "balance_loss_mlp": 1.09644127, + "diversity_loss_mlp": 0.0, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.08544352707712408, + "language_loss": 0.93046296, + "learning_rate": 0.0009939943304133982, + "loss": 0.94170201, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.27514648, + "routerloss_mlp": 0.0, + "step": 405, + "time_per_iteration": 2.631242275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929276, + "balance_loss_mlp": 1.55583501, + "diversity_loss_mlp": 0.25816602, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.039808149400508724, + "language_loss": 1.0085814, + "learning_rate": 0.0009939460930143416, + "loss": 1.017874, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02227605, + "step": 406, + "time_per_iteration": 2.655000925064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908113, + "balance_loss_mlp": 1.5136435, + "diversity_loss_mlp": 0.25845903, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.031543409668047605, + "language_loss": 0.94866949, + "learning_rate": 0.0009938976638484043, + "loss": 0.95775062, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02206134, + "step": 407, + "time_per_iteration": 2.932522773742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125815, + "balance_loss_mlp": 1.09954083, + "diversity_loss_mlp": 0.0, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.0874520562524596, + "language_loss": 0.93291676, + "learning_rate": 0.0009938490429343887, + "loss": 0.94417489, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 408, + "time_per_iteration": 2.5488343238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128514, + "balance_loss_mlp": 1.10140562, + "diversity_loss_mlp": 0.0, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.1051667442879041, + "language_loss": 0.94155729, + "learning_rate": 0.0009938002302911709, + "loss": 0.95284247, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 409, + "time_per_iteration": 2.7672979831695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.10946035, + "diversity_loss_mlp": 0.0, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.09613329153911296, + "language_loss": 0.9601537, + "learning_rate": 0.0009937512259377015, + "loss": 0.97151482, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 410, + "time_per_iteration": 2.674072504043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159019, + "balance_loss_mlp": 1.13217306, + "diversity_loss_mlp": 0.0, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.05951235305386178, + "language_loss": 0.95475662, + "learning_rate": 0.000993702029893006, + "loss": 0.96634674, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.26879883, + "routerloss_mlp": 0.0, + "step": 411, + "time_per_iteration": 2.7913753986358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185856, + "balance_loss_mlp": 1.15731764, + "diversity_loss_mlp": 0.0, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.10961223184545879, + "language_loss": 0.95336723, + "learning_rate": 0.0009936526421761838, + "loss": 0.96522582, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.28540039, + "routerloss_mlp": 0.0, + "step": 412, + "time_per_iteration": 3.036557197570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181446, + "balance_loss_mlp": 1.15414703, + "diversity_loss_mlp": 0.0, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.09075853005030154, + "language_loss": 0.97731507, + "learning_rate": 0.000993603062806409, + "loss": 0.98912954, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.27319336, + "routerloss_mlp": 0.0, + "step": 413, + "time_per_iteration": 2.690500259399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166438, + "balance_loss_mlp": 1.1394248, + "diversity_loss_mlp": 0.0, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.0841151797190701, + "language_loss": 1.00301099, + "learning_rate": 0.0009935532918029298, + "loss": 1.01467538, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.27050781, + "routerloss_mlp": 0.0, + "step": 414, + "time_per_iteration": 2.6386477947235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171646, + "balance_loss_mlp": 1.14432323, + "diversity_loss_mlp": 0.0, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.07267589634089947, + "language_loss": 0.94145483, + "learning_rate": 0.0009935033291850694, + "loss": 0.95317131, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.27307129, + "routerloss_mlp": 0.0, + "step": 415, + "time_per_iteration": 2.6771326065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138565, + "balance_loss_mlp": 1.11312544, + "diversity_loss_mlp": 0.0, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.09244391725109519, + "language_loss": 0.96404541, + "learning_rate": 0.0009934531749722247, + "loss": 0.97543103, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 416, + "time_per_iteration": 2.586975574493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132517, + "balance_loss_mlp": 1.10733998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.0915153559751851, + "language_loss": 0.94398224, + "learning_rate": 0.0009934028291838672, + "loss": 0.95530736, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.25183105, + "routerloss_mlp": 0.0, + "step": 417, + "time_per_iteration": 2.7062928676605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150706, + "balance_loss_mlp": 1.1251713, + "diversity_loss_mlp": 0.0, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.10053131301435142, + "language_loss": 0.89968443, + "learning_rate": 0.0009933522918395433, + "loss": 0.91119152, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.25549316, + "routerloss_mlp": 0.0, + "step": 418, + "time_per_iteration": 2.65326189994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00760745, + "balance_loss_mlp": 1.16580379, + "diversity_loss_mlp": 0.256477, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.006992447528439397, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79011846, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.049605, + "step": 419, + "time_per_iteration": 4.8772523403167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176473, + "balance_loss_mlp": 1.15143883, + "diversity_loss_mlp": 0.0, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.08608768077535772, + "language_loss": 1.07860529, + "learning_rate": 0.000993250642561551, + "loss": 1.09036994, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.25061035, + "routerloss_mlp": 0.0, + "step": 420, + "time_per_iteration": 2.588672399520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176613, + "balance_loss_mlp": 1.15165043, + "diversity_loss_mlp": 0.0, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.09804047271530963, + "language_loss": 0.93524832, + "learning_rate": 0.0009931995306673466, + "loss": 0.94701445, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 421, + "time_per_iteration": 2.734513521194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200943, + "balance_loss_mlp": 1.17474103, + "diversity_loss_mlp": 0.0, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.0768650968130289, + "language_loss": 0.98959565, + "learning_rate": 0.000993148227296103, + "loss": 1.00160503, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 422, + "time_per_iteration": 2.6389012336730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185361, + "balance_loss_mlp": 1.1604228, + "diversity_loss_mlp": 0.0, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.08220754838372611, + "language_loss": 0.87845761, + "learning_rate": 0.000993096732467738, + "loss": 0.89031118, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.24938965, + "routerloss_mlp": 0.0, + "step": 423, + "time_per_iteration": 2.976412057876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00884908, + "balance_loss_mlp": 1.45653749, + "diversity_loss_mlp": 0.26738948, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.04326164577840749, + "language_loss": 0.94753903, + "learning_rate": 0.0009930450462022435, + "loss": 0.95638812, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02294483, + "step": 424, + "time_per_iteration": 2.9038002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02462639, + "balance_loss_mlp": 2.35582733, + "diversity_loss_mlp": 0.0, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.15208391867633483, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.81652445, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 1.0703125, + "routerloss_mlp": 0.0, + "step": 425, + "time_per_iteration": 4.893689155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182525, + "balance_loss_mlp": 1.15690684, + "diversity_loss_mlp": 0.0, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.10181541083425144, + "language_loss": 0.92197704, + "learning_rate": 0.0009929410994402065, + "loss": 0.93380231, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.25646973, + "routerloss_mlp": 0.0, + "step": 426, + "time_per_iteration": 3.793488025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863772, + "balance_loss_mlp": 1.42266524, + "diversity_loss_mlp": 0.26325443, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.038163151149059646, + "language_loss": 0.97185421, + "learning_rate": 0.0009928888389840196, + "loss": 0.98049194, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02081174, + "step": 427, + "time_per_iteration": 2.7310097217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196199, + "balance_loss_mlp": 1.1708436, + "diversity_loss_mlp": 0.0, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.1014811860289813, + "language_loss": 0.98936689, + "learning_rate": 0.0009928363871714147, + "loss": 1.00132895, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.25378418, + "routerloss_mlp": 0.0, + "step": 428, + "time_per_iteration": 2.650698184967041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198239, + "balance_loss_mlp": 1.17194164, + "diversity_loss_mlp": 0.0, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.0884548399202502, + "language_loss": 0.93840969, + "learning_rate": 0.0009927837440227556, + "loss": 0.95039201, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.26306152, + "routerloss_mlp": 0.0, + "step": 429, + "time_per_iteration": 2.8162689208984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199498, + "balance_loss_mlp": 1.17399931, + "diversity_loss_mlp": 0.0, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.0660726649824177, + "language_loss": 0.88846099, + "learning_rate": 0.0009927309095584798, + "loss": 0.90045595, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.25524902, + "routerloss_mlp": 0.0, + "step": 430, + "time_per_iteration": 2.975594997406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190829, + "balance_loss_mlp": 1.1661284, + "diversity_loss_mlp": 0.0, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.08430379744466543, + "language_loss": 0.98639262, + "learning_rate": 0.0009926778837991, + "loss": 0.99830091, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.24682617, + "routerloss_mlp": 0.0, + "step": 431, + "time_per_iteration": 2.595855236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187757, + "balance_loss_mlp": 1.16231799, + "diversity_loss_mlp": 0.0, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.08045199303169787, + "language_loss": 0.97297168, + "learning_rate": 0.000992624666765202, + "loss": 0.98484921, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.2545166, + "routerloss_mlp": 0.0, + "step": 432, + "time_per_iteration": 2.828488826751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195331, + "balance_loss_mlp": 1.17080951, + "diversity_loss_mlp": 0.0, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.08518069864439091, + "language_loss": 0.9513936, + "learning_rate": 0.000992571258477447, + "loss": 0.96334684, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 433, + "time_per_iteration": 2.7914628982543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181479, + "balance_loss_mlp": 1.15727913, + "diversity_loss_mlp": 0.0, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.08514456826718247, + "language_loss": 0.89393032, + "learning_rate": 0.0009925176589565695, + "loss": 0.90574509, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.24182129, + "routerloss_mlp": 0.0, + "step": 434, + "time_per_iteration": 2.847381830215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154073, + "balance_loss_mlp": 1.13002813, + "diversity_loss_mlp": 0.0, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.09497783603336436, + "language_loss": 0.99263078, + "learning_rate": 0.0009924638682233791, + "loss": 1.00417161, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.24047852, + "routerloss_mlp": 0.0, + "step": 435, + "time_per_iteration": 2.5871427059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02505725, + "balance_loss_mlp": 2.43934894, + "diversity_loss_mlp": 0.0, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.06827578128022488, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.82070321, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.6640625, + "routerloss_mlp": 0.0, + "step": 436, + "time_per_iteration": 4.539026737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138708, + "balance_loss_mlp": 1.11440182, + "diversity_loss_mlp": 0.0, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.10357837156718612, + "language_loss": 0.8856501, + "learning_rate": 0.0009923557132036668, + "loss": 0.89703721, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 437, + "time_per_iteration": 3.0414698123931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124752, + "balance_loss_mlp": 1.09998, + "diversity_loss_mlp": 0.0, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.06660243724344939, + "language_loss": 0.94103611, + "learning_rate": 0.0009923013489591345, + "loss": 0.95228368, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.24768066, + "routerloss_mlp": 0.0, + "step": 438, + "time_per_iteration": 2.7426626682281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857144, + "balance_loss_mlp": 1.4199276, + "diversity_loss_mlp": 0.26049304, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.04620678173721227, + "language_loss": 0.92873847, + "learning_rate": 0.0009922467935862681, + "loss": 0.93730992, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01693399, + "step": 439, + "time_per_iteration": 3.107149124145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113851, + "balance_loss_mlp": 1.11386943, + "diversity_loss_mlp": 0.0, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.07763968648184205, + "language_loss": 0.95120305, + "learning_rate": 0.0009921920471062478, + "loss": 0.96258819, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 440, + "time_per_iteration": 2.572195529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139984, + "balance_loss_mlp": 1.11489022, + "diversity_loss_mlp": 0.0, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.0880262953369173, + "language_loss": 0.92829931, + "learning_rate": 0.0009921371095403281, + "loss": 0.93969917, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.25109863, + "routerloss_mlp": 0.0, + "step": 441, + "time_per_iteration": 2.6386919021606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156684, + "balance_loss_mlp": 1.13206697, + "diversity_loss_mlp": 0.0, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.09427081021892933, + "language_loss": 0.95792937, + "learning_rate": 0.0009920819809098379, + "loss": 0.96949625, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.24633789, + "routerloss_mlp": 0.0, + "step": 442, + "time_per_iteration": 2.588674783706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169571, + "balance_loss_mlp": 1.1441319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.0873536117240321, + "language_loss": 0.91373646, + "learning_rate": 0.0009920266612361798, + "loss": 0.92543221, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.25463867, + "routerloss_mlp": 0.0, + "step": 443, + "time_per_iteration": 2.755526065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167225, + "balance_loss_mlp": 1.14349055, + "diversity_loss_mlp": 0.0, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.07116177044877865, + "language_loss": 0.90907955, + "learning_rate": 0.0009919711505408308, + "loss": 0.92075175, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.23718262, + "routerloss_mlp": 0.0, + "step": 444, + "time_per_iteration": 2.7939865589141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116645, + "balance_loss_mlp": 1.14170241, + "diversity_loss_mlp": 0.0, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.09221719775958219, + "language_loss": 0.89192301, + "learning_rate": 0.000991915448845342, + "loss": 0.90358752, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.24731445, + "routerloss_mlp": 0.0, + "step": 445, + "time_per_iteration": 2.5457842350006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154656, + "balance_loss_mlp": 1.13168466, + "diversity_loss_mlp": 0.0, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.08780021998431992, + "language_loss": 0.98329008, + "learning_rate": 0.000991859556171339, + "loss": 0.99483669, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.22973633, + "routerloss_mlp": 0.0, + "step": 446, + "time_per_iteration": 2.6356756687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0083848, + "balance_loss_mlp": 1.38336182, + "diversity_loss_mlp": 0.25472927, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.049564893991705376, + "language_loss": 1.00050902, + "learning_rate": 0.000991803472540521, + "loss": 1.00889397, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01943407, + "step": 447, + "time_per_iteration": 2.631704807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130193, + "balance_loss_mlp": 1.1087712, + "diversity_loss_mlp": 0.0, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.11682082282160788, + "language_loss": 0.94917679, + "learning_rate": 0.0009917471979746615, + "loss": 0.96047872, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 448, + "time_per_iteration": 2.9820516109466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122722, + "balance_loss_mlp": 1.10119319, + "diversity_loss_mlp": 0.0, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.07207820272739716, + "language_loss": 0.94521272, + "learning_rate": 0.0009916907324956086, + "loss": 0.95643997, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 449, + "time_per_iteration": 2.701571464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127326, + "balance_loss_mlp": 1.10453379, + "diversity_loss_mlp": 0.0, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.081693490118891, + "language_loss": 0.90889072, + "learning_rate": 0.0009916340761252837, + "loss": 0.92016399, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 450, + "time_per_iteration": 2.598238945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124635, + "balance_loss_mlp": 1.10287929, + "diversity_loss_mlp": 0.0, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.08322873762038852, + "language_loss": 0.88526833, + "learning_rate": 0.0009915772288856832, + "loss": 0.89651471, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.21765137, + "routerloss_mlp": 0.0, + "step": 451, + "time_per_iteration": 3.0680441856384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121178, + "balance_loss_mlp": 1.09876692, + "diversity_loss_mlp": 0.0, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.07764148626601892, + "language_loss": 0.8994481, + "learning_rate": 0.000991520190798877, + "loss": 0.91065991, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22412109, + "routerloss_mlp": 0.0, + "step": 452, + "time_per_iteration": 2.7982983589172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136254, + "balance_loss_mlp": 1.11281788, + "diversity_loss_mlp": 0.0, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.11496723003988224, + "language_loss": 0.98584056, + "learning_rate": 0.0009914629618870089, + "loss": 0.99720311, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 453, + "time_per_iteration": 2.8737423419952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0218934, + "balance_loss_mlp": 2.1624465, + "diversity_loss_mlp": 0.0, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.09249743450545506, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.8086521, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.26953125, + "routerloss_mlp": 0.0, + "step": 454, + "time_per_iteration": 4.756322860717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02065274, + "balance_loss_mlp": 2.03780842, + "diversity_loss_mlp": 0.0, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.0744981683452351, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.83493233, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.27539062, + "routerloss_mlp": 0.0, + "step": 455, + "time_per_iteration": 2.173584461212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848454, + "balance_loss_mlp": 1.40727437, + "diversity_loss_mlp": 0.24745712, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.04702924064086775, + "language_loss": 0.92085564, + "learning_rate": 0.0009912901304235883, + "loss": 0.92934018, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0210887, + "step": 456, + "time_per_iteration": 2.868276596069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273346, + "balance_loss_mlp": 1.24886012, + "diversity_loss_mlp": 0.0, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.1518400720273604, + "language_loss": 0.87943619, + "learning_rate": 0.000991232138434397, + "loss": 0.89216965, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.24499512, + "routerloss_mlp": 0.0, + "step": 457, + "time_per_iteration": 2.8729381561279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262571, + "balance_loss_mlp": 1.23763299, + "diversity_loss_mlp": 0.0, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.14470377187588201, + "language_loss": 0.94336045, + "learning_rate": 0.000991173955731976, + "loss": 0.9559862, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.24951172, + "routerloss_mlp": 0.0, + "step": 458, + "time_per_iteration": 2.7100729942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218734, + "balance_loss_mlp": 1.19520259, + "diversity_loss_mlp": 0.0, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.09239254139658798, + "language_loss": 0.99845707, + "learning_rate": 0.0009911155823389137, + "loss": 1.01064444, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.23547363, + "routerloss_mlp": 0.0, + "step": 459, + "time_per_iteration": 2.9462080001831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178782, + "balance_loss_mlp": 1.1555717, + "diversity_loss_mlp": 0.0, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.0878830171329016, + "language_loss": 0.95269191, + "learning_rate": 0.000991057018277873, + "loss": 0.9644798, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.23205566, + "routerloss_mlp": 0.0, + "step": 460, + "time_per_iteration": 2.7473583221435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151252, + "balance_loss_mlp": 1.12904322, + "diversity_loss_mlp": 0.0, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.1205367347306004, + "language_loss": 0.9509443, + "learning_rate": 0.0009909982635715898, + "loss": 0.96245682, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22216797, + "routerloss_mlp": 0.0, + "step": 461, + "time_per_iteration": 2.6226725578308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145607, + "balance_loss_mlp": 1.12300491, + "diversity_loss_mlp": 0.0, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.0884001914091671, + "language_loss": 0.94182885, + "learning_rate": 0.0009909393182428751, + "loss": 0.95328492, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22619629, + "routerloss_mlp": 0.0, + "step": 462, + "time_per_iteration": 2.632216453552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157329, + "balance_loss_mlp": 1.13402367, + "diversity_loss_mlp": 0.0, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.09814328047414513, + "language_loss": 0.89072084, + "learning_rate": 0.000990880182314614, + "loss": 0.90229416, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 463, + "time_per_iteration": 2.6763410568237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.008652, + "balance_loss_mlp": 1.44467092, + "diversity_loss_mlp": 0.24997658, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.034550824680377484, + "language_loss": 0.89998591, + "learning_rate": 0.0009908208558097643, + "loss": 0.90863788, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01787652, + "step": 464, + "time_per_iteration": 2.9323060512542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224446, + "balance_loss_mlp": 1.20036614, + "diversity_loss_mlp": 0.0, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.11121459240038054, + "language_loss": 0.9153899, + "learning_rate": 0.000990761338751359, + "loss": 0.92763436, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 465, + "time_per_iteration": 2.7976956367492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01887012, + "balance_loss_mlp": 1.84867477, + "diversity_loss_mlp": 0.0, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.10155840838291885, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.75546634, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.3828125, + "routerloss_mlp": 0.0, + "step": 466, + "time_per_iteration": 4.965139150619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319273, + "balance_loss_mlp": 1.29344034, + "diversity_loss_mlp": 0.0, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.10901527230577203, + "language_loss": 0.93872285, + "learning_rate": 0.0009906417330663815, + "loss": 0.95191562, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 467, + "time_per_iteration": 2.628042459487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01352641, + "balance_loss_mlp": 1.3264153, + "diversity_loss_mlp": 0.0, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.10051526680757361, + "language_loss": 0.90321958, + "learning_rate": 0.0009905816444862442, + "loss": 0.91674596, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.26245117, + "routerloss_mlp": 0.0, + "step": 468, + "time_per_iteration": 2.613952398300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396274, + "balance_loss_mlp": 1.36905813, + "diversity_loss_mlp": 0.0, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.10220310656667285, + "language_loss": 0.88433367, + "learning_rate": 0.0009905213654454216, + "loss": 0.89829642, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.27209473, + "routerloss_mlp": 0.0, + "step": 469, + "time_per_iteration": 2.897365093231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01363851, + "balance_loss_mlp": 1.3367548, + "diversity_loss_mlp": 0.0, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.11223211494597432, + "language_loss": 0.94907629, + "learning_rate": 0.0009904608959673158, + "loss": 0.96271479, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.2713623, + "routerloss_mlp": 0.0, + "step": 470, + "time_per_iteration": 2.7828967571258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328731, + "balance_loss_mlp": 1.30289829, + "diversity_loss_mlp": 0.0, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.10534875872888719, + "language_loss": 0.94143116, + "learning_rate": 0.000990400236075403, + "loss": 0.95471847, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25866699, + "routerloss_mlp": 0.0, + "step": 471, + "time_per_iteration": 2.5291385650634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126, + "balance_loss_mlp": 1.23546696, + "diversity_loss_mlp": 0.0, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.08150240013734093, + "language_loss": 0.92401147, + "learning_rate": 0.0009903393857932338, + "loss": 0.93661153, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24536133, + "routerloss_mlp": 0.0, + "step": 472, + "time_per_iteration": 2.6317975521087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234666, + "balance_loss_mlp": 1.21105075, + "diversity_loss_mlp": 0.0, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.1079858906687858, + "language_loss": 0.89742762, + "learning_rate": 0.0009902783451444317, + "loss": 0.90977424, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.23583984, + "routerloss_mlp": 0.0, + "step": 473, + "time_per_iteration": 2.708159923553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204783, + "balance_loss_mlp": 1.18326581, + "diversity_loss_mlp": 0.0, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.08561107807714156, + "language_loss": 0.94620812, + "learning_rate": 0.0009902171141526956, + "loss": 0.95825595, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.21533203, + "routerloss_mlp": 0.0, + "step": 474, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196875, + "balance_loss_mlp": 1.17460644, + "diversity_loss_mlp": 0.0, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.10745755704500252, + "language_loss": 0.82875264, + "learning_rate": 0.000990155692841797, + "loss": 0.84072143, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.22277832, + "routerloss_mlp": 0.0, + "step": 475, + "time_per_iteration": 2.985820770263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191147, + "balance_loss_mlp": 1.16911697, + "diversity_loss_mlp": 0.0, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.10692573165988825, + "language_loss": 0.93685389, + "learning_rate": 0.0009900940812355818, + "loss": 0.9487654, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.22033691, + "routerloss_mlp": 0.0, + "step": 476, + "time_per_iteration": 2.882946014404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182015, + "balance_loss_mlp": 1.15972316, + "diversity_loss_mlp": 0.0, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.15748592495925862, + "language_loss": 0.89566875, + "learning_rate": 0.00099003227935797, + "loss": 0.90748894, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.22290039, + "routerloss_mlp": 0.0, + "step": 477, + "time_per_iteration": 2.729729413986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176422, + "balance_loss_mlp": 1.15324748, + "diversity_loss_mlp": 0.0, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.11223041806675033, + "language_loss": 0.92644513, + "learning_rate": 0.000989970287232955, + "loss": 0.93820935, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.23156738, + "routerloss_mlp": 0.0, + "step": 478, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168241, + "balance_loss_mlp": 1.14524555, + "diversity_loss_mlp": 0.0, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.08330283562574453, + "language_loss": 0.90444613, + "learning_rate": 0.0009899081048846043, + "loss": 0.91612852, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 479, + "time_per_iteration": 2.548454523086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230508, + "balance_loss_mlp": 1.20630884, + "diversity_loss_mlp": 0.0, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.17103007353978975, + "language_loss": 0.94793594, + "learning_rate": 0.0009898457323370593, + "loss": 0.96024096, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.24206543, + "routerloss_mlp": 0.0, + "step": 480, + "time_per_iteration": 2.582655668258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249007, + "balance_loss_mlp": 1.22349596, + "diversity_loss_mlp": 0.0, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.11976742763400251, + "language_loss": 0.9370476, + "learning_rate": 0.000989783169614535, + "loss": 0.94953763, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25537109, + "routerloss_mlp": 0.0, + "step": 481, + "time_per_iteration": 2.6305787563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01772239, + "balance_loss_mlp": 1.74649, + "diversity_loss_mlp": 0.0, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.0876770513617693, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80524993, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 482, + "time_per_iteration": 4.8690409660339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276229, + "balance_loss_mlp": 1.25084925, + "diversity_loss_mlp": 0.0, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.10686208189243855, + "language_loss": 0.91100538, + "learning_rate": 0.000989657473741779, + "loss": 0.92376775, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25402832, + "routerloss_mlp": 0.0, + "step": 483, + "time_per_iteration": 2.8294553756713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275465, + "balance_loss_mlp": 1.25022864, + "diversity_loss_mlp": 0.0, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.09087050091564236, + "language_loss": 0.92375994, + "learning_rate": 0.0009895943406403465, + "loss": 0.93651462, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.25244141, + "routerloss_mlp": 0.0, + "step": 484, + "time_per_iteration": 2.728445053100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231643, + "balance_loss_mlp": 1.20584655, + "diversity_loss_mlp": 0.0, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.11173906110031175, + "language_loss": 0.85102737, + "learning_rate": 0.0009895310174615338, + "loss": 0.86334383, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.25805664, + "routerloss_mlp": 0.0, + "step": 485, + "time_per_iteration": 2.809858560562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01674879, + "balance_loss_mlp": 1.65122819, + "diversity_loss_mlp": 0.0, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.0891862493938321, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.77393395, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.23632812, + "routerloss_mlp": 0.0, + "step": 486, + "time_per_iteration": 4.675356388092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149968, + "balance_loss_mlp": 1.1268059, + "diversity_loss_mlp": 0.0, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.12873710921953274, + "language_loss": 0.89867461, + "learning_rate": 0.0009894038009701782, + "loss": 0.91017425, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.23168945, + "routerloss_mlp": 0.0, + "step": 487, + "time_per_iteration": 2.646655797958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141338, + "balance_loss_mlp": 1.11786556, + "diversity_loss_mlp": 0.0, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.11717214663903742, + "language_loss": 0.89069557, + "learning_rate": 0.0009893399077070253, + "loss": 0.90210891, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.23474121, + "routerloss_mlp": 0.0, + "step": 488, + "time_per_iteration": 2.578733444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936332, + "balance_loss_mlp": 1.59238243, + "diversity_loss_mlp": 0.24211329, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.03786592480343135, + "language_loss": 0.88446009, + "learning_rate": 0.0009892758244652718, + "loss": 0.89382339, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0190843, + "step": 489, + "time_per_iteration": 2.72853946685791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131365, + "balance_loss_mlp": 1.10876274, + "diversity_loss_mlp": 0.0, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.09957245788293691, + "language_loss": 0.92780352, + "learning_rate": 0.0009892115512697968, + "loss": 0.93911719, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 490, + "time_per_iteration": 2.6975181102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127367, + "balance_loss_mlp": 1.10648203, + "diversity_loss_mlp": 0.0, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.09077239739165983, + "language_loss": 0.95311546, + "learning_rate": 0.0009891470881455537, + "loss": 0.96438909, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 491, + "time_per_iteration": 2.674140214920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141034, + "balance_loss_mlp": 1.12092364, + "diversity_loss_mlp": 0.0, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.08843271909801863, + "language_loss": 0.91967297, + "learning_rate": 0.0009890824351175692, + "loss": 0.93108326, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.20092773, + "routerloss_mlp": 0.0, + "step": 492, + "time_per_iteration": 2.689789295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148763, + "balance_loss_mlp": 1.12847304, + "diversity_loss_mlp": 0.0, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.0818574716555875, + "language_loss": 0.96715915, + "learning_rate": 0.0009890175922109435, + "loss": 0.97864676, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.20288086, + "routerloss_mlp": 0.0, + "step": 493, + "time_per_iteration": 2.653787136077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161837, + "balance_loss_mlp": 1.14108253, + "diversity_loss_mlp": 0.0, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.10785532679009643, + "language_loss": 0.94627249, + "learning_rate": 0.0009889525594508513, + "loss": 0.95789087, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.20751953, + "routerloss_mlp": 0.0, + "step": 494, + "time_per_iteration": 3.013289213180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168804, + "balance_loss_mlp": 1.14887238, + "diversity_loss_mlp": 0.0, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.09313196509024183, + "language_loss": 0.89226812, + "learning_rate": 0.0009888873368625404, + "loss": 0.90395617, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 495, + "time_per_iteration": 2.4990835189819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215397, + "balance_loss_mlp": 1.19448745, + "diversity_loss_mlp": 0.0, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.11525575263217126, + "language_loss": 0.92808712, + "learning_rate": 0.0009888219244713326, + "loss": 0.94024116, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 496, + "time_per_iteration": 2.828477382659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235818, + "balance_loss_mlp": 1.2138716, + "diversity_loss_mlp": 0.0, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.13708349411569606, + "language_loss": 0.92383498, + "learning_rate": 0.0009887563223026229, + "loss": 0.93619317, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 497, + "time_per_iteration": 2.6688501834869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03358766, + "balance_loss_mlp": 3.33902526, + "diversity_loss_mlp": 0.0, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.4973253845941573, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.82426929, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19726562, + "routerloss_mlp": 0.0, + "step": 498, + "time_per_iteration": 4.9225428104400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125204, + "balance_loss_mlp": 1.22810328, + "diversity_loss_mlp": 0.0, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.09338533863845942, + "language_loss": 0.9145627, + "learning_rate": 0.0009886245487346482, + "loss": 0.92708313, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.23925781, + "routerloss_mlp": 0.0, + "step": 499, + "time_per_iteration": 3.0396392345428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273949, + "balance_loss_mlp": 1.24874783, + "diversity_loss_mlp": 0.0, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.12406156723875504, + "language_loss": 0.94657683, + "learning_rate": 0.0009885583773865422, + "loss": 0.95931631, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 500, + "time_per_iteration": 2.434283971786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319213, + "balance_loss_mlp": 1.29096031, + "diversity_loss_mlp": 0.0, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.11518840252548597, + "language_loss": 0.91528684, + "learning_rate": 0.0009884920163632524, + "loss": 0.92847896, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.2824707, + "routerloss_mlp": 0.0, + "step": 501, + "time_per_iteration": 2.6888957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131255, + "balance_loss_mlp": 1.28246212, + "diversity_loss_mlp": 0.0, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.12991803618191863, + "language_loss": 0.93797207, + "learning_rate": 0.000988425465690543, + "loss": 0.95109755, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.30102539, + "routerloss_mlp": 0.0, + "step": 502, + "time_per_iteration": 2.5672004222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283439, + "balance_loss_mlp": 1.25225365, + "diversity_loss_mlp": 0.0, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.11000587000012971, + "language_loss": 0.91223967, + "learning_rate": 0.0009883587253942505, + "loss": 0.92507404, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.31152344, + "routerloss_mlp": 0.0, + "step": 503, + "time_per_iteration": 2.7560157775878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273545, + "balance_loss_mlp": 1.24281311, + "diversity_loss_mlp": 0.0, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.10509235815923167, + "language_loss": 0.97371984, + "learning_rate": 0.0009882917955002862, + "loss": 0.9864552, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.30712891, + "routerloss_mlp": 0.0, + "step": 504, + "time_per_iteration": 2.5183091163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227481, + "balance_loss_mlp": 1.1978929, + "diversity_loss_mlp": 0.0, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.11004475447178139, + "language_loss": 0.90284961, + "learning_rate": 0.0009882246760346343, + "loss": 0.91512442, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.2956543, + "routerloss_mlp": 0.0, + "step": 505, + "time_per_iteration": 2.6169376373291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215441, + "balance_loss_mlp": 1.18637753, + "diversity_loss_mlp": 0.0, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.13294554223904492, + "language_loss": 0.94025862, + "learning_rate": 0.0009881573670233533, + "loss": 0.95241302, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.29077148, + "routerloss_mlp": 0.0, + "step": 506, + "time_per_iteration": 2.5373079776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012064, + "balance_loss_mlp": 1.17976809, + "diversity_loss_mlp": 0.0, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.07932421313758002, + "language_loss": 0.89223576, + "learning_rate": 0.0009880898684925747, + "loss": 0.90429974, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.26660156, + "routerloss_mlp": 0.0, + "step": 507, + "time_per_iteration": 2.661796808242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206827, + "balance_loss_mlp": 1.18070853, + "diversity_loss_mlp": 0.0, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.09132088261693337, + "language_loss": 0.87935519, + "learning_rate": 0.0009880221804685037, + "loss": 0.89142346, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.26159668, + "routerloss_mlp": 0.0, + "step": 508, + "time_per_iteration": 2.542513608932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02552291, + "balance_loss_mlp": 2.42869496, + "diversity_loss_mlp": 0.0, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.1282373293100265, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.8189671, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 509, + "time_per_iteration": 4.707206964492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01280503, + "balance_loss_mlp": 1.25399113, + "diversity_loss_mlp": 0.0, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.09929466646798928, + "language_loss": 0.93586993, + "learning_rate": 0.0009878862360456733, + "loss": 0.94867498, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.265625, + "routerloss_mlp": 0.0, + "step": 510, + "time_per_iteration": 2.6981284618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284628, + "balance_loss_mlp": 1.25883126, + "diversity_loss_mlp": 0.0, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.10250849932844218, + "language_loss": 0.87516463, + "learning_rate": 0.0009878179796996922, + "loss": 0.88801086, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.25817871, + "routerloss_mlp": 0.0, + "step": 511, + "time_per_iteration": 2.7541561126708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281708, + "balance_loss_mlp": 1.25468373, + "diversity_loss_mlp": 0.0, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.10234956077068923, + "language_loss": 0.90780497, + "learning_rate": 0.0009877495339659754, + "loss": 0.92062211, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.27038574, + "routerloss_mlp": 0.0, + "step": 512, + "time_per_iteration": 2.7744665145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278173, + "balance_loss_mlp": 1.25241184, + "diversity_loss_mlp": 0.0, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.11291475079800635, + "language_loss": 0.85683644, + "learning_rate": 0.000987680898871096, + "loss": 0.86961818, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.2578125, + "routerloss_mlp": 0.0, + "step": 513, + "time_per_iteration": 2.8321592807769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289018, + "balance_loss_mlp": 1.26217198, + "diversity_loss_mlp": 0.0, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.10190264212433507, + "language_loss": 0.85800934, + "learning_rate": 0.0009876120744417, + "loss": 0.87089956, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.26867676, + "routerloss_mlp": 0.0, + "step": 514, + "time_per_iteration": 2.945312023162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245022, + "balance_loss_mlp": 1.2198211, + "diversity_loss_mlp": 0.0, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.09616865198011539, + "language_loss": 0.94088352, + "learning_rate": 0.0009875430607045078, + "loss": 0.9533338, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.2520752, + "routerloss_mlp": 0.0, + "step": 515, + "time_per_iteration": 2.656282663345337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214395, + "balance_loss_mlp": 1.19058895, + "diversity_loss_mlp": 0.0, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.0895550710797692, + "language_loss": 0.91242373, + "learning_rate": 0.000987473857686313, + "loss": 0.9245677, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.23791504, + "routerloss_mlp": 0.0, + "step": 516, + "time_per_iteration": 2.7530250549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218622, + "balance_loss_mlp": 1.19458985, + "diversity_loss_mlp": 0.0, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.11626991588591096, + "language_loss": 0.92559797, + "learning_rate": 0.0009874044654139824, + "loss": 0.93778414, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.24023438, + "routerloss_mlp": 0.0, + "step": 517, + "time_per_iteration": 2.7673146724700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188049, + "balance_loss_mlp": 1.16410005, + "diversity_loss_mlp": 0.0, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.09260385447056875, + "language_loss": 0.91065013, + "learning_rate": 0.0009873348839144563, + "loss": 0.92253065, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.23950195, + "routerloss_mlp": 0.0, + "step": 518, + "time_per_iteration": 2.5385515689849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162168, + "balance_loss_mlp": 1.13979197, + "diversity_loss_mlp": 0.0, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.07604390633760301, + "language_loss": 0.95252264, + "learning_rate": 0.000987265113214749, + "loss": 0.96414435, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.22375488, + "routerloss_mlp": 0.0, + "step": 519, + "time_per_iteration": 2.556882619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171262, + "balance_loss_mlp": 1.14849353, + "diversity_loss_mlp": 0.0, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.093032650642813, + "language_loss": 0.94720447, + "learning_rate": 0.0009871951533419476, + "loss": 0.95891708, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.22753906, + "routerloss_mlp": 0.0, + "step": 520, + "time_per_iteration": 2.724825143814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163285, + "balance_loss_mlp": 1.14063525, + "diversity_loss_mlp": 0.0, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.07732484115861517, + "language_loss": 0.87440532, + "learning_rate": 0.0009871250043232132, + "loss": 0.88603818, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.22631836, + "routerloss_mlp": 0.0, + "step": 521, + "time_per_iteration": 2.756647825241089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171709, + "balance_loss_mlp": 1.14840364, + "diversity_loss_mlp": 0.0, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.08586449419627491, + "language_loss": 0.8592059, + "learning_rate": 0.0009870546661857797, + "loss": 0.87092298, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.23291016, + "routerloss_mlp": 0.0, + "step": 522, + "time_per_iteration": 2.611241340637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188262, + "balance_loss_mlp": 1.16447985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.11121774977632432, + "language_loss": 0.93899059, + "learning_rate": 0.0009869841389569553, + "loss": 0.9508732, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.2376709, + "routerloss_mlp": 0.0, + "step": 523, + "time_per_iteration": 2.986001491546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897074, + "balance_loss_mlp": 1.51972795, + "diversity_loss_mlp": 0.23477924, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.04055297882665198, + "language_loss": 0.88430732, + "learning_rate": 0.0009869134226641206, + "loss": 0.89327806, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01982057, + "step": 524, + "time_per_iteration": 2.5944766998291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213869, + "balance_loss_mlp": 1.19106424, + "diversity_loss_mlp": 0.0, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.1040439940574723, + "language_loss": 0.87633705, + "learning_rate": 0.0009868425173347303, + "loss": 0.88847572, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 525, + "time_per_iteration": 2.679245710372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202393, + "balance_loss_mlp": 1.17973125, + "diversity_loss_mlp": 0.0, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.10306076043273057, + "language_loss": 0.95430547, + "learning_rate": 0.0009867714229963125, + "loss": 0.96632946, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 526, + "time_per_iteration": 2.6960504055023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194179, + "balance_loss_mlp": 1.17121899, + "diversity_loss_mlp": 0.0, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.13221329860014494, + "language_loss": 0.92439747, + "learning_rate": 0.000986700139676468, + "loss": 0.93633932, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 527, + "time_per_iteration": 2.5740442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226752, + "balance_loss_mlp": 1.20331526, + "diversity_loss_mlp": 0.0, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.07480383753700154, + "language_loss": 0.90178651, + "learning_rate": 0.0009866286674028717, + "loss": 0.91405398, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 528, + "time_per_iteration": 2.6214394569396973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00901033, + "balance_loss_mlp": 1.53179681, + "diversity_loss_mlp": 0.23385583, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.042015219172821444, + "language_loss": 0.87127066, + "learning_rate": 0.0009865570062032717, + "loss": 0.88028097, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820667, + "step": 529, + "time_per_iteration": 2.947612762451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243163, + "balance_loss_mlp": 1.21885657, + "diversity_loss_mlp": 0.0, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.11620953964099495, + "language_loss": 0.91896212, + "learning_rate": 0.0009864851561054893, + "loss": 0.93139374, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.24304199, + "routerloss_mlp": 0.0, + "step": 530, + "time_per_iteration": 2.8097901344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192516, + "balance_loss_mlp": 1.16937733, + "diversity_loss_mlp": 0.0, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.0991735208834069, + "language_loss": 0.90383148, + "learning_rate": 0.0009864131171374191, + "loss": 0.9157567, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.23132324, + "routerloss_mlp": 0.0, + "step": 531, + "time_per_iteration": 2.6775832176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169363, + "balance_loss_mlp": 1.14682031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.08125371515716559, + "language_loss": 0.90489674, + "learning_rate": 0.0009863408893270292, + "loss": 0.91659039, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.2253418, + "routerloss_mlp": 0.0, + "step": 532, + "time_per_iteration": 2.7877254486083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.1120224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.11770570969906818, + "language_loss": 0.85183895, + "learning_rate": 0.0009862684727023605, + "loss": 0.8631804, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 533, + "time_per_iteration": 2.717573642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128208, + "balance_loss_mlp": 1.10571277, + "diversity_loss_mlp": 0.0, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.10673213842736717, + "language_loss": 0.88664484, + "learning_rate": 0.0009861958672915283, + "loss": 0.89792687, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.22497559, + "routerloss_mlp": 0.0, + "step": 534, + "time_per_iteration": 2.7880847454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111155, + "balance_loss_mlp": 1.08948302, + "diversity_loss_mlp": 0.0, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.11915216532291298, + "language_loss": 0.88834876, + "learning_rate": 0.0009861230731227201, + "loss": 0.89946032, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.21679688, + "routerloss_mlp": 0.0, + "step": 535, + "time_per_iteration": 2.844203233718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121725, + "balance_loss_mlp": 1.10002935, + "diversity_loss_mlp": 0.0, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.11019657032079996, + "language_loss": 0.90318179, + "learning_rate": 0.0009860500902241973, + "loss": 0.91439903, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.21716309, + "routerloss_mlp": 0.0, + "step": 536, + "time_per_iteration": 2.5753133296966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126411, + "balance_loss_mlp": 1.10444033, + "diversity_loss_mlp": 0.0, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.13353850851854182, + "language_loss": 0.95278764, + "learning_rate": 0.0009859769186242942, + "loss": 0.96405172, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.21984863, + "routerloss_mlp": 0.0, + "step": 537, + "time_per_iteration": 2.544611930847168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894726, + "balance_loss_mlp": 1.52693653, + "diversity_loss_mlp": 0.22699235, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.04205207536563703, + "language_loss": 0.88558614, + "learning_rate": 0.0009859035583514187, + "loss": 0.8945334, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01776124, + "step": 538, + "time_per_iteration": 2.647594451904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257859, + "balance_loss_mlp": 1.23475599, + "diversity_loss_mlp": 0.0, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.11200334451020948, + "language_loss": 0.89448857, + "learning_rate": 0.0009858300094340517, + "loss": 0.90706718, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.23071289, + "routerloss_mlp": 0.0, + "step": 539, + "time_per_iteration": 2.7679364681243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291272, + "balance_loss_mlp": 1.26785898, + "diversity_loss_mlp": 0.0, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.17493624211104222, + "language_loss": 0.84562349, + "learning_rate": 0.0009857562719007473, + "loss": 0.85853624, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.23388672, + "routerloss_mlp": 0.0, + "step": 540, + "time_per_iteration": 2.6256375312805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267144, + "balance_loss_mlp": 1.24492311, + "diversity_loss_mlp": 0.0, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.14114133743563548, + "language_loss": 0.86615884, + "learning_rate": 0.0009856823457801331, + "loss": 0.87883031, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 541, + "time_per_iteration": 2.8773691654205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254714, + "balance_loss_mlp": 1.23256469, + "diversity_loss_mlp": 0.0, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.08733197639022866, + "language_loss": 0.93604994, + "learning_rate": 0.00098560823110091, + "loss": 0.94859707, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.22167969, + "routerloss_mlp": 0.0, + "step": 542, + "time_per_iteration": 2.6173057556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206738, + "balance_loss_mlp": 1.18436217, + "diversity_loss_mlp": 0.0, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.14252191795618116, + "language_loss": 0.94814467, + "learning_rate": 0.000985533927891851, + "loss": 0.96021199, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.22387695, + "routerloss_mlp": 0.0, + "step": 543, + "time_per_iteration": 2.682035207748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00924177, + "balance_loss_mlp": 1.58877563, + "diversity_loss_mlp": 0.22542018, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.04171093567104517, + "language_loss": 0.92462713, + "learning_rate": 0.0009854594361818044, + "loss": 0.93386889, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01707876, + "step": 544, + "time_per_iteration": 2.771606922149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134006, + "balance_loss_mlp": 1.11126077, + "diversity_loss_mlp": 0.0, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.16622789723447462, + "language_loss": 0.91736549, + "learning_rate": 0.0009853847559996897, + "loss": 0.92870551, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.22729492, + "routerloss_mlp": 0.0, + "step": 545, + "time_per_iteration": 2.714980363845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131307, + "balance_loss_mlp": 1.10896707, + "diversity_loss_mlp": 0.0, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.13863422454282084, + "language_loss": 0.90834534, + "learning_rate": 0.0009853098873745, + "loss": 0.91965836, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.22351074, + "routerloss_mlp": 0.0, + "step": 546, + "time_per_iteration": 2.98349928855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127677, + "balance_loss_mlp": 1.10500383, + "diversity_loss_mlp": 0.0, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.15888834478547278, + "language_loss": 0.90073705, + "learning_rate": 0.0009852348303353027, + "loss": 0.91201389, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.22668457, + "routerloss_mlp": 0.0, + "step": 547, + "time_per_iteration": 2.782012701034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148987, + "balance_loss_mlp": 1.12613487, + "diversity_loss_mlp": 0.0, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.10179846154330349, + "language_loss": 0.82990968, + "learning_rate": 0.000985159584911237, + "loss": 0.84139955, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 548, + "time_per_iteration": 3.102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216658, + "balance_loss_mlp": 1.19307828, + "diversity_loss_mlp": 0.0, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.12466178148261096, + "language_loss": 0.89916652, + "learning_rate": 0.0009850841511315162, + "loss": 0.91133308, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.2355957, + "routerloss_mlp": 0.0, + "step": 549, + "time_per_iteration": 2.61226749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241093, + "balance_loss_mlp": 1.21708441, + "diversity_loss_mlp": 0.0, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.11901003741868514, + "language_loss": 0.90615034, + "learning_rate": 0.0009850085290254256, + "loss": 0.91856128, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.23986816, + "routerloss_mlp": 0.0, + "step": 550, + "time_per_iteration": 2.7958199977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914838, + "balance_loss_mlp": 1.5724771, + "diversity_loss_mlp": 0.22113116, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.03122458898086593, + "language_loss": 0.87977409, + "learning_rate": 0.0009849327186223246, + "loss": 0.88892245, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0180343, + "step": 551, + "time_per_iteration": 2.799394130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242815, + "balance_loss_mlp": 1.21818638, + "diversity_loss_mlp": 0.0, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.10957849833176474, + "language_loss": 0.95181417, + "learning_rate": 0.000984856719951646, + "loss": 0.96424234, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.24609375, + "routerloss_mlp": 0.0, + "step": 552, + "time_per_iteration": 2.559286117553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121032, + "balance_loss_mlp": 1.18546462, + "diversity_loss_mlp": 0.0, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.09349197696587547, + "language_loss": 0.91760498, + "learning_rate": 0.0009847805330428943, + "loss": 0.92970818, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.24865723, + "routerloss_mlp": 0.0, + "step": 553, + "time_per_iteration": 2.906571388244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875983, + "balance_loss_mlp": 1.49139261, + "diversity_loss_mlp": 0.22127438, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.05457604420902532, + "language_loss": 0.93558431, + "learning_rate": 0.0009847041579256481, + "loss": 0.94434416, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01964992, + "step": 554, + "time_per_iteration": 2.6159372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202664, + "balance_loss_mlp": 1.17859542, + "diversity_loss_mlp": 0.0, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.08395889420783041, + "language_loss": 0.94042808, + "learning_rate": 0.0009846275946295592, + "loss": 0.95245475, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.24072266, + "routerloss_mlp": 0.0, + "step": 555, + "time_per_iteration": 2.592341184616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182493, + "balance_loss_mlp": 1.15904498, + "diversity_loss_mlp": 0.0, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.08262845202589308, + "language_loss": 0.8740595, + "learning_rate": 0.0009845508431843518, + "loss": 0.8858844, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.23425293, + "routerloss_mlp": 0.0, + "step": 556, + "time_per_iteration": 3.0123813152313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177615, + "balance_loss_mlp": 1.15481031, + "diversity_loss_mlp": 0.0, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.07593810566908125, + "language_loss": 0.88148719, + "learning_rate": 0.0009844739036198233, + "loss": 0.8932634, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.22814941, + "routerloss_mlp": 0.0, + "step": 557, + "time_per_iteration": 2.6356143951416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184514, + "balance_loss_mlp": 1.16157842, + "diversity_loss_mlp": 0.0, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.09177793780956148, + "language_loss": 0.94916999, + "learning_rate": 0.0009843967759658448, + "loss": 0.96101511, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 558, + "time_per_iteration": 2.6546378135681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02293865, + "balance_loss_mlp": 2.17026901, + "diversity_loss_mlp": 0.0, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.09925677209713644, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.75061619, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 1.234375, + "routerloss_mlp": 0.0, + "step": 559, + "time_per_iteration": 4.829499244689941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207667, + "balance_loss_mlp": 1.18555331, + "diversity_loss_mlp": 0.0, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.1031420062274817, + "language_loss": 0.9552027, + "learning_rate": 0.000984241956509384, + "loss": 0.96727937, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 560, + "time_per_iteration": 2.65759539604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204016, + "balance_loss_mlp": 1.18220043, + "diversity_loss_mlp": 0.0, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.08944048757536185, + "language_loss": 0.90505213, + "learning_rate": 0.0009841642647670078, + "loss": 0.91709226, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 561, + "time_per_iteration": 2.591806173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194467, + "balance_loss_mlp": 1.17308092, + "diversity_loss_mlp": 0.0, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.08297191380839272, + "language_loss": 0.85483265, + "learning_rate": 0.0009840863850553944, + "loss": 0.8667773, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.21398926, + "routerloss_mlp": 0.0, + "step": 562, + "time_per_iteration": 2.963149309158325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179499, + "balance_loss_mlp": 1.15856552, + "diversity_loss_mlp": 0.0, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.18759249419324772, + "language_loss": 0.9088884, + "learning_rate": 0.0009840083174047782, + "loss": 0.92068338, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.20947266, + "routerloss_mlp": 0.0, + "step": 563, + "time_per_iteration": 2.71415114402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169496, + "balance_loss_mlp": 1.14940953, + "diversity_loss_mlp": 0.0, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.08351477183844232, + "language_loss": 0.86295354, + "learning_rate": 0.0009839300618454685, + "loss": 0.87464857, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.20080566, + "routerloss_mlp": 0.0, + "step": 564, + "time_per_iteration": 2.8288042545318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163807, + "balance_loss_mlp": 1.14280224, + "diversity_loss_mlp": 0.0, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.0761185875884483, + "language_loss": 0.9141686, + "learning_rate": 0.0009838516184078466, + "loss": 0.92580664, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.21020508, + "routerloss_mlp": 0.0, + "step": 565, + "time_per_iteration": 2.8194022178649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177734, + "balance_loss_mlp": 1.15682447, + "diversity_loss_mlp": 0.0, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.14122321260962364, + "language_loss": 0.88377023, + "learning_rate": 0.0009837729871223669, + "loss": 0.89554763, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.20922852, + "routerloss_mlp": 0.0, + "step": 566, + "time_per_iteration": 2.6096079349517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194985, + "balance_loss_mlp": 1.17372978, + "diversity_loss_mlp": 0.0, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.1066586812750682, + "language_loss": 0.88896918, + "learning_rate": 0.0009836941680195568, + "loss": 0.90091902, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.21264648, + "routerloss_mlp": 0.0, + "step": 567, + "time_per_iteration": 2.779846429824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210465, + "balance_loss_mlp": 1.18900692, + "diversity_loss_mlp": 0.0, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.09744135285550241, + "language_loss": 0.84777021, + "learning_rate": 0.0009836151611300166, + "loss": 0.85987484, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 568, + "time_per_iteration": 3.2130274772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210546, + "balance_loss_mlp": 1.18979168, + "diversity_loss_mlp": 0.0, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.0877787159655237, + "language_loss": 0.95202124, + "learning_rate": 0.0009835359664844194, + "loss": 0.96412671, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.2076416, + "routerloss_mlp": 0.0, + "step": 569, + "time_per_iteration": 2.614626407623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02163392, + "balance_loss_mlp": 2.12848806, + "diversity_loss_mlp": 0.0, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.098326155744124, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.83200204, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.34960938, + "routerloss_mlp": 0.0, + "step": 570, + "time_per_iteration": 4.910563230514526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188679, + "balance_loss_mlp": 1.16738796, + "diversity_loss_mlp": 0.0, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.10673198509513786, + "language_loss": 0.92503107, + "learning_rate": 0.0009833770140481118, + "loss": 0.93691778, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.21313477, + "routerloss_mlp": 0.0, + "step": 571, + "time_per_iteration": 2.6361794471740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167522, + "balance_loss_mlp": 1.14587367, + "diversity_loss_mlp": 0.0, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.06757736028097705, + "language_loss": 0.82720339, + "learning_rate": 0.000983297256319112, + "loss": 0.83887863, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.21655273, + "routerloss_mlp": 0.0, + "step": 572, + "time_per_iteration": 3.2420709133148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148716, + "balance_loss_mlp": 1.12606621, + "diversity_loss_mlp": 0.0, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.09218112459591986, + "language_loss": 0.87054348, + "learning_rate": 0.000983217310957477, + "loss": 0.88203067, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.2265625, + "routerloss_mlp": 0.0, + "step": 573, + "time_per_iteration": 2.7485547065734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139139, + "balance_loss_mlp": 1.11725259, + "diversity_loss_mlp": 0.0, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.08282639029669561, + "language_loss": 0.90421212, + "learning_rate": 0.000983137177994244, + "loss": 0.91560352, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.21899414, + "routerloss_mlp": 0.0, + "step": 574, + "time_per_iteration": 2.8651185035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142425, + "balance_loss_mlp": 1.11990607, + "diversity_loss_mlp": 0.0, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.08655490231030577, + "language_loss": 0.8561765, + "learning_rate": 0.0009830568574605235, + "loss": 0.8676008, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.22521973, + "routerloss_mlp": 0.0, + "step": 575, + "time_per_iteration": 2.942331075668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162411, + "balance_loss_mlp": 1.13946342, + "diversity_loss_mlp": 0.0, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.08792859421485215, + "language_loss": 0.88113999, + "learning_rate": 0.0009829763493874992, + "loss": 0.89276409, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.22912598, + "routerloss_mlp": 0.0, + "step": 576, + "time_per_iteration": 3.0282514095306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173849, + "balance_loss_mlp": 1.15098429, + "diversity_loss_mlp": 0.0, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.10676499351314739, + "language_loss": 0.9303807, + "learning_rate": 0.0009828956538064264, + "loss": 0.94211912, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.2286377, + "routerloss_mlp": 0.0, + "step": 577, + "time_per_iteration": 2.7946369647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173248, + "balance_loss_mlp": 1.1503005, + "diversity_loss_mlp": 0.0, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.11074471638842859, + "language_loss": 0.91223717, + "learning_rate": 0.0009828147707486344, + "loss": 0.92396963, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.22937012, + "routerloss_mlp": 0.0, + "step": 578, + "time_per_iteration": 2.731588125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.13424993, + "diversity_loss_mlp": 0.0, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.09317476454713723, + "language_loss": 0.86116958, + "learning_rate": 0.0009827337002455245, + "loss": 0.87273794, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.22583008, + "routerloss_mlp": 0.0, + "step": 579, + "time_per_iteration": 2.639047145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134437, + "balance_loss_mlp": 1.11184728, + "diversity_loss_mlp": 0.0, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.07918824025832125, + "language_loss": 0.88299757, + "learning_rate": 0.0009826524423285712, + "loss": 0.89434195, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.22595215, + "routerloss_mlp": 0.0, + "step": 580, + "time_per_iteration": 2.911012649536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114105, + "balance_loss_mlp": 1.11881745, + "diversity_loss_mlp": 0.0, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.10469703454021252, + "language_loss": 0.89618349, + "learning_rate": 0.0009825709970293218, + "loss": 0.90759397, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.22229004, + "routerloss_mlp": 0.0, + "step": 581, + "time_per_iteration": 2.8837828636169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135626, + "balance_loss_mlp": 1.11433506, + "diversity_loss_mlp": 0.0, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.1022616119694228, + "language_loss": 0.95317924, + "learning_rate": 0.0009824893643793956, + "loss": 0.96453559, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.21289062, + "routerloss_mlp": 0.0, + "step": 582, + "time_per_iteration": 3.0962114334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948798, + "balance_loss_mlp": 1.63779283, + "diversity_loss_mlp": 0.22248407, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.04350556393742171, + "language_loss": 0.88843536, + "learning_rate": 0.0009824075444104857, + "loss": 0.89792335, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01865991, + "step": 583, + "time_per_iteration": 2.719085454940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157517, + "balance_loss_mlp": 1.13638163, + "diversity_loss_mlp": 0.0, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.10740950198198211, + "language_loss": 0.93831933, + "learning_rate": 0.000982325537154357, + "loss": 0.94989443, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.21154785, + "routerloss_mlp": 0.0, + "step": 584, + "time_per_iteration": 2.597120523452759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117352, + "balance_loss_mlp": 1.15234792, + "diversity_loss_mlp": 0.0, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.12322952105084124, + "language_loss": 0.94442445, + "learning_rate": 0.0009822433426428484, + "loss": 0.95615965, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.21179199, + "routerloss_mlp": 0.0, + "step": 585, + "time_per_iteration": 2.571805238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238103, + "balance_loss_mlp": 1.2166214, + "diversity_loss_mlp": 0.0, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.08678287386034968, + "language_loss": 0.87089044, + "learning_rate": 0.0009821609609078697, + "loss": 0.88327146, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.21484375, + "routerloss_mlp": 0.0, + "step": 586, + "time_per_iteration": 2.586289405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320429, + "balance_loss_mlp": 1.29861343, + "diversity_loss_mlp": 0.0, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.09324667942342675, + "language_loss": 0.89581811, + "learning_rate": 0.0009820783919814045, + "loss": 0.90902239, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.21826172, + "routerloss_mlp": 0.0, + "step": 587, + "time_per_iteration": 2.804417848587036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01397697, + "balance_loss_mlp": 1.37499988, + "diversity_loss_mlp": 0.0, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.11766834316785481, + "language_loss": 0.82825267, + "learning_rate": 0.0009819956358955095, + "loss": 0.8422296, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.22705078, + "routerloss_mlp": 0.0, + "step": 588, + "time_per_iteration": 2.5654590129852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433511, + "balance_loss_mlp": 1.41009879, + "diversity_loss_mlp": 0.0, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.13254981657968556, + "language_loss": 0.84316242, + "learning_rate": 0.0009819126926823127, + "loss": 0.85749757, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.23413086, + "routerloss_mlp": 0.0, + "step": 589, + "time_per_iteration": 2.5090954303741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369151, + "balance_loss_mlp": 1.34720445, + "diversity_loss_mlp": 0.0, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.12923638752993147, + "language_loss": 0.87131608, + "learning_rate": 0.000981829562374016, + "loss": 0.88500756, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.21948242, + "routerloss_mlp": 0.0, + "step": 590, + "time_per_iteration": 2.7904558181762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263432, + "balance_loss_mlp": 1.24309444, + "diversity_loss_mlp": 0.0, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.0979331207375339, + "language_loss": 0.97635686, + "learning_rate": 0.0009817462450028933, + "loss": 0.98899126, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 591, + "time_per_iteration": 2.6596498489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186211, + "balance_loss_mlp": 1.16698265, + "diversity_loss_mlp": 0.0, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.0791908179615389, + "language_loss": 0.85476398, + "learning_rate": 0.0009816627406012916, + "loss": 0.86662614, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 592, + "time_per_iteration": 2.795384168624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143018, + "balance_loss_mlp": 1.12423062, + "diversity_loss_mlp": 0.0, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.14133504737490046, + "language_loss": 0.85158926, + "learning_rate": 0.0009815790492016295, + "loss": 0.86301947, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.18774414, + "routerloss_mlp": 0.0, + "step": 593, + "time_per_iteration": 2.968202829360962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113857, + "balance_loss_mlp": 1.11954474, + "diversity_loss_mlp": 0.0, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.10990083394980393, + "language_loss": 0.87156999, + "learning_rate": 0.0009814951708363993, + "loss": 0.88295579, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.19006348, + "routerloss_mlp": 0.0, + "step": 594, + "time_per_iteration": 2.8341050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01993613, + "balance_loss_mlp": 1.96176016, + "diversity_loss_mlp": 0.0, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.10325359814292956, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79984605, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.31835938, + "routerloss_mlp": 0.0, + "step": 595, + "time_per_iteration": 4.746119976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11163688, + "diversity_loss_mlp": 0.0, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.1448933947746474, + "language_loss": 0.89056683, + "learning_rate": 0.0009813268533395648, + "loss": 0.90187395, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.19067383, + "routerloss_mlp": 0.0, + "step": 596, + "time_per_iteration": 2.592421054840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151969, + "balance_loss_mlp": 1.13301492, + "diversity_loss_mlp": 0.0, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.12455054099529249, + "language_loss": 0.8755219, + "learning_rate": 0.0009812424142733073, + "loss": 0.88704157, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.18933105, + "routerloss_mlp": 0.0, + "step": 597, + "time_per_iteration": 2.549654483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158973, + "balance_loss_mlp": 1.13961387, + "diversity_loss_mlp": 0.0, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.1533400924271749, + "language_loss": 0.86129421, + "learning_rate": 0.000981157788372175, + "loss": 0.87288398, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 598, + "time_per_iteration": 3.029372453689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181573, + "balance_loss_mlp": 1.16308403, + "diversity_loss_mlp": 0.0, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.08122879346901381, + "language_loss": 0.89185023, + "learning_rate": 0.0009810729756690223, + "loss": 0.90366596, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.18481445, + "routerloss_mlp": 0.0, + "step": 599, + "time_per_iteration": 2.72200608253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225343, + "balance_loss_mlp": 1.20584035, + "diversity_loss_mlp": 0.0, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.09322481346022114, + "language_loss": 0.91937912, + "learning_rate": 0.0009809879761967766, + "loss": 0.93163252, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.19482422, + "routerloss_mlp": 0.0, + "step": 600, + "time_per_iteration": 2.9454104900360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240049, + "balance_loss_mlp": 1.22046316, + "diversity_loss_mlp": 0.0, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.11235514763344263, + "language_loss": 0.86727029, + "learning_rate": 0.0009809027899884378, + "loss": 0.87967086, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.19580078, + "routerloss_mlp": 0.0, + "step": 601, + "time_per_iteration": 2.888047218322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288764, + "balance_loss_mlp": 1.26829576, + "diversity_loss_mlp": 0.0, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.07021797329248278, + "language_loss": 0.88593882, + "learning_rate": 0.0009808174170770779, + "loss": 0.89882648, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.20458984, + "routerloss_mlp": 0.0, + "step": 602, + "time_per_iteration": 2.8045670986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02144093, + "balance_loss_mlp": 2.11128712, + "diversity_loss_mlp": 0.0, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.1124732092134732, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.87042338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.328125, + "routerloss_mlp": 0.0, + "step": 603, + "time_per_iteration": 4.899731397628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01341078, + "balance_loss_mlp": 1.32069361, + "diversity_loss_mlp": 0.0, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.10202627615666406, + "language_loss": 0.93765342, + "learning_rate": 0.0009806461112779462, + "loss": 0.95106417, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 604, + "time_per_iteration": 2.6618311405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291209, + "balance_loss_mlp": 1.27080083, + "diversity_loss_mlp": 0.0, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.13219567018011513, + "language_loss": 0.87928259, + "learning_rate": 0.0009805601784566814, + "loss": 0.89219463, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.20397949, + "routerloss_mlp": 0.0, + "step": 605, + "time_per_iteration": 2.4783012866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229751, + "balance_loss_mlp": 1.20996237, + "diversity_loss_mlp": 0.0, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.07794567116482086, + "language_loss": 0.95705628, + "learning_rate": 0.0009804740590654089, + "loss": 0.9693538, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.19787598, + "routerloss_mlp": 0.0, + "step": 606, + "time_per_iteration": 2.6886532306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155761, + "balance_loss_mlp": 1.13543582, + "diversity_loss_mlp": 0.0, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.09113538166915294, + "language_loss": 0.90117687, + "learning_rate": 0.0009803877531375635, + "loss": 0.91273439, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 607, + "time_per_iteration": 2.877068281173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127783, + "balance_loss_mlp": 1.1072073, + "diversity_loss_mlp": 0.0, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.0886917383310614, + "language_loss": 0.90959686, + "learning_rate": 0.0009803012607066523, + "loss": 0.92087471, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.20581055, + "routerloss_mlp": 0.0, + "step": 608, + "time_per_iteration": 2.7187952995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110833, + "balance_loss_mlp": 1.08786178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.061304878637031934, + "language_loss": 0.89645171, + "learning_rate": 0.0009802145818062543, + "loss": 0.90753502, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 609, + "time_per_iteration": 2.692622423171997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920288, + "balance_loss_mlp": 1.57755673, + "diversity_loss_mlp": 0.22646153, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.03934500472587961, + "language_loss": 0.91726142, + "learning_rate": 0.0009801277164700212, + "loss": 0.92646432, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01827916, + "step": 610, + "time_per_iteration": 2.5983645915985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100014, + "balance_loss_mlp": 1.07810283, + "diversity_loss_mlp": 0.0, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.11493980483313035, + "language_loss": 0.90203917, + "learning_rate": 0.0009800406647316776, + "loss": 0.91303933, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.21911621, + "routerloss_mlp": 0.0, + "step": 611, + "time_per_iteration": 2.83890438079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02350268, + "balance_loss_mlp": 2.30563617, + "diversity_loss_mlp": 0.0, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.20114955038596882, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7926473, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.44726562, + "routerloss_mlp": 0.0, + "step": 612, + "time_per_iteration": 4.795763254165649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111114, + "balance_loss_mlp": 1.09067178, + "diversity_loss_mlp": 0.0, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.10624240262278996, + "language_loss": 0.88978302, + "learning_rate": 0.000979866002183916, + "loss": 0.9008944, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.20471191, + "routerloss_mlp": 0.0, + "step": 613, + "time_per_iteration": 2.660820484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.10140252, + "diversity_loss_mlp": 0.0, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.11793468153173196, + "language_loss": 0.90023279, + "learning_rate": 0.0009797783914423082, + "loss": 0.91144633, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.19946289, + "routerloss_mlp": 0.0, + "step": 614, + "time_per_iteration": 2.8052501678466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154456, + "balance_loss_mlp": 1.13508475, + "diversity_loss_mlp": 0.0, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.09232041353489327, + "language_loss": 0.84365702, + "learning_rate": 0.0009796905944342094, + "loss": 0.8552016, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.19360352, + "routerloss_mlp": 0.0, + "step": 615, + "time_per_iteration": 2.829193115234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164283, + "balance_loss_mlp": 1.14475632, + "diversity_loss_mlp": 0.0, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.08204462941928636, + "language_loss": 0.88193601, + "learning_rate": 0.0009796026111937057, + "loss": 0.89357883, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.19519043, + "routerloss_mlp": 0.0, + "step": 616, + "time_per_iteration": 2.5868873596191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165697, + "balance_loss_mlp": 1.14656377, + "diversity_loss_mlp": 0.0, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.08667467412120618, + "language_loss": 0.88612103, + "learning_rate": 0.0009795144417549552, + "loss": 0.89777797, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.19128418, + "routerloss_mlp": 0.0, + "step": 617, + "time_per_iteration": 2.689771890640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163262, + "balance_loss_mlp": 1.14452195, + "diversity_loss_mlp": 0.0, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.07824422885129345, + "language_loss": 0.8978498, + "learning_rate": 0.0009794260861521883, + "loss": 0.90948236, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.18737793, + "routerloss_mlp": 0.0, + "step": 618, + "time_per_iteration": 2.78352689743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154292, + "balance_loss_mlp": 1.13528955, + "diversity_loss_mlp": 0.0, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.09960243519509318, + "language_loss": 0.86907887, + "learning_rate": 0.0009793375444197075, + "loss": 0.88062179, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 619, + "time_per_iteration": 2.618597984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159739, + "balance_loss_mlp": 1.14053416, + "diversity_loss_mlp": 0.0, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.09155899478389973, + "language_loss": 0.85016847, + "learning_rate": 0.000979248816591888, + "loss": 0.86176586, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.1920166, + "routerloss_mlp": 0.0, + "step": 620, + "time_per_iteration": 2.7570278644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145713, + "balance_loss_mlp": 1.12721133, + "diversity_loss_mlp": 0.0, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.1108991519321712, + "language_loss": 0.86349535, + "learning_rate": 0.0009791599027031766, + "loss": 0.87495244, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 621, + "time_per_iteration": 3.2095139026641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137126, + "balance_loss_mlp": 1.11841059, + "diversity_loss_mlp": 0.0, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.09815511109151757, + "language_loss": 0.86187375, + "learning_rate": 0.0009790708027880932, + "loss": 0.873245, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 622, + "time_per_iteration": 2.878537654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01885107, + "balance_loss_mlp": 1.84448004, + "diversity_loss_mlp": 0.0, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.060338107853692736, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.79312396, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.40625, + "routerloss_mlp": 0.0, + "step": 623, + "time_per_iteration": 4.854407787322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147675, + "balance_loss_mlp": 1.12785053, + "diversity_loss_mlp": 0.0, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.08227936779447462, + "language_loss": 0.9313252, + "learning_rate": 0.0009788920450172487, + "loss": 0.94280195, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.19812012, + "routerloss_mlp": 0.0, + "step": 624, + "time_per_iteration": 2.633763551712036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173257, + "balance_loss_mlp": 1.15283692, + "diversity_loss_mlp": 0.0, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.08898942147955141, + "language_loss": 0.90448737, + "learning_rate": 0.0009788023872308875, + "loss": 0.91621995, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.20410156, + "routerloss_mlp": 0.0, + "step": 625, + "time_per_iteration": 2.5277719497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01862648, + "balance_loss_mlp": 1.82163978, + "diversity_loss_mlp": 0.0, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.06145643913195344, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.77291644, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.41015625, + "routerloss_mlp": 0.0, + "step": 626, + "time_per_iteration": 4.746332883834839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165065, + "balance_loss_mlp": 1.1446321, + "diversity_loss_mlp": 0.0, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.07179626691480034, + "language_loss": 0.93775636, + "learning_rate": 0.0009786225140303285, + "loss": 0.94940698, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.2043457, + "routerloss_mlp": 0.0, + "step": 627, + "time_per_iteration": 2.650980234146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154743, + "balance_loss_mlp": 1.13354802, + "diversity_loss_mlp": 0.0, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.1000912175423248, + "language_loss": 0.91955918, + "learning_rate": 0.0009785322986859634, + "loss": 0.93110657, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.21191406, + "routerloss_mlp": 0.0, + "step": 628, + "time_per_iteration": 2.699179172515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0098085, + "balance_loss_mlp": 1.69793713, + "diversity_loss_mlp": 0.22907162, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.03434932946066091, + "language_loss": 0.92752671, + "learning_rate": 0.0009784418975588838, + "loss": 0.93733525, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01734566, + "step": 629, + "time_per_iteration": 2.7467246055603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131113, + "balance_loss_mlp": 1.10905957, + "diversity_loss_mlp": 0.0, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.08662072407619689, + "language_loss": 0.93157279, + "learning_rate": 0.0009783513106841862, + "loss": 0.94288397, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.22070312, + "routerloss_mlp": 0.0, + "step": 630, + "time_per_iteration": 2.699862003326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01893774, + "balance_loss_mlp": 1.85181284, + "diversity_loss_mlp": 0.0, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.08318726834589595, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78626478, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.41992188, + "routerloss_mlp": 0.0, + "step": 631, + "time_per_iteration": 4.952157258987427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129662, + "balance_loss_mlp": 1.10740614, + "diversity_loss_mlp": 0.0, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.08011431594745816, + "language_loss": 0.87836802, + "learning_rate": 0.0009781695798326854, + "loss": 0.88966465, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.22265625, + "routerloss_mlp": 0.0, + "step": 632, + "time_per_iteration": 2.5692520141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.10132909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.08866631591317527, + "language_loss": 0.87804729, + "learning_rate": 0.0009780784359264365, + "loss": 0.88928837, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.2277832, + "routerloss_mlp": 0.0, + "step": 633, + "time_per_iteration": 2.6267781257629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00719882, + "balance_loss_mlp": 1.16367078, + "diversity_loss_mlp": 0.22089316, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.0030158712959469035, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.74908578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.02760048, + "step": 634, + "time_per_iteration": 4.819004535675049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00956665, + "balance_loss_mlp": 1.64561963, + "diversity_loss_mlp": 0.23289478, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.029780004210258365, + "language_loss": 0.87410563, + "learning_rate": 0.000977895591329867, + "loss": 0.88367236, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017408, + "step": 635, + "time_per_iteration": 2.8417630195617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111829, + "balance_loss_mlp": 1.09035909, + "diversity_loss_mlp": 0.0, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.07301537581986137, + "language_loss": 0.86799347, + "learning_rate": 0.000977803890710533, + "loss": 0.87911177, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.21472168, + "routerloss_mlp": 0.0, + "step": 636, + "time_per_iteration": 2.721245765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_mlp": 1.08507979, + "diversity_loss_mlp": 0.0, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.0646034576227674, + "language_loss": 0.93395561, + "learning_rate": 0.0009777120045912774, + "loss": 0.94501537, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.20898438, + "routerloss_mlp": 0.0, + "step": 637, + "time_per_iteration": 2.5976381301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114015, + "balance_loss_mlp": 1.09267688, + "diversity_loss_mlp": 0.0, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.07520229878174765, + "language_loss": 0.89586985, + "learning_rate": 0.0009776199330077736, + "loss": 0.90700996, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.21362305, + "routerloss_mlp": 0.0, + "step": 638, + "time_per_iteration": 2.7055575847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127487, + "balance_loss_mlp": 1.10741186, + "diversity_loss_mlp": 0.0, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.08952902399696973, + "language_loss": 0.91934389, + "learning_rate": 0.0009775276759957667, + "loss": 0.93061876, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.20068359, + "routerloss_mlp": 0.0, + "step": 639, + "time_per_iteration": 2.703442096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113385, + "balance_loss_mlp": 1.11285698, + "diversity_loss_mlp": 0.0, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.08734236555353025, + "language_loss": 0.8993817, + "learning_rate": 0.0009774352335910745, + "loss": 0.91072023, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.21008301, + "routerloss_mlp": 0.0, + "step": 640, + "time_per_iteration": 2.798133373260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133691, + "balance_loss_mlp": 1.11327052, + "diversity_loss_mlp": 0.0, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.08010684820371014, + "language_loss": 0.94195282, + "learning_rate": 0.000977342605829586, + "loss": 0.95328975, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.20422363, + "routerloss_mlp": 0.0, + "step": 641, + "time_per_iteration": 2.72929310798645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167832, + "balance_loss_mlp": 1.14699411, + "diversity_loss_mlp": 0.0, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.08202605728626432, + "language_loss": 0.85741401, + "learning_rate": 0.0009772497927472623, + "loss": 0.86909235, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.20837402, + "routerloss_mlp": 0.0, + "step": 642, + "time_per_iteration": 3.071017265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166824, + "balance_loss_mlp": 1.14637995, + "diversity_loss_mlp": 0.0, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.0829252807022359, + "language_loss": 0.84863311, + "learning_rate": 0.0009771567943801368, + "loss": 0.86030138, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.20446777, + "routerloss_mlp": 0.0, + "step": 643, + "time_per_iteration": 2.667830228805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180894, + "balance_loss_mlp": 1.16058123, + "diversity_loss_mlp": 0.0, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.07304892670416417, + "language_loss": 0.89067769, + "learning_rate": 0.0009770636107643152, + "loss": 0.90248668, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.203125, + "routerloss_mlp": 0.0, + "step": 644, + "time_per_iteration": 2.715703010559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187033, + "balance_loss_mlp": 1.16633821, + "diversity_loss_mlp": 0.0, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.07624328698635177, + "language_loss": 0.87043303, + "learning_rate": 0.0009769702419359738, + "loss": 0.88230342, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.20703125, + "routerloss_mlp": 0.0, + "step": 645, + "time_per_iteration": 2.645270586013794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199535, + "balance_loss_mlp": 1.17913866, + "diversity_loss_mlp": 0.0, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.10325279424343262, + "language_loss": 0.88927197, + "learning_rate": 0.000976876687931362, + "loss": 0.90126729, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 646, + "time_per_iteration": 2.9558987617492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154414, + "balance_loss_mlp": 1.13427997, + "diversity_loss_mlp": 0.0, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.10259074887379964, + "language_loss": 0.84658372, + "learning_rate": 0.0009767829487868005, + "loss": 0.85812783, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.20129395, + "routerloss_mlp": 0.0, + "step": 647, + "time_per_iteration": 2.593254566192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165839, + "balance_loss_mlp": 1.14557362, + "diversity_loss_mlp": 0.0, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.08660672395493044, + "language_loss": 0.88729513, + "learning_rate": 0.000976689024538682, + "loss": 0.8989535, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.20263672, + "routerloss_mlp": 0.0, + "step": 648, + "time_per_iteration": 2.6087043285369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147043, + "balance_loss_mlp": 1.12564492, + "diversity_loss_mlp": 0.0, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.09471610460140056, + "language_loss": 0.86980593, + "learning_rate": 0.0009765949152234716, + "loss": 0.88127637, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.21411133, + "routerloss_mlp": 0.0, + "step": 649, + "time_per_iteration": 2.8878984451293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130912, + "balance_loss_mlp": 2.08723378, + "diversity_loss_mlp": 0.0, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.17488169385486374, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.80816996, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.4375, + "routerloss_mlp": 0.0, + "step": 650, + "time_per_iteration": 4.7227959632873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125186, + "balance_loss_mlp": 1.10393071, + "diversity_loss_mlp": 0.0, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.09783498118048492, + "language_loss": 0.81436628, + "learning_rate": 0.0009764061415379919, + "loss": 0.82561815, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.21276855, + "routerloss_mlp": 0.0, + "step": 651, + "time_per_iteration": 3.2849485874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135606, + "balance_loss_mlp": 1.11419618, + "diversity_loss_mlp": 0.0, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.08568090703098526, + "language_loss": 0.88376707, + "learning_rate": 0.0009763114772410109, + "loss": 0.89512312, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.21435547, + "routerloss_mlp": 0.0, + "step": 652, + "time_per_iteration": 2.640482187271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147244, + "balance_loss_mlp": 1.12633479, + "diversity_loss_mlp": 0.0, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.0799999486499222, + "language_loss": 0.86490756, + "learning_rate": 0.0009762166280235146, + "loss": 0.87638003, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.20910645, + "routerloss_mlp": 0.0, + "step": 653, + "time_per_iteration": 2.9535903930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188128, + "balance_loss_mlp": 1.16659844, + "diversity_loss_mlp": 0.0, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.09522027236447655, + "language_loss": 0.86765033, + "learning_rate": 0.0009761215939223267, + "loss": 0.87953162, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 654, + "time_per_iteration": 2.7124929428100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186032, + "balance_loss_mlp": 1.16533732, + "diversity_loss_mlp": 0.0, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.11212167432887624, + "language_loss": 0.85993934, + "learning_rate": 0.0009760263749743428, + "loss": 0.87179965, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.20690918, + "routerloss_mlp": 0.0, + "step": 655, + "time_per_iteration": 2.5919461250305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171572, + "balance_loss_mlp": 1.1518662, + "diversity_loss_mlp": 0.0, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.09226162692886594, + "language_loss": 0.89700639, + "learning_rate": 0.0009759309712165299, + "loss": 0.9087221, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.19702148, + "routerloss_mlp": 0.0, + "step": 656, + "time_per_iteration": 2.746537685394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161192, + "balance_loss_mlp": 1.14149833, + "diversity_loss_mlp": 0.0, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.08627335840647962, + "language_loss": 0.92326117, + "learning_rate": 0.0009758353826859272, + "loss": 0.9348731, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 657, + "time_per_iteration": 2.5861480236053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128682, + "balance_loss_mlp": 1.10790431, + "diversity_loss_mlp": 0.0, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.1059978443595565, + "language_loss": 0.88603538, + "learning_rate": 0.0009757396094196456, + "loss": 0.89732224, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.20788574, + "routerloss_mlp": 0.0, + "step": 658, + "time_per_iteration": 2.8773136138916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130444, + "balance_loss_mlp": 1.11040533, + "diversity_loss_mlp": 0.0, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.12293029558515219, + "language_loss": 0.83426332, + "learning_rate": 0.0009756436514548673, + "loss": 0.8455677, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.20031738, + "routerloss_mlp": 0.0, + "step": 659, + "time_per_iteration": 2.810722589492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134733, + "balance_loss_mlp": 1.11438441, + "diversity_loss_mlp": 0.0, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.06793027871708798, + "language_loss": 0.87658846, + "learning_rate": 0.0009755475088288466, + "loss": 0.88793576, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.20349121, + "routerloss_mlp": 0.0, + "step": 660, + "time_per_iteration": 2.7121376991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147973, + "balance_loss_mlp": 1.12785089, + "diversity_loss_mlp": 0.0, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.08710392398912287, + "language_loss": 0.89421189, + "learning_rate": 0.0009754511815789095, + "loss": 0.90569162, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.20117188, + "routerloss_mlp": 0.0, + "step": 661, + "time_per_iteration": 2.777318239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162586, + "balance_loss_mlp": 1.14171267, + "diversity_loss_mlp": 0.0, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.08537034247511402, + "language_loss": 0.84716892, + "learning_rate": 0.0009753546697424533, + "loss": 0.85879481, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 662, + "time_per_iteration": 2.6664726734161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169368, + "balance_loss_mlp": 1.14935231, + "diversity_loss_mlp": 0.0, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.08593929583832248, + "language_loss": 0.89815515, + "learning_rate": 0.0009752579733569475, + "loss": 0.90984881, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.20019531, + "routerloss_mlp": 0.0, + "step": 663, + "time_per_iteration": 2.695844888687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02192512, + "balance_loss_mlp": 2.16352034, + "diversity_loss_mlp": 0.0, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.2093028146020386, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.77073896, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.2890625, + "routerloss_mlp": 0.0, + "step": 664, + "time_per_iteration": 4.96467137336731 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00927072, + "balance_loss_mlp": 1.59828615, + "diversity_loss_mlp": 0.21952696, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.040572636524321984, + "language_loss": 0.8949101, + "learning_rate": 0.0009750640270890217, + "loss": 0.90418077, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816532, + "step": 665, + "time_per_iteration": 2.7632246017456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241186, + "balance_loss_mlp": 1.22053885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.08846289988129392, + "language_loss": 0.95572138, + "learning_rate": 0.0009749667772818983, + "loss": 0.96813321, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.20654297, + "routerloss_mlp": 0.0, + "step": 666, + "time_per_iteration": 3.037458896636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183198, + "balance_loss_mlp": 1.80241597, + "diversity_loss_mlp": 0.0, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.11554481164154014, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.7876792, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.29492188, + "routerloss_mlp": 0.0, + "step": 667, + "time_per_iteration": 4.810182332992554 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244511, + "balance_loss_mlp": 1.22299325, + "diversity_loss_mlp": 0.0, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.09137997717488894, + "language_loss": 0.94816601, + "learning_rate": 0.0009747717245101093, + "loss": 0.9606111, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.21520996, + "routerloss_mlp": 0.0, + "step": 668, + "time_per_iteration": 2.552507162094116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917856, + "balance_loss_mlp": 1.58052325, + "diversity_loss_mlp": 0.21830653, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.03508480239171642, + "language_loss": 0.8457346, + "learning_rate": 0.00097467392162117, + "loss": 0.85491318, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01844162, + "step": 669, + "time_per_iteration": 2.6064391136169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242109, + "balance_loss_mlp": 1.21882796, + "diversity_loss_mlp": 0.0, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.1666980552990896, + "language_loss": 0.90609741, + "learning_rate": 0.0009745759344474708, + "loss": 0.91851848, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.23266602, + "routerloss_mlp": 0.0, + "step": 670, + "time_per_iteration": 2.826202392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229033, + "balance_loss_mlp": 1.2077179, + "diversity_loss_mlp": 0.0, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.09671049007121679, + "language_loss": 0.88974905, + "learning_rate": 0.0009744777630270536, + "loss": 0.90203935, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.21337891, + "routerloss_mlp": 0.0, + "step": 671, + "time_per_iteration": 2.578334331512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233527, + "balance_loss_mlp": 1.21067417, + "diversity_loss_mlp": 0.0, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.08999527722625096, + "language_loss": 0.92790663, + "learning_rate": 0.000974379407398032, + "loss": 0.94024187, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.22839355, + "routerloss_mlp": 0.0, + "step": 672, + "time_per_iteration": 2.8661158084869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237675, + "balance_loss_mlp": 1.21589506, + "diversity_loss_mlp": 0.0, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.09653126460783178, + "language_loss": 0.81875724, + "learning_rate": 0.0009742808675985913, + "loss": 0.83113402, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.21801758, + "routerloss_mlp": 0.0, + "step": 673, + "time_per_iteration": 3.0861356258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260533, + "balance_loss_mlp": 1.23754919, + "diversity_loss_mlp": 0.0, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.08653130412501808, + "language_loss": 0.90219223, + "learning_rate": 0.0009741821436669876, + "loss": 0.91479754, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.2298584, + "routerloss_mlp": 0.0, + "step": 674, + "time_per_iteration": 2.5609960556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267597, + "balance_loss_mlp": 1.24489975, + "diversity_loss_mlp": 0.0, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.09623752325881015, + "language_loss": 0.91791725, + "learning_rate": 0.0009740832356415492, + "loss": 0.93059325, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.22680664, + "routerloss_mlp": 0.0, + "step": 675, + "time_per_iteration": 2.544027805328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295128, + "balance_loss_mlp": 1.27278781, + "diversity_loss_mlp": 0.0, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.08903369590662558, + "language_loss": 0.87403589, + "learning_rate": 0.0009739841435606756, + "loss": 0.88698715, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.22338867, + "routerloss_mlp": 0.0, + "step": 676, + "time_per_iteration": 2.9931325912475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261461, + "balance_loss_mlp": 1.23933589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.0602287995404217, + "language_loss": 0.89557111, + "learning_rate": 0.0009738848674628377, + "loss": 0.90818572, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.22131348, + "routerloss_mlp": 0.0, + "step": 677, + "time_per_iteration": 2.7290966510772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264602, + "balance_loss_mlp": 1.24307275, + "diversity_loss_mlp": 0.0, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.10468610894957399, + "language_loss": 0.88751101, + "learning_rate": 0.000973785407386578, + "loss": 0.90015703, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.2154541, + "routerloss_mlp": 0.0, + "step": 678, + "time_per_iteration": 2.7950329780578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00969584, + "balance_loss_mlp": 1.6979661, + "diversity_loss_mlp": 0.20886885, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.03344489204860934, + "language_loss": 0.86933386, + "learning_rate": 0.0009736857633705103, + "loss": 0.87902969, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01616703, + "step": 679, + "time_per_iteration": 2.8691866397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193718, + "balance_loss_mlp": 1.17283261, + "diversity_loss_mlp": 0.0, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.08130386374469858, + "language_loss": 0.92363989, + "learning_rate": 0.0009735859354533196, + "loss": 0.93557703, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.2088623, + "routerloss_mlp": 0.0, + "step": 680, + "time_per_iteration": 2.6832337379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155917, + "balance_loss_mlp": 1.13447094, + "diversity_loss_mlp": 0.0, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.0924188238597787, + "language_loss": 0.91083395, + "learning_rate": 0.0009734859236737628, + "loss": 0.92239314, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.21459961, + "routerloss_mlp": 0.0, + "step": 681, + "time_per_iteration": 2.6023473739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125397, + "balance_loss_mlp": 1.10410571, + "diversity_loss_mlp": 0.0, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.08442474228180671, + "language_loss": 0.93186569, + "learning_rate": 0.0009733857280706678, + "loss": 0.9431197, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.2130127, + "routerloss_mlp": 0.0, + "step": 682, + "time_per_iteration": 2.5775911808013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968386, + "balance_loss_mlp": 1.69064701, + "diversity_loss_mlp": 0.21057674, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.03992508312329801, + "language_loss": 0.84369749, + "learning_rate": 0.000973285348682934, + "loss": 0.85338134, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777408, + "step": 683, + "time_per_iteration": 2.768641233444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01618305, + "balance_loss_mlp": 1.58530831, + "diversity_loss_mlp": 0.0, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.09794042911652269, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79516685, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.33007812, + "routerloss_mlp": 0.0, + "step": 684, + "time_per_iteration": 4.802167177200317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094162, + "balance_loss_mlp": 1.07383704, + "diversity_loss_mlp": 0.0, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.12652995306024198, + "language_loss": 0.84832728, + "learning_rate": 0.0009730840387095046, + "loss": 0.8592689, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.20324707, + "routerloss_mlp": 0.0, + "step": 685, + "time_per_iteration": 3.2910287380218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112414, + "balance_loss_mlp": 1.09188628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.13012317463795417, + "language_loss": 0.90537834, + "learning_rate": 0.0009729831082019642, + "loss": 0.91650254, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.20532227, + "routerloss_mlp": 0.0, + "step": 686, + "time_per_iteration": 2.7909138202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121716, + "balance_loss_mlp": 1.101331, + "diversity_loss_mlp": 0.0, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.08096428549902779, + "language_loss": 0.88353586, + "learning_rate": 0.0009728819940660958, + "loss": 0.89475298, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.20385742, + "routerloss_mlp": 0.0, + "step": 687, + "time_per_iteration": 2.7699429988861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131653, + "balance_loss_mlp": 1.11135173, + "diversity_loss_mlp": 0.0, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.07933225152322496, + "language_loss": 0.85085285, + "learning_rate": 0.0009727806963411557, + "loss": 0.86216938, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 688, + "time_per_iteration": 2.581984519958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144043, + "balance_loss_mlp": 1.12350333, + "diversity_loss_mlp": 0.0, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.09807362554425139, + "language_loss": 0.87180853, + "learning_rate": 0.000972679215066471, + "loss": 0.88324893, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.20544434, + "routerloss_mlp": 0.0, + "step": 689, + "time_per_iteration": 2.6538989543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148083, + "balance_loss_mlp": 1.12809181, + "diversity_loss_mlp": 0.0, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.09247782934143206, + "language_loss": 0.98983967, + "learning_rate": 0.0009725775502814401, + "loss": 1.00132048, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.19995117, + "routerloss_mlp": 0.0, + "step": 690, + "time_per_iteration": 2.610485315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167941, + "balance_loss_mlp": 1.14827132, + "diversity_loss_mlp": 0.0, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.08082631328369684, + "language_loss": 0.84880829, + "learning_rate": 0.0009724757020255327, + "loss": 0.8604877, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.1965332, + "routerloss_mlp": 0.0, + "step": 691, + "time_per_iteration": 2.8424370288848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152001, + "balance_loss_mlp": 1.13209307, + "diversity_loss_mlp": 0.0, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.09067820147092803, + "language_loss": 0.87807095, + "learning_rate": 0.0009723736703382902, + "loss": 0.88959098, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.19897461, + "routerloss_mlp": 0.0, + "step": 692, + "time_per_iteration": 2.5578606128692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149266, + "balance_loss_mlp": 1.13037133, + "diversity_loss_mlp": 0.0, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.07979062216362842, + "language_loss": 0.82877922, + "learning_rate": 0.0009722714552593244, + "loss": 0.84027195, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 693, + "time_per_iteration": 2.6148533821105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153464, + "balance_loss_mlp": 1.13444984, + "diversity_loss_mlp": 0.0, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.08708336283232748, + "language_loss": 0.94164526, + "learning_rate": 0.000972169056828319, + "loss": 0.9531799, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.18994141, + "routerloss_mlp": 0.0, + "step": 694, + "time_per_iteration": 2.517944097518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154915, + "balance_loss_mlp": 1.1360321, + "diversity_loss_mlp": 0.0, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.0753733884935208, + "language_loss": 0.86921358, + "learning_rate": 0.0009720664750850283, + "loss": 0.8807627, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 695, + "time_per_iteration": 2.8149421215057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148667, + "balance_loss_mlp": 1.1299628, + "diversity_loss_mlp": 0.0, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.09445278911045346, + "language_loss": 0.92951906, + "learning_rate": 0.0009719637100692784, + "loss": 0.94100577, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 696, + "time_per_iteration": 2.719451904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149322, + "balance_loss_mlp": 1.13098741, + "diversity_loss_mlp": 0.0, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.10008701466446891, + "language_loss": 0.82604736, + "learning_rate": 0.0009718607618209661, + "loss": 0.83754057, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 697, + "time_per_iteration": 2.8692104816436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148038, + "balance_loss_mlp": 1.12914348, + "diversity_loss_mlp": 0.0, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.07908911060166324, + "language_loss": 0.87701273, + "learning_rate": 0.0009717576303800595, + "loss": 0.88849318, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.1887207, + "routerloss_mlp": 0.0, + "step": 698, + "time_per_iteration": 3.0484437942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139219, + "balance_loss_mlp": 1.11988366, + "diversity_loss_mlp": 0.0, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.12480577454910273, + "language_loss": 0.85819161, + "learning_rate": 0.0009716543157865975, + "loss": 0.86958385, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.1932373, + "routerloss_mlp": 0.0, + "step": 699, + "time_per_iteration": 2.706787347793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144768, + "balance_loss_mlp": 1.12586117, + "diversity_loss_mlp": 0.0, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.16362357873421526, + "language_loss": 0.83352965, + "learning_rate": 0.0009715508180806907, + "loss": 0.84497738, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.18896484, + "routerloss_mlp": 0.0, + "step": 700, + "time_per_iteration": 3.1985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162079, + "balance_loss_mlp": 1.14230227, + "diversity_loss_mlp": 0.0, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.08746408781150025, + "language_loss": 0.90170425, + "learning_rate": 0.0009714471373025202, + "loss": 0.91332507, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.19763184, + "routerloss_mlp": 0.0, + "step": 701, + "time_per_iteration": 3.487022638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.13656974, + "diversity_loss_mlp": 0.0, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.10787745491017559, + "language_loss": 0.88186693, + "learning_rate": 0.0009713432734923386, + "loss": 0.89343208, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.19934082, + "routerloss_mlp": 0.0, + "step": 702, + "time_per_iteration": 2.6239736080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167828, + "balance_loss_mlp": 1.14830136, + "diversity_loss_mlp": 0.0, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.09670789671988574, + "language_loss": 0.86879516, + "learning_rate": 0.0009712392266904696, + "loss": 0.88047349, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.19506836, + "routerloss_mlp": 0.0, + "step": 703, + "time_per_iteration": 2.7542335987091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181198, + "balance_loss_mlp": 1.16149247, + "diversity_loss_mlp": 0.0, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.10598212751912446, + "language_loss": 0.85246772, + "learning_rate": 0.0009711349969373076, + "loss": 0.86427975, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 704, + "time_per_iteration": 3.162461042404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175522, + "balance_loss_mlp": 1.15518451, + "diversity_loss_mlp": 0.0, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.0954290464489283, + "language_loss": 0.80285007, + "learning_rate": 0.0009710305842733178, + "loss": 0.81460524, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.20336914, + "routerloss_mlp": 0.0, + "step": 705, + "time_per_iteration": 2.7630715370178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155052, + "balance_loss_mlp": 1.13601446, + "diversity_loss_mlp": 0.0, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.09437017973872532, + "language_loss": 0.89630616, + "learning_rate": 0.0009709259887390373, + "loss": 0.9078567, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.19030762, + "routerloss_mlp": 0.0, + "step": 706, + "time_per_iteration": 2.6160268783569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895019, + "balance_loss_mlp": 1.55161047, + "diversity_loss_mlp": 0.20666173, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.04273378361131697, + "language_loss": 0.90874577, + "learning_rate": 0.0009708212103750737, + "loss": 0.91769588, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588319, + "step": 707, + "time_per_iteration": 2.594606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180444, + "balance_loss_mlp": 1.16110778, + "diversity_loss_mlp": 0.0, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.08814378894040824, + "language_loss": 0.87522972, + "learning_rate": 0.0009707162492221051, + "loss": 0.88703418, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.19335938, + "routerloss_mlp": 0.0, + "step": 708, + "time_per_iteration": 2.8884427547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197388, + "balance_loss_mlp": 1.17801642, + "diversity_loss_mlp": 0.0, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.07892254834086627, + "language_loss": 0.87611169, + "learning_rate": 0.0009706111053208815, + "loss": 0.8880856, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.19348145, + "routerloss_mlp": 0.0, + "step": 709, + "time_per_iteration": 2.7824413776397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213311, + "balance_loss_mlp": 1.19383228, + "diversity_loss_mlp": 0.0, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.10389736734512126, + "language_loss": 0.85504246, + "learning_rate": 0.0009705057787122232, + "loss": 0.86717558, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.19458008, + "routerloss_mlp": 0.0, + "step": 710, + "time_per_iteration": 2.529498815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178108, + "balance_loss_mlp": 1.15870059, + "diversity_loss_mlp": 0.0, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.07975606670492637, + "language_loss": 0.91293353, + "learning_rate": 0.0009704002694370216, + "loss": 0.92471457, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.19384766, + "routerloss_mlp": 0.0, + "step": 711, + "time_per_iteration": 2.5365610122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152018, + "balance_loss_mlp": 1.13282573, + "diversity_loss_mlp": 0.0, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.08453852441771745, + "language_loss": 0.86583841, + "learning_rate": 0.0009702945775362388, + "loss": 0.87735862, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.19177246, + "routerloss_mlp": 0.0, + "step": 712, + "time_per_iteration": 2.595674514770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111883, + "balance_loss_mlp": 1.10022175, + "diversity_loss_mlp": 0.0, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.08096963371537849, + "language_loss": 0.87088716, + "learning_rate": 0.0009701887030509086, + "loss": 0.88207549, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.18615723, + "routerloss_mlp": 0.0, + "step": 713, + "time_per_iteration": 2.6124320030212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112657, + "balance_loss_mlp": 1.09444165, + "diversity_loss_mlp": 0.0, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.12434454369652892, + "language_loss": 0.91262931, + "learning_rate": 0.0009700826460221346, + "loss": 0.92375588, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.18225098, + "routerloss_mlp": 0.0, + "step": 714, + "time_per_iteration": 2.674612283706665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115832, + "balance_loss_mlp": 1.09812903, + "diversity_loss_mlp": 0.0, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.11407804289300516, + "language_loss": 0.92571628, + "learning_rate": 0.0009699764064910921, + "loss": 0.93687463, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.17712402, + "routerloss_mlp": 0.0, + "step": 715, + "time_per_iteration": 2.8810853958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121116, + "balance_loss_mlp": 1.10322237, + "diversity_loss_mlp": 0.0, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.08940816195623212, + "language_loss": 0.86826718, + "learning_rate": 0.0009698699844990268, + "loss": 0.87947834, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.17907715, + "routerloss_mlp": 0.0, + "step": 716, + "time_per_iteration": 2.697970151901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153213, + "balance_loss_mlp": 1.13561809, + "diversity_loss_mlp": 0.0, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.07906779204708066, + "language_loss": 0.88138282, + "learning_rate": 0.0009697633800872555, + "loss": 0.89291501, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 717, + "time_per_iteration": 2.8897392749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197417, + "balance_loss_mlp": 1.1801312, + "diversity_loss_mlp": 0.0, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.10867682790127652, + "language_loss": 0.9066782, + "learning_rate": 0.0009696565932971655, + "loss": 0.91865242, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 718, + "time_per_iteration": 2.8944718837738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209582, + "balance_loss_mlp": 1.19165277, + "diversity_loss_mlp": 0.0, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.0949883595308799, + "language_loss": 0.89814746, + "learning_rate": 0.0009695496241702153, + "loss": 0.91024327, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 719, + "time_per_iteration": 2.7888894081115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188478, + "balance_loss_mlp": 1.17082274, + "diversity_loss_mlp": 0.0, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.11627833553714081, + "language_loss": 0.86245799, + "learning_rate": 0.0009694424727479339, + "loss": 0.87434286, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.17687988, + "routerloss_mlp": 0.0, + "step": 720, + "time_per_iteration": 2.901224374771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157865, + "balance_loss_mlp": 1.14056826, + "diversity_loss_mlp": 0.0, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.09369792564045784, + "language_loss": 0.88928097, + "learning_rate": 0.0009693351390719213, + "loss": 0.90085959, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 721, + "time_per_iteration": 2.6945152282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126431, + "balance_loss_mlp": 1.10868096, + "diversity_loss_mlp": 0.0, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.07998653864580182, + "language_loss": 0.90800881, + "learning_rate": 0.000969227623183848, + "loss": 0.91927308, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 722, + "time_per_iteration": 2.789515733718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110503, + "balance_loss_mlp": 1.0873754, + "diversity_loss_mlp": 0.0, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.07914116119322331, + "language_loss": 0.90912664, + "learning_rate": 0.0009691199251254554, + "loss": 0.92017698, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.17663574, + "routerloss_mlp": 0.0, + "step": 723, + "time_per_iteration": 2.8231685161590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0093359, + "balance_loss_mlp": 1.62175167, + "diversity_loss_mlp": 0.20987722, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.03669424434563534, + "language_loss": 0.86868215, + "learning_rate": 0.0009690120449385555, + "loss": 0.87801802, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01777578, + "step": 724, + "time_per_iteration": 2.8498518466949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093753, + "balance_loss_mlp": 1.07543111, + "diversity_loss_mlp": 0.0, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.10366482624390064, + "language_loss": 0.92449063, + "learning_rate": 0.0009689039826650312, + "loss": 0.93542814, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.18322754, + "routerloss_mlp": 0.0, + "step": 725, + "time_per_iteration": 2.7611966133117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0154366, + "balance_loss_mlp": 1.50932813, + "diversity_loss_mlp": 0.0, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.08078369374569346, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.78066719, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.34375, + "routerloss_mlp": 0.0, + "step": 726, + "time_per_iteration": 4.927435398101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933775, + "balance_loss_mlp": 1.62253523, + "diversity_loss_mlp": 0.20735951, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.04309218151041253, + "language_loss": 0.87429261, + "learning_rate": 0.0009686873120259941, + "loss": 0.88363039, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01882811, + "step": 727, + "time_per_iteration": 2.602264165878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113926, + "balance_loss_mlp": 1.12035322, + "diversity_loss_mlp": 0.0, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.14876828859354083, + "language_loss": 0.8713131, + "learning_rate": 0.0009685787037446004, + "loss": 0.88270569, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.18884277, + "routerloss_mlp": 0.0, + "step": 728, + "time_per_iteration": 2.806549072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118218, + "balance_loss_mlp": 1.09903765, + "diversity_loss_mlp": 0.0, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.1987640778264907, + "language_loss": 0.87505388, + "learning_rate": 0.0009684699135448201, + "loss": 0.88623607, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 729, + "time_per_iteration": 2.7200138568878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112792, + "balance_loss_mlp": 1.09435034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.0640895655048784, + "language_loss": 0.92135447, + "learning_rate": 0.0009683609414688895, + "loss": 0.93248242, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.18432617, + "routerloss_mlp": 0.0, + "step": 730, + "time_per_iteration": 2.7423696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911127, + "balance_loss_mlp": 1.58117688, + "diversity_loss_mlp": 0.20959289, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.03249579551243702, + "language_loss": 0.86587501, + "learning_rate": 0.0009682517875591154, + "loss": 0.87498629, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01574249, + "step": 731, + "time_per_iteration": 2.809400796890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199938, + "balance_loss_mlp": 1.18138909, + "diversity_loss_mlp": 0.0, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.07609394509363156, + "language_loss": 0.86229968, + "learning_rate": 0.0009681424518578749, + "loss": 0.87429905, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.18530273, + "routerloss_mlp": 0.0, + "step": 732, + "time_per_iteration": 2.725839614868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283686, + "balance_loss_mlp": 1.26505399, + "diversity_loss_mlp": 0.0, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.1414658743658329, + "language_loss": 0.87506676, + "learning_rate": 0.000968032934407616, + "loss": 0.88790363, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.1862793, + "routerloss_mlp": 0.0, + "step": 733, + "time_per_iteration": 2.583768844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01310281, + "balance_loss_mlp": 1.29136264, + "diversity_loss_mlp": 0.0, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.10963887531318486, + "language_loss": 0.81871867, + "learning_rate": 0.0009679232352508571, + "loss": 0.8318215, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.18908691, + "routerloss_mlp": 0.0, + "step": 734, + "time_per_iteration": 2.785585880279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286635, + "balance_loss_mlp": 1.26744211, + "diversity_loss_mlp": 0.0, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.10469043869015734, + "language_loss": 0.80695581, + "learning_rate": 0.0009678133544301871, + "loss": 0.81982213, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.19165039, + "routerloss_mlp": 0.0, + "step": 735, + "time_per_iteration": 2.6638481616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224375, + "balance_loss_mlp": 1.20588589, + "diversity_loss_mlp": 0.0, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.06500438819618859, + "language_loss": 0.91870093, + "learning_rate": 0.0009677032919882658, + "loss": 0.93094468, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.18493652, + "routerloss_mlp": 0.0, + "step": 736, + "time_per_iteration": 2.6578378677368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197974, + "balance_loss_mlp": 1.18056929, + "diversity_loss_mlp": 0.0, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.09940630997209131, + "language_loss": 0.91374373, + "learning_rate": 0.000967593047967823, + "loss": 0.92572349, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.17419434, + "routerloss_mlp": 0.0, + "step": 737, + "time_per_iteration": 2.5236403942108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117212, + "balance_loss_mlp": 1.15476346, + "diversity_loss_mlp": 0.0, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.10840920786543624, + "language_loss": 0.86479127, + "learning_rate": 0.0009674826224116593, + "loss": 0.87651253, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 738, + "time_per_iteration": 2.803260326385498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134605, + "balance_loss_mlp": 1.11759412, + "diversity_loss_mlp": 0.0, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.09051392518082112, + "language_loss": 0.86862409, + "learning_rate": 0.0009673720153626455, + "loss": 0.87997013, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 739, + "time_per_iteration": 2.6086573600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124987, + "balance_loss_mlp": 1.10798764, + "diversity_loss_mlp": 0.0, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.11444093339414264, + "language_loss": 0.8689152, + "learning_rate": 0.0009672612268637235, + "loss": 0.88016504, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.17016602, + "routerloss_mlp": 0.0, + "step": 740, + "time_per_iteration": 2.582648277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116151, + "balance_loss_mlp": 1.09880614, + "diversity_loss_mlp": 0.0, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.10874190594389947, + "language_loss": 0.84213787, + "learning_rate": 0.0009671502569579048, + "loss": 0.85329938, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 741, + "time_per_iteration": 2.7945284843444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132432, + "balance_loss_mlp": 1.11539662, + "diversity_loss_mlp": 0.0, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.07140691777849974, + "language_loss": 0.89503837, + "learning_rate": 0.0009670391056882719, + "loss": 0.90636265, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.17053223, + "routerloss_mlp": 0.0, + "step": 742, + "time_per_iteration": 2.71687912940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149228, + "balance_loss_mlp": 1.13240731, + "diversity_loss_mlp": 0.0, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.08672376963732596, + "language_loss": 0.88698781, + "learning_rate": 0.0009669277730979776, + "loss": 0.89848006, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 743, + "time_per_iteration": 3.2029030323028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147207, + "balance_loss_mlp": 1.13025546, + "diversity_loss_mlp": 0.0, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.09113342882689801, + "language_loss": 0.85227454, + "learning_rate": 0.0009668162592302449, + "loss": 0.86374664, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 744, + "time_per_iteration": 2.899656057357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165032, + "balance_loss_mlp": 1.14748406, + "diversity_loss_mlp": 0.0, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.07780467137911447, + "language_loss": 0.86560214, + "learning_rate": 0.0009667045641283676, + "loss": 0.87725246, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.17553711, + "routerloss_mlp": 0.0, + "step": 745, + "time_per_iteration": 2.6474997997283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159177, + "balance_loss_mlp": 1.14148676, + "diversity_loss_mlp": 0.0, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.09864944110558675, + "language_loss": 0.95312673, + "learning_rate": 0.0009665926878357092, + "loss": 0.96471858, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.17700195, + "routerloss_mlp": 0.0, + "step": 746, + "time_per_iteration": 2.946307420730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00851982, + "balance_loss_mlp": 1.46230698, + "diversity_loss_mlp": 0.20995456, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.034792990408202794, + "language_loss": 0.91192698, + "learning_rate": 0.0009664806303957043, + "loss": 0.92044681, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01585159, + "step": 747, + "time_per_iteration": 2.706286668777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160661, + "balance_loss_mlp": 1.14221931, + "diversity_loss_mlp": 0.0, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.08367194984434445, + "language_loss": 0.87066692, + "learning_rate": 0.0009663683918518571, + "loss": 0.88227355, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 748, + "time_per_iteration": 2.892982244491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136526, + "balance_loss_mlp": 1.11831081, + "diversity_loss_mlp": 0.0, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.07455761265115375, + "language_loss": 0.85490787, + "learning_rate": 0.0009662559722477428, + "loss": 0.86627316, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.18237305, + "routerloss_mlp": 0.0, + "step": 749, + "time_per_iteration": 2.6979615688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01292346, + "balance_loss_mlp": 1.2582047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.08640394257539531, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77455318, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.34179688, + "routerloss_mlp": 0.0, + "step": 750, + "time_per_iteration": 4.991304397583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128515, + "balance_loss_mlp": 1.11068118, + "diversity_loss_mlp": 0.0, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.07866539193327844, + "language_loss": 0.89197791, + "learning_rate": 0.0009660305900333632, + "loss": 0.90326303, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.17834473, + "routerloss_mlp": 0.0, + "step": 751, + "time_per_iteration": 2.6706793308258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121937, + "balance_loss_mlp": 1.1038413, + "diversity_loss_mlp": 0.0, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.10038132697844201, + "language_loss": 0.82478833, + "learning_rate": 0.0009659176275105992, + "loss": 0.83600777, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.1809082, + "routerloss_mlp": 0.0, + "step": 752, + "time_per_iteration": 2.697909355163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126212, + "balance_loss_mlp": 1.10777032, + "diversity_loss_mlp": 0.0, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.10638604925915984, + "language_loss": 0.85756153, + "learning_rate": 0.0009658044841025701, + "loss": 0.86882365, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.18444824, + "routerloss_mlp": 0.0, + "step": 753, + "time_per_iteration": 2.7749171257019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128331, + "balance_loss_mlp": 1.1107595, + "diversity_loss_mlp": 0.0, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.09130861127340602, + "language_loss": 0.81584072, + "learning_rate": 0.0009656911598532021, + "loss": 0.827124, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.17590332, + "routerloss_mlp": 0.0, + "step": 754, + "time_per_iteration": 2.635702610015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136592, + "balance_loss_mlp": 1.11914003, + "diversity_loss_mlp": 0.0, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.06835454276473461, + "language_loss": 0.90494555, + "learning_rate": 0.0009655776548064917, + "loss": 0.9163115, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.17456055, + "routerloss_mlp": 0.0, + "step": 755, + "time_per_iteration": 2.6545748710632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135969, + "balance_loss_mlp": 1.11902952, + "diversity_loss_mlp": 0.0, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.07886906074703284, + "language_loss": 0.88367254, + "learning_rate": 0.0009654639690065054, + "loss": 0.89503217, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 756, + "time_per_iteration": 2.8773815631866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150961, + "balance_loss_mlp": 1.13343716, + "diversity_loss_mlp": 0.0, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.07604063018618923, + "language_loss": 0.8823185, + "learning_rate": 0.00096535010249738, + "loss": 0.89382815, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.17529297, + "routerloss_mlp": 0.0, + "step": 757, + "time_per_iteration": 2.7175021171569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846707, + "balance_loss_mlp": 1.45519352, + "diversity_loss_mlp": 0.20419648, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.03954501513556402, + "language_loss": 0.82782531, + "learning_rate": 0.0009652360553233224, + "loss": 0.83629239, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017012, + "step": 758, + "time_per_iteration": 2.7434637546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115333, + "balance_loss_mlp": 1.12624609, + "diversity_loss_mlp": 0.0, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.03342191973393777, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.7492708, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.27148438, + "routerloss_mlp": 0.0, + "step": 759, + "time_per_iteration": 4.910880088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188786, + "balance_loss_mlp": 1.17063034, + "diversity_loss_mlp": 0.0, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.0638252555407819, + "language_loss": 0.81659228, + "learning_rate": 0.0009650074191575883, + "loss": 0.82848012, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.18151855, + "routerloss_mlp": 0.0, + "step": 760, + "time_per_iteration": 3.2028603553771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213565, + "balance_loss_mlp": 1.19484925, + "diversity_loss_mlp": 0.0, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.07046318146001718, + "language_loss": 0.86031073, + "learning_rate": 0.0009648928302546766, + "loss": 0.87244636, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.18713379, + "routerloss_mlp": 0.0, + "step": 761, + "time_per_iteration": 2.6812515258789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243947, + "balance_loss_mlp": 1.22551703, + "diversity_loss_mlp": 0.0, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.0884537515073792, + "language_loss": 0.85470825, + "learning_rate": 0.0009647780608643613, + "loss": 0.86714768, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.1842041, + "routerloss_mlp": 0.0, + "step": 762, + "time_per_iteration": 3.3486785888671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012302, + "balance_loss_mlp": 1.21243811, + "diversity_loss_mlp": 0.0, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.12042495658723557, + "language_loss": 0.874053, + "learning_rate": 0.0009646631110312001, + "loss": 0.88635492, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 763, + "time_per_iteration": 2.6648313999176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172377, + "balance_loss_mlp": 1.1544956, + "diversity_loss_mlp": 0.0, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.05916332097574664, + "language_loss": 0.8841719, + "learning_rate": 0.0009645479807998203, + "loss": 0.89589572, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 764, + "time_per_iteration": 2.7347912788391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147505, + "balance_loss_mlp": 1.12983775, + "diversity_loss_mlp": 0.0, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.06985321722585584, + "language_loss": 0.92467874, + "learning_rate": 0.0009644326702149196, + "loss": 0.93615377, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.17675781, + "routerloss_mlp": 0.0, + "step": 765, + "time_per_iteration": 2.7316319942474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135962, + "balance_loss_mlp": 1.11803293, + "diversity_loss_mlp": 0.0, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.09157028460957184, + "language_loss": 0.84919345, + "learning_rate": 0.0009643171793212653, + "loss": 0.86055309, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.17944336, + "routerloss_mlp": 0.0, + "step": 766, + "time_per_iteration": 3.116917610168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105537, + "balance_loss_mlp": 1.08738184, + "diversity_loss_mlp": 0.0, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.08034801396880724, + "language_loss": 0.89233959, + "learning_rate": 0.0009642015081636952, + "loss": 0.90339494, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.18164062, + "routerloss_mlp": 0.0, + "step": 767, + "time_per_iteration": 2.705993175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103513, + "balance_loss_mlp": 1.08563185, + "diversity_loss_mlp": 0.0, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.09221888586765616, + "language_loss": 0.88360566, + "learning_rate": 0.0009640856567871166, + "loss": 0.8946408, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.17895508, + "routerloss_mlp": 0.0, + "step": 768, + "time_per_iteration": 2.5172243118286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108598, + "balance_loss_mlp": 1.08981061, + "diversity_loss_mlp": 0.0, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.0844592716079577, + "language_loss": 0.89047211, + "learning_rate": 0.0009639696252365072, + "loss": 0.9015581, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.18786621, + "routerloss_mlp": 0.0, + "step": 769, + "time_per_iteration": 3.034848690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105095, + "balance_loss_mlp": 1.08673656, + "diversity_loss_mlp": 0.0, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.07095543604969227, + "language_loss": 0.81996548, + "learning_rate": 0.0009638534135569144, + "loss": 0.83101642, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.18371582, + "routerloss_mlp": 0.0, + "step": 770, + "time_per_iteration": 2.947564125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106726, + "balance_loss_mlp": 1.08859468, + "diversity_loss_mlp": 0.0, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.08627707323979403, + "language_loss": 0.9012745, + "learning_rate": 0.0009637370217934554, + "loss": 0.91234171, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.18139648, + "routerloss_mlp": 0.0, + "step": 771, + "time_per_iteration": 2.6592423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.09355128, + "diversity_loss_mlp": 0.0, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.06345294765682771, + "language_loss": 0.82981932, + "learning_rate": 0.0009636204499913175, + "loss": 0.84093815, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 772, + "time_per_iteration": 2.8836610317230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115362, + "balance_loss_mlp": 1.09749293, + "diversity_loss_mlp": 0.0, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.06338786563117527, + "language_loss": 0.87914705, + "learning_rate": 0.0009635036981957581, + "loss": 0.89030063, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.17883301, + "routerloss_mlp": 0.0, + "step": 773, + "time_per_iteration": 2.885239601135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132405, + "balance_loss_mlp": 1.11417794, + "diversity_loss_mlp": 0.0, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.08623405645423676, + "language_loss": 0.90735364, + "learning_rate": 0.0009633867664521043, + "loss": 0.91867769, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.18212891, + "routerloss_mlp": 0.0, + "step": 774, + "time_per_iteration": 2.802264451980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159356, + "balance_loss_mlp": 1.14176083, + "diversity_loss_mlp": 0.0, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.09977443827883303, + "language_loss": 0.86760318, + "learning_rate": 0.0009632696548057527, + "loss": 0.8791967, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.17614746, + "routerloss_mlp": 0.0, + "step": 775, + "time_per_iteration": 2.5641794204711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187156, + "balance_loss_mlp": 1.16960835, + "diversity_loss_mlp": 0.0, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.08744626586779954, + "language_loss": 0.85013115, + "learning_rate": 0.0009631523633021704, + "loss": 0.86200273, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 776, + "time_per_iteration": 2.7851786613464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881631, + "balance_loss_mlp": 1.52411294, + "diversity_loss_mlp": 0.20632464, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.038364140445948956, + "language_loss": 0.88378215, + "learning_rate": 0.0009630348919868936, + "loss": 0.89259851, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164127, + "step": 777, + "time_per_iteration": 2.7285845279693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191902, + "balance_loss_mlp": 1.17415154, + "diversity_loss_mlp": 0.0, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.14061909589017782, + "language_loss": 0.81450796, + "learning_rate": 0.0009629172409055293, + "loss": 0.82642698, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.1776123, + "routerloss_mlp": 0.0, + "step": 778, + "time_per_iteration": 2.5018203258514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154162, + "balance_loss_mlp": 1.13728166, + "diversity_loss_mlp": 0.0, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.06968828956123203, + "language_loss": 0.87518388, + "learning_rate": 0.0009627994101037531, + "loss": 0.88672549, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 779, + "time_per_iteration": 2.763136863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139257, + "balance_loss_mlp": 1.12231779, + "diversity_loss_mlp": 0.0, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.07833298109740298, + "language_loss": 0.88761836, + "learning_rate": 0.0009626813996273114, + "loss": 0.8990109, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 780, + "time_per_iteration": 2.8791675567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117275, + "balance_loss_mlp": 1.09990597, + "diversity_loss_mlp": 0.0, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.09603506751758703, + "language_loss": 0.89051467, + "learning_rate": 0.0009625632095220198, + "loss": 0.90168738, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.17370605, + "routerloss_mlp": 0.0, + "step": 781, + "time_per_iteration": 2.8194801807403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119786, + "balance_loss_mlp": 1.10251248, + "diversity_loss_mlp": 0.0, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.1003760880169841, + "language_loss": 0.86904705, + "learning_rate": 0.0009624448398337637, + "loss": 0.88024497, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 782, + "time_per_iteration": 2.511925458908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117445, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.08409428795596587, + "language_loss": 0.8913728, + "learning_rate": 0.0009623262906084984, + "loss": 0.90254724, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.17236328, + "routerloss_mlp": 0.0, + "step": 783, + "time_per_iteration": 2.9890754222869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125496, + "balance_loss_mlp": 1.10804367, + "diversity_loss_mlp": 0.0, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.07818041002140835, + "language_loss": 0.90351313, + "learning_rate": 0.0009622075618922486, + "loss": 0.9147681, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 784, + "time_per_iteration": 2.6550891399383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119712, + "balance_loss_mlp": 1.10261774, + "diversity_loss_mlp": 0.0, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.07239943737193227, + "language_loss": 0.87125635, + "learning_rate": 0.0009620886537311091, + "loss": 0.88245344, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.17114258, + "routerloss_mlp": 0.0, + "step": 785, + "time_per_iteration": 2.646864652633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125571, + "balance_loss_mlp": 1.10794032, + "diversity_loss_mlp": 0.0, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.08980079735835493, + "language_loss": 0.85309643, + "learning_rate": 0.000961969566171244, + "loss": 0.86435217, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.1763916, + "routerloss_mlp": 0.0, + "step": 786, + "time_per_iteration": 2.5803041458129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136873, + "balance_loss_mlp": 1.11938524, + "diversity_loss_mlp": 0.0, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.08282756535064502, + "language_loss": 0.8993417, + "learning_rate": 0.0009618502992588873, + "loss": 0.91071045, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.17504883, + "routerloss_mlp": 0.0, + "step": 787, + "time_per_iteration": 2.6479151248931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124837, + "balance_loss_mlp": 1.10727715, + "diversity_loss_mlp": 0.0, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.07571751270322945, + "language_loss": 0.8792628, + "learning_rate": 0.0009617308530403424, + "loss": 0.89051116, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.17565918, + "routerloss_mlp": 0.0, + "step": 788, + "time_per_iteration": 3.002804756164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125535, + "balance_loss_mlp": 1.10758173, + "diversity_loss_mlp": 0.0, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0842913885359751, + "language_loss": 0.88032806, + "learning_rate": 0.0009616112275619825, + "loss": 0.89158338, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 789, + "time_per_iteration": 2.6842775344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110837, + "balance_loss_mlp": 1.09398067, + "diversity_loss_mlp": 0.0, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.07451962795351484, + "language_loss": 0.83893597, + "learning_rate": 0.0009614914228702503, + "loss": 0.85004437, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 790, + "time_per_iteration": 2.714026689529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095726, + "balance_loss_mlp": 1.07848811, + "diversity_loss_mlp": 0.0, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.07099161447381937, + "language_loss": 0.89133644, + "learning_rate": 0.0009613714390116581, + "loss": 0.90229368, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.17260742, + "routerloss_mlp": 0.0, + "step": 791, + "time_per_iteration": 2.947917938232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089669, + "balance_loss_mlp": 1.0730865, + "diversity_loss_mlp": 0.0, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.07518738092336623, + "language_loss": 0.86102855, + "learning_rate": 0.0009612512760327879, + "loss": 0.87192523, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 792, + "time_per_iteration": 2.887404203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092437, + "balance_loss_mlp": 1.07553315, + "diversity_loss_mlp": 0.0, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.09992337759040973, + "language_loss": 0.85428631, + "learning_rate": 0.0009611309339802909, + "loss": 0.86521071, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 793, + "time_per_iteration": 2.463308811187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101517, + "balance_loss_mlp": 1.08537626, + "diversity_loss_mlp": 0.0, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.07717151134226699, + "language_loss": 0.84535038, + "learning_rate": 0.0009610104129008881, + "loss": 0.85636556, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 794, + "time_per_iteration": 3.1276698112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108014, + "balance_loss_mlp": 1.09176612, + "diversity_loss_mlp": 0.0, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.07067272187318202, + "language_loss": 0.88475168, + "learning_rate": 0.0009608897128413701, + "loss": 0.89583182, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 795, + "time_per_iteration": 2.7658157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110863, + "balance_loss_mlp": 1.09251332, + "diversity_loss_mlp": 0.0, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.05987412473430484, + "language_loss": 0.85522842, + "learning_rate": 0.0009607688338485965, + "loss": 0.86631477, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 796, + "time_per_iteration": 2.849942207336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112598, + "balance_loss_mlp": 1.10935068, + "diversity_loss_mlp": 0.0, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.07148533051381147, + "language_loss": 0.90245026, + "learning_rate": 0.0009606477759694969, + "loss": 0.91371006, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 797, + "time_per_iteration": 3.0240113735198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144466, + "balance_loss_mlp": 1.12839675, + "diversity_loss_mlp": 0.0, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.07535837127697287, + "language_loss": 0.87540114, + "learning_rate": 0.0009605265392510703, + "loss": 0.88684577, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 798, + "time_per_iteration": 2.6324868202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147656, + "balance_loss_mlp": 1.13140786, + "diversity_loss_mlp": 0.0, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.070317951825601, + "language_loss": 0.91919398, + "learning_rate": 0.0009604051237403846, + "loss": 0.93067056, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 799, + "time_per_iteration": 2.6472957134246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159957, + "balance_loss_mlp": 1.14441192, + "diversity_loss_mlp": 0.0, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.08825283549053219, + "language_loss": 0.8626982, + "learning_rate": 0.0009602835294845776, + "loss": 0.8742978, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 800, + "time_per_iteration": 2.4501516819000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141823, + "balance_loss_mlp": 1.12552738, + "diversity_loss_mlp": 0.0, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.07489761537063061, + "language_loss": 0.89964634, + "learning_rate": 0.0009601617565308565, + "loss": 0.91106457, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 801, + "time_per_iteration": 2.6480391025543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00945745, + "balance_loss_mlp": 1.65525413, + "diversity_loss_mlp": 0.20237769, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.03656221347615257, + "language_loss": 0.8655234, + "learning_rate": 0.0009600398049264977, + "loss": 0.87498081, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01692954, + "step": 802, + "time_per_iteration": 3.0029048919677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00923116, + "balance_loss_mlp": 1.61011553, + "diversity_loss_mlp": 0.20312682, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.045238735441598905, + "language_loss": 0.92041564, + "learning_rate": 0.0009599176747188469, + "loss": 0.92964679, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0164945, + "step": 803, + "time_per_iteration": 2.860461473464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113914, + "balance_loss_mlp": 1.12246239, + "diversity_loss_mlp": 0.0, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.08350523706559901, + "language_loss": 0.83155477, + "learning_rate": 0.0009597953659553196, + "loss": 0.84294617, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.16687012, + "routerloss_mlp": 0.0, + "step": 804, + "time_per_iteration": 2.733302116394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139673, + "balance_loss_mlp": 1.12363935, + "diversity_loss_mlp": 0.0, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.08094420015679657, + "language_loss": 0.89484847, + "learning_rate": 0.0009596728786833997, + "loss": 0.90624517, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.16027832, + "routerloss_mlp": 0.0, + "step": 805, + "time_per_iteration": 2.602963447570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112483, + "balance_loss_mlp": 1.10851073, + "diversity_loss_mlp": 0.0, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.09295267358895155, + "language_loss": 0.8926357, + "learning_rate": 0.0009595502129506415, + "loss": 0.90388405, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 806, + "time_per_iteration": 3.358494997024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112525, + "balance_loss_mlp": 1.10893035, + "diversity_loss_mlp": 0.0, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.09807919542340894, + "language_loss": 0.82600027, + "learning_rate": 0.0009594273688046678, + "loss": 0.83725274, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 807, + "time_per_iteration": 2.7516088485717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121041, + "balance_loss_mlp": 1.10408974, + "diversity_loss_mlp": 0.0, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.13657059547118527, + "language_loss": 0.85685933, + "learning_rate": 0.000959304346293171, + "loss": 0.86806977, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 808, + "time_per_iteration": 2.676118850708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133717, + "balance_loss_mlp": 1.11686087, + "diversity_loss_mlp": 0.0, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.08670416080232539, + "language_loss": 0.88104093, + "learning_rate": 0.0009591811454639125, + "loss": 0.89237815, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.16870117, + "routerloss_mlp": 0.0, + "step": 809, + "time_per_iteration": 2.806877613067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143795, + "balance_loss_mlp": 1.12712979, + "diversity_loss_mlp": 0.0, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.07575766208840308, + "language_loss": 0.88623202, + "learning_rate": 0.0009590577663647234, + "loss": 0.89766991, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 810, + "time_per_iteration": 2.705397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167139, + "balance_loss_mlp": 1.15012765, + "diversity_loss_mlp": 0.0, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.07966338850805216, + "language_loss": 0.86178398, + "learning_rate": 0.0009589342090435036, + "loss": 0.87345541, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.17028809, + "routerloss_mlp": 0.0, + "step": 811, + "time_per_iteration": 2.767648935317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164589, + "balance_loss_mlp": 1.14749408, + "diversity_loss_mlp": 0.0, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07988119295983553, + "language_loss": 0.87430739, + "learning_rate": 0.0009588104735482223, + "loss": 0.88595331, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 812, + "time_per_iteration": 2.6543996334075928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167363, + "balance_loss_mlp": 1.14989901, + "diversity_loss_mlp": 0.0, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.09429144108453459, + "language_loss": 0.83906114, + "learning_rate": 0.0009586865599269177, + "loss": 0.85073483, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.17480469, + "routerloss_mlp": 0.0, + "step": 813, + "time_per_iteration": 2.632206439971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180179, + "balance_loss_mlp": 1.1632992, + "diversity_loss_mlp": 0.0, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.08748302318090055, + "language_loss": 0.88416874, + "learning_rate": 0.0009585624682276977, + "loss": 0.89597052, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.16894531, + "routerloss_mlp": 0.0, + "step": 814, + "time_per_iteration": 2.7365036010742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187488, + "balance_loss_mlp": 1.17066741, + "diversity_loss_mlp": 0.0, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.08109713122840453, + "language_loss": 0.87263978, + "learning_rate": 0.0009584381984987386, + "loss": 0.88451469, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 815, + "time_per_iteration": 2.5354831218719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011941, + "balance_loss_mlp": 1.1770407, + "diversity_loss_mlp": 0.0, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.07928759805262754, + "language_loss": 0.89978456, + "learning_rate": 0.0009583137507882864, + "loss": 0.91172552, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.17077637, + "routerloss_mlp": 0.0, + "step": 816, + "time_per_iteration": 2.679156541824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00895961, + "balance_loss_mlp": 1.55854249, + "diversity_loss_mlp": 0.20119007, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.035733799703693336, + "language_loss": 0.81236839, + "learning_rate": 0.000958189125144656, + "loss": 0.82132804, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160944, + "step": 817, + "time_per_iteration": 2.6629080772399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211679, + "balance_loss_mlp": 1.1954186, + "diversity_loss_mlp": 0.0, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.08655764528844483, + "language_loss": 0.88309336, + "learning_rate": 0.0009580643216162313, + "loss": 0.89521015, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 818, + "time_per_iteration": 2.6631743907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174608, + "balance_loss_mlp": 1.15813375, + "diversity_loss_mlp": 0.0, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.07543766685957613, + "language_loss": 0.79610753, + "learning_rate": 0.0009579393402514652, + "loss": 0.80785358, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 819, + "time_per_iteration": 2.5706892013549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116637, + "balance_loss_mlp": 1.15002656, + "diversity_loss_mlp": 0.0, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.08555828674018097, + "language_loss": 0.90543056, + "learning_rate": 0.0009578141810988801, + "loss": 0.91709423, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 820, + "time_per_iteration": 2.6443581581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154879, + "balance_loss_mlp": 1.13852358, + "diversity_loss_mlp": 0.0, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.08457683432578478, + "language_loss": 0.90617025, + "learning_rate": 0.0009576888442070668, + "loss": 0.91771901, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.16357422, + "routerloss_mlp": 0.0, + "step": 821, + "time_per_iteration": 2.588172197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131243, + "balance_loss_mlp": 1.11597228, + "diversity_loss_mlp": 0.0, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.08246293521158644, + "language_loss": 0.92183721, + "learning_rate": 0.0009575633296246854, + "loss": 0.93314958, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 822, + "time_per_iteration": 2.5674116611480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00894902, + "balance_loss_mlp": 1.55344844, + "diversity_loss_mlp": 0.20225295, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.035537794180972825, + "language_loss": 0.83368647, + "learning_rate": 0.0009574376374004652, + "loss": 0.84263551, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01705186, + "step": 823, + "time_per_iteration": 2.6215808391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124038, + "balance_loss_mlp": 1.10815978, + "diversity_loss_mlp": 0.0, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.07732147283422666, + "language_loss": 0.801727, + "learning_rate": 0.000957311767583204, + "loss": 0.81296742, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 824, + "time_per_iteration": 2.6025402545928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114811, + "balance_loss_mlp": 1.12617576, + "diversity_loss_mlp": 0.0, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.06675818035974217, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83219701, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.21972656, + "routerloss_mlp": 0.0, + "step": 825, + "time_per_iteration": 4.730658531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00883043, + "balance_loss_mlp": 1.5295732, + "diversity_loss_mlp": 0.20110103, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.0472865977200058, + "language_loss": 0.91635585, + "learning_rate": 0.0009570594953650961, + "loss": 0.92518628, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01770616, + "step": 826, + "time_per_iteration": 2.528219699859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119191, + "balance_loss_mlp": 1.10247803, + "diversity_loss_mlp": 0.0, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.1137923923451387, + "language_loss": 0.80430406, + "learning_rate": 0.00095693309306219, + "loss": 0.81549597, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 827, + "time_per_iteration": 3.0950989723205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111184, + "balance_loss_mlp": 1.09513879, + "diversity_loss_mlp": 0.0, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.08215179220405018, + "language_loss": 0.87886679, + "learning_rate": 0.0009568065133621244, + "loss": 0.8899852, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.16699219, + "routerloss_mlp": 0.0, + "step": 828, + "time_per_iteration": 3.367777109146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106235, + "balance_loss_mlp": 1.08993912, + "diversity_loss_mlp": 0.0, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.0806870261134831, + "language_loss": 0.85100621, + "learning_rate": 0.0009566797563140422, + "loss": 0.86206853, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 829, + "time_per_iteration": 2.8803212642669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122437, + "balance_loss_mlp": 1.10618925, + "diversity_loss_mlp": 0.0, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.0881590388408274, + "language_loss": 0.88045579, + "learning_rate": 0.0009565528219671547, + "loss": 0.89168018, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 830, + "time_per_iteration": 2.8965914249420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130205, + "balance_loss_mlp": 1.11437368, + "diversity_loss_mlp": 0.0, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.08433678519740714, + "language_loss": 0.84820044, + "learning_rate": 0.0009564257103707418, + "loss": 0.85950249, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 831, + "time_per_iteration": 2.6071205139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138047, + "balance_loss_mlp": 1.12237096, + "diversity_loss_mlp": 0.0, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.08192391736137887, + "language_loss": 0.90990019, + "learning_rate": 0.0009562984215741533, + "loss": 0.92128068, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.15661621, + "routerloss_mlp": 0.0, + "step": 832, + "time_per_iteration": 2.647022008895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126204, + "balance_loss_mlp": 1.11050415, + "diversity_loss_mlp": 0.0, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.08304692865674389, + "language_loss": 0.8233614, + "learning_rate": 0.0009561709556268065, + "loss": 0.83462346, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 833, + "time_per_iteration": 2.7033326625823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113334, + "balance_loss_mlp": 1.09758639, + "diversity_loss_mlp": 0.0, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.1118379895427605, + "language_loss": 0.94022137, + "learning_rate": 0.0009560433125781884, + "loss": 0.95135468, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 834, + "time_per_iteration": 2.7286314964294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137088, + "balance_loss_mlp": 1.12088716, + "diversity_loss_mlp": 0.0, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.07457680689162895, + "language_loss": 0.92389894, + "learning_rate": 0.0009559154924778544, + "loss": 0.93526971, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 835, + "time_per_iteration": 2.7348785400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143876, + "balance_loss_mlp": 1.12812805, + "diversity_loss_mlp": 0.0, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.10043267780752475, + "language_loss": 0.85037422, + "learning_rate": 0.0009557874953754284, + "loss": 0.86181295, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 836, + "time_per_iteration": 3.069246768951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156501, + "balance_loss_mlp": 1.14049125, + "diversity_loss_mlp": 0.0, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08327927090533828, + "language_loss": 0.83506572, + "learning_rate": 0.0009556593213206038, + "loss": 0.84663069, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 837, + "time_per_iteration": 2.7368414402008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190738, + "balance_loss_mlp": 1.17505026, + "diversity_loss_mlp": 0.0, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.08045457133261572, + "language_loss": 0.87076676, + "learning_rate": 0.0009555309703631414, + "loss": 0.88267422, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 838, + "time_per_iteration": 2.72027850151062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180132, + "balance_loss_mlp": 1.16382456, + "diversity_loss_mlp": 0.0, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.09367634959673259, + "language_loss": 0.87476748, + "learning_rate": 0.0009554024425528722, + "loss": 0.88656878, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 839, + "time_per_iteration": 2.7314722537994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173375, + "balance_loss_mlp": 1.15756762, + "diversity_loss_mlp": 0.0, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.0683151622017414, + "language_loss": 0.88983327, + "learning_rate": 0.0009552737379396948, + "loss": 0.90156698, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.15795898, + "routerloss_mlp": 0.0, + "step": 840, + "time_per_iteration": 2.6384117603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165757, + "balance_loss_mlp": 1.14950919, + "diversity_loss_mlp": 0.0, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.08203724053437887, + "language_loss": 0.87545735, + "learning_rate": 0.0009551448565735767, + "loss": 0.88711488, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 841, + "time_per_iteration": 2.7497382164001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158402, + "balance_loss_mlp": 1.14156926, + "diversity_loss_mlp": 0.0, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.08523302245909381, + "language_loss": 0.84374112, + "learning_rate": 0.0009550157985045543, + "loss": 0.8553251, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 842, + "time_per_iteration": 3.080169916152954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114708, + "balance_loss_mlp": 1.13046193, + "diversity_loss_mlp": 0.0, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.10255895710786052, + "language_loss": 0.89356017, + "learning_rate": 0.0009548865637827321, + "loss": 0.90503097, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 843, + "time_per_iteration": 2.684195041656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158581, + "balance_loss_mlp": 1.14129627, + "diversity_loss_mlp": 0.0, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.08376364289368579, + "language_loss": 0.89409387, + "learning_rate": 0.0009547571524582838, + "loss": 0.90567964, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.17297363, + "routerloss_mlp": 0.0, + "step": 844, + "time_per_iteration": 2.5846645832061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157702, + "balance_loss_mlp": 1.14051175, + "diversity_loss_mlp": 0.0, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.09201378669766774, + "language_loss": 0.92096436, + "learning_rate": 0.0009546275645814512, + "loss": 0.93254137, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.17211914, + "routerloss_mlp": 0.0, + "step": 845, + "time_per_iteration": 2.603830575942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165367, + "balance_loss_mlp": 1.1485343, + "diversity_loss_mlp": 0.0, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.11870998115484692, + "language_loss": 0.8935858, + "learning_rate": 0.0009544978002025446, + "loss": 0.90523952, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 846, + "time_per_iteration": 2.57155179977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167547, + "balance_loss_mlp": 1.15075064, + "diversity_loss_mlp": 0.0, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.08095587687984966, + "language_loss": 0.86639023, + "learning_rate": 0.0009543678593719434, + "loss": 0.87806571, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.16809082, + "routerloss_mlp": 0.0, + "step": 847, + "time_per_iteration": 2.7022597789764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189002, + "balance_loss_mlp": 1.17215741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.06757237913003537, + "language_loss": 0.87374425, + "learning_rate": 0.0009542377421400945, + "loss": 0.8856343, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.1685791, + "routerloss_mlp": 0.0, + "step": 848, + "time_per_iteration": 2.7858939170837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209239, + "balance_loss_mlp": 1.1922878, + "diversity_loss_mlp": 0.0, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.0709695929057924, + "language_loss": 0.83489215, + "learning_rate": 0.0009541074485575145, + "loss": 0.84698457, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 849, + "time_per_iteration": 2.7202138900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206318, + "balance_loss_mlp": 1.18949735, + "diversity_loss_mlp": 0.0, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.09796618546415216, + "language_loss": 0.91934282, + "learning_rate": 0.0009539769786747874, + "loss": 0.93140602, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 850, + "time_per_iteration": 2.6165611743927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183142, + "balance_loss_mlp": 1.16619003, + "diversity_loss_mlp": 0.0, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.08882238893928415, + "language_loss": 0.81184316, + "learning_rate": 0.0009538463325425665, + "loss": 0.82367456, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.16967773, + "routerloss_mlp": 0.0, + "step": 851, + "time_per_iteration": 2.686708927154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.13394117, + "diversity_loss_mlp": 0.0, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.07439357185799754, + "language_loss": 0.85950458, + "learning_rate": 0.0009537155102115728, + "loss": 0.87101221, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 852, + "time_per_iteration": 2.5918595790863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875998, + "balance_loss_mlp": 1.52336514, + "diversity_loss_mlp": 0.19506347, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.033648266618603755, + "language_loss": 0.83653182, + "learning_rate": 0.0009535845117325961, + "loss": 0.84529185, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0167836, + "step": 853, + "time_per_iteration": 2.724388599395752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106481, + "balance_loss_mlp": 1.08957744, + "diversity_loss_mlp": 0.0, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.08216353114673619, + "language_loss": 0.93429655, + "learning_rate": 0.0009534533371564946, + "loss": 0.94536138, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 854, + "time_per_iteration": 2.7487661838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011031, + "balance_loss_mlp": 1.08627963, + "diversity_loss_mlp": 0.0, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.1393079137823864, + "language_loss": 0.88947123, + "learning_rate": 0.0009533219865341949, + "loss": 0.9005022, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 855, + "time_per_iteration": 2.5900051593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095935, + "balance_loss_mlp": 1.0794363, + "diversity_loss_mlp": 0.0, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.09213408499242232, + "language_loss": 0.86629748, + "learning_rate": 0.0009531904599166916, + "loss": 0.87725687, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 856, + "time_per_iteration": 2.6516594886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093162, + "balance_loss_mlp": 1.07659197, + "diversity_loss_mlp": 0.0, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.11803940214792888, + "language_loss": 0.85319799, + "learning_rate": 0.0009530587573550478, + "loss": 0.86412966, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 857, + "time_per_iteration": 2.6046345233917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.06968486, + "diversity_loss_mlp": 0.0, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.035898632567184195, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75406808, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.17773438, + "routerloss_mlp": 0.0, + "step": 858, + "time_per_iteration": 5.039424180984497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113669, + "balance_loss_mlp": 1.12172914, + "diversity_loss_mlp": 0.0, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.11200047020164162, + "language_loss": 0.90257657, + "learning_rate": 0.0009527948246039337, + "loss": 0.91394353, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.14929199, + "routerloss_mlp": 0.0, + "step": 859, + "time_per_iteration": 2.550898551940918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912162, + "balance_loss_mlp": 1.5939728, + "diversity_loss_mlp": 0.19291875, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.041813305841329106, + "language_loss": 0.87981749, + "learning_rate": 0.000952662594516931, + "loss": 0.88893914, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871633, + "step": 860, + "time_per_iteration": 3.135986089706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159964, + "balance_loss_mlp": 1.14404976, + "diversity_loss_mlp": 0.0, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.09693666764449156, + "language_loss": 0.86321676, + "learning_rate": 0.0009525301886907234, + "loss": 0.87481636, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 861, + "time_per_iteration": 2.8601465225219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117936, + "balance_loss_mlp": 1.16340995, + "diversity_loss_mlp": 0.0, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.08775979857040934, + "language_loss": 0.87897611, + "learning_rate": 0.0009523976071767155, + "loss": 0.89076972, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 862, + "time_per_iteration": 2.676481246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186964, + "balance_loss_mlp": 1.17058492, + "diversity_loss_mlp": 0.0, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.08829714099376759, + "language_loss": 0.87565947, + "learning_rate": 0.00095226485002638, + "loss": 0.88752913, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 863, + "time_per_iteration": 2.7554168701171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188442, + "balance_loss_mlp": 1.17221785, + "diversity_loss_mlp": 0.0, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.07683945950910559, + "language_loss": 0.89008975, + "learning_rate": 0.0009521319172912576, + "loss": 0.90197414, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.16223145, + "routerloss_mlp": 0.0, + "step": 864, + "time_per_iteration": 2.7515084743499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180456, + "balance_loss_mlp": 1.16381395, + "diversity_loss_mlp": 0.0, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.07957847945510911, + "language_loss": 0.95031559, + "learning_rate": 0.0009519988090229579, + "loss": 0.96212018, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 865, + "time_per_iteration": 2.671473741531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177408, + "balance_loss_mlp": 1.16058719, + "diversity_loss_mlp": 0.0, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.08787110668844439, + "language_loss": 0.87748879, + "learning_rate": 0.0009518655252731576, + "loss": 0.8892628, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.16833496, + "routerloss_mlp": 0.0, + "step": 866, + "time_per_iteration": 2.7561991214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152051, + "balance_loss_mlp": 1.13470602, + "diversity_loss_mlp": 0.0, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.07641565274747647, + "language_loss": 0.90193641, + "learning_rate": 0.0009517320660936022, + "loss": 0.91345698, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.17358398, + "routerloss_mlp": 0.0, + "step": 867, + "time_per_iteration": 2.7005693912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177189, + "balance_loss_mlp": 1.16064239, + "diversity_loss_mlp": 0.0, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.08424262891613502, + "language_loss": 0.83321446, + "learning_rate": 0.0009515984315361051, + "loss": 0.84498632, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.16552734, + "routerloss_mlp": 0.0, + "step": 868, + "time_per_iteration": 2.7969586849212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167914, + "balance_loss_mlp": 1.15145087, + "diversity_loss_mlp": 0.0, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.08829416831991993, + "language_loss": 0.87132847, + "learning_rate": 0.000951464621652548, + "loss": 0.88300765, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 869, + "time_per_iteration": 2.6121644973754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152825, + "balance_loss_mlp": 1.13639808, + "diversity_loss_mlp": 0.0, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.07099792340868973, + "language_loss": 0.79077303, + "learning_rate": 0.0009513306364948804, + "loss": 0.80230129, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 870, + "time_per_iteration": 2.7814862728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140954, + "balance_loss_mlp": 1.12481356, + "diversity_loss_mlp": 0.0, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09401721418936884, + "language_loss": 0.89126736, + "learning_rate": 0.0009511964761151197, + "loss": 0.90267694, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 871, + "time_per_iteration": 2.601903200149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152354, + "balance_loss_mlp": 1.13628435, + "diversity_loss_mlp": 0.0, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.07594901152089473, + "language_loss": 0.90430808, + "learning_rate": 0.0009510621405653521, + "loss": 0.91583163, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 872, + "time_per_iteration": 2.6015260219573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140995, + "balance_loss_mlp": 1.12449682, + "diversity_loss_mlp": 0.0, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.08553354640914074, + "language_loss": 0.84159112, + "learning_rate": 0.0009509276298977309, + "loss": 0.85300112, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.16503906, + "routerloss_mlp": 0.0, + "step": 873, + "time_per_iteration": 2.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156157, + "balance_loss_mlp": 1.13969469, + "diversity_loss_mlp": 0.0, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.09960357111836311, + "language_loss": 0.81973028, + "learning_rate": 0.0009507929441644778, + "loss": 0.83129185, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.16467285, + "routerloss_mlp": 0.0, + "step": 874, + "time_per_iteration": 3.518749237060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141075, + "balance_loss_mlp": 1.12455297, + "diversity_loss_mlp": 0.0, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.09789550875526438, + "language_loss": 0.86003464, + "learning_rate": 0.0009506580834178826, + "loss": 0.87144536, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.1652832, + "routerloss_mlp": 0.0, + "step": 875, + "time_per_iteration": 2.7423431873321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152406, + "balance_loss_mlp": 1.13565707, + "diversity_loss_mlp": 0.0, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.08790070613593892, + "language_loss": 0.91631377, + "learning_rate": 0.0009505230477103028, + "loss": 0.92783785, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 876, + "time_per_iteration": 2.698725938796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133355, + "balance_loss_mlp": 1.11677289, + "diversity_loss_mlp": 0.0, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.09908277874944699, + "language_loss": 0.81365788, + "learning_rate": 0.0009503878370941641, + "loss": 0.82499135, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 877, + "time_per_iteration": 2.791314125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891363, + "balance_loss_mlp": 1.54620337, + "diversity_loss_mlp": 0.20141272, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.04203797903351432, + "language_loss": 0.89092785, + "learning_rate": 0.0009502524516219595, + "loss": 0.89984149, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01755447, + "step": 878, + "time_per_iteration": 2.776076078414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_mlp": 1.12719083, + "diversity_loss_mlp": 0.0, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.08982042340710936, + "language_loss": 0.90123284, + "learning_rate": 0.0009501168913462506, + "loss": 0.91266429, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.15942383, + "routerloss_mlp": 0.0, + "step": 879, + "time_per_iteration": 2.6948277950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112281, + "balance_loss_mlp": 1.09587741, + "diversity_loss_mlp": 0.0, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.05096984028598956, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80234206, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 880, + "time_per_iteration": 4.850466728210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143836, + "balance_loss_mlp": 1.12831497, + "diversity_loss_mlp": 0.0, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.08080936273118028, + "language_loss": 0.85235959, + "learning_rate": 0.0009498452465949042, + "loss": 0.8637979, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.1550293, + "routerloss_mlp": 0.0, + "step": 881, + "time_per_iteration": 3.2163655757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147334, + "balance_loss_mlp": 1.13156271, + "diversity_loss_mlp": 0.0, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.06875421208466073, + "language_loss": 0.91363323, + "learning_rate": 0.0009497091622247285, + "loss": 0.92510653, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 882, + "time_per_iteration": 2.686939239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152935, + "balance_loss_mlp": 1.13735437, + "diversity_loss_mlp": 0.0, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.08376903723107024, + "language_loss": 0.93688583, + "learning_rate": 0.0009495729032619723, + "loss": 0.94841516, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.15563965, + "routerloss_mlp": 0.0, + "step": 883, + "time_per_iteration": 2.709554433822632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164282, + "balance_loss_mlp": 1.14845097, + "diversity_loss_mlp": 0.0, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.07836441801613908, + "language_loss": 0.83897853, + "learning_rate": 0.0009494364697595354, + "loss": 0.85062128, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.15820312, + "routerloss_mlp": 0.0, + "step": 884, + "time_per_iteration": 2.905869722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192457, + "balance_loss_mlp": 1.17685246, + "diversity_loss_mlp": 0.0, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.08347533231949411, + "language_loss": 0.89193916, + "learning_rate": 0.0009492998617703867, + "loss": 0.90386373, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 885, + "time_per_iteration": 2.655181884765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196193, + "balance_loss_mlp": 1.18021917, + "diversity_loss_mlp": 0.0, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.09597329726050118, + "language_loss": 0.87667245, + "learning_rate": 0.0009491630793475619, + "loss": 0.88863432, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 886, + "time_per_iteration": 2.6077725887298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195953, + "balance_loss_mlp": 1.17983615, + "diversity_loss_mlp": 0.0, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.09161300078510141, + "language_loss": 0.8529889, + "learning_rate": 0.0009490261225441643, + "loss": 0.86494851, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 887, + "time_per_iteration": 2.8882617950439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169082, + "balance_loss_mlp": 1.15244031, + "diversity_loss_mlp": 0.0, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07944379291645969, + "language_loss": 0.90366387, + "learning_rate": 0.0009488889914133656, + "loss": 0.91535467, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 888, + "time_per_iteration": 2.969808578491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192276, + "balance_loss_mlp": 1.17532432, + "diversity_loss_mlp": 0.0, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.0816216626447537, + "language_loss": 0.89335579, + "learning_rate": 0.0009487516860084047, + "loss": 0.90527856, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.16955566, + "routerloss_mlp": 0.0, + "step": 889, + "time_per_iteration": 2.6975717544555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164555, + "balance_loss_mlp": 1.14738929, + "diversity_loss_mlp": 0.0, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.08956429914743876, + "language_loss": 0.88835347, + "learning_rate": 0.0009486142063825884, + "loss": 0.89999902, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 890, + "time_per_iteration": 2.5376908779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087842, + "balance_loss_mlp": 1.07248783, + "diversity_loss_mlp": 0.0, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.041165905845677725, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73514056, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 891, + "time_per_iteration": 4.961901664733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168071, + "balance_loss_mlp": 1.15150142, + "diversity_loss_mlp": 0.0, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.09530662242326329, + "language_loss": 0.89790797, + "learning_rate": 0.0009483387246819542, + "loss": 0.90958869, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 892, + "time_per_iteration": 2.7075483798980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.0489924, + "diversity_loss_mlp": 0.0, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.03173229244132217, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83349359, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 893, + "time_per_iteration": 4.639479398727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175334, + "balance_loss_mlp": 1.15915704, + "diversity_loss_mlp": 0.0, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.09568003043121609, + "language_loss": 0.88799989, + "learning_rate": 0.0009480625467392688, + "loss": 0.89975327, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 894, + "time_per_iteration": 2.6601061820983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062971, + "balance_loss_mlp": 1.04933381, + "diversity_loss_mlp": 0.0, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.02668432598653126, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79057646, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 895, + "time_per_iteration": 4.739619970321655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154117, + "balance_loss_mlp": 1.13857174, + "diversity_loss_mlp": 0.0, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.0641043143423189, + "language_loss": 0.87743723, + "learning_rate": 0.0009477856729834196, + "loss": 0.88897842, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 896, + "time_per_iteration": 2.7397632598876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143695, + "balance_loss_mlp": 1.12863934, + "diversity_loss_mlp": 0.0, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.08265751895316475, + "language_loss": 0.89999056, + "learning_rate": 0.0009476469753098809, + "loss": 0.9114275, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 897, + "time_per_iteration": 2.7494678497314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151499, + "balance_loss_mlp": 1.13624024, + "diversity_loss_mlp": 0.0, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08701823937514089, + "language_loss": 0.86839932, + "learning_rate": 0.0009475081038443738, + "loss": 0.87991428, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.15234375, + "routerloss_mlp": 0.0, + "step": 898, + "time_per_iteration": 2.6241486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147135, + "balance_loss_mlp": 1.13179302, + "diversity_loss_mlp": 0.0, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.10104724937619765, + "language_loss": 0.85756111, + "learning_rate": 0.0009473690586408124, + "loss": 0.86903244, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 899, + "time_per_iteration": 2.8371973037719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141451, + "balance_loss_mlp": 1.1257633, + "diversity_loss_mlp": 0.0, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.08019640817702944, + "language_loss": 0.86364079, + "learning_rate": 0.0009472298397531792, + "loss": 0.87505525, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 900, + "time_per_iteration": 2.742392063140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158392, + "balance_loss_mlp": 1.14285886, + "diversity_loss_mlp": 0.0, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.08623310667606855, + "language_loss": 0.86846912, + "learning_rate": 0.0009470904472355235, + "loss": 0.88005304, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 901, + "time_per_iteration": 2.6695165634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168499, + "balance_loss_mlp": 1.15235806, + "diversity_loss_mlp": 0.0, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.08505658620970231, + "language_loss": 0.7976377, + "learning_rate": 0.0009469508811419626, + "loss": 0.80932266, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.16137695, + "routerloss_mlp": 0.0, + "step": 902, + "time_per_iteration": 2.706495761871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295395, + "balance_loss_mlp": 1.28533375, + "diversity_loss_mlp": 0.0, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.12561294289393785, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72909224, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 903, + "time_per_iteration": 4.816544532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201232, + "balance_loss_mlp": 1.18432808, + "diversity_loss_mlp": 0.0, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.08260915403461032, + "language_loss": 0.83578205, + "learning_rate": 0.0009466712284439292, + "loss": 0.84779429, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.16918945, + "routerloss_mlp": 0.0, + "step": 904, + "time_per_iteration": 2.7518186569213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225673, + "balance_loss_mlp": 1.20837545, + "diversity_loss_mlp": 0.0, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.10172065741669829, + "language_loss": 0.88445127, + "learning_rate": 0.0009465311419480276, + "loss": 0.89670801, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 905, + "time_per_iteration": 2.6713294982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222896, + "balance_loss_mlp": 1.20540833, + "diversity_loss_mlp": 0.0, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.08928567213571854, + "language_loss": 0.88188136, + "learning_rate": 0.0009463908820933622, + "loss": 0.89411032, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 906, + "time_per_iteration": 2.838935375213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211371, + "balance_loss_mlp": 1.19455028, + "diversity_loss_mlp": 0.0, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.07641026648080583, + "language_loss": 0.82561022, + "learning_rate": 0.0009462504489343868, + "loss": 0.83772391, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 907, + "time_per_iteration": 2.814695119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176767, + "balance_loss_mlp": 1.15961313, + "diversity_loss_mlp": 0.0, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.1031074016814366, + "language_loss": 0.88790941, + "learning_rate": 0.0009461098425256222, + "loss": 0.89967716, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 908, + "time_per_iteration": 2.6116297245025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159634, + "balance_loss_mlp": 1.14329028, + "diversity_loss_mlp": 0.0, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.08015161116044169, + "language_loss": 0.86030436, + "learning_rate": 0.0009459690629216567, + "loss": 0.87190068, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 909, + "time_per_iteration": 2.6483752727508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130085, + "balance_loss_mlp": 1.11407518, + "diversity_loss_mlp": 0.0, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.1301831169035446, + "language_loss": 0.87761313, + "learning_rate": 0.0009458281101771457, + "loss": 0.88891399, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 910, + "time_per_iteration": 2.6089227199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00992009, + "balance_loss_mlp": 1.75545192, + "diversity_loss_mlp": 0.19214596, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.033219305186726854, + "language_loss": 0.82887536, + "learning_rate": 0.0009456869843468122, + "loss": 0.83879542, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01820984, + "step": 911, + "time_per_iteration": 2.895577907562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110935, + "balance_loss_mlp": 1.09519958, + "diversity_loss_mlp": 0.0, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.09801228329993106, + "language_loss": 0.78689641, + "learning_rate": 0.0009455456854854459, + "loss": 0.79800576, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 912, + "time_per_iteration": 2.61677885055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112332, + "balance_loss_mlp": 1.09684718, + "diversity_loss_mlp": 0.0, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.10345929433375275, + "language_loss": 0.84027654, + "learning_rate": 0.0009454042136479039, + "loss": 0.8513999, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 913, + "time_per_iteration": 2.63289737701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970368, + "balance_loss_mlp": 1.71473479, + "diversity_loss_mlp": 0.18966624, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.036406885856323776, + "language_loss": 0.82874572, + "learning_rate": 0.0009452625688891103, + "loss": 0.83844936, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01816791, + "step": 914, + "time_per_iteration": 2.5505056381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00652668, + "balance_loss_mlp": 1.1176697, + "diversity_loss_mlp": 0.15453993, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.002103211778310914, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79387403, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01656273, + "step": 915, + "time_per_iteration": 4.6835761070251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138887, + "balance_loss_mlp": 1.12381876, + "diversity_loss_mlp": 0.0, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.10180381633640839, + "language_loss": 0.92940623, + "learning_rate": 0.0009449787608278015, + "loss": 0.94079512, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 916, + "time_per_iteration": 2.7294180393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155245, + "balance_loss_mlp": 1.13949776, + "diversity_loss_mlp": 0.0, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08481056496958321, + "language_loss": 0.92318904, + "learning_rate": 0.0009448365976354704, + "loss": 0.9347415, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 917, + "time_per_iteration": 2.4908158779144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174187, + "balance_loss_mlp": 1.15821338, + "diversity_loss_mlp": 0.0, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.1031397623895646, + "language_loss": 0.89928877, + "learning_rate": 0.0009446942617422558, + "loss": 0.91103065, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.15966797, + "routerloss_mlp": 0.0, + "step": 918, + "time_per_iteration": 2.5721499919891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191219, + "balance_loss_mlp": 1.1748755, + "diversity_loss_mlp": 0.0, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.17804953788653613, + "language_loss": 0.85687363, + "learning_rate": 0.0009445517532034176, + "loss": 0.86878586, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.16345215, + "routerloss_mlp": 0.0, + "step": 919, + "time_per_iteration": 2.6613845825195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195517, + "balance_loss_mlp": 1.18031824, + "diversity_loss_mlp": 0.0, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.09678678856513988, + "language_loss": 0.89147103, + "learning_rate": 0.0009444090720742824, + "loss": 0.90342629, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 920, + "time_per_iteration": 2.587042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186456, + "balance_loss_mlp": 1.17107785, + "diversity_loss_mlp": 0.0, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.10185153476697495, + "language_loss": 0.87654328, + "learning_rate": 0.0009442662184102439, + "loss": 0.88840789, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 921, + "time_per_iteration": 2.8263702392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153869, + "balance_loss_mlp": 1.13851511, + "diversity_loss_mlp": 0.0, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.07023953845341, + "language_loss": 0.87764925, + "learning_rate": 0.000944123192266763, + "loss": 0.88918793, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 922, + "time_per_iteration": 2.789288282394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914197, + "balance_loss_mlp": 1.60349846, + "diversity_loss_mlp": 0.18745996, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.03372690713262746, + "language_loss": 0.83555657, + "learning_rate": 0.0009439799936993671, + "loss": 0.84469855, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01871805, + "step": 923, + "time_per_iteration": 2.7374520301818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137351, + "balance_loss_mlp": 1.12125802, + "diversity_loss_mlp": 0.0, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.08202300708599226, + "language_loss": 0.87886107, + "learning_rate": 0.0009438366227636511, + "loss": 0.89023459, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.16088867, + "routerloss_mlp": 0.0, + "step": 924, + "time_per_iteration": 2.7159595489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148154, + "balance_loss_mlp": 1.13190556, + "diversity_loss_mlp": 0.0, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.08035818105278464, + "language_loss": 0.86048192, + "learning_rate": 0.0009436930795152763, + "loss": 0.8719635, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 925, + "time_per_iteration": 2.8248116970062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143318, + "balance_loss_mlp": 1.12739205, + "diversity_loss_mlp": 0.0, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.07405817727017547, + "language_loss": 0.86317486, + "learning_rate": 0.0009435493640099713, + "loss": 0.87460804, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 926, + "time_per_iteration": 2.8155741691589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161834, + "balance_loss_mlp": 1.1451211, + "diversity_loss_mlp": 0.0, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.09122083849675254, + "language_loss": 0.84453332, + "learning_rate": 0.0009434054763035314, + "loss": 0.8561517, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 927, + "time_per_iteration": 2.636686325073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158411, + "balance_loss_mlp": 1.1422224, + "diversity_loss_mlp": 0.0, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.0663266274239875, + "language_loss": 0.85362542, + "learning_rate": 0.0009432614164518185, + "loss": 0.86520946, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 928, + "time_per_iteration": 2.9446685314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171163, + "balance_loss_mlp": 1.15443754, + "diversity_loss_mlp": 0.0, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.07726522608444414, + "language_loss": 0.84178561, + "learning_rate": 0.000943117184510762, + "loss": 0.85349721, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 929, + "time_per_iteration": 3.0194530487060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175374, + "balance_loss_mlp": 1.16435885, + "diversity_loss_mlp": 0.0, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.030831515732685378, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79965341, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 930, + "time_per_iteration": 5.04656982421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172004, + "balance_loss_mlp": 1.15555263, + "diversity_loss_mlp": 0.0, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.08209248711818126, + "language_loss": 0.88495553, + "learning_rate": 0.0009428282045846674, + "loss": 0.89667559, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.16455078, + "routerloss_mlp": 0.0, + "step": 931, + "time_per_iteration": 2.6833221912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905029, + "balance_loss_mlp": 1.58147573, + "diversity_loss_mlp": 0.18920106, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.030391877730158674, + "language_loss": 0.89804769, + "learning_rate": 0.0009426834567118214, + "loss": 0.90709794, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01969042, + "step": 932, + "time_per_iteration": 3.0804004669189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174106, + "balance_loss_mlp": 1.15761924, + "diversity_loss_mlp": 0.0, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.06967623980831897, + "language_loss": 0.80600739, + "learning_rate": 0.0009425385369740155, + "loss": 0.81774843, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.16491699, + "routerloss_mlp": 0.0, + "step": 933, + "time_per_iteration": 3.039576530456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172613, + "balance_loss_mlp": 1.15553069, + "diversity_loss_mlp": 0.0, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.09198882046168515, + "language_loss": 0.87049097, + "learning_rate": 0.0009423934454275125, + "loss": 0.88221705, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.17102051, + "routerloss_mlp": 0.0, + "step": 934, + "time_per_iteration": 2.8528192043304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147429, + "balance_loss_mlp": 1.13053656, + "diversity_loss_mlp": 0.0, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.09002999058802562, + "language_loss": 0.92077851, + "learning_rate": 0.0009422481821286418, + "loss": 0.93225282, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.16906738, + "routerloss_mlp": 0.0, + "step": 935, + "time_per_iteration": 2.720700740814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140916, + "balance_loss_mlp": 1.12434602, + "diversity_loss_mlp": 0.0, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.11818586168906865, + "language_loss": 0.88474637, + "learning_rate": 0.0009421027471337998, + "loss": 0.89615548, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 936, + "time_per_iteration": 2.61820125579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114364, + "balance_loss_mlp": 1.12680769, + "diversity_loss_mlp": 0.0, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.13119105141522364, + "language_loss": 0.82430404, + "learning_rate": 0.0009419571404994493, + "loss": 0.83574045, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.16845703, + "routerloss_mlp": 0.0, + "step": 937, + "time_per_iteration": 2.6458749771118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.11016333, + "diversity_loss_mlp": 0.0, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.10011425098636609, + "language_loss": 0.90748799, + "learning_rate": 0.00094181136228212, + "loss": 0.91875559, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 938, + "time_per_iteration": 2.659946918487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132333, + "balance_loss_mlp": 1.11602521, + "diversity_loss_mlp": 0.0, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.06984091109722412, + "language_loss": 0.86027002, + "learning_rate": 0.0009416654125384077, + "loss": 0.8715933, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.16308594, + "routerloss_mlp": 0.0, + "step": 939, + "time_per_iteration": 2.723839044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182476, + "balance_loss_mlp": 1.17174697, + "diversity_loss_mlp": 0.0, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.0414358910702132, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.8095485, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 940, + "time_per_iteration": 4.920511722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141007, + "balance_loss_mlp": 1.12453222, + "diversity_loss_mlp": 0.0, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.0813056862192268, + "language_loss": 0.83903325, + "learning_rate": 0.000941372998698552, + "loss": 0.85044336, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 941, + "time_per_iteration": 2.937645673751831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00896978, + "balance_loss_mlp": 1.56833267, + "diversity_loss_mlp": 0.1911485, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.04191931915848681, + "language_loss": 0.82149267, + "learning_rate": 0.0009412265347159336, + "loss": 0.83046246, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0172378, + "step": 942, + "time_per_iteration": 2.7250781059265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116112, + "balance_loss_mlp": 1.14446664, + "diversity_loss_mlp": 0.0, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.08706600394859935, + "language_loss": 0.84761524, + "learning_rate": 0.0009410798994339829, + "loss": 0.85922647, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 943, + "time_per_iteration": 2.5916900634765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115721, + "balance_loss_mlp": 1.14027047, + "diversity_loss_mlp": 0.0, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.07414862428622851, + "language_loss": 0.87698966, + "learning_rate": 0.000940933092909628, + "loss": 0.88856173, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.16943359, + "routerloss_mlp": 0.0, + "step": 944, + "time_per_iteration": 2.6747801303863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166789, + "balance_loss_mlp": 1.15049326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.07390491400887403, + "language_loss": 0.83424389, + "learning_rate": 0.0009407861151998649, + "loss": 0.84591174, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 945, + "time_per_iteration": 2.602691411972046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163795, + "balance_loss_mlp": 1.14708209, + "diversity_loss_mlp": 0.0, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.07435679337016335, + "language_loss": 0.86087269, + "learning_rate": 0.0009406389663617552, + "loss": 0.87251067, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.16723633, + "routerloss_mlp": 0.0, + "step": 946, + "time_per_iteration": 2.6775379180908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139209, + "balance_loss_mlp": 1.12300825, + "diversity_loss_mlp": 0.0, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.08423780444915897, + "language_loss": 0.86031067, + "learning_rate": 0.000940491646452427, + "loss": 0.87170279, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.1619873, + "routerloss_mlp": 0.0, + "step": 947, + "time_per_iteration": 2.717313051223755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134537, + "balance_loss_mlp": 1.11805058, + "diversity_loss_mlp": 0.0, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.0716601161320721, + "language_loss": 0.90799212, + "learning_rate": 0.000940344155529075, + "loss": 0.91933751, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.16479492, + "routerloss_mlp": 0.0, + "step": 948, + "time_per_iteration": 2.645601749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00905236, + "balance_loss_mlp": 1.57791471, + "diversity_loss_mlp": 0.19691566, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.03478780514937427, + "language_loss": 0.87420666, + "learning_rate": 0.0009401964936489605, + "loss": 0.883259, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01782099, + "step": 949, + "time_per_iteration": 2.546546459197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132433, + "balance_loss_mlp": 1.11666203, + "diversity_loss_mlp": 0.0, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.11218622077210595, + "language_loss": 0.85308415, + "learning_rate": 0.0009400486608694108, + "loss": 0.86440849, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 950, + "time_per_iteration": 2.71462345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135805, + "balance_loss_mlp": 1.1190201, + "diversity_loss_mlp": 0.0, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.07143871570155125, + "language_loss": 0.87176299, + "learning_rate": 0.0009399006572478195, + "loss": 0.88312101, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 951, + "time_per_iteration": 3.0933260917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137853, + "balance_loss_mlp": 1.12129509, + "diversity_loss_mlp": 0.0, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.08672794105569953, + "language_loss": 0.90997601, + "learning_rate": 0.0009397524828416468, + "loss": 0.92135453, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 952, + "time_per_iteration": 2.6721160411834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906668, + "balance_loss_mlp": 1.58174932, + "diversity_loss_mlp": 0.19792399, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.0341945315399877, + "language_loss": 0.96079636, + "learning_rate": 0.0009396041377084192, + "loss": 0.96986312, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01683164, + "step": 953, + "time_per_iteration": 2.6563429832458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147916, + "balance_loss_mlp": 1.1312983, + "diversity_loss_mlp": 0.0, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.07156922543086394, + "language_loss": 0.87274891, + "learning_rate": 0.0009394556219057295, + "loss": 0.88422805, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 954, + "time_per_iteration": 2.710129499435425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.1480366, + "diversity_loss_mlp": 0.0, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.08933499459227748, + "language_loss": 0.83389091, + "learning_rate": 0.0009393069354912362, + "loss": 0.84553862, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.1673584, + "routerloss_mlp": 0.0, + "step": 955, + "time_per_iteration": 2.736077070236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162546, + "balance_loss_mlp": 1.1459167, + "diversity_loss_mlp": 0.0, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.10088049230192819, + "language_loss": 0.81851852, + "learning_rate": 0.0009391580785226649, + "loss": 0.83014399, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.16638184, + "routerloss_mlp": 0.0, + "step": 956, + "time_per_iteration": 2.8675243854522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139492, + "balance_loss_mlp": 1.12933517, + "diversity_loss_mlp": 0.0, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.028623000900350283, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80479944, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 957, + "time_per_iteration": 4.758531332015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128949, + "balance_loss_mlp": 1.11177051, + "diversity_loss_mlp": 0.0, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.0742792603097427, + "language_loss": 0.8674221, + "learning_rate": 0.0009388598531545196, + "loss": 0.87871158, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.17175293, + "routerloss_mlp": 0.0, + "step": 958, + "time_per_iteration": 2.8665144443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110151, + "balance_loss_mlp": 1.09304404, + "diversity_loss_mlp": 0.0, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.08387101873752756, + "language_loss": 0.85292655, + "learning_rate": 0.000938710484870727, + "loss": 0.86402804, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.17126465, + "routerloss_mlp": 0.0, + "step": 959, + "time_per_iteration": 2.5621094703674316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113798, + "balance_loss_mlp": 1.09718001, + "diversity_loss_mlp": 0.0, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.08027143748444723, + "language_loss": 0.85896957, + "learning_rate": 0.0009385609462644189, + "loss": 0.87010753, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.16625977, + "routerloss_mlp": 0.0, + "step": 960, + "time_per_iteration": 2.6949400901794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122642, + "balance_loss_mlp": 1.10596502, + "diversity_loss_mlp": 0.0, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.07967759372686231, + "language_loss": 0.8535409, + "learning_rate": 0.0009384112373936514, + "loss": 0.86476731, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 961, + "time_per_iteration": 2.644244432449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132475, + "balance_loss_mlp": 1.11566615, + "diversity_loss_mlp": 0.0, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.09330138113238175, + "language_loss": 0.91539109, + "learning_rate": 0.0009382613583165467, + "loss": 0.92671585, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.16821289, + "routerloss_mlp": 0.0, + "step": 962, + "time_per_iteration": 2.8191375732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128481, + "balance_loss_mlp": 1.11161256, + "diversity_loss_mlp": 0.0, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.08799115365988901, + "language_loss": 0.89600122, + "learning_rate": 0.0009381113090912928, + "loss": 0.90728599, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 963, + "time_per_iteration": 2.77341890335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137775, + "balance_loss_mlp": 1.12159812, + "diversity_loss_mlp": 0.0, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.08224545608030313, + "language_loss": 0.89354098, + "learning_rate": 0.000937961089776144, + "loss": 0.90491867, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 964, + "time_per_iteration": 2.6057045459747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140677, + "balance_loss_mlp": 1.12448788, + "diversity_loss_mlp": 0.0, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.08763662153745684, + "language_loss": 0.82399738, + "learning_rate": 0.0009378107004294208, + "loss": 0.83540416, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 965, + "time_per_iteration": 2.9792187213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132665, + "balance_loss_mlp": 1.11624968, + "diversity_loss_mlp": 0.0, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.0696996408734829, + "language_loss": 0.91584361, + "learning_rate": 0.0009376601411095096, + "loss": 0.92717028, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.16418457, + "routerloss_mlp": 0.0, + "step": 966, + "time_per_iteration": 2.6557700634002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108368, + "balance_loss_mlp": 1.09209585, + "diversity_loss_mlp": 0.0, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.0928645758984953, + "language_loss": 0.86438054, + "learning_rate": 0.0009375094118748622, + "loss": 0.8754642, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.16271973, + "routerloss_mlp": 0.0, + "step": 967, + "time_per_iteration": 2.5574727058410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121341, + "balance_loss_mlp": 1.10546279, + "diversity_loss_mlp": 0.0, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.08866997131388626, + "language_loss": 0.90710455, + "learning_rate": 0.0009373585127839976, + "loss": 0.91831791, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 968, + "time_per_iteration": 2.9949731826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.1066587, + "diversity_loss_mlp": 0.0, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08663719992470821, + "language_loss": 0.90892541, + "learning_rate": 0.0009372074438954994, + "loss": 0.92014849, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.15637207, + "routerloss_mlp": 0.0, + "step": 969, + "time_per_iteration": 2.583392381668091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115205, + "balance_loss_mlp": 1.09983897, + "diversity_loss_mlp": 0.0, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.1288159292638968, + "language_loss": 0.91714692, + "learning_rate": 0.0009370562052680181, + "loss": 0.92829901, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.15356445, + "routerloss_mlp": 0.0, + "step": 970, + "time_per_iteration": 2.476053476333618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131477, + "balance_loss_mlp": 1.1160872, + "diversity_loss_mlp": 0.0, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.05501755081279848, + "language_loss": 0.89296091, + "learning_rate": 0.0009369047969602695, + "loss": 0.90427566, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.15368652, + "routerloss_mlp": 0.0, + "step": 971, + "time_per_iteration": 2.705310344696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161734, + "balance_loss_mlp": 1.14604628, + "diversity_loss_mlp": 0.0, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.09590230746039986, + "language_loss": 0.86690193, + "learning_rate": 0.0009367532190310357, + "loss": 0.8785193, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 972, + "time_per_iteration": 2.551683187484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151378, + "balance_loss_mlp": 1.13526106, + "diversity_loss_mlp": 0.0, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.13723256450586457, + "language_loss": 0.88859725, + "learning_rate": 0.0009366014715391644, + "loss": 0.90011096, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.16113281, + "routerloss_mlp": 0.0, + "step": 973, + "time_per_iteration": 2.6311707496643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140536, + "balance_loss_mlp": 1.12521768, + "diversity_loss_mlp": 0.0, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.0667022200872989, + "language_loss": 0.83902818, + "learning_rate": 0.0009364495545435693, + "loss": 0.85043353, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 974, + "time_per_iteration": 2.756056308746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121055, + "balance_loss_mlp": 1.10528326, + "diversity_loss_mlp": 0.0, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.06720472395514528, + "language_loss": 0.88235438, + "learning_rate": 0.0009362974681032297, + "loss": 0.89356488, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 975, + "time_per_iteration": 2.601027488708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117499, + "balance_loss_mlp": 1.10179889, + "diversity_loss_mlp": 0.0, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.09372829562862567, + "language_loss": 0.88529336, + "learning_rate": 0.0009361452122771907, + "loss": 0.8964684, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 976, + "time_per_iteration": 2.8729074001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124468, + "balance_loss_mlp": 1.107934, + "diversity_loss_mlp": 0.0, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.10248565336705484, + "language_loss": 0.83506191, + "learning_rate": 0.0009359927871245635, + "loss": 0.84630656, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 977, + "time_per_iteration": 2.4633541107177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114403, + "balance_loss_mlp": 1.12861657, + "diversity_loss_mlp": 0.0, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.09207140211488826, + "language_loss": 0.85937703, + "learning_rate": 0.0009358401927045246, + "loss": 0.87081736, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 978, + "time_per_iteration": 2.8528451919555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165656, + "balance_loss_mlp": 1.15002799, + "diversity_loss_mlp": 0.0, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.09819064259764942, + "language_loss": 0.88151729, + "learning_rate": 0.0009356874290763166, + "loss": 0.89317381, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 979, + "time_per_iteration": 3.4732589721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165217, + "balance_loss_mlp": 1.14985144, + "diversity_loss_mlp": 0.0, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.07125364842819645, + "language_loss": 0.88739443, + "learning_rate": 0.0009355344962992474, + "loss": 0.8990466, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 980, + "time_per_iteration": 2.618013381958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092711, + "balance_loss_mlp": 1.61735535, + "diversity_loss_mlp": 0.20325859, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.031158428526317693, + "language_loss": 0.8787328, + "learning_rate": 0.0009353813944326908, + "loss": 0.88800395, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0168031, + "step": 981, + "time_per_iteration": 2.926612377166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00925726, + "balance_loss_mlp": 1.616956, + "diversity_loss_mlp": 0.20126666, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.0354798675553145, + "language_loss": 0.82752389, + "learning_rate": 0.0009352281235360863, + "loss": 0.83678114, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01661466, + "step": 982, + "time_per_iteration": 2.7461719512939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156754, + "balance_loss_mlp": 1.14193642, + "diversity_loss_mlp": 0.0, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.08008026175511872, + "language_loss": 0.84875655, + "learning_rate": 0.0009350746836689389, + "loss": 0.86032403, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 983, + "time_per_iteration": 2.5128703117370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232965, + "balance_loss_mlp": 1.22199774, + "diversity_loss_mlp": 0.0, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.06420942239022731, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82672185, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 984, + "time_per_iteration": 4.987680196762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144146, + "balance_loss_mlp": 1.12880325, + "diversity_loss_mlp": 0.0, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08702988523082197, + "language_loss": 0.82654107, + "learning_rate": 0.0009347672972613634, + "loss": 0.83798254, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 985, + "time_per_iteration": 2.586580514907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891878, + "balance_loss_mlp": 1.54986262, + "diversity_loss_mlp": 0.20135348, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.032521151954013804, + "language_loss": 0.85226321, + "learning_rate": 0.0009346133508402735, + "loss": 0.86118197, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01626948, + "step": 986, + "time_per_iteration": 2.7389352321624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151414, + "balance_loss_mlp": 1.13596404, + "diversity_loss_mlp": 0.0, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.0982536864932062, + "language_loss": 0.84267235, + "learning_rate": 0.0009344592356873166, + "loss": 0.85418648, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 987, + "time_per_iteration": 2.6327145099639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157169, + "balance_loss_mlp": 1.14155281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.07528447862042392, + "language_loss": 0.78532755, + "learning_rate": 0.0009343049518623255, + "loss": 0.79689926, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 988, + "time_per_iteration": 2.7461259365081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161817, + "balance_loss_mlp": 1.14693928, + "diversity_loss_mlp": 0.0, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.07061488940634471, + "language_loss": 0.83142781, + "learning_rate": 0.0009341504994251985, + "loss": 0.84304595, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 989, + "time_per_iteration": 2.9033045768737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128003, + "balance_loss_mlp": 1.11765516, + "diversity_loss_mlp": 0.0, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.02664126889468688, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74648499, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 990, + "time_per_iteration": 5.065544605255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116372, + "balance_loss_mlp": 1.14821064, + "diversity_loss_mlp": 0.0, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.062492069067547173, + "language_loss": 0.81668103, + "learning_rate": 0.0009338410889544574, + "loss": 0.82831824, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 991, + "time_per_iteration": 3.0360453128814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160077, + "balance_loss_mlp": 1.14444828, + "diversity_loss_mlp": 0.0, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.07188646642614673, + "language_loss": 0.87598348, + "learning_rate": 0.000933686131040967, + "loss": 0.88758421, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 992, + "time_per_iteration": 4.194309234619141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132508, + "balance_loss_mlp": 1.11693931, + "diversity_loss_mlp": 0.0, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.07096950165415856, + "language_loss": 0.90250611, + "learning_rate": 0.0009335310047555883, + "loss": 0.91383117, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 993, + "time_per_iteration": 2.7198565006256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128004, + "balance_loss_mlp": 1.11225605, + "diversity_loss_mlp": 0.0, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.07682750770192658, + "language_loss": 0.8836562, + "learning_rate": 0.0009333757101585467, + "loss": 0.89493626, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 994, + "time_per_iteration": 2.6651480197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121887, + "balance_loss_mlp": 1.10621142, + "diversity_loss_mlp": 0.0, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.10461680978710068, + "language_loss": 0.9317944, + "learning_rate": 0.0009332202473101329, + "loss": 0.94301325, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 995, + "time_per_iteration": 2.667943239212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00890685, + "balance_loss_mlp": 1.54595685, + "diversity_loss_mlp": 0.2013846, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.03439253799161941, + "language_loss": 0.8270663, + "learning_rate": 0.0009330646162707028, + "loss": 0.83597314, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170145, + "step": 996, + "time_per_iteration": 2.7859413623809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130524, + "balance_loss_mlp": 1.11483645, + "diversity_loss_mlp": 0.0, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.07379991060729872, + "language_loss": 0.84002179, + "learning_rate": 0.0009329088171006779, + "loss": 0.85132706, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.15673828, + "routerloss_mlp": 0.0, + "step": 997, + "time_per_iteration": 3.133023738861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136353, + "balance_loss_mlp": 1.12061739, + "diversity_loss_mlp": 0.0, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.09187105070084006, + "language_loss": 0.85599297, + "learning_rate": 0.0009327528498605446, + "loss": 0.86735654, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 998, + "time_per_iteration": 2.5390877723693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888942, + "balance_loss_mlp": 1.54108667, + "diversity_loss_mlp": 0.20404731, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.03685920036749298, + "language_loss": 0.89166534, + "learning_rate": 0.0009325967146108548, + "loss": 0.90055484, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01637482, + "step": 999, + "time_per_iteration": 2.7167420387268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159789, + "balance_loss_mlp": 1.14361215, + "diversity_loss_mlp": 0.0, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.08415694153473897, + "language_loss": 0.87386107, + "learning_rate": 0.0009324404114122258, + "loss": 0.88545901, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.16174316, + "routerloss_mlp": 0.0, + "step": 1000, + "time_per_iteration": 2.6833291053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164843, + "balance_loss_mlp": 1.1492269, + "diversity_loss_mlp": 0.0, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.07516183221332183, + "language_loss": 0.86446774, + "learning_rate": 0.0009322839403253397, + "loss": 0.87611622, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.15612793, + "routerloss_mlp": 0.0, + "step": 1001, + "time_per_iteration": 4.16480565071106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173642, + "balance_loss_mlp": 1.15789402, + "diversity_loss_mlp": 0.0, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.07739515949456567, + "language_loss": 0.84035075, + "learning_rate": 0.0009321273014109439, + "loss": 0.8520872, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1002, + "time_per_iteration": 2.9390604496002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183539, + "balance_loss_mlp": 1.16795826, + "diversity_loss_mlp": 0.0, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.08102605487142737, + "language_loss": 0.84643984, + "learning_rate": 0.0009319704947298513, + "loss": 0.85827518, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1003, + "time_per_iteration": 2.923952579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116012, + "balance_loss_mlp": 1.14496815, + "diversity_loss_mlp": 0.0, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.060771133612280225, + "language_loss": 0.88448775, + "learning_rate": 0.0009318135203429393, + "loss": 0.89608896, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.15124512, + "routerloss_mlp": 0.0, + "step": 1004, + "time_per_iteration": 2.7170984745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135222, + "balance_loss_mlp": 1.11972475, + "diversity_loss_mlp": 0.0, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.07023398647530335, + "language_loss": 0.87528408, + "learning_rate": 0.0009316563783111511, + "loss": 0.88663626, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1005, + "time_per_iteration": 2.7271320819854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011162, + "balance_loss_mlp": 1.10061884, + "diversity_loss_mlp": 0.0, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.07388032809600253, + "language_loss": 0.82009041, + "learning_rate": 0.0009314990686954943, + "loss": 0.83125246, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1006, + "time_per_iteration": 2.9210305213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108745, + "balance_loss_mlp": 1.09337938, + "diversity_loss_mlp": 0.0, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.06330578200459082, + "language_loss": 0.80805916, + "learning_rate": 0.000931341591557042, + "loss": 0.81914663, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.15344238, + "routerloss_mlp": 0.0, + "step": 1007, + "time_per_iteration": 3.695157051086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095993, + "balance_loss_mlp": 1.08054364, + "diversity_loss_mlp": 0.0, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.07858263731415134, + "language_loss": 0.87216473, + "learning_rate": 0.0009311839469569325, + "loss": 0.88312465, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1008, + "time_per_iteration": 2.633854389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108854, + "balance_loss_mlp": 1.07287586, + "diversity_loss_mlp": 0.0, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.14235975733457876, + "language_loss": 0.87399781, + "learning_rate": 0.0009310261349563687, + "loss": 0.88488322, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1009, + "time_per_iteration": 2.702073574066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898627, + "balance_loss_mlp": 1.56164169, + "diversity_loss_mlp": 0.20371187, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.03011805945399338, + "language_loss": 0.85438645, + "learning_rate": 0.0009308681556166186, + "loss": 0.86337274, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01594995, + "step": 1010, + "time_per_iteration": 2.8698601722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111744, + "balance_loss_mlp": 1.0962348, + "diversity_loss_mlp": 0.0, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.08879322612819535, + "language_loss": 0.87462533, + "learning_rate": 0.0009307100089990152, + "loss": 0.88574278, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1011, + "time_per_iteration": 2.7149901390075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140864, + "balance_loss_mlp": 1.12543821, + "diversity_loss_mlp": 0.0, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.07383907155719892, + "language_loss": 0.83837229, + "learning_rate": 0.0009305516951649568, + "loss": 0.84978092, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.15405273, + "routerloss_mlp": 0.0, + "step": 1012, + "time_per_iteration": 2.702683448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161407, + "balance_loss_mlp": 1.14599323, + "diversity_loss_mlp": 0.0, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07624018834593461, + "language_loss": 0.86570859, + "learning_rate": 0.0009303932141759057, + "loss": 0.87732267, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.15393066, + "routerloss_mlp": 0.0, + "step": 1013, + "time_per_iteration": 2.7500197887420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168747, + "balance_loss_mlp": 1.15382242, + "diversity_loss_mlp": 0.0, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.08469076174706892, + "language_loss": 0.83575755, + "learning_rate": 0.0009302345660933902, + "loss": 0.84744501, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1014, + "time_per_iteration": 2.8010780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171185, + "balance_loss_mlp": 1.15642715, + "diversity_loss_mlp": 0.0, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.08619273283705803, + "language_loss": 0.85146868, + "learning_rate": 0.0009300757509790026, + "loss": 0.86318052, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1015, + "time_per_iteration": 2.840315103530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150596, + "balance_loss_mlp": 1.13570654, + "diversity_loss_mlp": 0.0, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.10655365126946059, + "language_loss": 0.90244913, + "learning_rate": 0.0009299167688944005, + "loss": 0.91395509, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1016, + "time_per_iteration": 2.502391815185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130549, + "balance_loss_mlp": 1.11540985, + "diversity_loss_mlp": 0.0, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.07757202619564983, + "language_loss": 0.85754222, + "learning_rate": 0.0009297576199013063, + "loss": 0.86884773, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1017, + "time_per_iteration": 2.7255496978759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00657481, + "balance_loss_mlp": 1.1064117, + "diversity_loss_mlp": 0.17609364, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.0027779106975556575, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.73659611, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01622855, + "step": 1018, + "time_per_iteration": 4.943171739578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01384914, + "balance_loss_mlp": 1.37351775, + "diversity_loss_mlp": 0.0, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.09054623740471555, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80811214, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 1019, + "time_per_iteration": 5.518418788909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125322, + "balance_loss_mlp": 1.11074281, + "diversity_loss_mlp": 0.0, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.08202201534603108, + "language_loss": 0.8648417, + "learning_rate": 0.0009292791720892659, + "loss": 0.87609494, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1020, + "time_per_iteration": 2.889078140258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131715, + "balance_loss_mlp": 1.11721921, + "diversity_loss_mlp": 0.0, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.07932574612707302, + "language_loss": 0.88913518, + "learning_rate": 0.0009291193560807218, + "loss": 0.90045238, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1021, + "time_per_iteration": 2.5933609008789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136679, + "balance_loss_mlp": 1.122159, + "diversity_loss_mlp": 0.0, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.08278255048112054, + "language_loss": 0.87034905, + "learning_rate": 0.0009289593734732688, + "loss": 0.88171583, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1022, + "time_per_iteration": 2.600834369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132774, + "balance_loss_mlp": 1.11842132, + "diversity_loss_mlp": 0.0, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.08270608551386573, + "language_loss": 0.93774927, + "learning_rate": 0.0009287992243290175, + "loss": 0.94907701, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1023, + "time_per_iteration": 2.474914312362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111783, + "balance_loss_mlp": 1.10275006, + "diversity_loss_mlp": 0.0, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.06901830196983176, + "language_loss": 0.90473127, + "learning_rate": 0.0009286389087101435, + "loss": 0.91590953, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1024, + "time_per_iteration": 2.7718465328216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120328, + "balance_loss_mlp": 1.1055932, + "diversity_loss_mlp": 0.0, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.07476522676232629, + "language_loss": 0.8853035, + "learning_rate": 0.0009284784266788864, + "loss": 0.89650679, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1025, + "time_per_iteration": 2.7143290042877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122071, + "balance_loss_mlp": 1.10795665, + "diversity_loss_mlp": 0.0, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.08990804702262417, + "language_loss": 0.91984832, + "learning_rate": 0.0009283177782975512, + "loss": 0.93106908, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1026, + "time_per_iteration": 2.948909282684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115739, + "balance_loss_mlp": 1.10118401, + "diversity_loss_mlp": 0.0, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.08229992096701991, + "language_loss": 0.88074464, + "learning_rate": 0.000928156963628507, + "loss": 0.89190209, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1027, + "time_per_iteration": 2.5764074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109846, + "balance_loss_mlp": 1.09483802, + "diversity_loss_mlp": 0.0, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.08379460495492784, + "language_loss": 0.87978798, + "learning_rate": 0.0009279959827341877, + "loss": 0.89088643, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1028, + "time_per_iteration": 2.752347946166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08043635, + "diversity_loss_mlp": 0.0, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.08467225305095022, + "language_loss": 0.87624389, + "learning_rate": 0.0009278348356770915, + "loss": 0.88720024, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1029, + "time_per_iteration": 2.555527687072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096968, + "balance_loss_mlp": 1.08132768, + "diversity_loss_mlp": 0.0, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.0755245964113765, + "language_loss": 0.85285002, + "learning_rate": 0.0009276735225197814, + "loss": 0.86381966, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1030, + "time_per_iteration": 2.5947089195251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104122, + "balance_loss_mlp": 1.08832633, + "diversity_loss_mlp": 0.0, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.08972056860523267, + "language_loss": 0.85732102, + "learning_rate": 0.0009275120433248847, + "loss": 0.86836231, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.15783691, + "routerloss_mlp": 0.0, + "step": 1031, + "time_per_iteration": 2.676872730255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109193, + "balance_loss_mlp": 1.09355247, + "diversity_loss_mlp": 0.0, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.07488561277584621, + "language_loss": 0.85529125, + "learning_rate": 0.0009273503981550931, + "loss": 0.86638314, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.15625, + "routerloss_mlp": 0.0, + "step": 1032, + "time_per_iteration": 3.09958815574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099668, + "balance_loss_mlp": 1.08494592, + "diversity_loss_mlp": 0.0, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.1040963884260124, + "language_loss": 0.86882496, + "learning_rate": 0.0009271885870731626, + "loss": 0.87982166, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1033, + "time_per_iteration": 2.509047269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098009, + "balance_loss_mlp": 1.08258307, + "diversity_loss_mlp": 0.0, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.09324111295027285, + "language_loss": 0.88376671, + "learning_rate": 0.0009270266101419143, + "loss": 0.89474678, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1034, + "time_per_iteration": 2.6504034996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094236, + "balance_loss_mlp": 1.07954955, + "diversity_loss_mlp": 0.0, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.12545708784893086, + "language_loss": 0.85201651, + "learning_rate": 0.0009268644674242328, + "loss": 0.86295891, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1035, + "time_per_iteration": 2.6919047832489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105423, + "balance_loss_mlp": 1.08997381, + "diversity_loss_mlp": 0.0, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.09055239952020887, + "language_loss": 0.80814689, + "learning_rate": 0.0009267021589830678, + "loss": 0.81920111, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1036, + "time_per_iteration": 2.582871198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278291, + "balance_loss_mlp": 1.26927888, + "diversity_loss_mlp": 0.0, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.10087907784966592, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78905374, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 1037, + "time_per_iteration": 4.955699920654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112922, + "balance_loss_mlp": 1.11371088, + "diversity_loss_mlp": 0.0, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.08737337363848705, + "language_loss": 0.9264009, + "learning_rate": 0.000926377045182406, + "loss": 0.93769312, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.15490723, + "routerloss_mlp": 0.0, + "step": 1038, + "time_per_iteration": 2.8884389400482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140143, + "balance_loss_mlp": 1.12453878, + "diversity_loss_mlp": 0.0, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.10415849564176528, + "language_loss": 0.87916917, + "learning_rate": 0.0009262142399491296, + "loss": 0.89057058, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1039, + "time_per_iteration": 3.045872211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143419, + "balance_loss_mlp": 1.12763548, + "diversity_loss_mlp": 0.0, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.09906225236156592, + "language_loss": 0.87455821, + "learning_rate": 0.0009260512692448105, + "loss": 0.88599241, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.15771484, + "routerloss_mlp": 0.0, + "step": 1040, + "time_per_iteration": 2.699052572250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.10879421, + "diversity_loss_mlp": 0.0, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.0911420547130344, + "language_loss": 0.8431657, + "learning_rate": 0.000925888133132719, + "loss": 0.85441184, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1041, + "time_per_iteration": 2.780141830444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063145, + "balance_loss_mlp": 1.05260694, + "diversity_loss_mlp": 0.0, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.04139604987307943, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80673575, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 1042, + "time_per_iteration": 4.971017360687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100746, + "balance_loss_mlp": 1.08498645, + "diversity_loss_mlp": 0.0, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.08950731646766712, + "language_loss": 0.81070006, + "learning_rate": 0.0009255613649386244, + "loss": 0.82170749, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.1574707, + "routerloss_mlp": 0.0, + "step": 1043, + "time_per_iteration": 2.6508612632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091355, + "balance_loss_mlp": 1.07623935, + "diversity_loss_mlp": 0.0, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.07614483401418765, + "language_loss": 0.78829026, + "learning_rate": 0.0009253977329834838, + "loss": 0.79920387, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1044, + "time_per_iteration": 2.7090582847595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109464, + "balance_loss_mlp": 1.07947624, + "diversity_loss_mlp": 0.0, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.0989854096864982, + "language_loss": 0.86366481, + "learning_rate": 0.0009252339358742965, + "loss": 0.8746112, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1045, + "time_per_iteration": 2.801323652267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100349, + "balance_loss_mlp": 1.08526874, + "diversity_loss_mlp": 0.0, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.07994799859902735, + "language_loss": 0.83704323, + "learning_rate": 0.000925069973674654, + "loss": 0.84804672, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.15063477, + "routerloss_mlp": 0.0, + "step": 1046, + "time_per_iteration": 2.6286635398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011046, + "balance_loss_mlp": 1.09036636, + "diversity_loss_mlp": 0.0, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.05803081938267982, + "language_loss": 0.88841283, + "learning_rate": 0.000924905846448212, + "loss": 0.89945889, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1047, + "time_per_iteration": 2.7208023071289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135029, + "balance_loss_mlp": 1.12078381, + "diversity_loss_mlp": 0.0, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.09159511175118457, + "language_loss": 0.85692465, + "learning_rate": 0.0009247415542586906, + "loss": 0.86827493, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1048, + "time_per_iteration": 2.8772377967834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089504, + "balance_loss_mlp": 1.55797935, + "diversity_loss_mlp": 0.19993141, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.028193920194447036, + "language_loss": 0.83094788, + "learning_rate": 0.0009245770971698735, + "loss": 0.83989829, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01608507, + "step": 1049, + "time_per_iteration": 2.922792911529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143099, + "balance_loss_mlp": 1.12878203, + "diversity_loss_mlp": 0.0, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.08345797467079887, + "language_loss": 0.88434327, + "learning_rate": 0.0009244124752456087, + "loss": 0.89577425, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1050, + "time_per_iteration": 2.5263967514038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141188, + "balance_loss_mlp": 1.12675214, + "diversity_loss_mlp": 0.0, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.07479960387863874, + "language_loss": 0.85303241, + "learning_rate": 0.0009242476885498081, + "loss": 0.86444432, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1051, + "time_per_iteration": 2.8012773990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146474, + "balance_loss_mlp": 1.13181126, + "diversity_loss_mlp": 0.0, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.07632391919964465, + "language_loss": 0.81114984, + "learning_rate": 0.0009240827371464474, + "loss": 0.82261455, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1052, + "time_per_iteration": 2.546449661254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146729, + "balance_loss_mlp": 1.1323998, + "diversity_loss_mlp": 0.0, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.11219768477147798, + "language_loss": 0.84167284, + "learning_rate": 0.0009239176210995666, + "loss": 0.85314012, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1053, + "time_per_iteration": 3.4905290603637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153158, + "balance_loss_mlp": 1.13878179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.07345468089138417, + "language_loss": 0.93850195, + "learning_rate": 0.0009237523404732695, + "loss": 0.95003355, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1054, + "time_per_iteration": 2.8854215145111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116831, + "balance_loss_mlp": 1.15374279, + "diversity_loss_mlp": 0.0, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.08788286689344726, + "language_loss": 0.84136868, + "learning_rate": 0.0009235868953317235, + "loss": 0.85305184, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1055, + "time_per_iteration": 2.785616397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115453, + "balance_loss_mlp": 1.14033246, + "diversity_loss_mlp": 0.0, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.07006303181868268, + "language_loss": 0.85314858, + "learning_rate": 0.0009234212857391602, + "loss": 0.86469388, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1056, + "time_per_iteration": 3.192293167114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167757, + "balance_loss_mlp": 1.15304708, + "diversity_loss_mlp": 0.0, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.07469852363602907, + "language_loss": 0.89220309, + "learning_rate": 0.000923255511759875, + "loss": 0.9038806, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1057, + "time_per_iteration": 2.783778429031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881428, + "balance_loss_mlp": 1.53356147, + "diversity_loss_mlp": 0.1968638, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.032510948660132113, + "language_loss": 0.84587663, + "learning_rate": 0.000923089573458227, + "loss": 0.85469091, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621579, + "step": 1058, + "time_per_iteration": 2.8847100734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150746, + "balance_loss_mlp": 1.13623881, + "diversity_loss_mlp": 0.0, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.11181454207252314, + "language_loss": 0.83516467, + "learning_rate": 0.0009229234708986392, + "loss": 0.84667218, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1059, + "time_per_iteration": 2.9079415798187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172867, + "balance_loss_mlp": 1.16251993, + "diversity_loss_mlp": 0.0, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.06024273804144221, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82839763, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1060, + "time_per_iteration": 4.646218776702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112152, + "balance_loss_mlp": 1.10713172, + "diversity_loss_mlp": 0.0, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.08928557521337042, + "language_loss": 0.85345757, + "learning_rate": 0.0009225907732636548, + "loss": 0.86467278, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1061, + "time_per_iteration": 2.745448112487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106249, + "balance_loss_mlp": 1.09209883, + "diversity_loss_mlp": 0.0, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.079028173596336, + "language_loss": 0.86936563, + "learning_rate": 0.0009224241783174227, + "loss": 0.88042819, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1062, + "time_per_iteration": 2.6923935413360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090506, + "balance_loss_mlp": 1.07616472, + "diversity_loss_mlp": 0.0, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.07452632641130948, + "language_loss": 0.85384166, + "learning_rate": 0.0009222574193715802, + "loss": 0.86474669, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1063, + "time_per_iteration": 2.7701327800750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092958, + "balance_loss_mlp": 1.07850981, + "diversity_loss_mlp": 0.0, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.06517233034985846, + "language_loss": 0.85915947, + "learning_rate": 0.000922090496490869, + "loss": 0.87008905, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1064, + "time_per_iteration": 2.7387099266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098934, + "balance_loss_mlp": 1.08404493, + "diversity_loss_mlp": 0.0, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.06963355430403552, + "language_loss": 0.89889115, + "learning_rate": 0.0009219234097400937, + "loss": 0.90988052, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1065, + "time_per_iteration": 2.859334707260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112884, + "balance_loss_mlp": 1.09778059, + "diversity_loss_mlp": 0.0, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.06723697540994414, + "language_loss": 0.83086514, + "learning_rate": 0.0009217561591841237, + "loss": 0.84199405, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1066, + "time_per_iteration": 3.3065547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00886484, + "balance_loss_mlp": 1.54046464, + "diversity_loss_mlp": 0.1982768, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.03984406199709606, + "language_loss": 0.80820358, + "learning_rate": 0.0009215887448878913, + "loss": 0.8170684, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01711285, + "step": 1067, + "time_per_iteration": 2.6291754245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131678, + "balance_loss_mlp": 1.11697936, + "diversity_loss_mlp": 0.0, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.07633348035576148, + "language_loss": 0.85365784, + "learning_rate": 0.0009214211669163922, + "loss": 0.86497462, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1068, + "time_per_iteration": 2.747936725616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136914, + "balance_loss_mlp": 1.12220347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.07197705825645119, + "language_loss": 0.9405331, + "learning_rate": 0.0009212534253346862, + "loss": 0.95190227, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1069, + "time_per_iteration": 2.696131467819214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128507, + "balance_loss_mlp": 1.11372542, + "diversity_loss_mlp": 0.0, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.09743186487320747, + "language_loss": 0.84269625, + "learning_rate": 0.0009210855202078964, + "loss": 0.85398132, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1070, + "time_per_iteration": 2.6194372177124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114316, + "balance_loss_mlp": 1.12903321, + "diversity_loss_mlp": 0.0, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.08033414700046611, + "language_loss": 0.87081122, + "learning_rate": 0.0009209174516012091, + "loss": 0.88224292, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1071, + "time_per_iteration": 2.5169904232025146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146914, + "balance_loss_mlp": 1.13247752, + "diversity_loss_mlp": 0.0, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.06769648970134874, + "language_loss": 0.89207751, + "learning_rate": 0.0009207492195798747, + "loss": 0.90354669, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1072, + "time_per_iteration": 2.804577112197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137485, + "balance_loss_mlp": 1.12303698, + "diversity_loss_mlp": 0.0, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.0857236005827703, + "language_loss": 0.84780991, + "learning_rate": 0.0009205808242092061, + "loss": 0.85918474, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1073, + "time_per_iteration": 2.6134936809539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122455, + "balance_loss_mlp": 1.10787559, + "diversity_loss_mlp": 0.0, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.09531084522047072, + "language_loss": 0.82512677, + "learning_rate": 0.0009204122655545808, + "loss": 0.83635134, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1074, + "time_per_iteration": 3.461315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00888955, + "balance_loss_mlp": 1.54418314, + "diversity_loss_mlp": 0.20175909, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.03221822204199988, + "language_loss": 0.80952764, + "learning_rate": 0.0009202435436814388, + "loss": 0.81841719, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01598355, + "step": 1075, + "time_per_iteration": 2.728055238723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146745, + "balance_loss_mlp": 1.13259482, + "diversity_loss_mlp": 0.0, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.0831097658087499, + "language_loss": 0.89925295, + "learning_rate": 0.0009200746586552836, + "loss": 0.91072041, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1076, + "time_per_iteration": 2.929422616958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136182, + "balance_loss_mlp": 1.12185347, + "diversity_loss_mlp": 0.0, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.07960863169785164, + "language_loss": 0.84148425, + "learning_rate": 0.0009199056105416825, + "loss": 0.85284609, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1077, + "time_per_iteration": 3.0795576572418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148051, + "balance_loss_mlp": 1.13384151, + "diversity_loss_mlp": 0.0, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.06589509494701294, + "language_loss": 0.86599898, + "learning_rate": 0.0009197363994062654, + "loss": 0.87747955, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1078, + "time_per_iteration": 2.8304550647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00891417, + "balance_loss_mlp": 1.54815006, + "diversity_loss_mlp": 0.20151556, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.027729032115243194, + "language_loss": 0.84302026, + "learning_rate": 0.0009195670253147262, + "loss": 0.85193443, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01658459, + "step": 1079, + "time_per_iteration": 2.987715005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168872, + "balance_loss_mlp": 1.15472198, + "diversity_loss_mlp": 0.0, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.07878432741989363, + "language_loss": 0.82508785, + "learning_rate": 0.0009193974883328216, + "loss": 0.83677661, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1080, + "time_per_iteration": 2.6007754802703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178335, + "balance_loss_mlp": 1.16408908, + "diversity_loss_mlp": 0.0, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.06872318796781544, + "language_loss": 0.86871535, + "learning_rate": 0.0009192277885263718, + "loss": 0.88049871, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1081, + "time_per_iteration": 2.645918846130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116777, + "balance_loss_mlp": 1.15339386, + "diversity_loss_mlp": 0.0, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.08475435362049728, + "language_loss": 0.86010319, + "learning_rate": 0.0009190579259612602, + "loss": 0.87178093, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1082, + "time_per_iteration": 3.2688331604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153529, + "balance_loss_mlp": 1.13914001, + "diversity_loss_mlp": 0.0, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.06676527060715894, + "language_loss": 0.86419082, + "learning_rate": 0.000918887900703433, + "loss": 0.8757261, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1083, + "time_per_iteration": 2.7645068168640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129996, + "balance_loss_mlp": 1.11559522, + "diversity_loss_mlp": 0.0, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.07296749014166971, + "language_loss": 0.89779425, + "learning_rate": 0.0009187177128188999, + "loss": 0.90909421, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1084, + "time_per_iteration": 2.441312313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128832, + "balance_loss_mlp": 1.11915255, + "diversity_loss_mlp": 0.0, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.053207927956046876, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78285372, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1085, + "time_per_iteration": 4.864179849624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117368, + "balance_loss_mlp": 1.1029439, + "diversity_loss_mlp": 0.0, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.07905606819783856, + "language_loss": 0.85833263, + "learning_rate": 0.000918376849434071, + "loss": 0.86950636, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1086, + "time_per_iteration": 4.049270868301392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112999, + "balance_loss_mlp": 1.09849179, + "diversity_loss_mlp": 0.0, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.08954509639668791, + "language_loss": 0.90778226, + "learning_rate": 0.0009182061740661098, + "loss": 0.91891223, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1087, + "time_per_iteration": 2.557358741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128974, + "balance_loss_mlp": 1.11446643, + "diversity_loss_mlp": 0.0, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.08446380837501397, + "language_loss": 0.85054636, + "learning_rate": 0.0009180353363361127, + "loss": 0.86183608, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1088, + "time_per_iteration": 3.0897305011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118573, + "balance_loss_mlp": 1.10417306, + "diversity_loss_mlp": 0.0, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.08173869768976531, + "language_loss": 0.82508695, + "learning_rate": 0.0009178643363104044, + "loss": 0.83627272, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1089, + "time_per_iteration": 3.124645948410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113657, + "balance_loss_mlp": 1.09938824, + "diversity_loss_mlp": 0.0, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.09307233053408402, + "language_loss": 0.90518665, + "learning_rate": 0.0009176931740553735, + "loss": 0.9163233, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1090, + "time_per_iteration": 2.6098225116729736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113731, + "balance_loss_mlp": 1.09981966, + "diversity_loss_mlp": 0.0, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.09489388322063774, + "language_loss": 0.8240813, + "learning_rate": 0.0009175218496374708, + "loss": 0.83521861, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1091, + "time_per_iteration": 3.336355686187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110612, + "balance_loss_mlp": 1.09205294, + "diversity_loss_mlp": 0.0, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.08870561470384966, + "language_loss": 0.86057436, + "learning_rate": 0.0009173503631232103, + "loss": 0.87163556, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1092, + "time_per_iteration": 3.356015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106884, + "balance_loss_mlp": 1.09269798, + "diversity_loss_mlp": 0.0, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.09478788106803046, + "language_loss": 0.82067865, + "learning_rate": 0.0009171787145791691, + "loss": 0.83174753, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1093, + "time_per_iteration": 3.2546143531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116222, + "balance_loss_mlp": 1.10199988, + "diversity_loss_mlp": 0.0, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.14674509624116924, + "language_loss": 0.80160701, + "learning_rate": 0.000917006904071987, + "loss": 0.81276917, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1094, + "time_per_iteration": 2.5837080478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911953, + "balance_loss_mlp": 1.58726883, + "diversity_loss_mlp": 0.20477253, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.035943125208157026, + "language_loss": 0.8737694, + "learning_rate": 0.0009168349316683669, + "loss": 0.88288891, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01593196, + "step": 1095, + "time_per_iteration": 2.768296718597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136825, + "balance_loss_mlp": 1.1224122, + "diversity_loss_mlp": 0.0, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.06639171103878667, + "language_loss": 0.82719827, + "learning_rate": 0.0009166627974350741, + "loss": 0.83856648, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1096, + "time_per_iteration": 2.8819992542266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145046, + "balance_loss_mlp": 1.13041949, + "diversity_loss_mlp": 0.0, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.08337696606413014, + "language_loss": 0.89929205, + "learning_rate": 0.0009164905014389373, + "loss": 0.91074252, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1097, + "time_per_iteration": 2.7877442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163813, + "balance_loss_mlp": 1.1495918, + "diversity_loss_mlp": 0.0, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.08033808486911229, + "language_loss": 0.86386079, + "learning_rate": 0.0009163180437468476, + "loss": 0.87549889, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1098, + "time_per_iteration": 2.6314592361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176615, + "balance_loss_mlp": 1.16195273, + "diversity_loss_mlp": 0.0, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.09094665560265827, + "language_loss": 0.85629344, + "learning_rate": 0.000916145424425759, + "loss": 0.86805964, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1099, + "time_per_iteration": 2.6608541011810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181873, + "balance_loss_mlp": 1.16744852, + "diversity_loss_mlp": 0.0, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.09944182260515583, + "language_loss": 0.9083795, + "learning_rate": 0.0009159726435426885, + "loss": 0.9201982, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1100, + "time_per_iteration": 3.0502405166625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149234, + "balance_loss_mlp": 1.134619, + "diversity_loss_mlp": 0.0, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.09151162791452093, + "language_loss": 0.90900993, + "learning_rate": 0.0009157997011647154, + "loss": 0.92050231, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1101, + "time_per_iteration": 2.6048476696014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127613, + "balance_loss_mlp": 1.11389172, + "diversity_loss_mlp": 0.0, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.07696729699318336, + "language_loss": 0.86130077, + "learning_rate": 0.0009156265973589817, + "loss": 0.87257689, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1102, + "time_per_iteration": 2.7552144527435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114805, + "balance_loss_mlp": 1.10088181, + "diversity_loss_mlp": 0.0, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.07661877314329607, + "language_loss": 0.89485067, + "learning_rate": 0.0009154533321926926, + "loss": 0.90599877, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.13909912, + "routerloss_mlp": 0.0, + "step": 1103, + "time_per_iteration": 4.073851108551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105254, + "balance_loss_mlp": 1.09134197, + "diversity_loss_mlp": 0.0, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.08363594534482698, + "language_loss": 0.8717171, + "learning_rate": 0.0009152799057331156, + "loss": 0.88276958, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1104, + "time_per_iteration": 3.142221450805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100132, + "balance_loss_mlp": 1.08656633, + "diversity_loss_mlp": 0.0, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.1056362594360365, + "language_loss": 0.91270363, + "learning_rate": 0.0009151063180475805, + "loss": 0.92370498, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1105, + "time_per_iteration": 2.512547016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095772, + "balance_loss_mlp": 1.08196795, + "diversity_loss_mlp": 0.0, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.08072473316090223, + "language_loss": 0.84285367, + "learning_rate": 0.0009149325692034803, + "loss": 0.85381138, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1106, + "time_per_iteration": 2.5711469650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071395, + "balance_loss_mlp": 1.06266928, + "diversity_loss_mlp": 0.0, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.04229613635199888, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.8027482, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1107, + "time_per_iteration": 4.817704916000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129097, + "balance_loss_mlp": 1.11547112, + "diversity_loss_mlp": 0.0, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.07382538641756346, + "language_loss": 0.8748607, + "learning_rate": 0.0009145845883094678, + "loss": 0.88615161, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1108, + "time_per_iteration": 3.039318561553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150208, + "balance_loss_mlp": 1.13671303, + "diversity_loss_mlp": 0.0, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.07887220377556703, + "language_loss": 0.85174125, + "learning_rate": 0.000914410356394654, + "loss": 0.86324334, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1109, + "time_per_iteration": 2.76413893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116209, + "balance_loss_mlp": 1.1484766, + "diversity_loss_mlp": 0.0, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.06362602917472766, + "language_loss": 0.84447891, + "learning_rate": 0.0009142359635914709, + "loss": 0.85609984, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1110, + "time_per_iteration": 3.007201671600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163563, + "balance_loss_mlp": 1.15004468, + "diversity_loss_mlp": 0.0, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.07633144605420673, + "language_loss": 0.84598219, + "learning_rate": 0.0009140614099676245, + "loss": 0.85761786, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1111, + "time_per_iteration": 2.569401979446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161722, + "balance_loss_mlp": 1.14807272, + "diversity_loss_mlp": 0.0, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.0712977258009472, + "language_loss": 0.82590818, + "learning_rate": 0.0009138866955908821, + "loss": 0.83752549, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1112, + "time_per_iteration": 2.870701789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166663, + "balance_loss_mlp": 1.15294182, + "diversity_loss_mlp": 0.0, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.09239605609063735, + "language_loss": 0.80485952, + "learning_rate": 0.0009137118205290738, + "loss": 0.81652606, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.13739014, + "routerloss_mlp": 0.0, + "step": 1113, + "time_per_iteration": 2.9623591899871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174843, + "balance_loss_mlp": 1.16082442, + "diversity_loss_mlp": 0.0, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.08763873550503462, + "language_loss": 0.90553653, + "learning_rate": 0.0009135367848500924, + "loss": 0.91728497, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1114, + "time_per_iteration": 2.5287492275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165146, + "balance_loss_mlp": 1.15138936, + "diversity_loss_mlp": 0.0, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.11593363319598911, + "language_loss": 0.86361086, + "learning_rate": 0.0009133615886218927, + "loss": 0.87526232, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1115, + "time_per_iteration": 2.6945505142211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141616, + "balance_loss_mlp": 1.12725139, + "diversity_loss_mlp": 0.0, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.08371979294567897, + "language_loss": 0.87389791, + "learning_rate": 0.0009131862319124917, + "loss": 0.88531411, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1116, + "time_per_iteration": 2.6219210624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130003, + "balance_loss_mlp": 1.1162107, + "diversity_loss_mlp": 0.0, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.08272793517794225, + "language_loss": 0.83981287, + "learning_rate": 0.0009130107147899691, + "loss": 0.85111284, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1117, + "time_per_iteration": 2.698151111602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.1039083, + "diversity_loss_mlp": 0.0, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.4685945915436946, + "language_loss": 0.85086691, + "learning_rate": 0.0009128350373224665, + "loss": 0.86204791, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1118, + "time_per_iteration": 2.545565128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059182, + "balance_loss_mlp": 1.04950213, + "diversity_loss_mlp": 0.0, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.03761711697708654, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82515609, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 1119, + "time_per_iteration": 4.648902416229248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118843, + "balance_loss_mlp": 1.10412121, + "diversity_loss_mlp": 0.0, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07492511871579786, + "language_loss": 0.85205054, + "learning_rate": 0.0009124832016254005, + "loss": 0.86323893, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1120, + "time_per_iteration": 2.5875513553619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112958, + "balance_loss_mlp": 1.11404657, + "diversity_loss_mlp": 0.0, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.10623123993924175, + "language_loss": 0.88117284, + "learning_rate": 0.0009123070435324316, + "loss": 0.89246857, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1121, + "time_per_iteration": 2.752814769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119114, + "balance_loss_mlp": 1.10852826, + "diversity_loss_mlp": 0.0, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.05861429426141409, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78994894, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 1122, + "time_per_iteration": 4.993450880050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114894, + "balance_loss_mlp": 1.13229823, + "diversity_loss_mlp": 0.0, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.09758120262844092, + "language_loss": 0.86477894, + "learning_rate": 0.0009119542471995752, + "loss": 0.87626839, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.16650391, + "routerloss_mlp": 0.0, + "step": 1123, + "time_per_iteration": 2.8260560035705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132116, + "balance_loss_mlp": 1.1160109, + "diversity_loss_mlp": 0.0, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.1175490331770948, + "language_loss": 0.81597894, + "learning_rate": 0.0009117776090966554, + "loss": 0.82730007, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1124, + "time_per_iteration": 2.955768585205078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133281, + "balance_loss_mlp": 1.1166153, + "diversity_loss_mlp": 0.0, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.08908783615486303, + "language_loss": 0.86717665, + "learning_rate": 0.0009116008111274899, + "loss": 0.87850952, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1125, + "time_per_iteration": 3.2493131160736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_mlp": 1.02921367, + "diversity_loss_mlp": 0.0, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.03267712428803131, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80145574, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 1126, + "time_per_iteration": 4.8121678829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148434, + "balance_loss_mlp": 1.13257909, + "diversity_loss_mlp": 0.0, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.09699177011816186, + "language_loss": 0.85244691, + "learning_rate": 0.0009112467358650396, + "loss": 0.86393118, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1127, + "time_per_iteration": 3.144075393676758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166528, + "balance_loss_mlp": 1.15056634, + "diversity_loss_mlp": 0.0, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.07985175184807933, + "language_loss": 0.86319685, + "learning_rate": 0.0009110694587092192, + "loss": 0.87486213, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.1595459, + "routerloss_mlp": 0.0, + "step": 1128, + "time_per_iteration": 2.7497644424438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179675, + "balance_loss_mlp": 1.1634866, + "diversity_loss_mlp": 0.0, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.1038215552752292, + "language_loss": 0.81267089, + "learning_rate": 0.0009108920219620815, + "loss": 0.82446766, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.16186523, + "routerloss_mlp": 0.0, + "step": 1129, + "time_per_iteration": 2.6150496006011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195026, + "balance_loss_mlp": 1.1788609, + "diversity_loss_mlp": 0.0, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.06771714561059723, + "language_loss": 0.89286679, + "learning_rate": 0.0009107144256925133, + "loss": 0.9048171, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1130, + "time_per_iteration": 2.6569926738739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196317, + "balance_loss_mlp": 1.18006873, + "diversity_loss_mlp": 0.0, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.08333124164895586, + "language_loss": 0.82520813, + "learning_rate": 0.0009105366699694638, + "loss": 0.83717132, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.16247559, + "routerloss_mlp": 0.0, + "step": 1131, + "time_per_iteration": 2.7384698390960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200769, + "balance_loss_mlp": 1.18390059, + "diversity_loss_mlp": 0.0, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.07018840625680964, + "language_loss": 0.81826723, + "learning_rate": 0.0009103587548619439, + "loss": 0.83027488, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.16882324, + "routerloss_mlp": 0.0, + "step": 1132, + "time_per_iteration": 2.8361291885375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188026, + "balance_loss_mlp": 1.17064476, + "diversity_loss_mlp": 0.0, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.08238158624987729, + "language_loss": 0.85952497, + "learning_rate": 0.0009101806804390261, + "loss": 0.87140524, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.1739502, + "routerloss_mlp": 0.0, + "step": 1133, + "time_per_iteration": 2.8646528720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846565, + "balance_loss_mlp": 1.45559311, + "diversity_loss_mlp": 0.20202307, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.03511986753794681, + "language_loss": 0.90682399, + "learning_rate": 0.0009100024467698453, + "loss": 0.91528964, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01775702, + "step": 1134, + "time_per_iteration": 2.628955364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119036, + "balance_loss_mlp": 1.17289567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.09831196896097749, + "language_loss": 0.82889581, + "learning_rate": 0.0009098240539235981, + "loss": 0.84079945, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.17492676, + "routerloss_mlp": 0.0, + "step": 1135, + "time_per_iteration": 2.6857638359069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179858, + "balance_loss_mlp": 1.16191649, + "diversity_loss_mlp": 0.0, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.07855046788509763, + "language_loss": 0.87649047, + "learning_rate": 0.0009096455019695423, + "loss": 0.88828909, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.1796875, + "routerloss_mlp": 0.0, + "step": 1136, + "time_per_iteration": 2.814746856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175201, + "balance_loss_mlp": 1.15702188, + "diversity_loss_mlp": 0.0, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.090535881946018, + "language_loss": 0.89789271, + "learning_rate": 0.000909466790976998, + "loss": 0.90964472, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.18188477, + "routerloss_mlp": 0.0, + "step": 1137, + "time_per_iteration": 2.503934144973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151304, + "balance_loss_mlp": 1.13231349, + "diversity_loss_mlp": 0.0, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.07386356915969775, + "language_loss": 0.82546908, + "learning_rate": 0.0009092879210153473, + "loss": 0.83698207, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.18981934, + "routerloss_mlp": 0.0, + "step": 1138, + "time_per_iteration": 3.106015682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143167, + "balance_loss_mlp": 1.12445128, + "diversity_loss_mlp": 0.0, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.08443059177839436, + "language_loss": 0.89126158, + "learning_rate": 0.0009091088921540333, + "loss": 0.90269327, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.18701172, + "routerloss_mlp": 0.0, + "step": 1139, + "time_per_iteration": 2.5165584087371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197317, + "balance_loss_mlp": 1.18491888, + "diversity_loss_mlp": 0.0, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.06938907882855633, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76705992, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12402344, + "routerloss_mlp": 0.0, + "step": 1140, + "time_per_iteration": 4.907839775085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845315, + "balance_loss_mlp": 1.45913088, + "diversity_loss_mlp": 0.19676474, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.04157801253712285, + "language_loss": 0.84799111, + "learning_rate": 0.0009087503580104985, + "loss": 0.8564443, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01736734, + "step": 1141, + "time_per_iteration": 2.6928980350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106137, + "balance_loss_mlp": 1.08643126, + "diversity_loss_mlp": 0.0, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.09652849342648293, + "language_loss": 0.7964108, + "learning_rate": 0.0009085708528674728, + "loss": 0.80747211, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.19689941, + "routerloss_mlp": 0.0, + "step": 1142, + "time_per_iteration": 2.7800490856170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115066, + "balance_loss_mlp": 1.09476519, + "diversity_loss_mlp": 0.0, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.11345906914127299, + "language_loss": 0.8700006, + "learning_rate": 0.0009083911891031745, + "loss": 0.88115132, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.20300293, + "routerloss_mlp": 0.0, + "step": 1143, + "time_per_iteration": 3.104893684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110402, + "balance_loss_mlp": 1.08533978, + "diversity_loss_mlp": 0.0, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.12428556161586228, + "language_loss": 0.91569418, + "learning_rate": 0.0009082113667873553, + "loss": 0.92673439, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.18676758, + "routerloss_mlp": 0.0, + "step": 1144, + "time_per_iteration": 3.0838277339935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138926, + "balance_loss_mlp": 1.12060392, + "diversity_loss_mlp": 0.0, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.0955721440223133, + "language_loss": 0.90911627, + "learning_rate": 0.0009080313859898283, + "loss": 0.92050546, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.18334961, + "routerloss_mlp": 0.0, + "step": 1145, + "time_per_iteration": 2.4998109340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162548, + "balance_loss_mlp": 1.14463091, + "diversity_loss_mlp": 0.0, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.07871728913387968, + "language_loss": 0.91642439, + "learning_rate": 0.0009078512467804684, + "loss": 0.92804986, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.17932129, + "routerloss_mlp": 0.0, + "step": 1146, + "time_per_iteration": 2.583137273788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192448, + "balance_loss_mlp": 1.17516243, + "diversity_loss_mlp": 0.0, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.10815580627735921, + "language_loss": 0.90245295, + "learning_rate": 0.0009076709492292119, + "loss": 0.91437739, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.1730957, + "routerloss_mlp": 0.0, + "step": 1147, + "time_per_iteration": 2.6189510822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199389, + "balance_loss_mlp": 1.18260384, + "diversity_loss_mlp": 0.0, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.10018226205073696, + "language_loss": 0.88948917, + "learning_rate": 0.0009074904934060562, + "loss": 0.90148306, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1148, + "time_per_iteration": 2.6619913578033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119284, + "balance_loss_mlp": 1.17623389, + "diversity_loss_mlp": 0.0, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.09879445691718633, + "language_loss": 0.85041308, + "learning_rate": 0.0009073098793810607, + "loss": 0.8623414, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.1661377, + "routerloss_mlp": 0.0, + "step": 1149, + "time_per_iteration": 2.9382119178771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185083, + "balance_loss_mlp": 1.16848898, + "diversity_loss_mlp": 0.0, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.09716543961816822, + "language_loss": 0.88557786, + "learning_rate": 0.000907129107224346, + "loss": 0.89742863, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.16601562, + "routerloss_mlp": 0.0, + "step": 1150, + "time_per_iteration": 2.717400550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.17356002, + "diversity_loss_mlp": 0.0, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.0741661773141201, + "language_loss": 0.88313866, + "learning_rate": 0.0009069481770060939, + "loss": 0.89504004, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1151, + "time_per_iteration": 2.676938056945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118655, + "balance_loss_mlp": 1.17039752, + "diversity_loss_mlp": 0.0, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.06827936796637825, + "language_loss": 0.83848286, + "learning_rate": 0.000906767088796548, + "loss": 0.85034835, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1152, + "time_per_iteration": 3.442782163619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185002, + "balance_loss_mlp": 1.16889715, + "diversity_loss_mlp": 0.0, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.07358747282835834, + "language_loss": 0.87001419, + "learning_rate": 0.0009065858426660127, + "loss": 0.88186425, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.16101074, + "routerloss_mlp": 0.0, + "step": 1153, + "time_per_iteration": 2.6501753330230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178927, + "balance_loss_mlp": 1.16286922, + "diversity_loss_mlp": 0.0, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.0863709920952229, + "language_loss": 0.84764236, + "learning_rate": 0.0009064044386848543, + "loss": 0.85943162, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1154, + "time_per_iteration": 2.920689344406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176891, + "balance_loss_mlp": 1.16032064, + "diversity_loss_mlp": 0.0, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.07669791788600007, + "language_loss": 0.88829726, + "learning_rate": 0.0009062228769234997, + "loss": 0.90006614, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.16577148, + "routerloss_mlp": 0.0, + "step": 1155, + "time_per_iteration": 2.561638832092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154629, + "balance_loss_mlp": 1.13797593, + "diversity_loss_mlp": 0.0, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.08447027490527963, + "language_loss": 0.81123281, + "learning_rate": 0.0009060411574524376, + "loss": 0.82277906, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.16662598, + "routerloss_mlp": 0.0, + "step": 1156, + "time_per_iteration": 2.655132293701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162354, + "balance_loss_mlp": 1.14597416, + "diversity_loss_mlp": 0.0, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.08665349089557017, + "language_loss": 0.87817705, + "learning_rate": 0.0009058592803422178, + "loss": 0.88980061, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.16381836, + "routerloss_mlp": 0.0, + "step": 1157, + "time_per_iteration": 3.1417362689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183028, + "balance_loss_mlp": 1.17430186, + "diversity_loss_mlp": 0.0, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.06198684812147071, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79893315, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 1158, + "time_per_iteration": 4.867843866348267 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_mlp": 1.1120069, + "diversity_loss_mlp": 0.0, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.0864152607347894, + "language_loss": 0.90156865, + "learning_rate": 0.00090549505348681, + "loss": 0.91285539, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.16674805, + "routerloss_mlp": 0.0, + "step": 1159, + "time_per_iteration": 2.581865072250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118982, + "balance_loss_mlp": 1.1025548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.07056827667929483, + "language_loss": 0.83819324, + "learning_rate": 0.0009053127038830275, + "loss": 0.84938306, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.16430664, + "routerloss_mlp": 0.0, + "step": 1160, + "time_per_iteration": 2.9969708919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881169, + "balance_loss_mlp": 1.53314447, + "diversity_loss_mlp": 0.19063006, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.04002382495760162, + "language_loss": 0.87460124, + "learning_rate": 0.000905130196922898, + "loss": 0.88341296, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01928164, + "step": 1161, + "time_per_iteration": 2.6307718753814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881407, + "balance_loss_mlp": 1.5316093, + "diversity_loss_mlp": 0.19140732, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.030280826501304762, + "language_loss": 0.86784196, + "learning_rate": 0.0009049475326772769, + "loss": 0.87665606, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01989887, + "step": 1162, + "time_per_iteration": 2.6021478176116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00889034, + "balance_loss_mlp": 1.54766631, + "diversity_loss_mlp": 0.19066738, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.03198536270345376, + "language_loss": 0.83124602, + "learning_rate": 0.0009047647112170811, + "loss": 0.84013629, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01986698, + "step": 1163, + "time_per_iteration": 2.804150342941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123868, + "balance_loss_mlp": 1.1070838, + "diversity_loss_mlp": 0.0, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.09901141435665076, + "language_loss": 0.87948084, + "learning_rate": 0.0009045817326132876, + "loss": 0.89071947, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1164, + "time_per_iteration": 3.6840732097625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125369, + "balance_loss_mlp": 1.107988, + "diversity_loss_mlp": 0.0, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.08432013167879508, + "language_loss": 0.83142793, + "learning_rate": 0.0009043985969369357, + "loss": 0.84268159, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.17407227, + "routerloss_mlp": 0.0, + "step": 1165, + "time_per_iteration": 2.8148193359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146301, + "balance_loss_mlp": 1.12976706, + "diversity_loss_mlp": 0.0, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.06944445596490195, + "language_loss": 0.84334069, + "learning_rate": 0.0009042153042591245, + "loss": 0.85480368, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.16540527, + "routerloss_mlp": 0.0, + "step": 1166, + "time_per_iteration": 2.8004493713378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142176, + "balance_loss_mlp": 1.12542677, + "diversity_loss_mlp": 0.0, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.06821660135571728, + "language_loss": 0.85225487, + "learning_rate": 0.0009040318546510146, + "loss": 0.86367661, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.16760254, + "routerloss_mlp": 0.0, + "step": 1167, + "time_per_iteration": 3.1969215869903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156354, + "balance_loss_mlp": 1.13979554, + "diversity_loss_mlp": 0.0, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.06547364647617461, + "language_loss": 0.84988701, + "learning_rate": 0.0009038482481838275, + "loss": 0.86145055, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.16564941, + "routerloss_mlp": 0.0, + "step": 1168, + "time_per_iteration": 2.7087180614471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00861334, + "balance_loss_mlp": 1.49333596, + "diversity_loss_mlp": 0.19261675, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.02892951533663535, + "language_loss": 0.87266529, + "learning_rate": 0.0009036644849288455, + "loss": 0.88127863, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01835741, + "step": 1169, + "time_per_iteration": 3.1039352416992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.1631248, + "diversity_loss_mlp": 0.0, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.06865085555084699, + "language_loss": 0.85404736, + "learning_rate": 0.0009034805649574118, + "loss": 0.86584634, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.16784668, + "routerloss_mlp": 0.0, + "step": 1170, + "time_per_iteration": 2.659322738647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208955, + "balance_loss_mlp": 1.1926589, + "diversity_loss_mlp": 0.0, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.07685307661183591, + "language_loss": 0.85691977, + "learning_rate": 0.0009032964883409308, + "loss": 0.86900926, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.16296387, + "routerloss_mlp": 0.0, + "step": 1171, + "time_per_iteration": 2.8938751220703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128211, + "balance_loss_mlp": 1.11910319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.06058864885284362, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74178743, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 1172, + "time_per_iteration": 4.983820676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217918, + "balance_loss_mlp": 1.20207548, + "diversity_loss_mlp": 0.0, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.1048847225020503, + "language_loss": 0.8717351, + "learning_rate": 0.0009029278654587462, + "loss": 0.88391435, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.1583252, + "routerloss_mlp": 0.0, + "step": 1173, + "time_per_iteration": 2.639632225036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181665, + "balance_loss_mlp": 1.16508245, + "diversity_loss_mlp": 0.0, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.07111002228073603, + "language_loss": 0.82226282, + "learning_rate": 0.0009027433193361548, + "loss": 0.83407944, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.16589355, + "routerloss_mlp": 0.0, + "step": 1174, + "time_per_iteration": 2.7443323135375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159983, + "balance_loss_mlp": 1.14366364, + "diversity_loss_mlp": 0.0, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.06531304020653, + "language_loss": 0.86980343, + "learning_rate": 0.00090255861685474, + "loss": 0.88140327, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.16320801, + "routerloss_mlp": 0.0, + "step": 1175, + "time_per_iteration": 2.7534220218658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142116, + "balance_loss_mlp": 1.12533128, + "diversity_loss_mlp": 0.0, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.10016618462748716, + "language_loss": 0.90750074, + "learning_rate": 0.0009023737580862095, + "loss": 0.91892195, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.16796875, + "routerloss_mlp": 0.0, + "step": 1176, + "time_per_iteration": 2.5116937160491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114077, + "balance_loss_mlp": 1.12470055, + "diversity_loss_mlp": 0.0, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0707285441494173, + "language_loss": 0.83225566, + "learning_rate": 0.0009021887431023321, + "loss": 0.84366333, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1177, + "time_per_iteration": 2.599956512451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130034, + "balance_loss_mlp": 1.11444104, + "diversity_loss_mlp": 0.0, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.08431891612549362, + "language_loss": 0.87212515, + "learning_rate": 0.0009020035719749369, + "loss": 0.88342547, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.15576172, + "routerloss_mlp": 0.0, + "step": 1178, + "time_per_iteration": 2.7144312858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135701, + "balance_loss_mlp": 1.1205014, + "diversity_loss_mlp": 0.0, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.09883499682369536, + "language_loss": 0.77450085, + "learning_rate": 0.0009018182447759136, + "loss": 0.7858578, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1179, + "time_per_iteration": 2.98848557472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137145, + "balance_loss_mlp": 1.12187457, + "diversity_loss_mlp": 0.0, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.08173095074239418, + "language_loss": 0.79878223, + "learning_rate": 0.0009016327615772126, + "loss": 0.81015366, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1180, + "time_per_iteration": 2.9338154792785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149275, + "balance_loss_mlp": 1.13449335, + "diversity_loss_mlp": 0.0, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.08374692364956231, + "language_loss": 0.87680298, + "learning_rate": 0.0009014471224508451, + "loss": 0.88829577, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1181, + "time_per_iteration": 2.7131431102752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00881934, + "balance_loss_mlp": 1.53494334, + "diversity_loss_mlp": 0.19571492, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.04185105584005936, + "language_loss": 0.83154267, + "learning_rate": 0.0009012613274688823, + "loss": 0.84036207, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01660516, + "step": 1182, + "time_per_iteration": 2.649559736251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184244, + "balance_loss_mlp": 1.1692239, + "diversity_loss_mlp": 0.0, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.12019924395271459, + "language_loss": 0.87753081, + "learning_rate": 0.0009010753767034565, + "loss": 0.8893733, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1183, + "time_per_iteration": 2.5258986949920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175003, + "balance_loss_mlp": 1.16030502, + "diversity_loss_mlp": 0.0, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.08783280174490297, + "language_loss": 0.78918862, + "learning_rate": 0.0009008892702267599, + "loss": 0.80093861, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1184, + "time_per_iteration": 2.9962406158447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139115, + "balance_loss_mlp": 1.12460732, + "diversity_loss_mlp": 0.0, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.08254121322216867, + "language_loss": 0.88525105, + "learning_rate": 0.0009007030081110457, + "loss": 0.89664215, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1185, + "time_per_iteration": 2.5990660190582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125585, + "balance_loss_mlp": 1.11087465, + "diversity_loss_mlp": 0.0, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.07610459395316062, + "language_loss": 0.84548527, + "learning_rate": 0.000900516590428627, + "loss": 0.85674113, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1186, + "time_per_iteration": 2.7377407550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121666, + "balance_loss_mlp": 1.1070751, + "diversity_loss_mlp": 0.0, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.13748029932532174, + "language_loss": 0.89182103, + "learning_rate": 0.0009003300172518778, + "loss": 0.90303767, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1187, + "time_per_iteration": 2.6916556358337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116227, + "balance_loss_mlp": 1.10145736, + "diversity_loss_mlp": 0.0, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.11313229810108143, + "language_loss": 0.84335989, + "learning_rate": 0.0009001432886532321, + "loss": 0.85452211, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.14758301, + "routerloss_mlp": 0.0, + "step": 1188, + "time_per_iteration": 2.9698264598846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114727, + "balance_loss_mlp": 1.10021877, + "diversity_loss_mlp": 0.0, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.06729358528862889, + "language_loss": 0.86774516, + "learning_rate": 0.0008999564047051843, + "loss": 0.87889242, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1189, + "time_per_iteration": 2.5002098083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136799, + "balance_loss_mlp": 1.12243462, + "diversity_loss_mlp": 0.0, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.0714274855120672, + "language_loss": 0.84824312, + "learning_rate": 0.0008997693654802894, + "loss": 0.85961115, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.14379883, + "routerloss_mlp": 0.0, + "step": 1190, + "time_per_iteration": 2.6300055980682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149047, + "balance_loss_mlp": 1.13425303, + "diversity_loss_mlp": 0.0, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.07754985979781381, + "language_loss": 0.86714745, + "learning_rate": 0.0008995821710511625, + "loss": 0.87863791, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1191, + "time_per_iteration": 2.7126989364624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162855, + "balance_loss_mlp": 1.14807296, + "diversity_loss_mlp": 0.0, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.11547698788472376, + "language_loss": 0.85060751, + "learning_rate": 0.0008993948214904786, + "loss": 0.86223602, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1192, + "time_per_iteration": 2.5562260150909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152534, + "balance_loss_mlp": 1.14361739, + "diversity_loss_mlp": 0.0, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.05307726892258072, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79574746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 1193, + "time_per_iteration": 4.909748792648315 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187526, + "balance_loss_mlp": 1.17205215, + "diversity_loss_mlp": 0.0, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.09739164860103838, + "language_loss": 0.78353333, + "learning_rate": 0.0008990196572654427, + "loss": 0.79540861, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.15454102, + "routerloss_mlp": 0.0, + "step": 1194, + "time_per_iteration": 2.8592262268066406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117424, + "balance_loss_mlp": 1.1592319, + "diversity_loss_mlp": 0.0, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.06260411033315277, + "language_loss": 0.87559408, + "learning_rate": 0.0008988318427467426, + "loss": 0.88733649, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.14990234, + "routerloss_mlp": 0.0, + "step": 1195, + "time_per_iteration": 2.7444722652435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00878316, + "balance_loss_mlp": 1.52780199, + "diversity_loss_mlp": 0.1948241, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.0364111048645648, + "language_loss": 0.86376345, + "learning_rate": 0.0008986438733877887, + "loss": 0.87254667, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01700337, + "step": 1196, + "time_per_iteration": 3.5090088844299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137546, + "balance_loss_mlp": 1.1229074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.08413871186116019, + "language_loss": 0.83810687, + "learning_rate": 0.0008984557492615576, + "loss": 0.84948236, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1197, + "time_per_iteration": 2.9953744411468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10803354, + "diversity_loss_mlp": 0.0, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.08617240411661099, + "language_loss": 0.90267789, + "learning_rate": 0.0008982674704410854, + "loss": 0.91390687, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1198, + "time_per_iteration": 2.7513339519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110338, + "balance_loss_mlp": 1.09598517, + "diversity_loss_mlp": 0.0, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.11146547076727734, + "language_loss": 0.77876621, + "learning_rate": 0.0008980790369994682, + "loss": 0.78986955, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1199, + "time_per_iteration": 2.989825487136841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120977, + "balance_loss_mlp": 1.10670781, + "diversity_loss_mlp": 0.0, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.0677628031660983, + "language_loss": 0.8729977, + "learning_rate": 0.000897890449009863, + "loss": 0.88420743, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1200, + "time_per_iteration": 2.6784448623657227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127646, + "balance_loss_mlp": 1.11330509, + "diversity_loss_mlp": 0.0, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.080414080555838, + "language_loss": 0.89825618, + "learning_rate": 0.0008977017065454853, + "loss": 0.90953267, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1201, + "time_per_iteration": 2.6610703468322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00880483, + "balance_loss_mlp": 1.52539706, + "diversity_loss_mlp": 0.19880572, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.03277795962214655, + "language_loss": 0.80367738, + "learning_rate": 0.0008975128096796121, + "loss": 0.81248224, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01838172, + "step": 1202, + "time_per_iteration": 2.901998996734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145011, + "balance_loss_mlp": 1.13089633, + "diversity_loss_mlp": 0.0, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.10693947298766643, + "language_loss": 0.85848922, + "learning_rate": 0.0008973237584855794, + "loss": 0.86993933, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1203, + "time_per_iteration": 2.872408151626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160017, + "balance_loss_mlp": 1.1457237, + "diversity_loss_mlp": 0.0, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.08753213296005687, + "language_loss": 0.82586002, + "learning_rate": 0.0008971345530367832, + "loss": 0.83746028, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1204, + "time_per_iteration": 2.4641921520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185717, + "balance_loss_mlp": 1.17120886, + "diversity_loss_mlp": 0.0, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.07947534631123947, + "language_loss": 0.85658818, + "learning_rate": 0.0008969451934066799, + "loss": 0.8684454, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1205, + "time_per_iteration": 2.7822117805480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173591, + "balance_loss_mlp": 1.15872586, + "diversity_loss_mlp": 0.0, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.08780432716538046, + "language_loss": 0.79991889, + "learning_rate": 0.0008967556796687854, + "loss": 0.81165481, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1206, + "time_per_iteration": 2.8849406242370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117013, + "balance_loss_mlp": 1.15584886, + "diversity_loss_mlp": 0.0, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.07569633120476413, + "language_loss": 0.83779937, + "learning_rate": 0.0008965660118966752, + "loss": 0.84950066, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1207, + "time_per_iteration": 2.9316329956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146692, + "balance_loss_mlp": 1.1319102, + "diversity_loss_mlp": 0.0, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.06968265941642382, + "language_loss": 0.90114093, + "learning_rate": 0.0008963761901639851, + "loss": 0.91260791, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1208, + "time_per_iteration": 2.8140323162078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113879, + "balance_loss_mlp": 1.12392485, + "diversity_loss_mlp": 0.0, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.08612535310277082, + "language_loss": 0.83098078, + "learning_rate": 0.0008961862145444103, + "loss": 0.84236872, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1209, + "time_per_iteration": 2.7529945373535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122935, + "balance_loss_mlp": 1.10796285, + "diversity_loss_mlp": 0.0, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.08243119711445285, + "language_loss": 0.85338795, + "learning_rate": 0.0008959960851117059, + "loss": 0.86461735, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1210, + "time_per_iteration": 2.624340534210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108554, + "balance_loss_mlp": 1.09396267, + "diversity_loss_mlp": 0.0, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.10596241027535934, + "language_loss": 0.84048676, + "learning_rate": 0.0008958058019396868, + "loss": 0.85157233, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1211, + "time_per_iteration": 2.8316566944122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112068, + "balance_loss_mlp": 1.09751284, + "diversity_loss_mlp": 0.0, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.07651667178885936, + "language_loss": 0.86494702, + "learning_rate": 0.0008956153651022274, + "loss": 0.8760677, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1212, + "time_per_iteration": 2.684788465499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103634, + "balance_loss_mlp": 1.08926892, + "diversity_loss_mlp": 0.0, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.07459915787800217, + "language_loss": 0.83929688, + "learning_rate": 0.0008954247746732618, + "loss": 0.85033321, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1213, + "time_per_iteration": 2.6184399127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117524, + "balance_loss_mlp": 1.10321903, + "diversity_loss_mlp": 0.0, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.08317009769115577, + "language_loss": 0.90604293, + "learning_rate": 0.0008952340307267837, + "loss": 0.91721821, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1214, + "time_per_iteration": 2.8993093967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119644, + "balance_loss_mlp": 1.10553002, + "diversity_loss_mlp": 0.0, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.09601716623847659, + "language_loss": 0.83731341, + "learning_rate": 0.0008950431333368468, + "loss": 0.84850979, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1215, + "time_per_iteration": 2.6151199340820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130858, + "balance_loss_mlp": 1.11676729, + "diversity_loss_mlp": 0.0, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.08049188450288745, + "language_loss": 0.84623635, + "learning_rate": 0.0008948520825775634, + "loss": 0.8575449, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1216, + "time_per_iteration": 3.645200490951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123063, + "balance_loss_mlp": 1.10880601, + "diversity_loss_mlp": 0.0, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.08038238822992319, + "language_loss": 0.83978343, + "learning_rate": 0.0008946608785231067, + "loss": 0.85101402, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1217, + "time_per_iteration": 2.871616840362549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126329, + "balance_loss_mlp": 1.11263156, + "diversity_loss_mlp": 0.0, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.07832391647543825, + "language_loss": 0.84442961, + "learning_rate": 0.0008944695212477084, + "loss": 0.85569292, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1218, + "time_per_iteration": 2.507080078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123493, + "balance_loss_mlp": 1.10867572, + "diversity_loss_mlp": 0.0, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.07420792055611987, + "language_loss": 0.86334574, + "learning_rate": 0.0008942780108256599, + "loss": 0.87458062, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1219, + "time_per_iteration": 2.6183433532714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107778, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.07657909053901747, + "language_loss": 0.86160946, + "learning_rate": 0.0008940863473313121, + "loss": 0.87268722, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1220, + "time_per_iteration": 2.495164632797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107377, + "balance_loss_mlp": 1.09272623, + "diversity_loss_mlp": 0.0, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.07962638616920462, + "language_loss": 0.87889743, + "learning_rate": 0.0008938945308390756, + "loss": 0.88997114, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1221, + "time_per_iteration": 2.613927125930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097552, + "balance_loss_mlp": 1.08298469, + "diversity_loss_mlp": 0.0, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.06679649396710063, + "language_loss": 0.87179595, + "learning_rate": 0.00089370256142342, + "loss": 0.88277149, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1222, + "time_per_iteration": 2.732208013534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094745, + "balance_loss_mlp": 1.07952189, + "diversity_loss_mlp": 0.0, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.06680688140454344, + "language_loss": 0.84810197, + "learning_rate": 0.0008935104391588746, + "loss": 0.85904944, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1223, + "time_per_iteration": 2.7585461139678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094902, + "balance_loss_mlp": 1.07917881, + "diversity_loss_mlp": 0.0, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.07271030004651308, + "language_loss": 0.83111542, + "learning_rate": 0.0008933181641200276, + "loss": 0.84206444, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.15710449, + "routerloss_mlp": 0.0, + "step": 1224, + "time_per_iteration": 3.1440725326538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087445, + "balance_loss_mlp": 1.07139981, + "diversity_loss_mlp": 0.0, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.07882513603721358, + "language_loss": 0.85824931, + "learning_rate": 0.0008931257363815271, + "loss": 0.8691237, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.16040039, + "routerloss_mlp": 0.0, + "step": 1225, + "time_per_iteration": 2.8887243270874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092595, + "balance_loss_mlp": 1.07659674, + "diversity_loss_mlp": 0.0, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.09571789824401095, + "language_loss": 0.89901638, + "learning_rate": 0.0008929331560180798, + "loss": 0.90994227, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.15991211, + "routerloss_mlp": 0.0, + "step": 1226, + "time_per_iteration": 2.897155284881592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095366, + "balance_loss_mlp": 1.07965469, + "diversity_loss_mlp": 0.0, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.068724406385502, + "language_loss": 0.90771782, + "learning_rate": 0.0008927404231044525, + "loss": 0.91867149, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.15698242, + "routerloss_mlp": 0.0, + "step": 1227, + "time_per_iteration": 2.6892144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_mlp": 1.08764625, + "diversity_loss_mlp": 0.0, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.06943954848997126, + "language_loss": 0.81646705, + "learning_rate": 0.0008925475377154703, + "loss": 0.82749879, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.15515137, + "routerloss_mlp": 0.0, + "step": 1228, + "time_per_iteration": 2.727325201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129278, + "balance_loss_mlp": 1.11394727, + "diversity_loss_mlp": 0.0, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.0778889683705481, + "language_loss": 0.8212285, + "learning_rate": 0.0008923544999260183, + "loss": 0.83252132, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1229, + "time_per_iteration": 2.7520618438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146737, + "balance_loss_mlp": 1.13194346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.0853653064859127, + "language_loss": 0.91254115, + "learning_rate": 0.00089216130981104, + "loss": 0.92400861, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1230, + "time_per_iteration": 3.016228199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138894, + "balance_loss_mlp": 1.12364721, + "diversity_loss_mlp": 0.0, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.08048994442870243, + "language_loss": 0.82752085, + "learning_rate": 0.000891967967445539, + "loss": 0.83890975, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.15222168, + "routerloss_mlp": 0.0, + "step": 1231, + "time_per_iteration": 2.65736722946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126061, + "balance_loss_mlp": 1.11135054, + "diversity_loss_mlp": 0.0, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.05909715635047166, + "language_loss": 0.889099, + "learning_rate": 0.0008917744729045772, + "loss": 0.90035963, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1232, + "time_per_iteration": 2.8686273097991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110871, + "balance_loss_mlp": 1.0962795, + "diversity_loss_mlp": 0.0, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.08046733758331526, + "language_loss": 0.83836448, + "learning_rate": 0.0008915808262632757, + "loss": 0.84947324, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1233, + "time_per_iteration": 2.860353708267212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918962, + "balance_loss_mlp": 1.60287488, + "diversity_loss_mlp": 0.20008399, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.03182006079144566, + "language_loss": 0.93544835, + "learning_rate": 0.0008913870275968148, + "loss": 0.94463801, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.017482, + "step": 1234, + "time_per_iteration": 2.7328829765319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095751, + "balance_loss_mlp": 1.08008718, + "diversity_loss_mlp": 0.0, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.07195832826776788, + "language_loss": 0.87503707, + "learning_rate": 0.0008911930769804342, + "loss": 0.88599461, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1235, + "time_per_iteration": 3.2619638442993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091405, + "balance_loss_mlp": 1.07551408, + "diversity_loss_mlp": 0.0, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.07148547933088874, + "language_loss": 0.91313815, + "learning_rate": 0.0008909989744894318, + "loss": 0.92405218, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1236, + "time_per_iteration": 2.8687992095947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080974, + "balance_loss_mlp": 1.06530952, + "diversity_loss_mlp": 0.0, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.08021447901266163, + "language_loss": 0.81662518, + "learning_rate": 0.0008908047201991649, + "loss": 0.8274349, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.15649414, + "routerloss_mlp": 0.0, + "step": 1237, + "time_per_iteration": 2.737638235092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076608, + "balance_loss_mlp": 1.06138515, + "diversity_loss_mlp": 0.0, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.07749899394714953, + "language_loss": 0.86585152, + "learning_rate": 0.0008906103141850502, + "loss": 0.87661767, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.15197754, + "routerloss_mlp": 0.0, + "step": 1238, + "time_per_iteration": 2.9184746742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068848, + "balance_loss_mlp": 1.05385113, + "diversity_loss_mlp": 0.0, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.10230617436374452, + "language_loss": 0.88104367, + "learning_rate": 0.0008904157565225621, + "loss": 0.89173216, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.1496582, + "routerloss_mlp": 0.0, + "step": 1239, + "time_per_iteration": 2.6396749019622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_mlp": 1.06220865, + "diversity_loss_mlp": 0.0, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.10467557893696883, + "language_loss": 0.81824136, + "learning_rate": 0.000890221047287235, + "loss": 0.82901168, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1240, + "time_per_iteration": 3.496812582015991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081371, + "balance_loss_mlp": 1.06710172, + "diversity_loss_mlp": 0.0, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.09443583580909311, + "language_loss": 0.91125917, + "learning_rate": 0.0008900261865546615, + "loss": 0.92207289, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1241, + "time_per_iteration": 2.6527724266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103977, + "balance_loss_mlp": 1.0890398, + "diversity_loss_mlp": 0.0, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.08429957072104315, + "language_loss": 0.84985352, + "learning_rate": 0.0008898311744004936, + "loss": 0.86089325, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1242, + "time_per_iteration": 2.6740338802337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118763, + "balance_loss_mlp": 1.10411179, + "diversity_loss_mlp": 0.0, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.07332762129893158, + "language_loss": 0.86932802, + "learning_rate": 0.0008896360109004414, + "loss": 0.88051569, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1243, + "time_per_iteration": 2.643489122390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142112, + "balance_loss_mlp": 1.12715125, + "diversity_loss_mlp": 0.0, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.09306092844590973, + "language_loss": 0.84636557, + "learning_rate": 0.0008894406961302742, + "loss": 0.85778666, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1244, + "time_per_iteration": 2.5876173973083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150798, + "balance_loss_mlp": 1.13590896, + "diversity_loss_mlp": 0.0, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.0838589606869783, + "language_loss": 0.83944738, + "learning_rate": 0.0008892452301658201, + "loss": 0.85095537, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1245, + "time_per_iteration": 2.928391218185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116146, + "balance_loss_mlp": 1.1460346, + "diversity_loss_mlp": 0.0, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.0736247551351698, + "language_loss": 0.83299339, + "learning_rate": 0.0008890496130829653, + "loss": 0.84460801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1246, + "time_per_iteration": 2.6510462760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915571, + "balance_loss_mlp": 1.59993446, + "diversity_loss_mlp": 0.1987851, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.03287481157446996, + "language_loss": 0.85918486, + "learning_rate": 0.0008888538449576555, + "loss": 0.86834061, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01621127, + "step": 1247, + "time_per_iteration": 2.5719456672668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178279, + "balance_loss_mlp": 1.16323447, + "diversity_loss_mlp": 0.0, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.10811715250715398, + "language_loss": 0.83036304, + "learning_rate": 0.0008886579258658944, + "loss": 0.8421458, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.15014648, + "routerloss_mlp": 0.0, + "step": 1248, + "time_per_iteration": 2.5736701488494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148631, + "balance_loss_mlp": 1.13341999, + "diversity_loss_mlp": 0.0, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.07868761607649298, + "language_loss": 0.84717274, + "learning_rate": 0.0008884618558837446, + "loss": 0.85865903, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.15185547, + "routerloss_mlp": 0.0, + "step": 1249, + "time_per_iteration": 2.8215761184692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911764, + "balance_loss_mlp": 1.59372783, + "diversity_loss_mlp": 0.19720009, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.03236174678929329, + "language_loss": 0.8677094, + "learning_rate": 0.0008882656350873273, + "loss": 0.87682706, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01629994, + "step": 1250, + "time_per_iteration": 2.885092258453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126022, + "balance_loss_mlp": 1.11122799, + "diversity_loss_mlp": 0.0, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.08347743908005935, + "language_loss": 0.87000573, + "learning_rate": 0.0008880692635528219, + "loss": 0.88126594, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1251, + "time_per_iteration": 3.049070119857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106629, + "balance_loss_mlp": 1.09177542, + "diversity_loss_mlp": 0.0, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.07406446185181008, + "language_loss": 0.89514965, + "learning_rate": 0.0008878727413564669, + "loss": 0.90621597, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1252, + "time_per_iteration": 2.734839677810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.06804204, + "diversity_loss_mlp": 0.0, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.048930323133030355, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81211317, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1253, + "time_per_iteration": 4.854974031448364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00873083, + "balance_loss_mlp": 1.51531768, + "diversity_loss_mlp": 0.19563958, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.03648198852202315, + "language_loss": 0.78763413, + "learning_rate": 0.0008874792452834528, + "loss": 0.7963649, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01760404, + "step": 1254, + "time_per_iteration": 2.803690195083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090026, + "balance_loss_mlp": 1.07530415, + "diversity_loss_mlp": 0.0, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.09659900556863026, + "language_loss": 0.8729195, + "learning_rate": 0.0008872822715595626, + "loss": 0.88381982, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1255, + "time_per_iteration": 2.657867670059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084204, + "balance_loss_mlp": 1.06968451, + "diversity_loss_mlp": 0.0, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.10497791491954662, + "language_loss": 0.87333822, + "learning_rate": 0.0008870851474793598, + "loss": 0.88418031, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1256, + "time_per_iteration": 2.5694568157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083756, + "balance_loss_mlp": 1.06920075, + "diversity_loss_mlp": 0.0, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.07331256259210016, + "language_loss": 0.89243567, + "learning_rate": 0.0008868878731193752, + "loss": 0.90327322, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1257, + "time_per_iteration": 2.829789400100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086138, + "balance_loss_mlp": 1.07158267, + "diversity_loss_mlp": 0.0, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.07236027639177293, + "language_loss": 0.89720446, + "learning_rate": 0.0008866904485561973, + "loss": 0.90806586, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.14526367, + "routerloss_mlp": 0.0, + "step": 1258, + "time_per_iteration": 2.731635570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078524, + "balance_loss_mlp": 1.06384969, + "diversity_loss_mlp": 0.0, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.0727569881861308, + "language_loss": 0.83084273, + "learning_rate": 0.000886492873866473, + "loss": 0.84162796, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1259, + "time_per_iteration": 2.8250575065612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080175, + "balance_loss_mlp": 1.06528533, + "diversity_loss_mlp": 0.0, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.10762424055834904, + "language_loss": 0.84672934, + "learning_rate": 0.000886295149126908, + "loss": 0.85753107, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.14868164, + "routerloss_mlp": 0.0, + "step": 1260, + "time_per_iteration": 2.7148356437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086434, + "balance_loss_mlp": 1.07181931, + "diversity_loss_mlp": 0.0, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.07159531524201106, + "language_loss": 0.85693741, + "learning_rate": 0.0008860972744142655, + "loss": 0.86780179, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1261, + "time_per_iteration": 2.931696653366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115009, + "balance_loss_mlp": 1.10064411, + "diversity_loss_mlp": 0.0, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.065367920687613, + "language_loss": 0.81639904, + "learning_rate": 0.0008858992498053671, + "loss": 0.82754916, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1262, + "time_per_iteration": 2.846466541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055704, + "balance_loss_mlp": 1.04764521, + "diversity_loss_mlp": 0.0, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.03374572714932058, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77644455, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1263, + "time_per_iteration": 4.882519006729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00872344, + "balance_loss_mlp": 1.51226497, + "diversity_loss_mlp": 0.19974959, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.03166105856965055, + "language_loss": 0.83409035, + "learning_rate": 0.0008855027512063817, + "loss": 0.84281385, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633644, + "step": 1264, + "time_per_iteration": 2.7414488792419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185798, + "balance_loss_mlp": 1.17132628, + "diversity_loss_mlp": 0.0, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.06261248257395001, + "language_loss": 0.85949916, + "learning_rate": 0.0008853042773702292, + "loss": 0.8713572, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1265, + "time_per_iteration": 2.695514440536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196886, + "balance_loss_mlp": 1.18234205, + "diversity_loss_mlp": 0.0, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.08760826562773598, + "language_loss": 0.87981403, + "learning_rate": 0.0008851056539456896, + "loss": 0.89178288, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1266, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119913, + "balance_loss_mlp": 1.18489647, + "diversity_loss_mlp": 0.0, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.07991839198753149, + "language_loss": 0.81904382, + "learning_rate": 0.0008849068810098755, + "loss": 0.83103514, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1267, + "time_per_iteration": 3.3067915439605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174372, + "balance_loss_mlp": 1.15992332, + "diversity_loss_mlp": 0.0, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.10499473220259715, + "language_loss": 0.83550054, + "learning_rate": 0.0008847079586399575, + "loss": 0.84724426, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1268, + "time_per_iteration": 2.4791157245635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115106, + "balance_loss_mlp": 1.13699341, + "diversity_loss_mlp": 0.0, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.07765469411987547, + "language_loss": 0.86144567, + "learning_rate": 0.0008845088869131641, + "loss": 0.87295628, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1269, + "time_per_iteration": 2.6733555793762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111483, + "balance_loss_mlp": 1.10053682, + "diversity_loss_mlp": 0.0, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.0888033537849515, + "language_loss": 0.88898385, + "learning_rate": 0.0008843096659067818, + "loss": 0.90013218, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1270, + "time_per_iteration": 2.6315910816192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111213, + "balance_loss_mlp": 1.09708679, + "diversity_loss_mlp": 0.0, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.09475560383246978, + "language_loss": 0.86565858, + "learning_rate": 0.000884110295698155, + "loss": 0.87677073, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1271, + "time_per_iteration": 2.926668643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.08752966, + "diversity_loss_mlp": 0.0, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.09917556522455147, + "language_loss": 0.85849231, + "learning_rate": 0.0008839107763646861, + "loss": 0.86951411, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.14624023, + "routerloss_mlp": 0.0, + "step": 1272, + "time_per_iteration": 2.58022403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110242, + "balance_loss_mlp": 1.08751881, + "diversity_loss_mlp": 0.0, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.08783320449451974, + "language_loss": 0.89941388, + "learning_rate": 0.0008837111079838353, + "loss": 0.91043806, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1273, + "time_per_iteration": 2.6877150535583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111566, + "balance_loss_mlp": 1.10096157, + "diversity_loss_mlp": 0.0, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.07640958054403056, + "language_loss": 0.89671296, + "learning_rate": 0.000883511290633121, + "loss": 0.90786958, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1274, + "time_per_iteration": 2.5929813385009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123812, + "balance_loss_mlp": 1.10898256, + "diversity_loss_mlp": 0.0, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.05814589763763208, + "language_loss": 0.92211604, + "learning_rate": 0.000883311324390119, + "loss": 0.93335414, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1275, + "time_per_iteration": 2.721343517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138447, + "balance_loss_mlp": 1.12315261, + "diversity_loss_mlp": 0.0, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.10098653640048322, + "language_loss": 0.81237984, + "learning_rate": 0.0008831112093324629, + "loss": 0.82376432, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.15283203, + "routerloss_mlp": 0.0, + "step": 1276, + "time_per_iteration": 3.066657543182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148152, + "balance_loss_mlp": 1.13266695, + "diversity_loss_mlp": 0.0, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.07328274291062464, + "language_loss": 0.89255905, + "learning_rate": 0.0008829109455378444, + "loss": 0.90404058, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.15466309, + "routerloss_mlp": 0.0, + "step": 1277, + "time_per_iteration": 2.6705071926116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163573, + "balance_loss_mlp": 1.14844561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.08343231090098181, + "language_loss": 0.86569774, + "learning_rate": 0.000882710533084013, + "loss": 0.87733346, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.15100098, + "routerloss_mlp": 0.0, + "step": 1278, + "time_per_iteration": 2.632864236831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152351, + "balance_loss_mlp": 1.13783133, + "diversity_loss_mlp": 0.0, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.0729065811951457, + "language_loss": 0.8929435, + "learning_rate": 0.0008825099720487755, + "loss": 0.90446699, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1279, + "time_per_iteration": 2.7111196517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00676302, + "balance_loss_mlp": 1.12665224, + "diversity_loss_mlp": 0.19835761, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.0027483074809680533, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.75937444, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0137972, + "step": 1280, + "time_per_iteration": 4.88429594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111003, + "balance_loss_mlp": 1.10232449, + "diversity_loss_mlp": 0.0, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.05615046205501133, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79055113, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1281, + "time_per_iteration": 4.752316236495972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113823, + "balance_loss_mlp": 1.09987593, + "diversity_loss_mlp": 0.0, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.08093958913819582, + "language_loss": 0.89542687, + "learning_rate": 0.0008819073982335619, + "loss": 0.90656507, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1282, + "time_per_iteration": 2.876927137374878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110167, + "balance_loss_mlp": 1.08783603, + "diversity_loss_mlp": 0.0, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.07169123109412263, + "language_loss": 0.84362143, + "learning_rate": 0.0008817062436519235, + "loss": 0.8546381, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.13824463, + "routerloss_mlp": 0.0, + "step": 1283, + "time_per_iteration": 2.6551387310028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0086846, + "balance_loss_mlp": 1.5022366, + "diversity_loss_mlp": 0.20048198, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.033180516132009126, + "language_loss": 0.89655471, + "learning_rate": 0.0008815049408787788, + "loss": 0.90523928, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01710081, + "step": 1284, + "time_per_iteration": 2.5652830600738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100363, + "balance_loss_mlp": 1.08698821, + "diversity_loss_mlp": 0.0, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.0762028673981185, + "language_loss": 0.85473216, + "learning_rate": 0.0008813034899922805, + "loss": 0.86573577, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1285, + "time_per_iteration": 2.549622058868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111306, + "balance_loss_mlp": 1.09783578, + "diversity_loss_mlp": 0.0, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.11471388318643767, + "language_loss": 0.89855313, + "learning_rate": 0.0008811018910706387, + "loss": 0.9096663, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1286, + "time_per_iteration": 2.575176954269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117993, + "balance_loss_mlp": 1.10453439, + "diversity_loss_mlp": 0.0, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.10517914532856759, + "language_loss": 0.81922066, + "learning_rate": 0.0008809001441921211, + "loss": 0.83040059, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1287, + "time_per_iteration": 2.732236862182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126757, + "balance_loss_mlp": 1.1132865, + "diversity_loss_mlp": 0.0, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.1440229573277689, + "language_loss": 0.85392761, + "learning_rate": 0.0008806982494350528, + "loss": 0.86519527, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1288, + "time_per_iteration": 2.6544177532196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168468, + "balance_loss_mlp": 1.1549263, + "diversity_loss_mlp": 0.0, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.07192560701016996, + "language_loss": 0.9021467, + "learning_rate": 0.0008804962068778161, + "loss": 0.91383135, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1289, + "time_per_iteration": 2.8321304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_mlp": 1.20329499, + "diversity_loss_mlp": 0.0, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.08274381184261048, + "language_loss": 0.81234664, + "learning_rate": 0.0008802940165988511, + "loss": 0.82451665, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1290, + "time_per_iteration": 2.848726749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262968, + "balance_loss_mlp": 1.24875808, + "diversity_loss_mlp": 0.0, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.09449787402071168, + "language_loss": 0.88461435, + "learning_rate": 0.000880091678676655, + "loss": 0.8972441, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1291, + "time_per_iteration": 2.802199363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279654, + "balance_loss_mlp": 1.26553965, + "diversity_loss_mlp": 0.0, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.11843407890200246, + "language_loss": 0.88870949, + "learning_rate": 0.0008798891931897821, + "loss": 0.90150601, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1292, + "time_per_iteration": 2.7150259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870403, + "balance_loss_mlp": 1.50883341, + "diversity_loss_mlp": 0.20002533, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.035309457370921726, + "language_loss": 0.84031773, + "learning_rate": 0.0008796865602168447, + "loss": 0.84902173, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597392, + "step": 1293, + "time_per_iteration": 2.5952000617980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210957, + "balance_loss_mlp": 1.19661582, + "diversity_loss_mlp": 0.0, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.07909897749306223, + "language_loss": 0.88611919, + "learning_rate": 0.0008794837798365115, + "loss": 0.89822876, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1294, + "time_per_iteration": 2.6257524490356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167929, + "balance_loss_mlp": 1.15246725, + "diversity_loss_mlp": 0.0, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.06704316740686254, + "language_loss": 0.8866623, + "learning_rate": 0.0008792808521275089, + "loss": 0.89834166, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1295, + "time_per_iteration": 2.7125115394592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153488, + "balance_loss_mlp": 1.13757372, + "diversity_loss_mlp": 0.0, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.08601952378824393, + "language_loss": 0.87496305, + "learning_rate": 0.0008790777771686206, + "loss": 0.88649786, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1296, + "time_per_iteration": 2.6131319999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124706, + "balance_loss_mlp": 1.10882747, + "diversity_loss_mlp": 0.0, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.0951042007575699, + "language_loss": 0.8543523, + "learning_rate": 0.0008788745550386872, + "loss": 0.86559939, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.15869141, + "routerloss_mlp": 0.0, + "step": 1297, + "time_per_iteration": 2.5590503215789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115503, + "balance_loss_mlp": 1.09948111, + "diversity_loss_mlp": 0.0, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.07219065567928346, + "language_loss": 0.80291975, + "learning_rate": 0.0008786711858166063, + "loss": 0.81407487, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1298, + "time_per_iteration": 2.951768398284912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00871436, + "balance_loss_mlp": 1.51113367, + "diversity_loss_mlp": 0.19870289, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.03357842357877673, + "language_loss": 0.83488023, + "learning_rate": 0.0008784676695813332, + "loss": 0.84359455, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165179, + "step": 1299, + "time_per_iteration": 2.985684871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108985, + "balance_loss_mlp": 1.07411456, + "diversity_loss_mlp": 0.0, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.07050099983107566, + "language_loss": 0.84900999, + "learning_rate": 0.0008782640064118796, + "loss": 0.85990846, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1300, + "time_per_iteration": 2.943368673324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139161, + "balance_loss_mlp": 1.13172245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.062054541004710057, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323914, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.07421875, + "routerloss_mlp": 0.0, + "step": 1301, + "time_per_iteration": 4.975619316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106013, + "balance_loss_mlp": 1.09055138, + "diversity_loss_mlp": 0.0, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.08145949094764637, + "language_loss": 0.86554521, + "learning_rate": 0.0008778562395867648, + "loss": 0.87660533, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.15441895, + "routerloss_mlp": 0.0, + "step": 1302, + "time_per_iteration": 2.6318612098693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111342, + "balance_loss_mlp": 1.09572554, + "diversity_loss_mlp": 0.0, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.0727542370097133, + "language_loss": 0.84224409, + "learning_rate": 0.0008776521360894127, + "loss": 0.85335743, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.15600586, + "routerloss_mlp": 0.0, + "step": 1303, + "time_per_iteration": 2.6512627601623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_mlp": 1.02259421, + "diversity_loss_mlp": 0.0, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.02979233866947858, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79991817, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.07128906, + "routerloss_mlp": 0.0, + "step": 1304, + "time_per_iteration": 4.802467107772827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112993, + "balance_loss_mlp": 1.11518431, + "diversity_loss_mlp": 0.0, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.07060498048015267, + "language_loss": 0.9057076, + "learning_rate": 0.0008772434893213186, + "loss": 0.91700697, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1305, + "time_per_iteration": 2.601546049118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137218, + "balance_loss_mlp": 1.12251997, + "diversity_loss_mlp": 0.0, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.13797279723809866, + "language_loss": 0.84362888, + "learning_rate": 0.0008770389462092276, + "loss": 0.85500103, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.14685059, + "routerloss_mlp": 0.0, + "step": 1306, + "time_per_iteration": 2.626138210296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141522, + "balance_loss_mlp": 1.12685966, + "diversity_loss_mlp": 0.0, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.08471108342240245, + "language_loss": 0.86803389, + "learning_rate": 0.0008768342567176357, + "loss": 0.87944913, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1307, + "time_per_iteration": 2.8074796199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114159, + "balance_loss_mlp": 1.12681937, + "diversity_loss_mlp": 0.0, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.07263390393133992, + "language_loss": 0.90559924, + "learning_rate": 0.0008766294209260107, + "loss": 0.91701508, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1308, + "time_per_iteration": 2.670790910720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147465, + "balance_loss_mlp": 1.13312435, + "diversity_loss_mlp": 0.0, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.07764888634730133, + "language_loss": 0.91554916, + "learning_rate": 0.0008764244389138767, + "loss": 0.92702377, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1309, + "time_per_iteration": 2.572793483734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147476, + "balance_loss_mlp": 1.13318276, + "diversity_loss_mlp": 0.0, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.09714227143719616, + "language_loss": 0.82980847, + "learning_rate": 0.000876219310760815, + "loss": 0.8412832, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1310, + "time_per_iteration": 2.8601791858673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146968, + "balance_loss_mlp": 1.13273418, + "diversity_loss_mlp": 0.0, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.09648806821544922, + "language_loss": 0.81436276, + "learning_rate": 0.0008760140365464631, + "loss": 0.82583249, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1311, + "time_per_iteration": 2.599353790283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870128, + "balance_loss_mlp": 1.50605726, + "diversity_loss_mlp": 0.20002663, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.03529693250820236, + "language_loss": 0.871418, + "learning_rate": 0.0008758086163505156, + "loss": 0.88011926, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170862, + "step": 1312, + "time_per_iteration": 2.6166832447052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163863, + "balance_loss_mlp": 1.14953399, + "diversity_loss_mlp": 0.0, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.07147814499844148, + "language_loss": 0.89267951, + "learning_rate": 0.0008756030502527239, + "loss": 0.90431809, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1313, + "time_per_iteration": 2.8452062606811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188075, + "balance_loss_mlp": 1.17377019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.09335955432973846, + "language_loss": 0.90298462, + "learning_rate": 0.0008753973383328954, + "loss": 0.91486537, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1314, + "time_per_iteration": 2.6988537311553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165459, + "balance_loss_mlp": 1.15108287, + "diversity_loss_mlp": 0.0, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.08872096542459323, + "language_loss": 0.83944553, + "learning_rate": 0.0008751914806708952, + "loss": 0.85110015, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1315, + "time_per_iteration": 2.6328680515289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151493, + "balance_loss_mlp": 1.1372478, + "diversity_loss_mlp": 0.0, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.09247066962171595, + "language_loss": 0.81854099, + "learning_rate": 0.0008749854773466439, + "loss": 0.83005595, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1316, + "time_per_iteration": 2.6708498001098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134446, + "balance_loss_mlp": 1.11980653, + "diversity_loss_mlp": 0.0, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.06992463478304738, + "language_loss": 0.84568423, + "learning_rate": 0.0008747793284401192, + "loss": 0.85702872, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1317, + "time_per_iteration": 2.70182204246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120804, + "balance_loss_mlp": 1.10560477, + "diversity_loss_mlp": 0.0, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.11229953955213261, + "language_loss": 0.85994983, + "learning_rate": 0.0008745730340313551, + "loss": 0.87115788, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1318, + "time_per_iteration": 2.8026556968688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.1048007, + "diversity_loss_mlp": 0.0, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.0843917818222923, + "language_loss": 0.84519732, + "learning_rate": 0.0008743665942004422, + "loss": 0.85639453, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.14904785, + "routerloss_mlp": 0.0, + "step": 1319, + "time_per_iteration": 2.6717073917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120645, + "balance_loss_mlp": 1.10569644, + "diversity_loss_mlp": 0.0, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.06860607652829093, + "language_loss": 0.92769039, + "learning_rate": 0.0008741600090275277, + "loss": 0.93889689, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1320, + "time_per_iteration": 2.6251981258392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120587, + "balance_loss_mlp": 1.10530448, + "diversity_loss_mlp": 0.0, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.09643257369734548, + "language_loss": 0.8425917, + "learning_rate": 0.0008739532785928151, + "loss": 0.85379755, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1321, + "time_per_iteration": 3.4925267696380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101061, + "balance_loss_mlp": 1.09305024, + "diversity_loss_mlp": 0.0, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.04547815076873398, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75994641, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1322, + "time_per_iteration": 4.8446879386901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0085354, + "balance_loss_mlp": 1.4814328, + "diversity_loss_mlp": 0.19370571, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.036800523279172735, + "language_loss": 0.82844102, + "learning_rate": 0.0008735393822590908, + "loss": 0.83697641, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597124, + "step": 1323, + "time_per_iteration": 2.7354650497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174586, + "balance_loss_mlp": 1.16032863, + "diversity_loss_mlp": 0.0, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.08280852347492981, + "language_loss": 0.87442601, + "learning_rate": 0.0008733322165207681, + "loss": 0.88617194, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1324, + "time_per_iteration": 2.6581695079803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120021, + "balance_loss_mlp": 1.18529749, + "diversity_loss_mlp": 0.0, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.0779912319299164, + "language_loss": 0.8296451, + "learning_rate": 0.0008731249058420247, + "loss": 0.84164721, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1325, + "time_per_iteration": 3.0674960613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203892, + "balance_loss_mlp": 1.18865728, + "diversity_loss_mlp": 0.0, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.10695670124077197, + "language_loss": 0.90080667, + "learning_rate": 0.0008729174503033459, + "loss": 0.91284555, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1326, + "time_per_iteration": 2.6511192321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188403, + "balance_loss_mlp": 1.17334652, + "diversity_loss_mlp": 0.0, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.10125548093505272, + "language_loss": 0.82427752, + "learning_rate": 0.0008727098499852728, + "loss": 0.83616149, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1327, + "time_per_iteration": 2.833803415298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150318, + "balance_loss_mlp": 1.13529778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.08478455973869617, + "language_loss": 0.89778203, + "learning_rate": 0.0008725021049684034, + "loss": 0.90928519, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.15002441, + "routerloss_mlp": 0.0, + "step": 1328, + "time_per_iteration": 2.7405433654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116795, + "balance_loss_mlp": 1.10194123, + "diversity_loss_mlp": 0.0, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.07099770943741918, + "language_loss": 0.83078361, + "learning_rate": 0.000872294215333391, + "loss": 0.84195161, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1329, + "time_per_iteration": 3.219834089279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099158, + "balance_loss_mlp": 1.08430433, + "diversity_loss_mlp": 0.0, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.06913408205057751, + "language_loss": 0.82662833, + "learning_rate": 0.0008720861811609457, + "loss": 0.8376199, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1330, + "time_per_iteration": 2.753122329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096587, + "balance_loss_mlp": 1.0816741, + "diversity_loss_mlp": 0.0, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.0919113566921475, + "language_loss": 0.83719599, + "learning_rate": 0.0008718780025318338, + "loss": 0.84816188, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1331, + "time_per_iteration": 2.724808692932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.09296656, + "diversity_loss_mlp": 0.0, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.09880415123515712, + "language_loss": 0.83982158, + "learning_rate": 0.0008716696795268771, + "loss": 0.85089689, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1332, + "time_per_iteration": 2.718421220779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098797, + "balance_loss_mlp": 1.08430111, + "diversity_loss_mlp": 0.0, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.15208681676824193, + "language_loss": 0.85333431, + "learning_rate": 0.0008714612122269538, + "loss": 0.8643223, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1333, + "time_per_iteration": 2.877823829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120258, + "balance_loss_mlp": 1.10586989, + "diversity_loss_mlp": 0.0, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.07756137703605612, + "language_loss": 0.89334106, + "learning_rate": 0.0008712526007129982, + "loss": 0.90454364, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1334, + "time_per_iteration": 2.561842441558838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155014, + "balance_loss_mlp": 1.14101923, + "diversity_loss_mlp": 0.0, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.12724628219842446, + "language_loss": 0.90676123, + "learning_rate": 0.0008710438450660003, + "loss": 0.91831136, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1335, + "time_per_iteration": 2.6618270874023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199277, + "balance_loss_mlp": 1.18486404, + "diversity_loss_mlp": 0.0, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.10895723532104484, + "language_loss": 0.87596953, + "learning_rate": 0.0008708349453670064, + "loss": 0.88796222, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1336, + "time_per_iteration": 2.5121865272521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195197, + "balance_loss_mlp": 1.18032002, + "diversity_loss_mlp": 0.0, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.10227195785495524, + "language_loss": 0.91035736, + "learning_rate": 0.0008706259016971185, + "loss": 0.92230934, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.14855957, + "routerloss_mlp": 0.0, + "step": 1337, + "time_per_iteration": 2.7760090827941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189061, + "balance_loss_mlp": 1.17414773, + "diversity_loss_mlp": 0.0, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.12625436277937716, + "language_loss": 0.83095431, + "learning_rate": 0.0008704167141374944, + "loss": 0.84284496, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1338, + "time_per_iteration": 2.824122428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146224, + "balance_loss_mlp": 1.13107228, + "diversity_loss_mlp": 0.0, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.0801465901926633, + "language_loss": 0.88427222, + "learning_rate": 0.0008702073827693482, + "loss": 0.89573455, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.15148926, + "routerloss_mlp": 0.0, + "step": 1339, + "time_per_iteration": 2.708488941192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101183, + "balance_loss_mlp": 1.0865202, + "diversity_loss_mlp": 0.0, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.07445900988257396, + "language_loss": 0.88514435, + "learning_rate": 0.0008699979076739494, + "loss": 0.89615613, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1340, + "time_per_iteration": 2.960650682449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.07054412, + "diversity_loss_mlp": 0.0, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.09041758143252471, + "language_loss": 0.88622832, + "learning_rate": 0.0008697882889326234, + "loss": 0.89708054, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1341, + "time_per_iteration": 2.5199689865112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094608, + "balance_loss_mlp": 1.08043432, + "diversity_loss_mlp": 0.0, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.08157938691300957, + "language_loss": 0.86840844, + "learning_rate": 0.0008695785266267515, + "loss": 0.87935448, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1342, + "time_per_iteration": 2.6833419799804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0089859, + "balance_loss_mlp": 1.56664371, + "diversity_loss_mlp": 0.19803861, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.03344075262961686, + "language_loss": 0.83491886, + "learning_rate": 0.0008693686208377704, + "loss": 0.84390479, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01624843, + "step": 1343, + "time_per_iteration": 2.8157622814178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101399, + "balance_loss_mlp": 1.08711743, + "diversity_loss_mlp": 0.0, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.07460013341605923, + "language_loss": 0.89022982, + "learning_rate": 0.0008691585716471733, + "loss": 0.90124375, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1344, + "time_per_iteration": 2.6386232376098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.09707415, + "diversity_loss_mlp": 0.0, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.08548738123283665, + "language_loss": 0.85822487, + "learning_rate": 0.0008689483791365079, + "loss": 0.86934054, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1345, + "time_per_iteration": 2.831817626953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112096, + "balance_loss_mlp": 1.10685778, + "diversity_loss_mlp": 0.0, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.07218857890204664, + "language_loss": 0.89327282, + "learning_rate": 0.0008687380433873786, + "loss": 0.90448248, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1346, + "time_per_iteration": 2.8322408199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139286, + "balance_loss_mlp": 1.1251955, + "diversity_loss_mlp": 0.0, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.07612070672802876, + "language_loss": 0.82638776, + "learning_rate": 0.0008685275644814448, + "loss": 0.83778065, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.14099121, + "routerloss_mlp": 0.0, + "step": 1347, + "time_per_iteration": 2.689772367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116224, + "balance_loss_mlp": 1.14764857, + "diversity_loss_mlp": 0.0, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07884944678342334, + "language_loss": 0.84390515, + "learning_rate": 0.0008683169425004216, + "loss": 0.85552752, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1348, + "time_per_iteration": 2.895153760910034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159732, + "balance_loss_mlp": 1.14511704, + "diversity_loss_mlp": 0.0, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.10354145261803285, + "language_loss": 0.83314335, + "learning_rate": 0.0008681061775260799, + "loss": 0.84474063, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1349, + "time_per_iteration": 2.850862503051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166024, + "balance_loss_mlp": 1.15118265, + "diversity_loss_mlp": 0.0, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.08416928552821445, + "language_loss": 0.9214983, + "learning_rate": 0.0008678952696402458, + "loss": 0.93315852, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1350, + "time_per_iteration": 2.525019884109497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153353, + "balance_loss_mlp": 1.13848734, + "diversity_loss_mlp": 0.0, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.07397225666721696, + "language_loss": 0.86554277, + "learning_rate": 0.000867684218924801, + "loss": 0.87707639, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.1484375, + "routerloss_mlp": 0.0, + "step": 1351, + "time_per_iteration": 2.8780648708343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083238, + "balance_loss_mlp": 1.07517958, + "diversity_loss_mlp": 0.0, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.0438698963901256, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80030328, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1352, + "time_per_iteration": 4.916059255599976 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132931, + "balance_loss_mlp": 1.11807716, + "diversity_loss_mlp": 0.0, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.06358739416567256, + "language_loss": 0.85154414, + "learning_rate": 0.0008672616893328834, + "loss": 0.86287344, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.14831543, + "routerloss_mlp": 0.0, + "step": 1353, + "time_per_iteration": 2.9301464557647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120208, + "balance_loss_mlp": 1.10545015, + "diversity_loss_mlp": 0.0, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.0804298790611747, + "language_loss": 0.89736795, + "learning_rate": 0.0008670502106204512, + "loss": 0.90857005, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.14733887, + "routerloss_mlp": 0.0, + "step": 1354, + "time_per_iteration": 2.8392651081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121529, + "balance_loss_mlp": 1.10672283, + "diversity_loss_mlp": 0.0, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.08121830869095954, + "language_loss": 0.81676221, + "learning_rate": 0.0008668385894064892, + "loss": 0.82797754, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1355, + "time_per_iteration": 2.632744550704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115185, + "balance_loss_mlp": 1.10095191, + "diversity_loss_mlp": 0.0, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.0871855710564252, + "language_loss": 0.88984954, + "learning_rate": 0.0008666268257731562, + "loss": 0.90100139, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1356, + "time_per_iteration": 3.0961363315582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132093, + "balance_loss_mlp": 1.11785948, + "diversity_loss_mlp": 0.0, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.08548634624367135, + "language_loss": 0.8594982, + "learning_rate": 0.0008664149198026662, + "loss": 0.87081909, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1357, + "time_per_iteration": 3.2423956394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133945, + "balance_loss_mlp": 1.12039137, + "diversity_loss_mlp": 0.0, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.09109654485188295, + "language_loss": 0.88802171, + "learning_rate": 0.0008662028715772883, + "loss": 0.89936113, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1358, + "time_per_iteration": 2.619495153427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138578, + "balance_loss_mlp": 1.12476182, + "diversity_loss_mlp": 0.0, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.07135790209188476, + "language_loss": 0.85816395, + "learning_rate": 0.0008659906811793467, + "loss": 0.86954975, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.13842773, + "routerloss_mlp": 0.0, + "step": 1359, + "time_per_iteration": 2.6752817630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135637, + "balance_loss_mlp": 1.12191582, + "diversity_loss_mlp": 0.0, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.07783428421444573, + "language_loss": 0.89649427, + "learning_rate": 0.0008657783486912215, + "loss": 0.90785068, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1360, + "time_per_iteration": 2.770136594772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918859, + "balance_loss_mlp": 1.60386825, + "diversity_loss_mlp": 0.20058532, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.03438194549161764, + "language_loss": 0.90315008, + "learning_rate": 0.0008655658741953472, + "loss": 0.91233867, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01663268, + "step": 1361, + "time_per_iteration": 3.239567518234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117406, + "balance_loss_mlp": 1.10352993, + "diversity_loss_mlp": 0.0, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.053733033776962646, + "language_loss": 0.88311911, + "learning_rate": 0.0008653532577742136, + "loss": 0.89429319, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1362, + "time_per_iteration": 2.6912107467651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111717, + "balance_loss_mlp": 1.09805584, + "diversity_loss_mlp": 0.0, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.07456283347469675, + "language_loss": 0.8687824, + "learning_rate": 0.0008651404995103659, + "loss": 0.87989956, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1363, + "time_per_iteration": 2.5554919242858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106371, + "balance_loss_mlp": 1.09212554, + "diversity_loss_mlp": 0.0, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.0735216597505126, + "language_loss": 0.87311852, + "learning_rate": 0.0008649275994864041, + "loss": 0.88418221, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1364, + "time_per_iteration": 2.7228429317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109936, + "balance_loss_mlp": 1.0955832, + "diversity_loss_mlp": 0.0, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.06423000395680191, + "language_loss": 0.83767593, + "learning_rate": 0.0008647145577849834, + "loss": 0.84877527, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1365, + "time_per_iteration": 2.8194234371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110395, + "balance_loss_mlp": 1.09573257, + "diversity_loss_mlp": 0.0, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.0636918785190987, + "language_loss": 0.82912111, + "learning_rate": 0.0008645013744888139, + "loss": 0.8402251, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1366, + "time_per_iteration": 2.9121909141540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106528, + "balance_loss_mlp": 1.09266424, + "diversity_loss_mlp": 0.0, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.07268525177684865, + "language_loss": 0.87255573, + "learning_rate": 0.0008642880496806607, + "loss": 0.88362104, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1367, + "time_per_iteration": 2.7527663707733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117256, + "balance_loss_mlp": 1.1027844, + "diversity_loss_mlp": 0.0, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.06883104565378229, + "language_loss": 0.84193766, + "learning_rate": 0.0008640745834433437, + "loss": 0.85311019, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1368, + "time_per_iteration": 2.7203800678253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114698, + "balance_loss_mlp": 1.10065532, + "diversity_loss_mlp": 0.0, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.0718323039568536, + "language_loss": 0.87083656, + "learning_rate": 0.000863860975859738, + "loss": 0.88198352, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1369, + "time_per_iteration": 2.9021553993225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116648, + "balance_loss_mlp": 1.10278392, + "diversity_loss_mlp": 0.0, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.08463505288724613, + "language_loss": 0.88568735, + "learning_rate": 0.0008636472270127733, + "loss": 0.8968538, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1370, + "time_per_iteration": 2.6336748600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118916, + "balance_loss_mlp": 1.10440779, + "diversity_loss_mlp": 0.0, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.08505114845208346, + "language_loss": 0.90530956, + "learning_rate": 0.0008634333369854345, + "loss": 0.91649872, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1371, + "time_per_iteration": 2.585775136947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122621, + "balance_loss_mlp": 1.10868549, + "diversity_loss_mlp": 0.0, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.07138701063901956, + "language_loss": 0.87574148, + "learning_rate": 0.0008632193058607608, + "loss": 0.88696772, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.13952637, + "routerloss_mlp": 0.0, + "step": 1372, + "time_per_iteration": 2.719151735305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124687, + "balance_loss_mlp": 1.11042953, + "diversity_loss_mlp": 0.0, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.09395332240398839, + "language_loss": 0.81125695, + "learning_rate": 0.0008630051337218466, + "loss": 0.82250381, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1373, + "time_per_iteration": 2.6700031757354736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118707, + "balance_loss_mlp": 1.10506988, + "diversity_loss_mlp": 0.0, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.0808240378873911, + "language_loss": 0.82403839, + "learning_rate": 0.0008627908206518409, + "loss": 0.83522546, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1374, + "time_per_iteration": 2.6610107421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061343, + "balance_loss_mlp": 1.05442929, + "diversity_loss_mlp": 0.0, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.04099598647265769, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76212597, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 1375, + "time_per_iteration": 4.979893922805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109458, + "balance_loss_mlp": 1.09580863, + "diversity_loss_mlp": 0.0, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.06989177478220372, + "language_loss": 0.91488004, + "learning_rate": 0.0008623617720514241, + "loss": 0.92597461, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1376, + "time_per_iteration": 2.6515755653381348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109452, + "balance_loss_mlp": 1.09554029, + "diversity_loss_mlp": 0.0, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.07399727326907257, + "language_loss": 0.84706682, + "learning_rate": 0.0008621470366875848, + "loss": 0.85816133, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1377, + "time_per_iteration": 2.599776268005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119233, + "balance_loss_mlp": 1.10546422, + "diversity_loss_mlp": 0.0, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.07769258092785128, + "language_loss": 0.87980253, + "learning_rate": 0.0008619321607257966, + "loss": 0.89099485, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1378, + "time_per_iteration": 2.678865671157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.10274947, + "diversity_loss_mlp": 0.0, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.07519514659764338, + "language_loss": 0.82002568, + "learning_rate": 0.000861717144249482, + "loss": 0.83118635, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1379, + "time_per_iteration": 2.8830740451812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118616, + "balance_loss_mlp": 1.10515702, + "diversity_loss_mlp": 0.0, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06542821866252439, + "language_loss": 0.89670694, + "learning_rate": 0.0008615019873421175, + "loss": 0.90789306, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.1348877, + "routerloss_mlp": 0.0, + "step": 1380, + "time_per_iteration": 2.4692320823669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124803, + "balance_loss_mlp": 1.11096311, + "diversity_loss_mlp": 0.0, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.08230289019981965, + "language_loss": 0.85984069, + "learning_rate": 0.0008612866900872349, + "loss": 0.87108874, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1381, + "time_per_iteration": 2.5671193599700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119212, + "balance_loss_mlp": 1.10564578, + "diversity_loss_mlp": 0.0, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.09708901974799254, + "language_loss": 0.8800329, + "learning_rate": 0.0008610712525684197, + "loss": 0.89122504, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1382, + "time_per_iteration": 2.673672676086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134856, + "balance_loss_mlp": 1.12075388, + "diversity_loss_mlp": 0.0, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.08550137436350284, + "language_loss": 0.84231853, + "learning_rate": 0.0008608556748693121, + "loss": 0.85366714, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1383, + "time_per_iteration": 3.285391330718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113293, + "balance_loss_mlp": 1.11881518, + "diversity_loss_mlp": 0.0, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.07276264363306281, + "language_loss": 0.86098409, + "learning_rate": 0.000860639957073607, + "loss": 0.87231338, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.14123535, + "routerloss_mlp": 0.0, + "step": 1384, + "time_per_iteration": 2.74979829788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130834, + "balance_loss_mlp": 1.11668396, + "diversity_loss_mlp": 0.0, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.07735164598050102, + "language_loss": 0.87488532, + "learning_rate": 0.0008604240992650534, + "loss": 0.88619369, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1385, + "time_per_iteration": 2.765714406967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113264, + "balance_loss_mlp": 1.11819148, + "diversity_loss_mlp": 0.0, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.09224305204204497, + "language_loss": 0.89344275, + "learning_rate": 0.0008602081015274545, + "loss": 0.90476912, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.14428711, + "routerloss_mlp": 0.0, + "step": 1386, + "time_per_iteration": 2.7466471195220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130382, + "balance_loss_mlp": 1.11580229, + "diversity_loss_mlp": 0.0, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.08049268911379595, + "language_loss": 0.83551365, + "learning_rate": 0.0008599919639446684, + "loss": 0.84681749, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1387, + "time_per_iteration": 2.680053234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119435, + "balance_loss_mlp": 1.10439074, + "diversity_loss_mlp": 0.0, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.08313146027802099, + "language_loss": 0.80363739, + "learning_rate": 0.000859775686600607, + "loss": 0.81483173, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.15026855, + "routerloss_mlp": 0.0, + "step": 1388, + "time_per_iteration": 2.5738272666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114186, + "balance_loss_mlp": 1.12722135, + "diversity_loss_mlp": 0.0, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.08559032433145165, + "language_loss": 0.85052109, + "learning_rate": 0.0008595592695792367, + "loss": 0.86193967, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1389, + "time_per_iteration": 2.660012722015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112772, + "balance_loss_mlp": 1.11312914, + "diversity_loss_mlp": 0.0, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.07620364037172102, + "language_loss": 0.90774226, + "learning_rate": 0.0008593427129645778, + "loss": 0.91901946, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1390, + "time_per_iteration": 2.62744140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131127, + "balance_loss_mlp": 1.11615419, + "diversity_loss_mlp": 0.0, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.0742307152228864, + "language_loss": 0.85619152, + "learning_rate": 0.0008591260168407052, + "loss": 0.86750275, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1391, + "time_per_iteration": 2.738680124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113811, + "balance_loss_mlp": 1.09930313, + "diversity_loss_mlp": 0.0, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.05574398067767488, + "language_loss": 0.82839364, + "learning_rate": 0.0008589091812917479, + "loss": 0.83953172, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1392, + "time_per_iteration": 2.5947506427764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109245, + "balance_loss_mlp": 1.09471345, + "diversity_loss_mlp": 0.0, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07022348692687568, + "language_loss": 0.85257161, + "learning_rate": 0.0008586922064018887, + "loss": 0.86366403, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1393, + "time_per_iteration": 2.6624581813812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110195, + "balance_loss_mlp": 1.09542501, + "diversity_loss_mlp": 0.0, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.07561979453055602, + "language_loss": 0.89401793, + "learning_rate": 0.0008584750922553651, + "loss": 0.9051199, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1394, + "time_per_iteration": 3.1940202713012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107917, + "balance_loss_mlp": 1.0934931, + "diversity_loss_mlp": 0.0, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.07234350422575066, + "language_loss": 0.83740592, + "learning_rate": 0.0008582578389364677, + "loss": 0.84848505, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.14404297, + "routerloss_mlp": 0.0, + "step": 1395, + "time_per_iteration": 2.8844621181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106129, + "balance_loss_mlp": 1.09147811, + "diversity_loss_mlp": 0.0, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.061968206774760184, + "language_loss": 0.91908813, + "learning_rate": 0.0008580404465295422, + "loss": 0.93014938, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.14648438, + "routerloss_mlp": 0.0, + "step": 1396, + "time_per_iteration": 2.7842769622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106127, + "balance_loss_mlp": 1.09155917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.07293181793333794, + "language_loss": 0.88274646, + "learning_rate": 0.0008578229151189876, + "loss": 0.89380777, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1397, + "time_per_iteration": 2.96771502494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110096, + "balance_loss_mlp": 1.08638036, + "diversity_loss_mlp": 0.0, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.08798004746081324, + "language_loss": 0.81253606, + "learning_rate": 0.0008576052447892573, + "loss": 0.82354569, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1398, + "time_per_iteration": 2.5413830280303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101823, + "balance_loss_mlp": 1.08761334, + "diversity_loss_mlp": 0.0, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.0737959226904994, + "language_loss": 0.86320835, + "learning_rate": 0.000857387435624858, + "loss": 0.87422657, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1399, + "time_per_iteration": 2.554016351699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00934821, + "balance_loss_mlp": 1.63627267, + "diversity_loss_mlp": 0.20064378, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.02492172823463741, + "language_loss": 0.88190895, + "learning_rate": 0.0008571694877103513, + "loss": 0.89125717, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636335, + "step": 1400, + "time_per_iteration": 3.307114839553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110386, + "balance_loss_mlp": 1.09591365, + "diversity_loss_mlp": 0.0, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.07757128819182789, + "language_loss": 0.87680864, + "learning_rate": 0.0008569514011303515, + "loss": 0.88791251, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1401, + "time_per_iteration": 2.800502300262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00917512, + "balance_loss_mlp": 1.60226941, + "diversity_loss_mlp": 0.19939175, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.03393521208879438, + "language_loss": 0.88186574, + "learning_rate": 0.0008567331759695277, + "loss": 0.8910408, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01668182, + "step": 1402, + "time_per_iteration": 2.7670016288757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108043, + "balance_loss_mlp": 1.09297514, + "diversity_loss_mlp": 0.0, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.0674494366068644, + "language_loss": 0.86427194, + "learning_rate": 0.0008565148123126023, + "loss": 0.87535238, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1403, + "time_per_iteration": 2.660659074783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094781, + "balance_loss_mlp": 1.08053553, + "diversity_loss_mlp": 0.0, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.059221605294443855, + "language_loss": 0.86113608, + "learning_rate": 0.0008562963102443516, + "loss": 0.8720839, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1404, + "time_per_iteration": 2.6982760429382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110424, + "balance_loss_mlp": 1.090042, + "diversity_loss_mlp": 0.0, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.08483345099627004, + "language_loss": 0.85166299, + "learning_rate": 0.0008560776698496056, + "loss": 0.86270541, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.14196777, + "routerloss_mlp": 0.0, + "step": 1405, + "time_per_iteration": 2.9167518615722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110133, + "balance_loss_mlp": 1.09539831, + "diversity_loss_mlp": 0.0, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.06923600464578249, + "language_loss": 0.85861331, + "learning_rate": 0.0008558588912132481, + "loss": 0.86971468, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1406, + "time_per_iteration": 2.8346776962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00696474, + "balance_loss_mlp": 1.17983532, + "diversity_loss_mlp": 0.18206902, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.0036772550136199766, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77155459, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0155216, + "step": 1407, + "time_per_iteration": 4.943782091140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105422, + "balance_loss_mlp": 1.09137964, + "diversity_loss_mlp": 0.0, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.08329945876184135, + "language_loss": 0.82942384, + "learning_rate": 0.0008554209195555016, + "loss": 0.84047806, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1408, + "time_per_iteration": 2.7417516708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125368, + "balance_loss_mlp": 1.11146832, + "diversity_loss_mlp": 0.0, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.06975199960684045, + "language_loss": 0.8827157, + "learning_rate": 0.0008552017267041483, + "loss": 0.89396936, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1409, + "time_per_iteration": 2.6978721618652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126015, + "balance_loss_mlp": 1.11216331, + "diversity_loss_mlp": 0.0, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.06710824628929367, + "language_loss": 0.83395678, + "learning_rate": 0.0008549823959512549, + "loss": 0.84521693, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1410, + "time_per_iteration": 2.6867637634277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125714, + "balance_loss_mlp": 1.11246991, + "diversity_loss_mlp": 0.0, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.07002470067050659, + "language_loss": 0.86486357, + "learning_rate": 0.0008547629273819728, + "loss": 0.87612069, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.13262939, + "routerloss_mlp": 0.0, + "step": 1411, + "time_per_iteration": 3.410454750061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142962, + "balance_loss_mlp": 1.12940812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.07619635814943253, + "language_loss": 0.83522588, + "learning_rate": 0.0008545433210815074, + "loss": 0.84665549, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1412, + "time_per_iteration": 2.638172149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139051, + "balance_loss_mlp": 1.12536621, + "diversity_loss_mlp": 0.0, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.06317158203016926, + "language_loss": 0.87351668, + "learning_rate": 0.0008543235771351176, + "loss": 0.88490719, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1413, + "time_per_iteration": 2.7705581188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159735, + "balance_loss_mlp": 1.14645457, + "diversity_loss_mlp": 0.0, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.08259318688939964, + "language_loss": 0.84684592, + "learning_rate": 0.0008541036956281154, + "loss": 0.85844326, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1414, + "time_per_iteration": 2.8803579807281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147853, + "balance_loss_mlp": 1.13435841, + "diversity_loss_mlp": 0.0, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.09396951476817994, + "language_loss": 0.81928164, + "learning_rate": 0.0008538836766458665, + "loss": 0.83076018, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.13519287, + "routerloss_mlp": 0.0, + "step": 1415, + "time_per_iteration": 2.860991954803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140979, + "balance_loss_mlp": 1.12721062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.07553622395064079, + "language_loss": 0.84927893, + "learning_rate": 0.0008536635202737897, + "loss": 0.86068869, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1416, + "time_per_iteration": 2.848196268081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146453, + "balance_loss_mlp": 1.13278019, + "diversity_loss_mlp": 0.0, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.07031625369418516, + "language_loss": 0.82188255, + "learning_rate": 0.0008534432265973573, + "loss": 0.83334708, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1417, + "time_per_iteration": 2.6029789447784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153419, + "balance_loss_mlp": 1.13950717, + "diversity_loss_mlp": 0.0, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.07823597875801033, + "language_loss": 0.88322413, + "learning_rate": 0.000853222795702095, + "loss": 0.89475828, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1418, + "time_per_iteration": 3.3933968544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149691, + "balance_loss_mlp": 1.13570726, + "diversity_loss_mlp": 0.0, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.07267637680100167, + "language_loss": 0.83730674, + "learning_rate": 0.0008530022276735813, + "loss": 0.84880364, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1419, + "time_per_iteration": 2.766181707382202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134499, + "balance_loss_mlp": 1.12086129, + "diversity_loss_mlp": 0.0, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.06887995103877555, + "language_loss": 0.86238861, + "learning_rate": 0.0008527815225974489, + "loss": 0.87373358, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1420, + "time_per_iteration": 2.6471102237701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135972, + "balance_loss_mlp": 1.12148833, + "diversity_loss_mlp": 0.0, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10131461494963417, + "language_loss": 0.88726115, + "learning_rate": 0.0008525606805593829, + "loss": 0.89862096, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1421, + "time_per_iteration": 2.436647653579712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118286, + "balance_loss_mlp": 1.10405266, + "diversity_loss_mlp": 0.0, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.0859881194807961, + "language_loss": 0.8254106, + "learning_rate": 0.0008523397016451213, + "loss": 0.83659345, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1422, + "time_per_iteration": 2.593588352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103656, + "balance_loss_mlp": 1.08907628, + "diversity_loss_mlp": 0.0, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.06052148467578676, + "language_loss": 0.87038374, + "learning_rate": 0.0008521185859404564, + "loss": 0.88142037, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1423, + "time_per_iteration": 3.3936307430267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092129, + "balance_loss_mlp": 1.07775199, + "diversity_loss_mlp": 0.0, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.06977326166261295, + "language_loss": 0.8940134, + "learning_rate": 0.0008518973335312326, + "loss": 0.90493476, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1424, + "time_per_iteration": 2.7834270000457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081272, + "balance_loss_mlp": 1.06702638, + "diversity_loss_mlp": 0.0, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.119675165593639, + "language_loss": 0.83282709, + "learning_rate": 0.0008516759445033477, + "loss": 0.84363985, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1425, + "time_per_iteration": 2.665099859237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.06930685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.08266887436661914, + "language_loss": 0.85026807, + "learning_rate": 0.0008514544189427526, + "loss": 0.86110568, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.14477539, + "routerloss_mlp": 0.0, + "step": 1426, + "time_per_iteration": 2.6887404918670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086038, + "balance_loss_mlp": 1.07249546, + "diversity_loss_mlp": 0.0, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.06908859165293682, + "language_loss": 0.86575979, + "learning_rate": 0.0008512327569354511, + "loss": 0.87662017, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1427, + "time_per_iteration": 2.5235631465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108142, + "balance_loss_mlp": 1.09480238, + "diversity_loss_mlp": 0.0, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.08987008099145026, + "language_loss": 0.8368206, + "learning_rate": 0.0008510109585675001, + "loss": 0.847902, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.13360596, + "routerloss_mlp": 0.0, + "step": 1428, + "time_per_iteration": 2.613348960876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140613, + "balance_loss_mlp": 1.13260245, + "diversity_loss_mlp": 0.0, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.05207498704371428, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82293957, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.08007812, + "routerloss_mlp": 0.0, + "step": 1429, + "time_per_iteration": 4.706013202667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133032, + "balance_loss_mlp": 1.11977601, + "diversity_loss_mlp": 0.0, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.09002666847623074, + "language_loss": 0.80503839, + "learning_rate": 0.0008505669530941415, + "loss": 0.8163687, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1430, + "time_per_iteration": 3.2976372241973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0097004, + "balance_loss_mlp": 1.70641518, + "diversity_loss_mlp": 0.20088202, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.03747760406507578, + "language_loss": 0.84294951, + "learning_rate": 0.000850344746161112, + "loss": 0.85264993, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01639144, + "step": 1431, + "time_per_iteration": 2.6297106742858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139922, + "balance_loss_mlp": 1.12685704, + "diversity_loss_mlp": 0.0, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.08230554095697513, + "language_loss": 0.87346137, + "learning_rate": 0.0008501224032121894, + "loss": 0.88486063, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1432, + "time_per_iteration": 2.4853787422180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129049, + "balance_loss_mlp": 1.1158998, + "diversity_loss_mlp": 0.0, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.06557126517551867, + "language_loss": 0.82118285, + "learning_rate": 0.0008498999243336946, + "loss": 0.83247334, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1433, + "time_per_iteration": 2.623809576034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11776567, + "diversity_loss_mlp": 0.0, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.0832335684907068, + "language_loss": 0.87471139, + "learning_rate": 0.0008496773096120021, + "loss": 0.88601708, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.12817383, + "routerloss_mlp": 0.0, + "step": 1434, + "time_per_iteration": 2.7995760440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111971, + "balance_loss_mlp": 1.10637057, + "diversity_loss_mlp": 0.0, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.10286197296711953, + "language_loss": 0.84387434, + "learning_rate": 0.0008494545591335381, + "loss": 0.85507143, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.13354492, + "routerloss_mlp": 0.0, + "step": 1435, + "time_per_iteration": 2.933576822280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113068, + "balance_loss_mlp": 1.09978795, + "diversity_loss_mlp": 0.0, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.053150449500146836, + "language_loss": 0.86971611, + "learning_rate": 0.0008492316729847823, + "loss": 0.88084674, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1436, + "time_per_iteration": 2.8865604400634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.09676659, + "diversity_loss_mlp": 0.0, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08937825724590943, + "language_loss": 0.7968539, + "learning_rate": 0.0008490086512522664, + "loss": 0.80795395, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1437, + "time_per_iteration": 2.7166872024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105369, + "balance_loss_mlp": 1.0916723, + "diversity_loss_mlp": 0.0, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.09013751301914075, + "language_loss": 0.90582836, + "learning_rate": 0.0008487854940225755, + "loss": 0.91688204, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1438, + "time_per_iteration": 2.4426465034484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102616, + "balance_loss_mlp": 1.08844161, + "diversity_loss_mlp": 0.0, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.09066429268698341, + "language_loss": 0.89896768, + "learning_rate": 0.0008485622013823466, + "loss": 0.90999383, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.14172363, + "routerloss_mlp": 0.0, + "step": 1439, + "time_per_iteration": 2.599177360534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090727, + "balance_loss_mlp": 1.07675576, + "diversity_loss_mlp": 0.0, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.08059762035463526, + "language_loss": 0.83446515, + "learning_rate": 0.00084833877341827, + "loss": 0.84537244, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1440, + "time_per_iteration": 2.667215347290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.0762167, + "diversity_loss_mlp": 0.0, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.07889497077341047, + "language_loss": 0.80625433, + "learning_rate": 0.000848115210217088, + "loss": 0.81715715, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1441, + "time_per_iteration": 2.5463788509368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.08003855, + "diversity_loss_mlp": 0.0, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.08443965058939805, + "language_loss": 0.81771946, + "learning_rate": 0.0008478915118655952, + "loss": 0.82866359, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1442, + "time_per_iteration": 2.743678569793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118232, + "balance_loss_mlp": 1.10385561, + "diversity_loss_mlp": 0.0, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.07019455815968899, + "language_loss": 0.86195552, + "learning_rate": 0.0008476676784506393, + "loss": 0.87313789, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1443, + "time_per_iteration": 2.663422107696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.10996866, + "diversity_loss_mlp": 0.0, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.08623331537045495, + "language_loss": 0.81889486, + "learning_rate": 0.0008474437100591201, + "loss": 0.83014178, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.14709473, + "routerloss_mlp": 0.0, + "step": 1444, + "time_per_iteration": 3.340557813644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129459, + "balance_loss_mlp": 1.11489129, + "diversity_loss_mlp": 0.0, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.08279806566523454, + "language_loss": 0.85577607, + "learning_rate": 0.0008472196067779898, + "loss": 0.86707067, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1445, + "time_per_iteration": 2.675623655319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112665, + "balance_loss_mlp": 1.09800267, + "diversity_loss_mlp": 0.0, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.10281028137483857, + "language_loss": 0.85108185, + "learning_rate": 0.0008469953686942531, + "loss": 0.86220849, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.14672852, + "routerloss_mlp": 0.0, + "step": 1446, + "time_per_iteration": 3.0647382736206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933894, + "balance_loss_mlp": 1.63962197, + "diversity_loss_mlp": 0.19544066, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.039122045531048345, + "language_loss": 0.83261281, + "learning_rate": 0.0008467709958949668, + "loss": 0.84195173, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01636306, + "step": 1447, + "time_per_iteration": 2.777806043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932176, + "balance_loss_mlp": 1.63710666, + "diversity_loss_mlp": 0.19454433, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.036668832644649825, + "language_loss": 0.85678959, + "learning_rate": 0.0008465464884672403, + "loss": 0.8661114, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01635053, + "step": 1448, + "time_per_iteration": 2.7313778400421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109364, + "balance_loss_mlp": 1.07944214, + "diversity_loss_mlp": 0.0, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.08672786191572247, + "language_loss": 0.85892808, + "learning_rate": 0.0008463218464982348, + "loss": 0.86986446, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1449, + "time_per_iteration": 2.8115885257720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109775, + "balance_loss_mlp": 1.08367157, + "diversity_loss_mlp": 0.0, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.09681901325388456, + "language_loss": 0.8756566, + "learning_rate": 0.0008460970700751645, + "loss": 0.88663405, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1450, + "time_per_iteration": 3.071645975112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093318, + "balance_loss_mlp": 1.07963276, + "diversity_loss_mlp": 0.0, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.09020366192691211, + "language_loss": 0.87640095, + "learning_rate": 0.000845872159285295, + "loss": 0.88733411, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1451, + "time_per_iteration": 2.7342164516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051691, + "balance_loss_mlp": 1.04301238, + "diversity_loss_mlp": 0.0, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.032344288076380935, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78818536, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 1452, + "time_per_iteration": 4.95387077331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121492, + "balance_loss_mlp": 1.10795009, + "diversity_loss_mlp": 0.0, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.08097200979220782, + "language_loss": 0.86171871, + "learning_rate": 0.0008454219349544836, + "loss": 0.87293363, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1453, + "time_per_iteration": 3.373755693435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127619, + "balance_loss_mlp": 1.11439896, + "diversity_loss_mlp": 0.0, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.0882994281711823, + "language_loss": 0.81864405, + "learning_rate": 0.000845196621588334, + "loss": 0.82992017, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.13244629, + "routerloss_mlp": 0.0, + "step": 1454, + "time_per_iteration": 2.758122682571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147815, + "balance_loss_mlp": 1.13453507, + "diversity_loss_mlp": 0.0, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.06575509380885615, + "language_loss": 0.76256007, + "learning_rate": 0.0008449711742049706, + "loss": 0.7740382, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1455, + "time_per_iteration": 2.752345561981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156513, + "balance_loss_mlp": 1.1432693, + "diversity_loss_mlp": 0.0, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.10411587441286801, + "language_loss": 0.84306383, + "learning_rate": 0.0008447455928919196, + "loss": 0.85462898, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1456, + "time_per_iteration": 2.6104180812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146529, + "balance_loss_mlp": 1.13327312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.07273170046833245, + "language_loss": 0.86767292, + "learning_rate": 0.0008445198777367595, + "loss": 0.87913817, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1457, + "time_per_iteration": 2.614743947982788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144047, + "balance_loss_mlp": 1.13080251, + "diversity_loss_mlp": 0.0, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.08362811388708001, + "language_loss": 0.81054902, + "learning_rate": 0.0008442940288271208, + "loss": 0.82198954, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1458, + "time_per_iteration": 2.615705966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112578, + "balance_loss_mlp": 1.11191583, + "diversity_loss_mlp": 0.0, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06892977395484212, + "language_loss": 0.8688817, + "learning_rate": 0.0008440680462506856, + "loss": 0.88013953, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1459, + "time_per_iteration": 2.810474157333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121233, + "balance_loss_mlp": 1.10828125, + "diversity_loss_mlp": 0.0, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.06441288224223744, + "language_loss": 0.86424565, + "learning_rate": 0.0008438419300951883, + "loss": 0.87545788, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1460, + "time_per_iteration": 2.6540863513946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115517, + "balance_loss_mlp": 1.10215354, + "diversity_loss_mlp": 0.0, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.12446768600100189, + "language_loss": 0.86647975, + "learning_rate": 0.0008436156804484148, + "loss": 0.87763494, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1461, + "time_per_iteration": 2.810589075088501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110833, + "balance_loss_mlp": 1.0965395, + "diversity_loss_mlp": 0.0, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.08490544085138897, + "language_loss": 0.88168794, + "learning_rate": 0.0008433892973982031, + "loss": 0.89279622, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1462, + "time_per_iteration": 2.561211347579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10098886, + "diversity_loss_mlp": 0.0, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07295818188475026, + "language_loss": 0.84776855, + "learning_rate": 0.0008431627810324431, + "loss": 0.85892212, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1463, + "time_per_iteration": 2.654146671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117739, + "balance_loss_mlp": 1.10345769, + "diversity_loss_mlp": 0.0, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.06893619297503142, + "language_loss": 0.8126353, + "learning_rate": 0.000842936131439076, + "loss": 0.82381272, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.1427002, + "routerloss_mlp": 0.0, + "step": 1464, + "time_per_iteration": 2.6571760177612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115394, + "balance_loss_mlp": 1.1010766, + "diversity_loss_mlp": 0.0, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.07879840484237804, + "language_loss": 0.87885797, + "learning_rate": 0.0008427093487060951, + "loss": 0.89001191, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.14294434, + "routerloss_mlp": 0.0, + "step": 1465, + "time_per_iteration": 2.6847336292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101907, + "balance_loss_mlp": 1.08776927, + "diversity_loss_mlp": 0.0, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.06118480673876746, + "language_loss": 0.84661305, + "learning_rate": 0.000842482432921545, + "loss": 0.8576321, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1466, + "time_per_iteration": 2.884965181350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110751, + "balance_loss_mlp": 1.09353852, + "diversity_loss_mlp": 0.0, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.07927655906335743, + "language_loss": 0.87199128, + "learning_rate": 0.0008422553841735225, + "loss": 0.88306642, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.13977051, + "routerloss_mlp": 0.0, + "step": 1467, + "time_per_iteration": 2.528017997741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115631, + "balance_loss_mlp": 1.10146928, + "diversity_loss_mlp": 0.0, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.07348722340160863, + "language_loss": 0.84837711, + "learning_rate": 0.0008420282025501757, + "loss": 0.85953343, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1468, + "time_per_iteration": 2.7696359157562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115321, + "balance_loss_mlp": 1.10156429, + "diversity_loss_mlp": 0.0, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.07024793700711117, + "language_loss": 0.85080296, + "learning_rate": 0.0008418008881397043, + "loss": 0.86195612, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1469, + "time_per_iteration": 2.659646511077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115825, + "balance_loss_mlp": 1.10241413, + "diversity_loss_mlp": 0.0, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.12791916727658353, + "language_loss": 0.82420468, + "learning_rate": 0.0008415734410303595, + "loss": 0.83536291, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1470, + "time_per_iteration": 3.2350287437438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120259, + "balance_loss_mlp": 1.10672879, + "diversity_loss_mlp": 0.0, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.0700140113394834, + "language_loss": 0.90437436, + "learning_rate": 0.0008413458613104444, + "loss": 0.91557699, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1471, + "time_per_iteration": 2.7219245433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111254, + "balance_loss_mlp": 1.09766376, + "diversity_loss_mlp": 0.0, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.07145574186167022, + "language_loss": 0.83164495, + "learning_rate": 0.0008411181490683129, + "loss": 0.84275752, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1472, + "time_per_iteration": 2.727936029434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107735, + "balance_loss_mlp": 1.09348917, + "diversity_loss_mlp": 0.0, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.0645149730480124, + "language_loss": 0.82377428, + "learning_rate": 0.0008408903043923707, + "loss": 0.83485162, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1473, + "time_per_iteration": 2.9972269535064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111455, + "balance_loss_mlp": 1.1004951, + "diversity_loss_mlp": 0.0, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09233547648167305, + "language_loss": 0.81268132, + "learning_rate": 0.0008406623273710754, + "loss": 0.82382679, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1474, + "time_per_iteration": 2.5923123359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105938, + "balance_loss_mlp": 1.09263408, + "diversity_loss_mlp": 0.0, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.0761903935255829, + "language_loss": 0.8290056, + "learning_rate": 0.0008404342180929351, + "loss": 0.840065, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1475, + "time_per_iteration": 2.664698600769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121728, + "balance_loss_mlp": 1.10819817, + "diversity_loss_mlp": 0.0, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.08946081876366527, + "language_loss": 0.81824017, + "learning_rate": 0.00084020597664651, + "loss": 0.82945752, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1476, + "time_per_iteration": 2.7941510677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113829, + "balance_loss_mlp": 1.10019112, + "diversity_loss_mlp": 0.0, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.09030679544521746, + "language_loss": 0.83820337, + "learning_rate": 0.0008399776031204111, + "loss": 0.84934169, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.13659668, + "routerloss_mlp": 0.0, + "step": 1477, + "time_per_iteration": 2.7508158683776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101529, + "balance_loss_mlp": 1.08784389, + "diversity_loss_mlp": 0.0, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.07642048536310797, + "language_loss": 0.79864645, + "learning_rate": 0.0008397490976033009, + "loss": 0.80966175, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1478, + "time_per_iteration": 2.6500625610351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054127, + "balance_loss_mlp": 1.04673624, + "diversity_loss_mlp": 0.0, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.0303646120618472, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78933775, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.07373047, + "routerloss_mlp": 0.0, + "step": 1479, + "time_per_iteration": 4.757360935211182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098606, + "balance_loss_mlp": 1.08449173, + "diversity_loss_mlp": 0.0, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.06570619267025138, + "language_loss": 0.85133117, + "learning_rate": 0.0008392916909509525, + "loss": 0.86231726, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1480, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093081, + "balance_loss_mlp": 1.07888281, + "diversity_loss_mlp": 0.0, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.07896332999012158, + "language_loss": 0.8543641, + "learning_rate": 0.0008390627899932954, + "loss": 0.86529493, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.14208984, + "routerloss_mlp": 0.0, + "step": 1481, + "time_per_iteration": 2.5937705039978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100254, + "balance_loss_mlp": 1.08532953, + "diversity_loss_mlp": 0.0, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.08879627929694006, + "language_loss": 0.88894033, + "learning_rate": 0.000838833757399789, + "loss": 0.89994287, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1482, + "time_per_iteration": 2.95451283454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106961, + "balance_loss_mlp": 1.09247661, + "diversity_loss_mlp": 0.0, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.08557616325511565, + "language_loss": 0.80760586, + "learning_rate": 0.0008386045932593515, + "loss": 0.81867552, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1483, + "time_per_iteration": 2.6901025772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112082, + "balance_loss_mlp": 1.09776473, + "diversity_loss_mlp": 0.0, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.0661413109298982, + "language_loss": 0.86017227, + "learning_rate": 0.0008383752976609525, + "loss": 0.87129307, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1484, + "time_per_iteration": 2.9148330688476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116421, + "balance_loss_mlp": 1.1014719, + "diversity_loss_mlp": 0.0, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06788684976720215, + "language_loss": 0.80004096, + "learning_rate": 0.0008381458706936123, + "loss": 0.81120521, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.14916992, + "routerloss_mlp": 0.0, + "step": 1485, + "time_per_iteration": 2.681067943572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112387, + "balance_loss_mlp": 1.09728312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.06920905175587555, + "language_loss": 0.8725493, + "learning_rate": 0.0008379163124464025, + "loss": 0.88367319, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1486, + "time_per_iteration": 2.7093162536621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117865, + "balance_loss_mlp": 1.10290396, + "diversity_loss_mlp": 0.0, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.09647963836289664, + "language_loss": 0.77093983, + "learning_rate": 0.0008376866230084452, + "loss": 0.78211844, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.14941406, + "routerloss_mlp": 0.0, + "step": 1487, + "time_per_iteration": 2.8678433895111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00910546, + "balance_loss_mlp": 1.59136748, + "diversity_loss_mlp": 0.19592074, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.03660624024989628, + "language_loss": 0.86046171, + "learning_rate": 0.000837456802468914, + "loss": 0.86956716, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01690142, + "step": 1488, + "time_per_iteration": 2.602982997894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102391, + "balance_loss_mlp": 1.08787107, + "diversity_loss_mlp": 0.0, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.0820682475712047, + "language_loss": 0.85374725, + "learning_rate": 0.0008372268509170331, + "loss": 0.86477119, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1489, + "time_per_iteration": 2.6895487308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099554, + "balance_loss_mlp": 1.08529639, + "diversity_loss_mlp": 0.0, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.09305985964981825, + "language_loss": 0.85262501, + "learning_rate": 0.0008369967684420779, + "loss": 0.86362052, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1490, + "time_per_iteration": 2.7102949619293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083179, + "balance_loss_mlp": 1.06912422, + "diversity_loss_mlp": 0.0, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.08804420397834639, + "language_loss": 0.84696782, + "learning_rate": 0.0008367665551333736, + "loss": 0.85779965, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1491, + "time_per_iteration": 2.618272304534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088636, + "balance_loss_mlp": 1.07430756, + "diversity_loss_mlp": 0.0, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.07991380194683065, + "language_loss": 0.85525382, + "learning_rate": 0.0008365362110802977, + "loss": 0.86614019, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1492, + "time_per_iteration": 2.851928234100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101019, + "balance_loss_mlp": 1.08655906, + "diversity_loss_mlp": 0.0, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.0838988471662801, + "language_loss": 0.82620168, + "learning_rate": 0.0008363057363722773, + "loss": 0.83721185, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1493, + "time_per_iteration": 2.853207588195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_mlp": 1.09245062, + "diversity_loss_mlp": 0.0, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.06826703692619526, + "language_loss": 0.84157109, + "learning_rate": 0.0008360751310987906, + "loss": 0.85263485, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1494, + "time_per_iteration": 2.57387638092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113071, + "balance_loss_mlp": 1.11695361, + "diversity_loss_mlp": 0.0, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.058749130100992836, + "language_loss": 0.85290074, + "learning_rate": 0.0008358443953493666, + "loss": 0.86420786, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1495, + "time_per_iteration": 2.8883073329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164777, + "balance_loss_mlp": 1.15067482, + "diversity_loss_mlp": 0.0, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.08087911977453179, + "language_loss": 0.88221979, + "learning_rate": 0.0008356135292135851, + "loss": 0.89386749, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1496, + "time_per_iteration": 2.5230934619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186431, + "balance_loss_mlp": 1.17226899, + "diversity_loss_mlp": 0.0, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.11116302526442519, + "language_loss": 0.92429602, + "learning_rate": 0.0008353825327810758, + "loss": 0.93616039, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1497, + "time_per_iteration": 2.420966863632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188369, + "balance_loss_mlp": 1.17465985, + "diversity_loss_mlp": 0.0, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.07094257684914687, + "language_loss": 0.8160103, + "learning_rate": 0.00083515140614152, + "loss": 0.82789397, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1498, + "time_per_iteration": 2.7105205059051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172297, + "balance_loss_mlp": 1.15901685, + "diversity_loss_mlp": 0.0, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.09212284213685974, + "language_loss": 0.87059236, + "learning_rate": 0.0008349201493846485, + "loss": 0.88231528, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1499, + "time_per_iteration": 2.6807801723480225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148211, + "balance_loss_mlp": 1.13470435, + "diversity_loss_mlp": 0.0, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.07375807574735407, + "language_loss": 0.88790113, + "learning_rate": 0.0008346887626002432, + "loss": 0.89938325, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1500, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919256, + "balance_loss_mlp": 1.60489607, + "diversity_loss_mlp": 0.19980004, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.030907333217789122, + "language_loss": 0.85892522, + "learning_rate": 0.000834457245878137, + "loss": 0.86811781, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0169074, + "step": 1501, + "time_per_iteration": 2.6543540954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112198, + "balance_loss_mlp": 1.10861671, + "diversity_loss_mlp": 0.0, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.09029230185558035, + "language_loss": 0.81450766, + "learning_rate": 0.000834225599308212, + "loss": 0.82572746, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1502, + "time_per_iteration": 3.2493886947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125349, + "balance_loss_mlp": 1.11191428, + "diversity_loss_mlp": 0.0, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.07343077704271528, + "language_loss": 0.85592055, + "learning_rate": 0.0008339938229804016, + "loss": 0.86717403, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.13458252, + "routerloss_mlp": 0.0, + "step": 1503, + "time_per_iteration": 2.712455987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091173, + "balance_loss_mlp": 1.08344853, + "diversity_loss_mlp": 0.0, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.040592353184382625, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76525998, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1504, + "time_per_iteration": 4.975377082824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117311, + "balance_loss_mlp": 1.10320854, + "diversity_loss_mlp": 0.0, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.10665663300821891, + "language_loss": 0.84014988, + "learning_rate": 0.0008335298814111094, + "loss": 0.85132295, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1505, + "time_per_iteration": 2.563352584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119478, + "balance_loss_mlp": 1.10572124, + "diversity_loss_mlp": 0.0, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.07488877863745698, + "language_loss": 0.87982982, + "learning_rate": 0.0008332977163497455, + "loss": 0.89102459, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1506, + "time_per_iteration": 2.799177646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011178, + "balance_loss_mlp": 1.10419846, + "diversity_loss_mlp": 0.0, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.08855239932012744, + "language_loss": 0.83522987, + "learning_rate": 0.0008330654218907325, + "loss": 0.84640789, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1507, + "time_per_iteration": 2.7311654090881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130891, + "balance_loss_mlp": 1.1170032, + "diversity_loss_mlp": 0.0, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.06185767339129184, + "language_loss": 0.82011658, + "learning_rate": 0.0008328329981242548, + "loss": 0.83142549, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1508, + "time_per_iteration": 2.87014102935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148949, + "balance_loss_mlp": 1.13483465, + "diversity_loss_mlp": 0.0, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.0780337340178098, + "language_loss": 0.88045996, + "learning_rate": 0.0008326004451405475, + "loss": 0.89194947, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1509, + "time_per_iteration": 2.7449288368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146827, + "balance_loss_mlp": 1.13290334, + "diversity_loss_mlp": 0.0, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.07615169765943663, + "language_loss": 0.82328165, + "learning_rate": 0.0008323677630298957, + "loss": 0.83474988, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1510, + "time_per_iteration": 2.5527472496032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00911058, + "balance_loss_mlp": 1.59209251, + "diversity_loss_mlp": 0.19929613, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.030084219280472915, + "language_loss": 0.84789264, + "learning_rate": 0.0008321349518826345, + "loss": 0.85700321, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01536426, + "step": 1511, + "time_per_iteration": 2.85006046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167449, + "balance_loss_mlp": 1.15337038, + "diversity_loss_mlp": 0.0, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.09547204503407083, + "language_loss": 0.94614309, + "learning_rate": 0.0008319020117891491, + "loss": 0.95781755, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1512, + "time_per_iteration": 2.619699001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150869, + "balance_loss_mlp": 1.13603973, + "diversity_loss_mlp": 0.0, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.0903449194731753, + "language_loss": 0.86757064, + "learning_rate": 0.0008316689428398751, + "loss": 0.87907934, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.14819336, + "routerloss_mlp": 0.0, + "step": 1513, + "time_per_iteration": 2.6975061893463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122355, + "balance_loss_mlp": 1.10804975, + "diversity_loss_mlp": 0.0, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.05700485295001885, + "language_loss": 0.88661957, + "learning_rate": 0.0008314357451252979, + "loss": 0.89784312, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.14306641, + "routerloss_mlp": 0.0, + "step": 1514, + "time_per_iteration": 2.7759623527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101866, + "balance_loss_mlp": 1.08762062, + "diversity_loss_mlp": 0.0, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.06876651723291546, + "language_loss": 0.87979865, + "learning_rate": 0.0008312024187359527, + "loss": 0.89081734, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1515, + "time_per_iteration": 2.6594746112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108928, + "balance_loss_mlp": 1.07499838, + "diversity_loss_mlp": 0.0, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.06943657009436902, + "language_loss": 0.87168229, + "learning_rate": 0.000830968963762425, + "loss": 0.88257504, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.14282227, + "routerloss_mlp": 0.0, + "step": 1516, + "time_per_iteration": 3.0544168949127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078645, + "balance_loss_mlp": 1.06457818, + "diversity_loss_mlp": 0.0, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.07942748937188983, + "language_loss": 0.84183443, + "learning_rate": 0.0008307353802953497, + "loss": 0.85262084, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1517, + "time_per_iteration": 2.7325901985168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.06031072, + "diversity_loss_mlp": 0.0, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.0903207444065502, + "language_loss": 0.86203992, + "learning_rate": 0.0008305016684254125, + "loss": 0.87279052, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1518, + "time_per_iteration": 2.790580987930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073552, + "balance_loss_mlp": 1.05908012, + "diversity_loss_mlp": 0.0, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.07640210633127195, + "language_loss": 0.86818451, + "learning_rate": 0.0008302678282433479, + "loss": 0.87892002, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1519, + "time_per_iteration": 2.594045400619507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077986, + "balance_loss_mlp": 1.06394291, + "diversity_loss_mlp": 0.0, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07607218771192015, + "language_loss": 0.84937745, + "learning_rate": 0.0008300338598399411, + "loss": 0.86015737, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1520, + "time_per_iteration": 2.6176183223724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00897129, + "balance_loss_mlp": 1.56367016, + "diversity_loss_mlp": 0.19839743, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.03454500929264816, + "language_loss": 0.94754219, + "learning_rate": 0.0008297997633060263, + "loss": 0.95651346, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0160955, + "step": 1521, + "time_per_iteration": 2.5507402420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098471, + "balance_loss_mlp": 1.08445215, + "diversity_loss_mlp": 0.0, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07923859397995789, + "language_loss": 0.84868819, + "learning_rate": 0.0008295655387324883, + "loss": 0.8596729, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1522, + "time_per_iteration": 2.942894458770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103286, + "balance_loss_mlp": 1.08957708, + "diversity_loss_mlp": 0.0, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.09185291067452052, + "language_loss": 0.84979212, + "learning_rate": 0.0008293311862102609, + "loss": 0.86082506, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1523, + "time_per_iteration": 2.555556297302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.10218382, + "diversity_loss_mlp": 0.0, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.07878242279946136, + "language_loss": 0.88546365, + "learning_rate": 0.0008290967058303275, + "loss": 0.89662319, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.13781738, + "routerloss_mlp": 0.0, + "step": 1524, + "time_per_iteration": 2.5723721981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117022, + "balance_loss_mlp": 1.10387325, + "diversity_loss_mlp": 0.0, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07157234250277994, + "language_loss": 0.86573815, + "learning_rate": 0.0008288620976837219, + "loss": 0.87690842, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1525, + "time_per_iteration": 2.539079427719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116802, + "balance_loss_mlp": 1.10354626, + "diversity_loss_mlp": 0.0, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.07300174969402286, + "language_loss": 0.82548958, + "learning_rate": 0.000828627361861527, + "loss": 0.83665758, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1526, + "time_per_iteration": 2.5784413814544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117225, + "balance_loss_mlp": 1.10368335, + "diversity_loss_mlp": 0.0, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.105387273671708, + "language_loss": 0.84438479, + "learning_rate": 0.0008283924984548752, + "loss": 0.85555708, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1527, + "time_per_iteration": 2.876854181289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136626, + "balance_loss_mlp": 1.12352467, + "diversity_loss_mlp": 0.0, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.07473419184062492, + "language_loss": 0.84776825, + "learning_rate": 0.0008281575075549485, + "loss": 0.8591345, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1528, + "time_per_iteration": 2.5660881996154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103997, + "balance_loss_mlp": 1.09631968, + "diversity_loss_mlp": 0.0, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.053938657910520806, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78456688, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.07666016, + "routerloss_mlp": 0.0, + "step": 1529, + "time_per_iteration": 4.633493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149647, + "balance_loss_mlp": 1.13666511, + "diversity_loss_mlp": 0.0, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.07225715112962865, + "language_loss": 0.90511358, + "learning_rate": 0.0008276871436402469, + "loss": 0.91661, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1530, + "time_per_iteration": 2.8149213790893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156897, + "balance_loss_mlp": 1.14402199, + "diversity_loss_mlp": 0.0, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.10076437192912456, + "language_loss": 0.87526608, + "learning_rate": 0.000827451770808083, + "loss": 0.88683504, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1531, + "time_per_iteration": 2.7307019233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137224, + "balance_loss_mlp": 1.12402749, + "diversity_loss_mlp": 0.0, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.07118672956881426, + "language_loss": 0.8318634, + "learning_rate": 0.0008272162708478674, + "loss": 0.84323561, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1532, + "time_per_iteration": 2.559326648712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135091, + "balance_loss_mlp": 1.1222167, + "diversity_loss_mlp": 0.0, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.07324079883183283, + "language_loss": 0.86170006, + "learning_rate": 0.000826980643851029, + "loss": 0.87305093, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1533, + "time_per_iteration": 2.728351354598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120692, + "balance_loss_mlp": 1.10734081, + "diversity_loss_mlp": 0.0, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.07850912920042735, + "language_loss": 0.84523225, + "learning_rate": 0.0008267448899090464, + "loss": 0.85643911, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.13378906, + "routerloss_mlp": 0.0, + "step": 1534, + "time_per_iteration": 2.595296859741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121931, + "balance_loss_mlp": 1.10788798, + "diversity_loss_mlp": 0.0, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.07265790711823701, + "language_loss": 0.80930066, + "learning_rate": 0.0008265090091134473, + "loss": 0.82051992, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1535, + "time_per_iteration": 2.8336315155029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105358, + "balance_loss_mlp": 1.09133863, + "diversity_loss_mlp": 0.0, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.08467148330579209, + "language_loss": 0.80271345, + "learning_rate": 0.0008262730015558088, + "loss": 0.81376696, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1536, + "time_per_iteration": 2.9066760540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102197, + "balance_loss_mlp": 1.08847594, + "diversity_loss_mlp": 0.0, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.07407642769484, + "language_loss": 0.81805962, + "learning_rate": 0.0008260368673277574, + "loss": 0.82908159, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.1373291, + "routerloss_mlp": 0.0, + "step": 1537, + "time_per_iteration": 3.1795482635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106736, + "balance_loss_mlp": 1.09302735, + "diversity_loss_mlp": 0.0, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.06784415515848828, + "language_loss": 0.84026253, + "learning_rate": 0.0008258006065209682, + "loss": 0.85132986, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1538, + "time_per_iteration": 2.766732931137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112658, + "balance_loss_mlp": 1.09863889, + "diversity_loss_mlp": 0.0, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.0747520981493109, + "language_loss": 0.80543184, + "learning_rate": 0.0008255642192271657, + "loss": 0.81655836, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.14038086, + "routerloss_mlp": 0.0, + "step": 1539, + "time_per_iteration": 2.792191505432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130833, + "balance_loss_mlp": 1.11683834, + "diversity_loss_mlp": 0.0, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.06277821647748005, + "language_loss": 0.83592129, + "learning_rate": 0.0008253277055381241, + "loss": 0.8472296, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1540, + "time_per_iteration": 2.8384311199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138407, + "balance_loss_mlp": 1.12428069, + "diversity_loss_mlp": 0.0, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.09924754491110549, + "language_loss": 0.85482454, + "learning_rate": 0.0008250910655456658, + "loss": 0.86620867, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1541, + "time_per_iteration": 3.1718008518218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133859, + "balance_loss_mlp": 1.12016189, + "diversity_loss_mlp": 0.0, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.07747440640117766, + "language_loss": 0.83370835, + "learning_rate": 0.0008248542993416625, + "loss": 0.84504688, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1542, + "time_per_iteration": 2.5952396392822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127147, + "balance_loss_mlp": 1.11278272, + "diversity_loss_mlp": 0.0, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.08018137719350796, + "language_loss": 0.83926904, + "learning_rate": 0.0008246174070180352, + "loss": 0.85054052, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.14355469, + "routerloss_mlp": 0.0, + "step": 1543, + "time_per_iteration": 2.6775217056274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.10168624, + "diversity_loss_mlp": 0.0, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09273281815149376, + "language_loss": 0.83928716, + "learning_rate": 0.0008243803886667537, + "loss": 0.85044312, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1544, + "time_per_iteration": 3.0925238132476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110422, + "balance_loss_mlp": 1.09024858, + "diversity_loss_mlp": 0.0, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.06593992881851045, + "language_loss": 0.79115343, + "learning_rate": 0.0008241432443798364, + "loss": 0.80219567, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1545, + "time_per_iteration": 2.839099407196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088557, + "balance_loss_mlp": 1.07518196, + "diversity_loss_mlp": 0.0, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.05453506209022983, + "language_loss": 0.85691601, + "learning_rate": 0.0008239059742493512, + "loss": 0.86780155, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1546, + "time_per_iteration": 2.7476751804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088565, + "balance_loss_mlp": 1.07480812, + "diversity_loss_mlp": 0.0, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.06672989003234615, + "language_loss": 0.87117672, + "learning_rate": 0.0008236685783674142, + "loss": 0.88206244, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1547, + "time_per_iteration": 3.0519776344299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06796312, + "diversity_loss_mlp": 0.0, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.04305360715769565, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.772995, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 1548, + "time_per_iteration": 4.883166790008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084727, + "balance_loss_mlp": 1.07123256, + "diversity_loss_mlp": 0.0, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.11160876507978217, + "language_loss": 0.82253683, + "learning_rate": 0.0008231934097178955, + "loss": 0.8333841, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.1350708, + "routerloss_mlp": 0.0, + "step": 1549, + "time_per_iteration": 2.60786771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092454, + "balance_loss_mlp": 1.07919788, + "diversity_loss_mlp": 0.0, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.07843428838445873, + "language_loss": 0.85328496, + "learning_rate": 0.0008229556371347903, + "loss": 0.86420953, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1550, + "time_per_iteration": 2.962412118911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106892, + "balance_loss_mlp": 1.09379029, + "diversity_loss_mlp": 0.0, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.0840525031564576, + "language_loss": 0.79399186, + "learning_rate": 0.0008227177391691874, + "loss": 0.80506086, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.13122559, + "routerloss_mlp": 0.0, + "step": 1551, + "time_per_iteration": 3.1673550605773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111848, + "balance_loss_mlp": 1.09871709, + "diversity_loss_mlp": 0.0, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07195743014481873, + "language_loss": 0.89281148, + "learning_rate": 0.0008224797159134463, + "loss": 0.90392995, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1552, + "time_per_iteration": 2.7333877086639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121508, + "balance_loss_mlp": 1.10890126, + "diversity_loss_mlp": 0.0, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.07485820549569244, + "language_loss": 0.83144093, + "learning_rate": 0.0008222415674599765, + "loss": 0.84265602, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.12609863, + "routerloss_mlp": 0.0, + "step": 1553, + "time_per_iteration": 3.077017068862915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135128, + "balance_loss_mlp": 1.12165701, + "diversity_loss_mlp": 0.0, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.08671551895934956, + "language_loss": 0.83149582, + "learning_rate": 0.0008220032939012349, + "loss": 0.84284711, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1554, + "time_per_iteration": 2.6689035892486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115966, + "balance_loss_mlp": 1.10284674, + "diversity_loss_mlp": 0.0, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.06666483036401037, + "language_loss": 0.87800217, + "learning_rate": 0.0008217648953297277, + "loss": 0.88916183, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1555, + "time_per_iteration": 2.8417294025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119981, + "balance_loss_mlp": 1.10677278, + "diversity_loss_mlp": 0.0, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.08472740856632217, + "language_loss": 0.78017807, + "learning_rate": 0.0008215263718380095, + "loss": 0.7913779, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1556, + "time_per_iteration": 2.682047128677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096383, + "balance_loss_mlp": 1.08319807, + "diversity_loss_mlp": 0.0, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.07743195715790333, + "language_loss": 0.84389544, + "learning_rate": 0.0008212877235186833, + "loss": 0.85485923, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.13201904, + "routerloss_mlp": 0.0, + "step": 1557, + "time_per_iteration": 2.6532580852508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074398, + "balance_loss_mlp": 1.06710196, + "diversity_loss_mlp": 0.0, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.04061005434024277, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78811955, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.07275391, + "routerloss_mlp": 0.0, + "step": 1558, + "time_per_iteration": 4.923272132873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092088, + "balance_loss_mlp": 1.07896352, + "diversity_loss_mlp": 0.0, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.10565427097675566, + "language_loss": 0.8116585, + "learning_rate": 0.0008208100527678611, + "loss": 0.82257938, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1559, + "time_per_iteration": 2.602773427963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.07101393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.11780548804152448, + "language_loss": 0.78494406, + "learning_rate": 0.0008205710305218135, + "loss": 0.79578459, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1560, + "time_per_iteration": 3.013576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089526, + "balance_loss_mlp": 1.07663918, + "diversity_loss_mlp": 0.0, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.08018423106971302, + "language_loss": 0.89838511, + "learning_rate": 0.0008203318838190541, + "loss": 0.9092803, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.12890625, + "routerloss_mlp": 0.0, + "step": 1561, + "time_per_iteration": 2.741619348526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108989, + "balance_loss_mlp": 1.07702184, + "diversity_loss_mlp": 0.0, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.09397123990600864, + "language_loss": 0.85396177, + "learning_rate": 0.0008200926127524281, + "loss": 0.86486065, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1562, + "time_per_iteration": 2.60974383354187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106708, + "balance_loss_mlp": 1.0936904, + "diversity_loss_mlp": 0.0, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.08688269643752358, + "language_loss": 0.83400619, + "learning_rate": 0.0008198532174148289, + "loss": 0.84507322, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1563, + "time_per_iteration": 2.7336533069610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079297, + "balance_loss_mlp": 1.07195389, + "diversity_loss_mlp": 0.0, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.04112604139988501, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81765467, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1564, + "time_per_iteration": 4.828714609146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145426, + "balance_loss_mlp": 1.1324501, + "diversity_loss_mlp": 0.0, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.08852118135813189, + "language_loss": 0.89291, + "learning_rate": 0.0008193740542985244, + "loss": 0.90436429, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1565, + "time_per_iteration": 2.5988731384277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151488, + "balance_loss_mlp": 1.13872099, + "diversity_loss_mlp": 0.0, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.1281977179548432, + "language_loss": 0.86354733, + "learning_rate": 0.0008191342867058467, + "loss": 0.87506223, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1566, + "time_per_iteration": 2.6914639472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118723, + "balance_loss_mlp": 1.10574174, + "diversity_loss_mlp": 0.0, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.07018370282969584, + "language_loss": 0.83602738, + "learning_rate": 0.0008188943952142509, + "loss": 0.84721458, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1567, + "time_per_iteration": 2.7846438884735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111402, + "balance_loss_mlp": 1.09847367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.08750889372003143, + "language_loss": 0.82150149, + "learning_rate": 0.0008186543799168711, + "loss": 0.83261549, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1568, + "time_per_iteration": 3.1300384998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094152, + "balance_loss_mlp": 1.08103871, + "diversity_loss_mlp": 0.0, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.07719475001811499, + "language_loss": 0.88627326, + "learning_rate": 0.0008184142409068892, + "loss": 0.89721477, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.13134766, + "routerloss_mlp": 0.0, + "step": 1569, + "time_per_iteration": 2.9922726154327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087737, + "balance_loss_mlp": 1.07475495, + "diversity_loss_mlp": 0.0, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.07345065764158631, + "language_loss": 0.86446834, + "learning_rate": 0.000818173978277536, + "loss": 0.87534571, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.12994385, + "routerloss_mlp": 0.0, + "step": 1570, + "time_per_iteration": 2.695930242538452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089564, + "balance_loss_mlp": 1.07673669, + "diversity_loss_mlp": 0.0, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.0712021049255776, + "language_loss": 0.83337176, + "learning_rate": 0.000817933592122089, + "loss": 0.84426749, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.12841797, + "routerloss_mlp": 0.0, + "step": 1571, + "time_per_iteration": 2.7131617069244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087482, + "balance_loss_mlp": 1.07427394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.08283074842036095, + "language_loss": 0.83667982, + "learning_rate": 0.0008176930825338749, + "loss": 0.84755468, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1572, + "time_per_iteration": 2.5447826385498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087862, + "balance_loss_mlp": 1.07405734, + "diversity_loss_mlp": 0.0, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.07741282152017008, + "language_loss": 0.88849854, + "learning_rate": 0.0008174524496062679, + "loss": 0.89937723, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1573, + "time_per_iteration": 2.908740997314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092114, + "balance_loss_mlp": 1.07822633, + "diversity_loss_mlp": 0.0, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.06962859876416791, + "language_loss": 0.85499102, + "learning_rate": 0.0008172116934326894, + "loss": 0.86591208, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1574, + "time_per_iteration": 2.751488208770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098046, + "balance_loss_mlp": 1.08365786, + "diversity_loss_mlp": 0.0, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.09195920466248479, + "language_loss": 0.8794626, + "learning_rate": 0.0008169708141066097, + "loss": 0.89044309, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1575, + "time_per_iteration": 2.5947275161743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118908, + "balance_loss_mlp": 1.10441208, + "diversity_loss_mlp": 0.0, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.0784824693742563, + "language_loss": 0.90658617, + "learning_rate": 0.0008167298117215465, + "loss": 0.91777527, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1576, + "time_per_iteration": 2.5396125316619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011316, + "balance_loss_mlp": 1.11705649, + "diversity_loss_mlp": 0.0, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.1093253517132677, + "language_loss": 0.87566864, + "learning_rate": 0.0008164886863710649, + "loss": 0.88698471, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1577, + "time_per_iteration": 2.931835412979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138004, + "balance_loss_mlp": 1.12323439, + "diversity_loss_mlp": 0.0, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07788016425512684, + "language_loss": 0.8637675, + "learning_rate": 0.0008162474381487783, + "loss": 0.87514758, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.14770508, + "routerloss_mlp": 0.0, + "step": 1578, + "time_per_iteration": 3.041262626647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125978, + "balance_loss_mlp": 1.11132693, + "diversity_loss_mlp": 0.0, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.1532642042193693, + "language_loss": 0.84568751, + "learning_rate": 0.0008160060671483475, + "loss": 0.8569473, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1579, + "time_per_iteration": 2.6566197872161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110829, + "balance_loss_mlp": 1.0942831, + "diversity_loss_mlp": 0.0, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.10001869607158981, + "language_loss": 0.8342396, + "learning_rate": 0.0008157645734634809, + "loss": 0.84532249, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1580, + "time_per_iteration": 2.5994346141815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151521, + "balance_loss_mlp": 1.14064956, + "diversity_loss_mlp": 0.0, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.06737085519591758, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78048015, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 1581, + "time_per_iteration": 4.946556329727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00631723, + "balance_loss_mlp": 1.05820811, + "diversity_loss_mlp": 0.17941347, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.002006006723137456, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.73846221, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01291206, + "step": 1582, + "time_per_iteration": 4.897693395614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.08376384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.07529557219412701, + "language_loss": 0.83949858, + "learning_rate": 0.000815039357240067, + "loss": 0.85047406, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1583, + "time_per_iteration": 2.6096932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101837, + "balance_loss_mlp": 1.0882473, + "diversity_loss_mlp": 0.0, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.0740498467066553, + "language_loss": 0.84922493, + "learning_rate": 0.0008147973737554952, + "loss": 0.86024332, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1584, + "time_per_iteration": 2.7863824367523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106775, + "balance_loss_mlp": 1.09364963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.11669723774220289, + "language_loss": 0.85926318, + "learning_rate": 0.000814555268055744, + "loss": 0.87033093, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1585, + "time_per_iteration": 2.6167564392089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111589, + "balance_loss_mlp": 1.1022768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07476018488685929, + "language_loss": 0.87489879, + "learning_rate": 0.0008143130402348073, + "loss": 0.88605773, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1586, + "time_per_iteration": 2.6318202018737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112097, + "balance_loss_mlp": 1.10742807, + "diversity_loss_mlp": 0.0, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.07016471467090964, + "language_loss": 0.79198885, + "learning_rate": 0.0008140706903867265, + "loss": 0.80319858, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1587, + "time_per_iteration": 2.82663893699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128991, + "balance_loss_mlp": 1.11541307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.09040046070353, + "language_loss": 0.90612531, + "learning_rate": 0.0008138282186055897, + "loss": 0.91741514, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1588, + "time_per_iteration": 2.690561294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142156, + "balance_loss_mlp": 1.12872136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.07675542780120453, + "language_loss": 0.82382154, + "learning_rate": 0.0008135856249855331, + "loss": 0.83524311, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1589, + "time_per_iteration": 2.6935813426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115907, + "balance_loss_mlp": 1.14551568, + "diversity_loss_mlp": 0.0, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.07642745969896261, + "language_loss": 0.89603746, + "learning_rate": 0.0008133429096207398, + "loss": 0.90762818, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1590, + "time_per_iteration": 2.7690787315368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113549, + "balance_loss_mlp": 1.10534787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.03962763613217991, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76425815, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.08203125, + "routerloss_mlp": 0.0, + "step": 1591, + "time_per_iteration": 4.950432538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184059, + "balance_loss_mlp": 1.17060041, + "diversity_loss_mlp": 0.0, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.0624915030883944, + "language_loss": 0.8671608, + "learning_rate": 0.0008128571140339123, + "loss": 0.87900144, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1592, + "time_per_iteration": 2.717022657394409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169875, + "balance_loss_mlp": 1.15618944, + "diversity_loss_mlp": 0.0, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.08640912687422367, + "language_loss": 0.87240267, + "learning_rate": 0.0008126140340004805, + "loss": 0.88410139, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.13696289, + "routerloss_mlp": 0.0, + "step": 1593, + "time_per_iteration": 2.5112054347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157381, + "balance_loss_mlp": 1.14379096, + "diversity_loss_mlp": 0.0, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.06492228459438584, + "language_loss": 0.82168889, + "learning_rate": 0.0008123708325995172, + "loss": 0.83326268, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1594, + "time_per_iteration": 3.193125009536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139509, + "balance_loss_mlp": 1.1256932, + "diversity_loss_mlp": 0.0, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.06515151231920442, + "language_loss": 0.79815221, + "learning_rate": 0.0008121275099254414, + "loss": 0.80954736, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1595, + "time_per_iteration": 2.9032304286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133663, + "balance_loss_mlp": 1.12007284, + "diversity_loss_mlp": 0.0, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06899315915000012, + "language_loss": 0.88638222, + "learning_rate": 0.0008118840660727194, + "loss": 0.89771879, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1596, + "time_per_iteration": 2.6298515796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115466, + "balance_loss_mlp": 1.10215056, + "diversity_loss_mlp": 0.0, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.06984166924665287, + "language_loss": 0.87847084, + "learning_rate": 0.0008116405011358644, + "loss": 0.88962543, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.13336182, + "routerloss_mlp": 0.0, + "step": 1597, + "time_per_iteration": 3.1922342777252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095988, + "balance_loss_mlp": 1.08212388, + "diversity_loss_mlp": 0.0, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.07145022695402857, + "language_loss": 0.79985273, + "learning_rate": 0.0008113968152094369, + "loss": 0.81081259, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1598, + "time_per_iteration": 2.500500440597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090097, + "balance_loss_mlp": 1.07637632, + "diversity_loss_mlp": 0.0, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.07896733537507578, + "language_loss": 0.82477671, + "learning_rate": 0.0008111530083880438, + "loss": 0.83567768, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1599, + "time_per_iteration": 2.9081485271453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.07693791, + "diversity_loss_mlp": 0.0, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.10700735308097704, + "language_loss": 0.86289096, + "learning_rate": 0.0008109090807663399, + "loss": 0.87379909, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1600, + "time_per_iteration": 2.7883458137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084832, + "balance_loss_mlp": 1.07049167, + "diversity_loss_mlp": 0.0, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.058046583591585654, + "language_loss": 0.8845669, + "learning_rate": 0.0008106650324390257, + "loss": 0.89541531, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.14331055, + "routerloss_mlp": 0.0, + "step": 1601, + "time_per_iteration": 2.8250818252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012306, + "balance_loss_mlp": 1.78856134, + "diversity_loss_mlp": 0.20302816, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.03151963489439222, + "language_loss": 0.81347358, + "learning_rate": 0.0008104208635008493, + "loss": 0.8235966, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0165114, + "step": 1602, + "time_per_iteration": 2.6824991703033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078191, + "balance_loss_mlp": 1.06365991, + "diversity_loss_mlp": 0.0, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.06925842581040223, + "language_loss": 0.81696957, + "learning_rate": 0.0008101765740466058, + "loss": 0.82775152, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1603, + "time_per_iteration": 2.4828884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083153, + "balance_loss_mlp": 1.06891942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.08194523431430376, + "language_loss": 0.83996522, + "learning_rate": 0.0008099321641711364, + "loss": 0.85079676, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.14221191, + "routerloss_mlp": 0.0, + "step": 1604, + "time_per_iteration": 2.628990650177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093302, + "balance_loss_mlp": 1.07891393, + "diversity_loss_mlp": 0.0, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.066381842407901, + "language_loss": 0.83568424, + "learning_rate": 0.0008096876339693295, + "loss": 0.84661728, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1605, + "time_per_iteration": 2.621486186981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104017, + "balance_loss_mlp": 1.0898906, + "diversity_loss_mlp": 0.0, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.08065648415588843, + "language_loss": 0.8146233, + "learning_rate": 0.0008094429835361206, + "loss": 0.82566357, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1606, + "time_per_iteration": 2.9436137676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101821, + "balance_loss_mlp": 1.08727765, + "diversity_loss_mlp": 0.0, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.06722603246449312, + "language_loss": 0.85730284, + "learning_rate": 0.0008091982129664908, + "loss": 0.86832106, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.14538574, + "routerloss_mlp": 0.0, + "step": 1607, + "time_per_iteration": 2.6776270866394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110298, + "balance_loss_mlp": 1.09606481, + "diversity_loss_mlp": 0.0, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.07435522574008574, + "language_loss": 0.83177197, + "learning_rate": 0.0008089533223554687, + "loss": 0.842875, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1608, + "time_per_iteration": 2.6971724033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106883, + "balance_loss_mlp": 1.09322155, + "diversity_loss_mlp": 0.0, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.08534881839400792, + "language_loss": 0.85436511, + "learning_rate": 0.0008087083117981294, + "loss": 0.86543399, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1609, + "time_per_iteration": 2.873072624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100887, + "balance_loss_mlp": 1.08715367, + "diversity_loss_mlp": 0.0, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.08408730625442483, + "language_loss": 0.88209295, + "learning_rate": 0.0008084631813895943, + "loss": 0.89310181, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.13745117, + "routerloss_mlp": 0.0, + "step": 1610, + "time_per_iteration": 2.7717368602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_mlp": 1.0843389, + "diversity_loss_mlp": 0.0, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07291880748627809, + "language_loss": 0.84093356, + "learning_rate": 0.0008082179312250315, + "loss": 0.85191453, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1611, + "time_per_iteration": 2.6323728561401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167376, + "balance_loss_mlp": 1.15912676, + "diversity_loss_mlp": 0.0, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.06715325583723679, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81023216, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.08251953, + "routerloss_mlp": 0.0, + "step": 1612, + "time_per_iteration": 4.837978839874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103787, + "balance_loss_mlp": 1.09591889, + "diversity_loss_mlp": 0.0, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.04843806861709949, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77733123, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.07861328, + "routerloss_mlp": 0.0, + "step": 1613, + "time_per_iteration": 5.086154937744141 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118625, + "balance_loss_mlp": 1.10497594, + "diversity_loss_mlp": 0.0, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.09649046421891638, + "language_loss": 0.82414234, + "learning_rate": 0.0008074814631475545, + "loss": 0.83532858, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.13671875, + "routerloss_mlp": 0.0, + "step": 1614, + "time_per_iteration": 3.3300058841705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115901, + "balance_loss_mlp": 1.10232294, + "diversity_loss_mlp": 0.0, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.10381126956618623, + "language_loss": 0.7917223, + "learning_rate": 0.0008072357349114907, + "loss": 0.80288124, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.1361084, + "routerloss_mlp": 0.0, + "step": 1615, + "time_per_iteration": 2.692242383956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123449, + "balance_loss_mlp": 1.1100384, + "diversity_loss_mlp": 0.0, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.09811598085954727, + "language_loss": 0.88751173, + "learning_rate": 0.0008069898873959363, + "loss": 0.89874619, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1616, + "time_per_iteration": 2.688138723373413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119599, + "balance_loss_mlp": 1.10590243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.06496922585492992, + "language_loss": 0.85670269, + "learning_rate": 0.0008067439206963375, + "loss": 0.8678987, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1617, + "time_per_iteration": 2.628465175628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126727, + "balance_loss_mlp": 1.11359048, + "diversity_loss_mlp": 0.0, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.08367367493581554, + "language_loss": 0.86233091, + "learning_rate": 0.0008064978349081873, + "loss": 0.87359822, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.13146973, + "routerloss_mlp": 0.0, + "step": 1618, + "time_per_iteration": 2.9359195232391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122791, + "balance_loss_mlp": 1.10941529, + "diversity_loss_mlp": 0.0, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.062058920213391884, + "language_loss": 0.86742592, + "learning_rate": 0.0008062516301270245, + "loss": 0.87865382, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1619, + "time_per_iteration": 2.685615301132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00968061, + "balance_loss_mlp": 1.70987701, + "diversity_loss_mlp": 0.19448289, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.02692656797073588, + "language_loss": 0.8831743, + "learning_rate": 0.0008060053064484343, + "loss": 0.89285493, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01588114, + "step": 1620, + "time_per_iteration": 2.9507076740264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131577, + "balance_loss_mlp": 1.11839283, + "diversity_loss_mlp": 0.0, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.08216719715750098, + "language_loss": 0.85142976, + "learning_rate": 0.0008057588639680482, + "loss": 0.86274558, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.13208008, + "routerloss_mlp": 0.0, + "step": 1621, + "time_per_iteration": 2.7498936653137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00955916, + "balance_loss_mlp": 1.68915153, + "diversity_loss_mlp": 0.19115068, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.038673577194741904, + "language_loss": 0.82934028, + "learning_rate": 0.0008055123027815434, + "loss": 0.83889943, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01576493, + "step": 1622, + "time_per_iteration": 2.92877459526062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119016, + "balance_loss_mlp": 1.10545552, + "diversity_loss_mlp": 0.0, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.11144773799130939, + "language_loss": 0.8492527, + "learning_rate": 0.0008052656229846436, + "loss": 0.86044282, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.13580322, + "routerloss_mlp": 0.0, + "step": 1623, + "time_per_iteration": 2.6647849082946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104342, + "balance_loss_mlp": 1.09039474, + "diversity_loss_mlp": 0.0, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.09067734621983937, + "language_loss": 0.90320027, + "learning_rate": 0.0008050188246731182, + "loss": 0.9142437, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1624, + "time_per_iteration": 2.6908931732177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108727, + "balance_loss_mlp": 1.07360816, + "diversity_loss_mlp": 0.0, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.08706559573327896, + "language_loss": 0.8222695, + "learning_rate": 0.0008047719079427834, + "loss": 0.83314216, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1625, + "time_per_iteration": 2.979578733444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281481, + "balance_loss_mlp": 1.27170551, + "diversity_loss_mlp": 0.0, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.09241126848133228, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75633186, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.09765625, + "routerloss_mlp": 0.0, + "step": 1626, + "time_per_iteration": 4.813723802566528 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078597, + "balance_loss_mlp": 1.06489933, + "diversity_loss_mlp": 0.0, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.061158387019755324, + "language_loss": 0.86164916, + "learning_rate": 0.0008042777196091757, + "loss": 0.87243509, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.13708496, + "routerloss_mlp": 0.0, + "step": 1627, + "time_per_iteration": 2.6777052879333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931263, + "balance_loss_mlp": 1.63595629, + "diversity_loss_mlp": 0.19502082, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.02888255305303151, + "language_loss": 0.81839561, + "learning_rate": 0.0008040304481977643, + "loss": 0.82770824, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01577434, + "step": 1628, + "time_per_iteration": 2.685519218444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083762, + "balance_loss_mlp": 1.07024312, + "diversity_loss_mlp": 0.0, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.070875243316129, + "language_loss": 0.86462033, + "learning_rate": 0.0008037830587512649, + "loss": 0.875458, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1629, + "time_per_iteration": 3.0812296867370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093655, + "balance_loss_mlp": 1.07976675, + "diversity_loss_mlp": 0.0, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.07857424850498267, + "language_loss": 0.78910959, + "learning_rate": 0.0008035355513657224, + "loss": 0.80004621, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1630, + "time_per_iteration": 2.509866714477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109932, + "balance_loss_mlp": 1.08518136, + "diversity_loss_mlp": 0.0, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.05926482463995905, + "language_loss": 0.9323386, + "learning_rate": 0.0008032879261372279, + "loss": 0.94333184, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.14135742, + "routerloss_mlp": 0.0, + "step": 1631, + "time_per_iteration": 2.793675422668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121244, + "balance_loss_mlp": 1.20142555, + "diversity_loss_mlp": 0.0, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.0543299042148954, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80848283, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 1632, + "time_per_iteration": 5.6717705726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_mlp": 1.08712876, + "diversity_loss_mlp": 0.0, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.07399367926820971, + "language_loss": 0.87236691, + "learning_rate": 0.0008027923225359748, + "loss": 0.88337696, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.13885498, + "routerloss_mlp": 0.0, + "step": 1633, + "time_per_iteration": 2.591161012649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_mlp": 1.09272563, + "diversity_loss_mlp": 0.0, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.07361205381971474, + "language_loss": 0.8823992, + "learning_rate": 0.0008025443443556267, + "loss": 0.89347273, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1634, + "time_per_iteration": 2.714925765991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106628, + "balance_loss_mlp": 1.09279966, + "diversity_loss_mlp": 0.0, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.05821338652647348, + "language_loss": 0.88174599, + "learning_rate": 0.000802296248717147, + "loss": 0.89281231, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1635, + "time_per_iteration": 2.924661159515381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102803, + "balance_loss_mlp": 1.08889091, + "diversity_loss_mlp": 0.0, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.06918051977022115, + "language_loss": 0.78766519, + "learning_rate": 0.0008020480357168554, + "loss": 0.79869324, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1636, + "time_per_iteration": 2.8397598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096954, + "balance_loss_mlp": 1.08334041, + "diversity_loss_mlp": 0.0, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.061070409346790804, + "language_loss": 0.88343245, + "learning_rate": 0.0008017997054511165, + "loss": 0.89440191, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.13623047, + "routerloss_mlp": 0.0, + "step": 1637, + "time_per_iteration": 2.5770463943481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109567, + "balance_loss_mlp": 1.08241367, + "diversity_loss_mlp": 0.0, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.06082888573267997, + "language_loss": 0.85688329, + "learning_rate": 0.0008015512580163407, + "loss": 0.86783999, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1638, + "time_per_iteration": 2.7893900871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915347, + "balance_loss_mlp": 1.6005652, + "diversity_loss_mlp": 0.19760543, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.03200753828687725, + "language_loss": 0.80247211, + "learning_rate": 0.0008013026935089838, + "loss": 0.8116256, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0162621, + "step": 1639, + "time_per_iteration": 2.9013028144836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116887, + "balance_loss_mlp": 1.10366678, + "diversity_loss_mlp": 0.0, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.07107229367788748, + "language_loss": 0.84156835, + "learning_rate": 0.0008010540120255472, + "loss": 0.85273731, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1640, + "time_per_iteration": 2.6617894172668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122905, + "balance_loss_mlp": 1.10991144, + "diversity_loss_mlp": 0.0, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.08316081918757003, + "language_loss": 0.86058956, + "learning_rate": 0.0008008052136625774, + "loss": 0.87181866, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1641, + "time_per_iteration": 2.8128581047058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117766, + "balance_loss_mlp": 1.10461712, + "diversity_loss_mlp": 0.0, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.11340060957388516, + "language_loss": 0.86898887, + "learning_rate": 0.0008005562985166666, + "loss": 0.88016647, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1642, + "time_per_iteration": 2.6915791034698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113412, + "balance_loss_mlp": 1.10045385, + "diversity_loss_mlp": 0.0, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.06371803301806024, + "language_loss": 0.85065734, + "learning_rate": 0.0008003072666844524, + "loss": 0.86179143, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.12976074, + "routerloss_mlp": 0.0, + "step": 1643, + "time_per_iteration": 2.713515520095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110554, + "balance_loss_mlp": 1.09287417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.09207812275617455, + "language_loss": 0.82446098, + "learning_rate": 0.0008000581182626173, + "loss": 0.83551639, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 1644, + "time_per_iteration": 2.5728507041931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099382, + "balance_loss_mlp": 1.08668065, + "diversity_loss_mlp": 0.0, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.07446065392993936, + "language_loss": 0.86341298, + "learning_rate": 0.0007998088533478894, + "loss": 0.87440687, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.12713623, + "routerloss_mlp": 0.0, + "step": 1645, + "time_per_iteration": 2.7022316455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103676, + "balance_loss_mlp": 1.09096265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.09512310951915111, + "language_loss": 0.84171218, + "learning_rate": 0.000799559472037042, + "loss": 0.85274899, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.12719727, + "routerloss_mlp": 0.0, + "step": 1646, + "time_per_iteration": 2.5341672897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089286, + "balance_loss_mlp": 1.07678151, + "diversity_loss_mlp": 0.0, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.05690135295492242, + "language_loss": 0.87462902, + "learning_rate": 0.0007993099744268932, + "loss": 0.88552189, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1647, + "time_per_iteration": 2.9204719066619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097973, + "balance_loss_mlp": 1.08491409, + "diversity_loss_mlp": 0.0, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.08028992569563033, + "language_loss": 0.88103539, + "learning_rate": 0.000799060360614307, + "loss": 0.8920151, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1648, + "time_per_iteration": 2.7098584175109863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094994, + "balance_loss_mlp": 1.08204746, + "diversity_loss_mlp": 0.0, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.07374581447427947, + "language_loss": 0.83565277, + "learning_rate": 0.0007988106306961917, + "loss": 0.84660268, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1649, + "time_per_iteration": 3.136148691177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096074, + "balance_loss_mlp": 1.08292556, + "diversity_loss_mlp": 0.0, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.08307651310008923, + "language_loss": 0.84510154, + "learning_rate": 0.0007985607847695014, + "loss": 0.85606229, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1650, + "time_per_iteration": 2.6657865047454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090136, + "balance_loss_mlp": 1.07697558, + "diversity_loss_mlp": 0.0, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.07221907468491222, + "language_loss": 0.82981718, + "learning_rate": 0.0007983108229312345, + "loss": 0.84071863, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.13183594, + "routerloss_mlp": 0.0, + "step": 1651, + "time_per_iteration": 2.939943313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109006, + "balance_loss_mlp": 1.07648206, + "diversity_loss_mlp": 0.0, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.0785368607999539, + "language_loss": 0.86505926, + "learning_rate": 0.0007980607452784351, + "loss": 0.87595987, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1652, + "time_per_iteration": 2.586700916290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082675, + "balance_loss_mlp": 1.06952596, + "diversity_loss_mlp": 0.0, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.06920593361186494, + "language_loss": 0.90510356, + "learning_rate": 0.0007978105519081919, + "loss": 0.91593033, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.13165283, + "routerloss_mlp": 0.0, + "step": 1653, + "time_per_iteration": 2.665844440460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084984, + "balance_loss_mlp": 1.0715965, + "diversity_loss_mlp": 0.0, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.07269169213621761, + "language_loss": 0.87967515, + "learning_rate": 0.0007975602429176385, + "loss": 0.89052504, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.13415527, + "routerloss_mlp": 0.0, + "step": 1654, + "time_per_iteration": 2.5818393230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085975, + "balance_loss_mlp": 1.07225442, + "diversity_loss_mlp": 0.0, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.08150423110047789, + "language_loss": 0.81308222, + "learning_rate": 0.0007973098184039536, + "loss": 0.82394195, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1655, + "time_per_iteration": 2.664916515350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094509, + "balance_loss_mlp": 1.08110952, + "diversity_loss_mlp": 0.0, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.0661968945841423, + "language_loss": 0.8695243, + "learning_rate": 0.0007970592784643602, + "loss": 0.88046944, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.13427734, + "routerloss_mlp": 0.0, + "step": 1656, + "time_per_iteration": 2.851214647293091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104427, + "balance_loss_mlp": 1.09084868, + "diversity_loss_mlp": 0.0, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.0809768283097012, + "language_loss": 0.85228848, + "learning_rate": 0.0007968086231961272, + "loss": 0.86333275, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.13598633, + "routerloss_mlp": 0.0, + "step": 1657, + "time_per_iteration": 2.6277201175689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111168, + "balance_loss_mlp": 1.09744644, + "diversity_loss_mlp": 0.0, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.10999441213252201, + "language_loss": 0.83322126, + "learning_rate": 0.0007965578526965671, + "loss": 0.84433806, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.14245605, + "routerloss_mlp": 0.0, + "step": 1658, + "time_per_iteration": 2.5514447689056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097556, + "balance_loss_mlp": 1.08337009, + "diversity_loss_mlp": 0.0, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.07090711515760839, + "language_loss": 0.86299932, + "learning_rate": 0.0007963069670630377, + "loss": 0.87397492, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1659, + "time_per_iteration": 2.722572088241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_mlp": 1.07523549, + "diversity_loss_mlp": 0.0, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.07181055202596492, + "language_loss": 0.88127738, + "learning_rate": 0.0007960559663929416, + "loss": 0.8921715, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1660, + "time_per_iteration": 2.6411688327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079317, + "balance_loss_mlp": 1.06500006, + "diversity_loss_mlp": 0.0, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.06614466369263741, + "language_loss": 0.87915826, + "learning_rate": 0.0007958048507837259, + "loss": 0.88995141, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1661, + "time_per_iteration": 2.954888343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075627, + "balance_loss_mlp": 1.06107187, + "diversity_loss_mlp": 0.0, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.08599761261652404, + "language_loss": 0.87309289, + "learning_rate": 0.0007955536203328822, + "loss": 0.88384914, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1662, + "time_per_iteration": 2.9499282836914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074811, + "balance_loss_mlp": 1.06073272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.08962386225204486, + "language_loss": 0.8334958, + "learning_rate": 0.0007953022751379469, + "loss": 0.84424388, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.140625, + "routerloss_mlp": 0.0, + "step": 1663, + "time_per_iteration": 2.768754005432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075722, + "balance_loss_mlp": 1.06131005, + "diversity_loss_mlp": 0.0, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.08182948291647181, + "language_loss": 0.8200748, + "learning_rate": 0.000795050815296501, + "loss": 0.830832, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1664, + "time_per_iteration": 2.9893014430999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.07167196, + "diversity_loss_mlp": 0.0, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.0641722272838546, + "language_loss": 0.93037909, + "learning_rate": 0.0007947992409061695, + "loss": 0.94122881, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.13330078, + "routerloss_mlp": 0.0, + "step": 1665, + "time_per_iteration": 2.583789110183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100063, + "balance_loss_mlp": 1.08662808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07388769827525307, + "language_loss": 0.86501724, + "learning_rate": 0.0007945475520646226, + "loss": 0.87601787, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1666, + "time_per_iteration": 2.944988965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127031, + "balance_loss_mlp": 1.11408508, + "diversity_loss_mlp": 0.0, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.0781321549049884, + "language_loss": 0.84777099, + "learning_rate": 0.0007942957488695743, + "loss": 0.85904133, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.12957764, + "routerloss_mlp": 0.0, + "step": 1667, + "time_per_iteration": 2.667464017868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138299, + "balance_loss_mlp": 1.12505507, + "diversity_loss_mlp": 0.0, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06588913292879497, + "language_loss": 0.81000018, + "learning_rate": 0.0007940438314187833, + "loss": 0.82138324, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.13250732, + "routerloss_mlp": 0.0, + "step": 1668, + "time_per_iteration": 3.0395359992980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147791, + "balance_loss_mlp": 1.13491094, + "diversity_loss_mlp": 0.0, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.07621602089938284, + "language_loss": 0.80540276, + "learning_rate": 0.0007937917998100529, + "loss": 0.8168807, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1669, + "time_per_iteration": 2.5894687175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142528, + "balance_loss_mlp": 1.1294744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.07981389159152626, + "language_loss": 0.79167509, + "learning_rate": 0.0007935396541412302, + "loss": 0.80310035, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.13067627, + "routerloss_mlp": 0.0, + "step": 1670, + "time_per_iteration": 2.672978401184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141245, + "balance_loss_mlp": 1.12813175, + "diversity_loss_mlp": 0.0, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.06899314705075654, + "language_loss": 0.85712755, + "learning_rate": 0.0007932873945102068, + "loss": 0.86854005, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.13128662, + "routerloss_mlp": 0.0, + "step": 1671, + "time_per_iteration": 2.6296515464782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272088, + "balance_loss_mlp": 1.25616145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.05047573422440889, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.77033865, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1672, + "time_per_iteration": 4.840561628341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138556, + "balance_loss_mlp": 1.1251744, + "diversity_loss_mlp": 0.0, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.06902528499394482, + "language_loss": 0.86527705, + "learning_rate": 0.0007927825337533461, + "loss": 0.87666261, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.1338501, + "routerloss_mlp": 0.0, + "step": 1673, + "time_per_iteration": 2.693758964538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142697, + "balance_loss_mlp": 1.12930942, + "diversity_loss_mlp": 0.0, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.08521571565711833, + "language_loss": 0.84877092, + "learning_rate": 0.0007925299328235131, + "loss": 0.8601979, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1674, + "time_per_iteration": 2.659621238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141943, + "balance_loss_mlp": 1.12855613, + "diversity_loss_mlp": 0.0, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.08187135533898351, + "language_loss": 0.84720862, + "learning_rate": 0.000792277218323488, + "loss": 0.85862803, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.1340332, + "routerloss_mlp": 0.0, + "step": 1675, + "time_per_iteration": 2.646108865737915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135022, + "balance_loss_mlp": 1.12169456, + "diversity_loss_mlp": 0.0, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.08499328402904442, + "language_loss": 0.8509531, + "learning_rate": 0.0007920243903513833, + "loss": 0.86230332, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.13342285, + "routerloss_mlp": 0.0, + "step": 1676, + "time_per_iteration": 2.5730555057525635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126699, + "balance_loss_mlp": 1.11364567, + "diversity_loss_mlp": 0.0, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.08854342537284099, + "language_loss": 0.84008271, + "learning_rate": 0.0007917714490053556, + "loss": 0.85134971, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1677, + "time_per_iteration": 2.718555212020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122958, + "balance_loss_mlp": 1.10974979, + "diversity_loss_mlp": 0.0, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.07711595043056121, + "language_loss": 0.86223996, + "learning_rate": 0.0007915183943836055, + "loss": 0.87346947, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.13220215, + "routerloss_mlp": 0.0, + "step": 1678, + "time_per_iteration": 2.902038812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112402, + "balance_loss_mlp": 1.09958673, + "diversity_loss_mlp": 0.0, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.07762427611918464, + "language_loss": 0.8422336, + "learning_rate": 0.0007912652265843773, + "loss": 0.85335761, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1679, + "time_per_iteration": 3.024665117263794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107958, + "balance_loss_mlp": 1.09453535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.06959311244041297, + "language_loss": 0.81845474, + "learning_rate": 0.0007910119457059597, + "loss": 0.82953429, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.13439941, + "routerloss_mlp": 0.0, + "step": 1680, + "time_per_iteration": 2.6954221725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111109, + "balance_loss_mlp": 1.09806776, + "diversity_loss_mlp": 0.0, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.08135634404485692, + "language_loss": 0.80380678, + "learning_rate": 0.0007907585518466849, + "loss": 0.81491786, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.13061523, + "routerloss_mlp": 0.0, + "step": 1681, + "time_per_iteration": 2.961648464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108764, + "balance_loss_mlp": 1.09574652, + "diversity_loss_mlp": 0.0, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.06462126830885603, + "language_loss": 0.89670283, + "learning_rate": 0.000790505045104929, + "loss": 0.90779042, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1682, + "time_per_iteration": 2.5210485458374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111018, + "balance_loss_mlp": 1.09719789, + "diversity_loss_mlp": 0.0, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.08715930327910015, + "language_loss": 0.86719161, + "learning_rate": 0.0007902514255791125, + "loss": 0.8782934, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1683, + "time_per_iteration": 2.8002610206604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097901, + "balance_loss_mlp": 1.084764, + "diversity_loss_mlp": 0.0, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.06656486310868524, + "language_loss": 0.8795855, + "learning_rate": 0.0007899976933676986, + "loss": 0.89056444, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1684, + "time_per_iteration": 2.967172622680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092249, + "balance_loss_mlp": 1.07880259, + "diversity_loss_mlp": 0.0, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.09628316614228749, + "language_loss": 0.87045735, + "learning_rate": 0.0007897438485691955, + "loss": 0.88137984, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.13464355, + "routerloss_mlp": 0.0, + "step": 1685, + "time_per_iteration": 2.680147171020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103099, + "balance_loss_mlp": 1.0898304, + "diversity_loss_mlp": 0.0, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.0850736326825917, + "language_loss": 0.82684374, + "learning_rate": 0.0007894898912821542, + "loss": 0.83787471, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1686, + "time_per_iteration": 2.554380416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101582, + "balance_loss_mlp": 1.0880518, + "diversity_loss_mlp": 0.0, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.06056792299191916, + "language_loss": 0.86695451, + "learning_rate": 0.0007892358216051695, + "loss": 0.87797034, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1687, + "time_per_iteration": 2.7851648330688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.09641767, + "diversity_loss_mlp": 0.0, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.07434076211008771, + "language_loss": 0.91829026, + "learning_rate": 0.0007889816396368803, + "loss": 0.92938912, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1688, + "time_per_iteration": 2.6211581230163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.10499799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.07845440141588131, + "language_loss": 0.85253429, + "learning_rate": 0.0007887273454759687, + "loss": 0.8637172, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.13299561, + "routerloss_mlp": 0.0, + "step": 1689, + "time_per_iteration": 2.507779598236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.10946417, + "diversity_loss_mlp": 0.0, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.08373410695529686, + "language_loss": 0.82792354, + "learning_rate": 0.0007884729392211603, + "loss": 0.83914578, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1690, + "time_per_iteration": 2.6805906295776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119249, + "balance_loss_mlp": 1.10672641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09069843341009556, + "language_loss": 0.85648167, + "learning_rate": 0.0007882184209712245, + "loss": 0.86767411, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.12530518, + "routerloss_mlp": 0.0, + "step": 1691, + "time_per_iteration": 2.569239377975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00949982, + "balance_loss_mlp": 1.66309059, + "diversity_loss_mlp": 0.20491584, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.028395749586794427, + "language_loss": 0.85757548, + "learning_rate": 0.000787963790824974, + "loss": 0.86707526, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01597837, + "step": 1692, + "time_per_iteration": 3.009209156036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113225, + "balance_loss_mlp": 1.10071397, + "diversity_loss_mlp": 0.0, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.22846677162281695, + "language_loss": 0.89612615, + "learning_rate": 0.0007877090488812651, + "loss": 0.90725839, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.12512207, + "routerloss_mlp": 0.0, + "step": 1693, + "time_per_iteration": 2.450209617614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00936753, + "balance_loss_mlp": 1.63723278, + "diversity_loss_mlp": 0.20419246, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.03161007726798549, + "language_loss": 0.83743423, + "learning_rate": 0.0007874541952389973, + "loss": 0.84680176, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01604037, + "step": 1694, + "time_per_iteration": 2.6965737342834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111615, + "balance_loss_mlp": 1.10350823, + "diversity_loss_mlp": 0.0, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.07424213060006848, + "language_loss": 0.86538494, + "learning_rate": 0.0007871992299971136, + "loss": 0.87654638, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1695, + "time_per_iteration": 2.570406913757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131966, + "balance_loss_mlp": 1.11953878, + "diversity_loss_mlp": 0.0, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.0612219868328418, + "language_loss": 0.84142137, + "learning_rate": 0.0007869441532546001, + "loss": 0.852741, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1696, + "time_per_iteration": 2.763688087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128533, + "balance_loss_mlp": 1.11626601, + "diversity_loss_mlp": 0.0, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.06155756648422996, + "language_loss": 0.79298395, + "learning_rate": 0.0007866889651104867, + "loss": 0.80426925, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 1697, + "time_per_iteration": 2.816236972808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130866, + "balance_loss_mlp": 1.11769366, + "diversity_loss_mlp": 0.0, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.0827611554210385, + "language_loss": 0.83172429, + "learning_rate": 0.000786433665663846, + "loss": 0.84303296, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.13195801, + "routerloss_mlp": 0.0, + "step": 1698, + "time_per_iteration": 2.6627049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135283, + "balance_loss_mlp": 1.12240815, + "diversity_loss_mlp": 0.0, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.08562611300573084, + "language_loss": 0.86256903, + "learning_rate": 0.0007861782550137942, + "loss": 0.87392187, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1699, + "time_per_iteration": 2.9298973083496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115677, + "balance_loss_mlp": 1.10270739, + "diversity_loss_mlp": 0.0, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.06870341741306431, + "language_loss": 0.85913056, + "learning_rate": 0.0007859227332594901, + "loss": 0.8702873, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1700, + "time_per_iteration": 2.9108214378356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099921, + "balance_loss_mlp": 1.08703494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.08010897822069696, + "language_loss": 0.84705722, + "learning_rate": 0.0007856671005001365, + "loss": 0.85805643, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1701, + "time_per_iteration": 3.172921895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088126, + "balance_loss_mlp": 1.07506084, + "diversity_loss_mlp": 0.0, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.0963591610521261, + "language_loss": 0.81720912, + "learning_rate": 0.0007854113568349787, + "loss": 0.82809043, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.13085938, + "routerloss_mlp": 0.0, + "step": 1702, + "time_per_iteration": 3.1135685443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100269, + "balance_loss_mlp": 1.08686948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.07838750037803571, + "language_loss": 0.80661154, + "learning_rate": 0.0007851555023633052, + "loss": 0.8176142, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.13397217, + "routerloss_mlp": 0.0, + "step": 1703, + "time_per_iteration": 2.841059684753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086421, + "balance_loss_mlp": 1.07271171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.07047077484334266, + "language_loss": 0.82222247, + "learning_rate": 0.0007848995371844474, + "loss": 0.83308667, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1704, + "time_per_iteration": 2.515455961227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094981, + "balance_loss_mlp": 1.0816896, + "diversity_loss_mlp": 0.0, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.08203255389116743, + "language_loss": 0.80260348, + "learning_rate": 0.0007846434613977801, + "loss": 0.81355333, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.13305664, + "routerloss_mlp": 0.0, + "step": 1705, + "time_per_iteration": 2.523026466369629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.0868392, + "diversity_loss_mlp": 0.0, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.07270926258732689, + "language_loss": 0.78603041, + "learning_rate": 0.0007843872751027203, + "loss": 0.7970314, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.13275146, + "routerloss_mlp": 0.0, + "step": 1706, + "time_per_iteration": 2.8923709392547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00915397, + "balance_loss_mlp": 1.59612775, + "diversity_loss_mlp": 0.20258766, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.02966318853366187, + "language_loss": 0.87305748, + "learning_rate": 0.0007841309783987287, + "loss": 0.88221151, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01603885, + "step": 1707, + "time_per_iteration": 2.7517144680023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115655, + "balance_loss_mlp": 1.10263109, + "diversity_loss_mlp": 0.0, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.06500174516261728, + "language_loss": 0.89240694, + "learning_rate": 0.0007838745713853084, + "loss": 0.9035635, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.13031006, + "routerloss_mlp": 0.0, + "step": 1708, + "time_per_iteration": 2.6181201934814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122322, + "balance_loss_mlp": 1.10945296, + "diversity_loss_mlp": 0.0, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.06936064314807153, + "language_loss": 0.8434307, + "learning_rate": 0.0007836180541620053, + "loss": 0.85465395, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.12866211, + "routerloss_mlp": 0.0, + "step": 1709, + "time_per_iteration": 2.7040350437164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124287, + "balance_loss_mlp": 1.1112572, + "diversity_loss_mlp": 0.0, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.06883588356672955, + "language_loss": 0.86454904, + "learning_rate": 0.0007833614268284082, + "loss": 0.87579191, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 1710, + "time_per_iteration": 2.5110740661621094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01425821, + "balance_loss_mlp": 1.41738081, + "diversity_loss_mlp": 0.0, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.1402114647579648, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75535595, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.08447266, + "routerloss_mlp": 0.0, + "step": 1711, + "time_per_iteration": 4.873327016830444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129416, + "balance_loss_mlp": 1.11650598, + "diversity_loss_mlp": 0.0, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.0798208466882041, + "language_loss": 0.78414649, + "learning_rate": 0.0007828478422289016, + "loss": 0.79544067, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 1712, + "time_per_iteration": 2.608412027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138403, + "balance_loss_mlp": 1.12507582, + "diversity_loss_mlp": 0.0, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.07544776571140048, + "language_loss": 0.8909815, + "learning_rate": 0.0007825908851623833, + "loss": 0.90236557, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.13323975, + "routerloss_mlp": 0.0, + "step": 1713, + "time_per_iteration": 2.8033607006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134961, + "balance_loss_mlp": 1.12190771, + "diversity_loss_mlp": 0.0, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.06974595077498419, + "language_loss": 0.85003847, + "learning_rate": 0.0007823338183843533, + "loss": 0.86138809, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1714, + "time_per_iteration": 2.6861188411712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148942, + "balance_loss_mlp": 1.13610959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.07049806127627434, + "language_loss": 0.81025606, + "learning_rate": 0.0007820766419946141, + "loss": 0.82174551, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.1282959, + "routerloss_mlp": 0.0, + "step": 1715, + "time_per_iteration": 3.3007164001464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168148, + "balance_loss_mlp": 1.16008925, + "diversity_loss_mlp": 0.0, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.052131774928428895, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80840629, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.08056641, + "routerloss_mlp": 0.0, + "step": 1716, + "time_per_iteration": 4.947760105133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906852, + "balance_loss_mlp": 1.58163857, + "diversity_loss_mlp": 0.20079982, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.033697214377685164, + "language_loss": 0.75853068, + "learning_rate": 0.0007815619607794288, + "loss": 0.76759923, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563331, + "step": 1717, + "time_per_iteration": 2.689937114715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173062, + "balance_loss_mlp": 1.1601274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.09689448967864323, + "language_loss": 0.8294118, + "learning_rate": 0.0007813044561538001, + "loss": 0.84114236, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.12945557, + "routerloss_mlp": 0.0, + "step": 1718, + "time_per_iteration": 3.1421005725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158875, + "balance_loss_mlp": 1.14559531, + "diversity_loss_mlp": 0.0, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.06842928932014077, + "language_loss": 0.88578129, + "learning_rate": 0.0007810468423160958, + "loss": 0.89736998, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1719, + "time_per_iteration": 2.8917293548583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157511, + "balance_loss_mlp": 1.14486265, + "diversity_loss_mlp": 0.0, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.06941390463820386, + "language_loss": 0.81896281, + "learning_rate": 0.0007807891193663306, + "loss": 0.83053792, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.12640381, + "routerloss_mlp": 0.0, + "step": 1720, + "time_per_iteration": 2.8352882862091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141979, + "balance_loss_mlp": 1.12950385, + "diversity_loss_mlp": 0.0, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.07961809028947962, + "language_loss": 0.82409328, + "learning_rate": 0.0007805312874045614, + "loss": 0.83551311, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1721, + "time_per_iteration": 2.5056259632110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137994, + "balance_loss_mlp": 1.12510777, + "diversity_loss_mlp": 0.0, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.09061115976682882, + "language_loss": 0.86960506, + "learning_rate": 0.0007802733465308874, + "loss": 0.88098502, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 1722, + "time_per_iteration": 2.438533306121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144898, + "balance_loss_mlp": 1.13225603, + "diversity_loss_mlp": 0.0, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.06773749819611302, + "language_loss": 0.84162688, + "learning_rate": 0.0007800152968454501, + "loss": 0.8530758, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.12652588, + "routerloss_mlp": 0.0, + "step": 1723, + "time_per_iteration": 2.6364991664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134044, + "balance_loss_mlp": 1.12146711, + "diversity_loss_mlp": 0.0, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.06044198445597461, + "language_loss": 0.90330362, + "learning_rate": 0.0007797571384484334, + "loss": 0.91464406, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.12567139, + "routerloss_mlp": 0.0, + "step": 1724, + "time_per_iteration": 2.8638265132904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133346, + "balance_loss_mlp": 1.12061453, + "diversity_loss_mlp": 0.0, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.0752969909322094, + "language_loss": 0.91929704, + "learning_rate": 0.0007794988714400633, + "loss": 0.93063056, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 1725, + "time_per_iteration": 2.615788698196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125798, + "balance_loss_mlp": 1.11242867, + "diversity_loss_mlp": 0.0, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.07890733478173245, + "language_loss": 0.85302055, + "learning_rate": 0.0007792404959206079, + "loss": 0.86427855, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.13372803, + "routerloss_mlp": 0.0, + "step": 1726, + "time_per_iteration": 2.545780897140503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107165, + "balance_loss_mlp": 1.09446895, + "diversity_loss_mlp": 0.0, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.07756389475354548, + "language_loss": 0.81480336, + "learning_rate": 0.0007789820119903774, + "loss": 0.82587504, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.12689209, + "routerloss_mlp": 0.0, + "step": 1727, + "time_per_iteration": 3.005662441253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114992, + "balance_loss_mlp": 1.10335684, + "diversity_loss_mlp": 0.0, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.03748312413261812, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.7960766, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 1728, + "time_per_iteration": 4.833205223083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105872, + "balance_loss_mlp": 1.09285486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.07170574552345628, + "language_loss": 0.83970881, + "learning_rate": 0.0007784647192990428, + "loss": 0.85076749, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 1729, + "time_per_iteration": 2.7309772968292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107979, + "balance_loss_mlp": 1.0948776, + "diversity_loss_mlp": 0.0, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.06011930461286596, + "language_loss": 0.80777055, + "learning_rate": 0.0007782059107387696, + "loss": 0.81885028, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.13116455, + "routerloss_mlp": 0.0, + "step": 1730, + "time_per_iteration": 2.8615641593933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113195, + "balance_loss_mlp": 1.11733532, + "diversity_loss_mlp": 0.0, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.08106060743083753, + "language_loss": 0.88617826, + "learning_rate": 0.0007779469941693826, + "loss": 0.89749771, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.1463623, + "routerloss_mlp": 0.0, + "step": 1731, + "time_per_iteration": 2.801208257675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126142, + "balance_loss_mlp": 1.11240935, + "diversity_loss_mlp": 0.0, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.09519717038034853, + "language_loss": 0.77091044, + "learning_rate": 0.0007776879696914029, + "loss": 0.78217185, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1732, + "time_per_iteration": 2.8286595344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123068, + "balance_loss_mlp": 1.10889435, + "diversity_loss_mlp": 0.0, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.05947539267688924, + "language_loss": 0.88910627, + "learning_rate": 0.000777428837405392, + "loss": 0.90033698, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1733, + "time_per_iteration": 2.8319156169891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121491, + "balance_loss_mlp": 1.10701954, + "diversity_loss_mlp": 0.0, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.07113995025739508, + "language_loss": 0.86735553, + "learning_rate": 0.0007771695974119544, + "loss": 0.87857044, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1734, + "time_per_iteration": 2.5376570224761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112031, + "balance_loss_mlp": 1.09795249, + "diversity_loss_mlp": 0.0, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.08734149249458338, + "language_loss": 0.75937277, + "learning_rate": 0.0007769102498117359, + "loss": 0.77049315, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1735, + "time_per_iteration": 3.093188524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105698, + "balance_loss_mlp": 1.09138131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.06929562674350419, + "language_loss": 0.79383999, + "learning_rate": 0.000776650794705424, + "loss": 0.80489695, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1736, + "time_per_iteration": 3.253673791885376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121685, + "balance_loss_mlp": 1.10730791, + "diversity_loss_mlp": 0.0, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.06325878214231093, + "language_loss": 0.82130396, + "learning_rate": 0.0007763912321937483, + "loss": 0.83252084, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1737, + "time_per_iteration": 2.7109947204589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117751, + "balance_loss_mlp": 1.10324299, + "diversity_loss_mlp": 0.0, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.08404595709863052, + "language_loss": 0.82403475, + "learning_rate": 0.0007761315623774799, + "loss": 0.83521223, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1738, + "time_per_iteration": 3.4125657081604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109047, + "balance_loss_mlp": 1.0946703, + "diversity_loss_mlp": 0.0, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.08421865543081901, + "language_loss": 0.87820536, + "learning_rate": 0.0007758717853574313, + "loss": 0.88929582, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1739, + "time_per_iteration": 2.7345223426818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106099, + "balance_loss_mlp": 1.09184134, + "diversity_loss_mlp": 0.0, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.07638673743764693, + "language_loss": 0.90095574, + "learning_rate": 0.0007756119012344571, + "loss": 0.91201669, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.14257812, + "routerloss_mlp": 0.0, + "step": 1740, + "time_per_iteration": 2.5901129245758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.08709717, + "diversity_loss_mlp": 0.0, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.06863708242027233, + "language_loss": 0.8461023, + "learning_rate": 0.0007753519101094535, + "loss": 0.85711253, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1741, + "time_per_iteration": 2.770315647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089984, + "balance_loss_mlp": 1.07595301, + "diversity_loss_mlp": 0.0, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.07992644583812669, + "language_loss": 0.86363387, + "learning_rate": 0.0007750918120833575, + "loss": 0.87453371, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.14050293, + "routerloss_mlp": 0.0, + "step": 1742, + "time_per_iteration": 2.58940052986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.07488728, + "diversity_loss_mlp": 0.0, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.11201991585260462, + "language_loss": 0.87392128, + "learning_rate": 0.0007748316072571485, + "loss": 0.88480592, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1743, + "time_per_iteration": 2.8557286262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086266, + "balance_loss_mlp": 1.07202053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.0749416267225997, + "language_loss": 0.79045737, + "learning_rate": 0.0007745712957318467, + "loss": 0.80131996, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1744, + "time_per_iteration": 2.9912548065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084233, + "balance_loss_mlp": 1.07057166, + "diversity_loss_mlp": 0.0, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06946859722884112, + "language_loss": 0.86471289, + "learning_rate": 0.0007743108776085141, + "loss": 0.87555522, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1745, + "time_per_iteration": 2.7899224758148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084466, + "balance_loss_mlp": 1.07023191, + "diversity_loss_mlp": 0.0, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.08256839233284315, + "language_loss": 0.82965624, + "learning_rate": 0.0007740503529882543, + "loss": 0.84050083, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.14233398, + "routerloss_mlp": 0.0, + "step": 1746, + "time_per_iteration": 2.808084011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084564, + "balance_loss_mlp": 1.07044971, + "diversity_loss_mlp": 0.0, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.07349682427851349, + "language_loss": 0.90707254, + "learning_rate": 0.0007737897219722114, + "loss": 0.91791821, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1747, + "time_per_iteration": 2.712833881378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092286, + "balance_loss_mlp": 1.07794499, + "diversity_loss_mlp": 0.0, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.05794758251669461, + "language_loss": 0.81094921, + "learning_rate": 0.0007735289846615716, + "loss": 0.82187206, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.14343262, + "routerloss_mlp": 0.0, + "step": 1748, + "time_per_iteration": 2.677976369857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108166, + "balance_loss_mlp": 1.09457588, + "diversity_loss_mlp": 0.0, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.0827866783592608, + "language_loss": 0.823035, + "learning_rate": 0.0007732681411575621, + "loss": 0.8341167, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.13586426, + "routerloss_mlp": 0.0, + "step": 1749, + "time_per_iteration": 2.674349069595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114062, + "balance_loss_mlp": 1.09997165, + "diversity_loss_mlp": 0.0, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.4203922337067485, + "language_loss": 0.87328398, + "learning_rate": 0.0007730071915614514, + "loss": 0.88442457, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.14086914, + "routerloss_mlp": 0.0, + "step": 1750, + "time_per_iteration": 2.6714634895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113648, + "balance_loss_mlp": 1.10037947, + "diversity_loss_mlp": 0.0, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.09571011442330926, + "language_loss": 0.88792437, + "learning_rate": 0.0007727461359745489, + "loss": 0.89906085, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.13293457, + "routerloss_mlp": 0.0, + "step": 1751, + "time_per_iteration": 2.469905376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141755, + "balance_loss_mlp": 1.12897623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.07412184794878955, + "language_loss": 0.85941112, + "learning_rate": 0.0007724849744982056, + "loss": 0.87082875, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 1752, + "time_per_iteration": 2.6805977821350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117715, + "balance_loss_mlp": 1.16388226, + "diversity_loss_mlp": 0.0, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.09378397224837084, + "language_loss": 0.81843758, + "learning_rate": 0.0007722237072338131, + "loss": 0.83020908, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.1328125, + "routerloss_mlp": 0.0, + "step": 1753, + "time_per_iteration": 2.7348344326019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186311, + "balance_loss_mlp": 1.17280459, + "diversity_loss_mlp": 0.0, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.1034159122014491, + "language_loss": 0.85304463, + "learning_rate": 0.0007719623342828046, + "loss": 0.86490774, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1754, + "time_per_iteration": 2.5181336402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202577, + "balance_loss_mlp": 1.18872511, + "diversity_loss_mlp": 0.0, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.12703041648808322, + "language_loss": 0.84088987, + "learning_rate": 0.000771700855746654, + "loss": 0.85291564, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1755, + "time_per_iteration": 2.590925931930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188345, + "balance_loss_mlp": 1.1743381, + "diversity_loss_mlp": 0.0, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.06849832931784437, + "language_loss": 0.88371092, + "learning_rate": 0.0007714392717268763, + "loss": 0.89559436, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.14013672, + "routerloss_mlp": 0.0, + "step": 1756, + "time_per_iteration": 2.560246706008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189305, + "balance_loss_mlp": 1.17545295, + "diversity_loss_mlp": 0.0, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.09135673410225151, + "language_loss": 0.8630141, + "learning_rate": 0.0007711775823250273, + "loss": 0.8749072, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.13867188, + "routerloss_mlp": 0.0, + "step": 1757, + "time_per_iteration": 2.562939167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194838, + "balance_loss_mlp": 1.18069935, + "diversity_loss_mlp": 0.0, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.07414503329772545, + "language_loss": 0.83081156, + "learning_rate": 0.0007709157876427039, + "loss": 0.84275991, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.14147949, + "routerloss_mlp": 0.0, + "step": 1758, + "time_per_iteration": 3.0652947425842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190916, + "balance_loss_mlp": 1.17681408, + "diversity_loss_mlp": 0.0, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.06977999371164574, + "language_loss": 0.85321373, + "learning_rate": 0.0007706538877815439, + "loss": 0.86512285, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.14111328, + "routerloss_mlp": 0.0, + "step": 1759, + "time_per_iteration": 2.5949320793151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202515, + "balance_loss_mlp": 1.1888063, + "diversity_loss_mlp": 0.0, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.052908737395413206, + "language_loss": 0.83029473, + "learning_rate": 0.0007703918828432259, + "loss": 0.84231991, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.13720703, + "routerloss_mlp": 0.0, + "step": 1760, + "time_per_iteration": 2.6404576301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231589, + "balance_loss_mlp": 1.21696198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.11529749255982873, + "language_loss": 0.89274669, + "learning_rate": 0.000770129772929469, + "loss": 0.90506256, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.14611816, + "routerloss_mlp": 0.0, + "step": 1761, + "time_per_iteration": 2.6486427783966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212596, + "balance_loss_mlp": 1.19812357, + "diversity_loss_mlp": 0.0, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.10010821715075297, + "language_loss": 0.8820551, + "learning_rate": 0.0007698675581420334, + "loss": 0.89418107, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.14453125, + "routerloss_mlp": 0.0, + "step": 1762, + "time_per_iteration": 2.8473589420318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170537, + "balance_loss_mlp": 1.15610099, + "diversity_loss_mlp": 0.0, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.06768336788468338, + "language_loss": 0.79040444, + "learning_rate": 0.0007696052385827199, + "loss": 0.80210984, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.14440918, + "routerloss_mlp": 0.0, + "step": 1763, + "time_per_iteration": 2.9893951416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147034, + "balance_loss_mlp": 1.13271689, + "diversity_loss_mlp": 0.0, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.06731413775333611, + "language_loss": 0.78161937, + "learning_rate": 0.00076934281435337, + "loss": 0.79308975, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.14318848, + "routerloss_mlp": 0.0, + "step": 1764, + "time_per_iteration": 2.7329161167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933074, + "balance_loss_mlp": 1.62411106, + "diversity_loss_mlp": 0.20785357, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.0341650984642099, + "language_loss": 0.86205357, + "learning_rate": 0.0007690802855558658, + "loss": 0.87138426, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0170921, + "step": 1765, + "time_per_iteration": 2.9281163215637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121638, + "balance_loss_mlp": 1.10924029, + "diversity_loss_mlp": 0.0, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.029090002598214117, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77496594, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 1766, + "time_per_iteration": 4.91774320602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104609, + "balance_loss_mlp": 1.08886182, + "diversity_loss_mlp": 0.0, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.08396151855964885, + "language_loss": 0.89357018, + "learning_rate": 0.0007685549146641262, + "loss": 0.90461624, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.15734863, + "routerloss_mlp": 0.0, + "step": 1767, + "time_per_iteration": 2.5867435932159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108303, + "balance_loss_mlp": 1.093521, + "diversity_loss_mlp": 0.0, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.10736891621188589, + "language_loss": 0.8816734, + "learning_rate": 0.0007682920727738579, + "loss": 0.89275646, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1768, + "time_per_iteration": 2.5119268894195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102653, + "balance_loss_mlp": 1.08738232, + "diversity_loss_mlp": 0.0, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.10494960168224592, + "language_loss": 0.85048056, + "learning_rate": 0.000768029126723369, + "loss": 0.86150718, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.15246582, + "routerloss_mlp": 0.0, + "step": 1769, + "time_per_iteration": 2.495424270629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090257, + "balance_loss_mlp": 1.07520068, + "diversity_loss_mlp": 0.0, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.08686425564719477, + "language_loss": 0.82128584, + "learning_rate": 0.0007677660766147447, + "loss": 0.83218843, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.15039062, + "routerloss_mlp": 0.0, + "step": 1770, + "time_per_iteration": 2.532904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066946, + "balance_loss_mlp": 1.05578792, + "diversity_loss_mlp": 0.0, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.023964921008177247, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73537892, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 1771, + "time_per_iteration": 4.944117784500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117931, + "balance_loss_mlp": 1.1034112, + "diversity_loss_mlp": 0.0, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.10616133846526872, + "language_loss": 0.795196, + "learning_rate": 0.0007672396646316306, + "loss": 0.80637527, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1772, + "time_per_iteration": 2.6089062690734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134399, + "balance_loss_mlp": 1.11959314, + "diversity_loss_mlp": 0.0, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.07513330183645242, + "language_loss": 0.80376065, + "learning_rate": 0.000766976302961512, + "loss": 0.8151046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1773, + "time_per_iteration": 3.042421340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158934, + "balance_loss_mlp": 1.14410484, + "diversity_loss_mlp": 0.0, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.07872996810077096, + "language_loss": 0.81390858, + "learning_rate": 0.0007667128376420003, + "loss": 0.82549793, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1774, + "time_per_iteration": 2.536562442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208475, + "balance_loss_mlp": 1.19358635, + "diversity_loss_mlp": 0.0, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.08297883362487203, + "language_loss": 0.8462863, + "learning_rate": 0.0007664492687753817, + "loss": 0.85837102, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1775, + "time_per_iteration": 2.6977102756500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198612, + "balance_loss_mlp": 1.18424678, + "diversity_loss_mlp": 0.0, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.10155126624771216, + "language_loss": 0.81542516, + "learning_rate": 0.000766185596463983, + "loss": 0.82741123, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1776, + "time_per_iteration": 2.6038215160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196202, + "balance_loss_mlp": 1.18163514, + "diversity_loss_mlp": 0.0, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.0897891274607312, + "language_loss": 0.77011722, + "learning_rate": 0.0007659218208101706, + "loss": 0.78207922, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1777, + "time_per_iteration": 3.0933022499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173425, + "balance_loss_mlp": 1.15902483, + "diversity_loss_mlp": 0.0, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.08364054831663822, + "language_loss": 0.85122472, + "learning_rate": 0.0007656579419163515, + "loss": 0.86295897, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.1439209, + "routerloss_mlp": 0.0, + "step": 1778, + "time_per_iteration": 2.732297420501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146549, + "balance_loss_mlp": 1.13211274, + "diversity_loss_mlp": 0.0, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.0722191895240348, + "language_loss": 0.77409559, + "learning_rate": 0.0007653939598849724, + "loss": 0.78556108, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.14416504, + "routerloss_mlp": 0.0, + "step": 1779, + "time_per_iteration": 2.4908664226531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_mlp": 1.02253902, + "diversity_loss_mlp": 0.0, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.029240552967656448, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83912855, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.10107422, + "routerloss_mlp": 0.0, + "step": 1780, + "time_per_iteration": 4.9182775020599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.10688317, + "diversity_loss_mlp": 0.0, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.07624931845389674, + "language_loss": 0.80176342, + "learning_rate": 0.000764865686819522, + "loss": 0.81297386, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1781, + "time_per_iteration": 3.0602052211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111853, + "balance_loss_mlp": 1.097965, + "diversity_loss_mlp": 0.0, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.07936344533488468, + "language_loss": 0.85836053, + "learning_rate": 0.0007646013959905449, + "loss": 0.86947906, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1782, + "time_per_iteration": 2.5750925540924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109071, + "balance_loss_mlp": 1.09528995, + "diversity_loss_mlp": 0.0, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.07233814650781724, + "language_loss": 0.81042612, + "learning_rate": 0.0007643370024341949, + "loss": 0.82151681, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1783, + "time_per_iteration": 3.0870087146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110812, + "balance_loss_mlp": 1.09431553, + "diversity_loss_mlp": 0.0, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.07806584209391611, + "language_loss": 0.83175099, + "learning_rate": 0.0007640725062531195, + "loss": 0.84283221, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1784, + "time_per_iteration": 2.5063886642456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102776, + "balance_loss_mlp": 1.08888865, + "diversity_loss_mlp": 0.0, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.5067557182324087, + "language_loss": 0.86699629, + "learning_rate": 0.0007638079075500047, + "loss": 0.87802398, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1785, + "time_per_iteration": 2.532945394515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015111, + "balance_loss_mlp": 1.00562215, + "diversity_loss_mlp": 0.0, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.016449027395748255, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76195776, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 1786, + "time_per_iteration": 4.944318056106567 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150049, + "balance_loss_mlp": 1.13542247, + "diversity_loss_mlp": 0.0, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.07356798682381475, + "language_loss": 0.83088338, + "learning_rate": 0.0007632784029886026, + "loss": 0.84238386, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.14599609, + "routerloss_mlp": 0.0, + "step": 1787, + "time_per_iteration": 2.6217002868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204344, + "balance_loss_mlp": 1.1884768, + "diversity_loss_mlp": 0.0, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.08799574205003287, + "language_loss": 0.85466659, + "learning_rate": 0.0007630134973358873, + "loss": 0.86671007, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.15856934, + "routerloss_mlp": 0.0, + "step": 1788, + "time_per_iteration": 2.9664394855499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251833, + "balance_loss_mlp": 1.2359066, + "diversity_loss_mlp": 0.0, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.1052875761358054, + "language_loss": 0.86575854, + "learning_rate": 0.0007627484895722763, + "loss": 0.87827688, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.15917969, + "routerloss_mlp": 0.0, + "step": 1789, + "time_per_iteration": 2.67280912399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247407, + "balance_loss_mlp": 1.23117065, + "diversity_loss_mlp": 0.0, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.09611070791328494, + "language_loss": 0.80025196, + "learning_rate": 0.0007624833798006552, + "loss": 0.81272602, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.16235352, + "routerloss_mlp": 0.0, + "step": 1790, + "time_per_iteration": 3.046809196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238128, + "balance_loss_mlp": 1.22221315, + "diversity_loss_mlp": 0.0, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.07959093752215074, + "language_loss": 0.83783114, + "learning_rate": 0.0007622181681239483, + "loss": 0.8502124, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.15905762, + "routerloss_mlp": 0.0, + "step": 1791, + "time_per_iteration": 2.6601433753967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244342, + "balance_loss_mlp": 1.22793913, + "diversity_loss_mlp": 0.0, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.07919089267187412, + "language_loss": 0.84668601, + "learning_rate": 0.0007619528546451202, + "loss": 0.85912943, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.1640625, + "routerloss_mlp": 0.0, + "step": 1792, + "time_per_iteration": 2.782947063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208587, + "balance_loss_mlp": 1.19314909, + "diversity_loss_mlp": 0.0, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.07332959959795217, + "language_loss": 0.83832949, + "learning_rate": 0.0007616874394671745, + "loss": 0.85041535, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.1541748, + "routerloss_mlp": 0.0, + "step": 1793, + "time_per_iteration": 3.3206703662872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184994, + "balance_loss_mlp": 1.169258, + "diversity_loss_mlp": 0.0, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.0713753042238581, + "language_loss": 0.85051751, + "learning_rate": 0.0007614219226931547, + "loss": 0.86236751, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1794, + "time_per_iteration": 2.7190396785736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179587, + "balance_loss_mlp": 1.16401851, + "diversity_loss_mlp": 0.0, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.07163818055438703, + "language_loss": 0.8457973, + "learning_rate": 0.0007611563044261435, + "loss": 0.85759324, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.15551758, + "routerloss_mlp": 0.0, + "step": 1795, + "time_per_iteration": 2.5077741146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150042, + "balance_loss_mlp": 1.13422251, + "diversity_loss_mlp": 0.0, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.0670543853763616, + "language_loss": 0.86376798, + "learning_rate": 0.0007608905847692631, + "loss": 0.8752684, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.15808105, + "routerloss_mlp": 0.0, + "step": 1796, + "time_per_iteration": 2.4662768840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112741, + "balance_loss_mlp": 1.11171043, + "diversity_loss_mlp": 0.0, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.07671810253227593, + "language_loss": 0.86553091, + "learning_rate": 0.0007606247638256749, + "loss": 0.87680501, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.15686035, + "routerloss_mlp": 0.0, + "step": 1797, + "time_per_iteration": 2.8649494647979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00624206, + "balance_loss_mlp": 1.05204535, + "diversity_loss_mlp": 0.16984753, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.0016633519833830733, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.78794497, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01325956, + "step": 1798, + "time_per_iteration": 4.963132619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_mlp": 1.04498482, + "diversity_loss_mlp": 0.0, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.032920799461559694, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80382872, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.10693359, + "routerloss_mlp": 0.0, + "step": 1799, + "time_per_iteration": 4.773633003234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099713, + "balance_loss_mlp": 1.08345306, + "diversity_loss_mlp": 0.0, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.10233507255995049, + "language_loss": 0.85892332, + "learning_rate": 0.0007598266943068686, + "loss": 0.86992049, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.16259766, + "routerloss_mlp": 0.0, + "step": 1800, + "time_per_iteration": 2.7380948066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092311, + "balance_loss_mlp": 1.0761466, + "diversity_loss_mlp": 0.0, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.08416075255699706, + "language_loss": 0.83903629, + "learning_rate": 0.0007595604692488507, + "loss": 0.84995937, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1801, + "time_per_iteration": 2.5558300018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099836, + "balance_loss_mlp": 1.08382583, + "diversity_loss_mlp": 0.0, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.0681721192963598, + "language_loss": 0.82674247, + "learning_rate": 0.0007592941434205215, + "loss": 0.83774084, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.16003418, + "routerloss_mlp": 0.0, + "step": 1802, + "time_per_iteration": 2.8181002140045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017477, + "balance_loss_mlp": 1.00651026, + "diversity_loss_mlp": 0.0, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.018274165575771096, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588537, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10986328, + "routerloss_mlp": 0.0, + "step": 1803, + "time_per_iteration": 5.063629388809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126513, + "balance_loss_mlp": 1.11121821, + "diversity_loss_mlp": 0.0, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07342722091818694, + "language_loss": 0.80217302, + "learning_rate": 0.0007587611898665566, + "loss": 0.81343818, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.15270996, + "routerloss_mlp": 0.0, + "step": 1804, + "time_per_iteration": 3.0994317531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113982, + "balance_loss_mlp": 1.12468028, + "diversity_loss_mlp": 0.0, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.05936466476556785, + "language_loss": 0.82130265, + "learning_rate": 0.0007584945623478315, + "loss": 0.83270085, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.15112305, + "routerloss_mlp": 0.0, + "step": 1805, + "time_per_iteration": 2.833981513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152624, + "balance_loss_mlp": 1.13780582, + "diversity_loss_mlp": 0.0, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.08744691316973383, + "language_loss": 0.80801159, + "learning_rate": 0.000758227834472617, + "loss": 0.81953788, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.14807129, + "routerloss_mlp": 0.0, + "step": 1806, + "time_per_iteration": 3.0535178184509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166216, + "balance_loss_mlp": 1.15111172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.07500761638021176, + "language_loss": 0.77729452, + "learning_rate": 0.0007579610063444664, + "loss": 0.7889567, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1807, + "time_per_iteration": 2.7615864276885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149194, + "balance_loss_mlp": 1.1339947, + "diversity_loss_mlp": 0.0, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.07406875426876382, + "language_loss": 0.87547183, + "learning_rate": 0.0007576940780669712, + "loss": 0.88696373, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.1517334, + "routerloss_mlp": 0.0, + "step": 1808, + "time_per_iteration": 3.264080762863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143539, + "balance_loss_mlp": 1.12863731, + "diversity_loss_mlp": 0.0, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.07928472428244501, + "language_loss": 0.84104979, + "learning_rate": 0.0007574270497437624, + "loss": 0.85248518, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.14880371, + "routerloss_mlp": 0.0, + "step": 1809, + "time_per_iteration": 2.9859273433685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128376, + "balance_loss_mlp": 1.11302221, + "diversity_loss_mlp": 0.0, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.07150597602774303, + "language_loss": 0.88426095, + "learning_rate": 0.000757159921478509, + "loss": 0.89554477, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.15332031, + "routerloss_mlp": 0.0, + "step": 1810, + "time_per_iteration": 2.7891488075256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057512, + "balance_loss_mlp": 1.04754615, + "diversity_loss_mlp": 0.0, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.03228641235871289, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75508153, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 1811, + "time_per_iteration": 4.737962007522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103488, + "balance_loss_mlp": 1.08814573, + "diversity_loss_mlp": 0.0, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.07438083858778873, + "language_loss": 0.87798911, + "learning_rate": 0.0007566253655367423, + "loss": 0.88902402, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.15319824, + "routerloss_mlp": 0.0, + "step": 1812, + "time_per_iteration": 2.5879476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.07600367, + "diversity_loss_mlp": 0.0, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.06854488097647142, + "language_loss": 0.8957805, + "learning_rate": 0.000756357938067762, + "loss": 0.90669596, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.15527344, + "routerloss_mlp": 0.0, + "step": 1813, + "time_per_iteration": 2.7090489864349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094415, + "balance_loss_mlp": 1.07826209, + "diversity_loss_mlp": 0.0, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.0690606019510397, + "language_loss": 0.8334865, + "learning_rate": 0.0007560904110718033, + "loss": 0.84443069, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.16149902, + "routerloss_mlp": 0.0, + "step": 1814, + "time_per_iteration": 3.2445590496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096093, + "balance_loss_mlp": 1.08003569, + "diversity_loss_mlp": 0.0, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.06223934742271703, + "language_loss": 0.83650601, + "learning_rate": 0.0007558227846527297, + "loss": 0.84746695, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.16052246, + "routerloss_mlp": 0.0, + "step": 1815, + "time_per_iteration": 2.8504550457000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110334, + "balance_loss_mlp": 1.08731842, + "diversity_loss_mlp": 0.0, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.07831164241761415, + "language_loss": 0.83117825, + "learning_rate": 0.0007555550589144429, + "loss": 0.84221166, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1816, + "time_per_iteration": 2.4655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111049, + "balance_loss_mlp": 1.09515882, + "diversity_loss_mlp": 0.0, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.08460625336983617, + "language_loss": 0.84522688, + "learning_rate": 0.000755287233960883, + "loss": 0.85633731, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.15881348, + "routerloss_mlp": 0.0, + "step": 1817, + "time_per_iteration": 2.602492094039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089592, + "balance_loss_mlp": 1.07385683, + "diversity_loss_mlp": 0.0, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.07045705340523431, + "language_loss": 0.77682364, + "learning_rate": 0.0007550193098960292, + "loss": 0.78771949, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.15722656, + "routerloss_mlp": 0.0, + "step": 1818, + "time_per_iteration": 2.8674800395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00989642, + "balance_loss_mlp": 1.73270237, + "diversity_loss_mlp": 0.21087486, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.029406524514427698, + "language_loss": 0.86412024, + "learning_rate": 0.0007547512868238988, + "loss": 0.87401664, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01785346, + "step": 1819, + "time_per_iteration": 3.151559829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090552, + "balance_loss_mlp": 1.07453036, + "diversity_loss_mlp": 0.0, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.06124546921927801, + "language_loss": 0.83503008, + "learning_rate": 0.0007544831648485473, + "loss": 0.84593564, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.16015625, + "routerloss_mlp": 0.0, + "step": 1820, + "time_per_iteration": 2.6791367530822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094234, + "balance_loss_mlp": 1.07806909, + "diversity_loss_mlp": 0.0, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.08232155140582742, + "language_loss": 0.81448233, + "learning_rate": 0.0007542149440740694, + "loss": 0.82542467, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.16162109, + "routerloss_mlp": 0.0, + "step": 1821, + "time_per_iteration": 2.665632724761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.07229352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.08177047744866778, + "language_loss": 0.85514361, + "learning_rate": 0.000753946624604597, + "loss": 0.8660273, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.16064453, + "routerloss_mlp": 0.0, + "step": 1822, + "time_per_iteration": 2.708221673965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085159, + "balance_loss_mlp": 1.06938744, + "diversity_loss_mlp": 0.0, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.07022994660183399, + "language_loss": 0.88119262, + "learning_rate": 0.0007536782065443015, + "loss": 0.89204431, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.15759277, + "routerloss_mlp": 0.0, + "step": 1823, + "time_per_iteration": 2.633929967880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109141, + "balance_loss_mlp": 1.0758059, + "diversity_loss_mlp": 0.0, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.09965750131036237, + "language_loss": 0.75038946, + "learning_rate": 0.0007534096899973919, + "loss": 0.7613036, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.15588379, + "routerloss_mlp": 0.0, + "step": 1824, + "time_per_iteration": 2.585160732269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089888, + "balance_loss_mlp": 1.07460535, + "diversity_loss_mlp": 0.0, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.0636070515998131, + "language_loss": 0.82941401, + "learning_rate": 0.0007531410750681154, + "loss": 0.84031284, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.15258789, + "routerloss_mlp": 0.0, + "step": 1825, + "time_per_iteration": 2.7595911026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100118, + "balance_loss_mlp": 1.08562207, + "diversity_loss_mlp": 0.0, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.09267960960885083, + "language_loss": 0.87015611, + "learning_rate": 0.0007528723618607575, + "loss": 0.88115728, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.14489746, + "routerloss_mlp": 0.0, + "step": 1826, + "time_per_iteration": 3.4216692447662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090335, + "balance_loss_mlp": 1.07524323, + "diversity_loss_mlp": 0.0, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.07214965975453298, + "language_loss": 0.82582879, + "learning_rate": 0.0007526035504796422, + "loss": 0.83673215, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.15087891, + "routerloss_mlp": 0.0, + "step": 1827, + "time_per_iteration": 2.7822000980377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094099, + "balance_loss_mlp": 1.0794003, + "diversity_loss_mlp": 0.0, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.07057247929289283, + "language_loss": 0.86824054, + "learning_rate": 0.0007523346410291312, + "loss": 0.8791815, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1828, + "time_per_iteration": 2.7560181617736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098232, + "balance_loss_mlp": 1.08291376, + "diversity_loss_mlp": 0.0, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.0630617970486185, + "language_loss": 0.85159689, + "learning_rate": 0.0007520656336136245, + "loss": 0.86257917, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.15307617, + "routerloss_mlp": 0.0, + "step": 1829, + "time_per_iteration": 2.9432313442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098974, + "balance_loss_mlp": 1.08431172, + "diversity_loss_mlp": 0.0, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.06541232162591855, + "language_loss": 0.88230217, + "learning_rate": 0.0007517965283375599, + "loss": 0.89329195, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1830, + "time_per_iteration": 2.8773486614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098552, + "balance_loss_mlp": 1.08363926, + "diversity_loss_mlp": 0.0, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.06973135687475002, + "language_loss": 0.89511967, + "learning_rate": 0.0007515273253054132, + "loss": 0.90610522, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.14892578, + "routerloss_mlp": 0.0, + "step": 1831, + "time_per_iteration": 2.662757396697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097986, + "balance_loss_mlp": 1.08288169, + "diversity_loss_mlp": 0.0, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.07142201858296882, + "language_loss": 0.82785273, + "learning_rate": 0.0007512580246216988, + "loss": 0.83883256, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.15075684, + "routerloss_mlp": 0.0, + "step": 1832, + "time_per_iteration": 2.730994939804077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096378, + "balance_loss_mlp": 1.08164394, + "diversity_loss_mlp": 0.0, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.07119734441282773, + "language_loss": 0.84715027, + "learning_rate": 0.000750988626390968, + "loss": 0.85811406, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.1472168, + "routerloss_mlp": 0.0, + "step": 1833, + "time_per_iteration": 2.604182004928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_mlp": 1.07508624, + "diversity_loss_mlp": 0.0, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.07060575001723658, + "language_loss": 0.85089648, + "learning_rate": 0.0007507191307178108, + "loss": 0.86179501, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.14746094, + "routerloss_mlp": 0.0, + "step": 1834, + "time_per_iteration": 2.7584774494171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083532, + "balance_loss_mlp": 1.06808281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.09392412586459238, + "language_loss": 0.75105453, + "learning_rate": 0.0007504495377068543, + "loss": 0.76188982, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.15429688, + "routerloss_mlp": 0.0, + "step": 1835, + "time_per_iteration": 2.731039524078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087025, + "balance_loss_mlp": 1.07230306, + "diversity_loss_mlp": 0.0, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09299008065025831, + "language_loss": 0.81784093, + "learning_rate": 0.0007501798474627642, + "loss": 0.82871115, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.14697266, + "routerloss_mlp": 0.0, + "step": 1836, + "time_per_iteration": 2.9180665016174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092183, + "balance_loss_mlp": 1.07738876, + "diversity_loss_mlp": 0.0, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.06800399913452355, + "language_loss": 0.8354817, + "learning_rate": 0.0007499100600902433, + "loss": 0.84640354, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.14782715, + "routerloss_mlp": 0.0, + "step": 1837, + "time_per_iteration": 2.981478452682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097597, + "balance_loss_mlp": 1.08236217, + "diversity_loss_mlp": 0.0, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.07178124654929893, + "language_loss": 0.83625698, + "learning_rate": 0.0007496401756940324, + "loss": 0.84723294, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.15209961, + "routerloss_mlp": 0.0, + "step": 1838, + "time_per_iteration": 2.7256877422332764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107267, + "balance_loss_mlp": 1.09267545, + "diversity_loss_mlp": 0.0, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.08438072522416575, + "language_loss": 0.81940264, + "learning_rate": 0.0007493701943789098, + "loss": 0.83047533, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.14575195, + "routerloss_mlp": 0.0, + "step": 1839, + "time_per_iteration": 2.805553674697876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117723, + "balance_loss_mlp": 1.10266685, + "diversity_loss_mlp": 0.0, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07000666511795951, + "language_loss": 0.82830888, + "learning_rate": 0.000749100116249692, + "loss": 0.83948612, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.1505127, + "routerloss_mlp": 0.0, + "step": 1840, + "time_per_iteration": 2.608135223388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00954188, + "balance_loss_mlp": 1.66862321, + "diversity_loss_mlp": 0.20571998, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.03743173710930313, + "language_loss": 0.86076337, + "learning_rate": 0.0007488299414112321, + "loss": 0.87030524, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01701665, + "step": 1841, + "time_per_iteration": 2.6307811737060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112413, + "balance_loss_mlp": 1.10974133, + "diversity_loss_mlp": 0.0, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.06710116446149988, + "language_loss": 0.77204335, + "learning_rate": 0.0007485596699684215, + "loss": 0.78328466, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1842, + "time_per_iteration": 2.808776378631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132611, + "balance_loss_mlp": 1.11780548, + "diversity_loss_mlp": 0.0, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.07987851383877129, + "language_loss": 0.85353696, + "learning_rate": 0.000748289302026189, + "loss": 0.86486304, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1843, + "time_per_iteration": 2.8449106216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127963, + "balance_loss_mlp": 1.11339569, + "diversity_loss_mlp": 0.0, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.06918658934745357, + "language_loss": 0.85752398, + "learning_rate": 0.0007480188376895004, + "loss": 0.86880362, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.14550781, + "routerloss_mlp": 0.0, + "step": 1844, + "time_per_iteration": 3.0339298248291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160602, + "balance_loss_mlp": 1.15135121, + "diversity_loss_mlp": 0.0, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.06421168097867443, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74971944, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 1845, + "time_per_iteration": 4.932978391647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119726, + "balance_loss_mlp": 1.10506296, + "diversity_loss_mlp": 0.0, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08194467088107492, + "language_loss": 0.78768218, + "learning_rate": 0.0007474776202528074, + "loss": 0.79887938, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.14660645, + "routerloss_mlp": 0.0, + "step": 1846, + "time_per_iteration": 2.9188990592956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111713, + "balance_loss_mlp": 1.1021452, + "diversity_loss_mlp": 0.0, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08015412782248336, + "language_loss": 0.80999184, + "learning_rate": 0.000747206867362922, + "loss": 0.82116312, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.14953613, + "routerloss_mlp": 0.0, + "step": 1847, + "time_per_iteration": 3.0966272354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099814, + "balance_loss_mlp": 1.085235, + "diversity_loss_mlp": 0.0, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.09857033029565816, + "language_loss": 0.836568, + "learning_rate": 0.0007469360184988194, + "loss": 0.84756613, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.14562988, + "routerloss_mlp": 0.0, + "step": 1848, + "time_per_iteration": 2.9021246433258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104687, + "balance_loss_mlp": 1.08986914, + "diversity_loss_mlp": 0.0, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08185517170087683, + "language_loss": 0.86821651, + "learning_rate": 0.0007466650737656518, + "loss": 0.8792634, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.14794922, + "routerloss_mlp": 0.0, + "step": 1849, + "time_per_iteration": 2.615549325942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102053, + "balance_loss_mlp": 1.0876888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.06916390030254578, + "language_loss": 0.89687926, + "learning_rate": 0.0007463940332686098, + "loss": 0.9078998, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.14367676, + "routerloss_mlp": 0.0, + "step": 1850, + "time_per_iteration": 2.497159242630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931214, + "balance_loss_mlp": 1.62144685, + "diversity_loss_mlp": 0.20650919, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.030410176313075864, + "language_loss": 0.84120536, + "learning_rate": 0.0007461228971129205, + "loss": 0.85051751, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01723633, + "step": 1851, + "time_per_iteration": 2.959170341491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931448, + "balance_loss_mlp": 1.62270963, + "diversity_loss_mlp": 0.20620242, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.03221270440610224, + "language_loss": 0.85523784, + "learning_rate": 0.0007458516654038483, + "loss": 0.86455238, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01699215, + "step": 1852, + "time_per_iteration": 2.6886868476867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149336, + "balance_loss_mlp": 1.13526964, + "diversity_loss_mlp": 0.0, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.06572834298852859, + "language_loss": 0.86835778, + "learning_rate": 0.0007455803382466946, + "loss": 0.8798511, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.14074707, + "routerloss_mlp": 0.0, + "step": 1853, + "time_per_iteration": 2.8323659896850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151398, + "balance_loss_mlp": 1.13686657, + "diversity_loss_mlp": 0.0, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.06349489422764842, + "language_loss": 0.86956179, + "learning_rate": 0.0007453089157467979, + "loss": 0.88107574, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.1451416, + "routerloss_mlp": 0.0, + "step": 1854, + "time_per_iteration": 2.817117929458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151473, + "balance_loss_mlp": 1.13687038, + "diversity_loss_mlp": 0.0, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.06687597930641362, + "language_loss": 0.8221277, + "learning_rate": 0.0007450373980095341, + "loss": 0.83364242, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1855, + "time_per_iteration": 3.0857772827148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148466, + "balance_loss_mlp": 1.13494754, + "diversity_loss_mlp": 0.0, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.0656889709190827, + "language_loss": 0.86804116, + "learning_rate": 0.0007447657851403155, + "loss": 0.87952584, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1856, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144273, + "balance_loss_mlp": 1.1303966, + "diversity_loss_mlp": 0.0, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.08894932465162153, + "language_loss": 0.78988904, + "learning_rate": 0.0007444940772445915, + "loss": 0.80133176, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.13879395, + "routerloss_mlp": 0.0, + "step": 1857, + "time_per_iteration": 2.752232551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122576, + "balance_loss_mlp": 1.10860419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.06705763345081875, + "language_loss": 0.80129987, + "learning_rate": 0.0007442222744278484, + "loss": 0.81252563, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.13989258, + "routerloss_mlp": 0.0, + "step": 1858, + "time_per_iteration": 2.638322591781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110883, + "balance_loss_mlp": 1.09717393, + "diversity_loss_mlp": 0.0, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.05935371072747042, + "language_loss": 0.8399322, + "learning_rate": 0.0007439503767956099, + "loss": 0.85104102, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.137146, + "routerloss_mlp": 0.0, + "step": 1859, + "time_per_iteration": 2.699204921722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124434, + "balance_loss_mlp": 1.11480188, + "diversity_loss_mlp": 0.0, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.03541879327423246, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80796039, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 1860, + "time_per_iteration": 4.89499831199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089258, + "balance_loss_mlp": 1.07479787, + "diversity_loss_mlp": 0.0, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.06413043417122823, + "language_loss": 0.86215138, + "learning_rate": 0.000743406297506922, + "loss": 0.87304389, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.14465332, + "routerloss_mlp": 0.0, + "step": 1861, + "time_per_iteration": 2.7184388637542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919817, + "balance_loss_mlp": 1.60078692, + "diversity_loss_mlp": 0.20507258, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.028510278569739433, + "language_loss": 0.84439111, + "learning_rate": 0.0007431341160617031, + "loss": 0.8535893, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01688758, + "step": 1862, + "time_per_iteration": 2.8915610313415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084391, + "balance_loss_mlp": 1.06988358, + "diversity_loss_mlp": 0.0, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.06954606141633879, + "language_loss": 0.88100171, + "learning_rate": 0.0007428618402234491, + "loss": 0.8918457, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.14501953, + "routerloss_mlp": 0.0, + "step": 1863, + "time_per_iteration": 2.6724555492401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087919, + "balance_loss_mlp": 1.0733279, + "diversity_loss_mlp": 0.0, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.07542508091229044, + "language_loss": 0.80288851, + "learning_rate": 0.0007425894700978668, + "loss": 0.81376767, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.14587402, + "routerloss_mlp": 0.0, + "step": 1864, + "time_per_iteration": 2.724853038787842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_mlp": 1.06996608, + "diversity_loss_mlp": 0.0, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.07695346444963648, + "language_loss": 0.7981261, + "learning_rate": 0.0007423170057906996, + "loss": 0.80896473, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.13916016, + "routerloss_mlp": 0.0, + "step": 1865, + "time_per_iteration": 3.9006779193878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108627, + "balance_loss_mlp": 1.0722512, + "diversity_loss_mlp": 0.0, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.07814080760266444, + "language_loss": 0.86228722, + "learning_rate": 0.0007420444474077275, + "loss": 0.87314993, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.14025879, + "routerloss_mlp": 0.0, + "step": 1866, + "time_per_iteration": 2.546194076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095193, + "balance_loss_mlp": 1.0812335, + "diversity_loss_mlp": 0.0, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.0773553058948038, + "language_loss": 0.8949936, + "learning_rate": 0.0007417717950547671, + "loss": 0.90594554, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1867, + "time_per_iteration": 2.5670700073242188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052493, + "balance_loss_mlp": 1.04262233, + "diversity_loss_mlp": 0.0, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.023944930622272237, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.770491, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 1868, + "time_per_iteration": 4.900780200958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101302, + "balance_loss_mlp": 1.087533, + "diversity_loss_mlp": 0.0, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.06547244306940128, + "language_loss": 0.84938717, + "learning_rate": 0.0007412262088623299, + "loss": 0.86040014, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.13793945, + "routerloss_mlp": 0.0, + "step": 1869, + "time_per_iteration": 2.7674195766448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0092029, + "balance_loss_mlp": 1.60128522, + "diversity_loss_mlp": 0.20662443, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.03542659619783611, + "language_loss": 0.79155517, + "learning_rate": 0.0007409532752346684, + "loss": 0.80075806, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01633519, + "step": 1870, + "time_per_iteration": 2.7116785049438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111485, + "balance_loss_mlp": 1.101367, + "diversity_loss_mlp": 0.0, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.061502004439029076, + "language_loss": 0.8836326, + "learning_rate": 0.0007406802480606491, + "loss": 0.89478111, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.13500977, + "routerloss_mlp": 0.0, + "step": 1871, + "time_per_iteration": 2.642608165740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105605, + "balance_loss_mlp": 1.0916698, + "diversity_loss_mlp": 0.0, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.06939665757215846, + "language_loss": 0.90353388, + "learning_rate": 0.0007404071274462707, + "loss": 0.91458994, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.1394043, + "routerloss_mlp": 0.0, + "step": 1872, + "time_per_iteration": 2.5600955486297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113518, + "balance_loss_mlp": 1.09967744, + "diversity_loss_mlp": 0.0, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.07241097832053987, + "language_loss": 0.83719409, + "learning_rate": 0.0007401339134975682, + "loss": 0.84832925, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1873, + "time_per_iteration": 2.6775293350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111162, + "balance_loss_mlp": 1.09724998, + "diversity_loss_mlp": 0.0, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.07980684605652169, + "language_loss": 0.84604299, + "learning_rate": 0.0007398606063206122, + "loss": 0.85715467, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.13928223, + "routerloss_mlp": 0.0, + "step": 1874, + "time_per_iteration": 2.6092889308929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109131, + "balance_loss_mlp": 1.09546924, + "diversity_loss_mlp": 0.0, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.09304103013369584, + "language_loss": 0.78818524, + "learning_rate": 0.0007395872060215101, + "loss": 0.79927647, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.13684082, + "routerloss_mlp": 0.0, + "step": 1875, + "time_per_iteration": 2.5999374389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124779, + "balance_loss_mlp": 1.11121297, + "diversity_loss_mlp": 0.0, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.08049441369365674, + "language_loss": 0.8851527, + "learning_rate": 0.0007393137127064056, + "loss": 0.89640045, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.13574219, + "routerloss_mlp": 0.0, + "step": 1876, + "time_per_iteration": 2.635896682739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127435, + "balance_loss_mlp": 1.11380959, + "diversity_loss_mlp": 0.0, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.06613177233605298, + "language_loss": 0.84377646, + "learning_rate": 0.0007390401264814779, + "loss": 0.8550508, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1877, + "time_per_iteration": 2.597508192062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151319, + "balance_loss_mlp": 1.1378243, + "diversity_loss_mlp": 0.0, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.09083655630754779, + "language_loss": 0.84454513, + "learning_rate": 0.0007387664474529427, + "loss": 0.8560583, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.13525391, + "routerloss_mlp": 0.0, + "step": 1878, + "time_per_iteration": 2.6493661403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143725, + "balance_loss_mlp": 1.1302073, + "diversity_loss_mlp": 0.0, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.0643860955644754, + "language_loss": 0.91379291, + "learning_rate": 0.0007384926757270518, + "loss": 0.92523015, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.13537598, + "routerloss_mlp": 0.0, + "step": 1879, + "time_per_iteration": 2.62565016746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152012, + "balance_loss_mlp": 1.13819528, + "diversity_loss_mlp": 0.0, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.07609143241795291, + "language_loss": 0.80057949, + "learning_rate": 0.0007382188114100924, + "loss": 0.81209958, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.13818359, + "routerloss_mlp": 0.0, + "step": 1880, + "time_per_iteration": 2.974212169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155662, + "balance_loss_mlp": 1.14148784, + "diversity_loss_mlp": 0.0, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.0632350243804942, + "language_loss": 0.8182314, + "learning_rate": 0.0007379448546083884, + "loss": 0.82978803, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1881, + "time_per_iteration": 2.894099712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154364, + "balance_loss_mlp": 1.14052355, + "diversity_loss_mlp": 0.0, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.06232367753538678, + "language_loss": 0.8822301, + "learning_rate": 0.0007376708054282992, + "loss": 0.89377379, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.1385498, + "routerloss_mlp": 0.0, + "step": 1882, + "time_per_iteration": 2.9576163291931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162916, + "balance_loss_mlp": 1.14919519, + "diversity_loss_mlp": 0.0, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.06608098206448941, + "language_loss": 0.83563071, + "learning_rate": 0.0007373966639762201, + "loss": 0.84725988, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.13757324, + "routerloss_mlp": 0.0, + "step": 1883, + "time_per_iteration": 2.6004068851470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158732, + "balance_loss_mlp": 1.14478457, + "diversity_loss_mlp": 0.0, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.07441448138889938, + "language_loss": 0.88544619, + "learning_rate": 0.0007371224303585822, + "loss": 0.89703357, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.13964844, + "routerloss_mlp": 0.0, + "step": 1884, + "time_per_iteration": 2.5741078853607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109461, + "balance_loss_mlp": 1.09897089, + "diversity_loss_mlp": 0.0, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.03545085729862102, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81466532, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 1885, + "time_per_iteration": 4.706872224807739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148036, + "balance_loss_mlp": 1.13442218, + "diversity_loss_mlp": 0.0, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.0691831634947964, + "language_loss": 0.8278423, + "learning_rate": 0.0007365736870525335, + "loss": 0.83932269, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1886, + "time_per_iteration": 2.8480284214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135823, + "balance_loss_mlp": 1.12236464, + "diversity_loss_mlp": 0.0, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.0786816251155578, + "language_loss": 0.82659888, + "learning_rate": 0.000736299177577164, + "loss": 0.83795714, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.13476562, + "routerloss_mlp": 0.0, + "step": 1887, + "time_per_iteration": 2.601449966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127197, + "balance_loss_mlp": 1.11358309, + "diversity_loss_mlp": 0.0, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.0767010159800114, + "language_loss": 0.8381778, + "learning_rate": 0.0007360245763623174, + "loss": 0.84944975, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.13635254, + "routerloss_mlp": 0.0, + "step": 1888, + "time_per_iteration": 2.6951138973236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106263, + "balance_loss_mlp": 1.09350717, + "diversity_loss_mlp": 0.0, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.06311908909694558, + "language_loss": 0.89886129, + "learning_rate": 0.0007357498835146039, + "loss": 0.90992391, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1889, + "time_per_iteration": 2.8509137630462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094399, + "balance_loss_mlp": 1.08141732, + "diversity_loss_mlp": 0.0, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.06820711534899371, + "language_loss": 0.86674547, + "learning_rate": 0.0007354750991406684, + "loss": 0.87768942, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.13000488, + "routerloss_mlp": 0.0, + "step": 1890, + "time_per_iteration": 2.7162795066833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089716, + "balance_loss_mlp": 1.07673419, + "diversity_loss_mlp": 0.0, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.07876014589837055, + "language_loss": 0.80930853, + "learning_rate": 0.0007352002233471919, + "loss": 0.82020569, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.12988281, + "routerloss_mlp": 0.0, + "step": 1891, + "time_per_iteration": 2.631824016571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091374, + "balance_loss_mlp": 1.07835662, + "diversity_loss_mlp": 0.0, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.08103720744805817, + "language_loss": 0.79372823, + "learning_rate": 0.0007349252562408906, + "loss": 0.80464196, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1892, + "time_per_iteration": 2.6752734184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097706, + "balance_loss_mlp": 1.08496833, + "diversity_loss_mlp": 0.0, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.07356128462514616, + "language_loss": 0.81490725, + "learning_rate": 0.0007346501979285158, + "loss": 0.82588428, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1893, + "time_per_iteration": 2.8990893363952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_mlp": 1.03214884, + "diversity_loss_mlp": 0.0, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.022756463517582398, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81579787, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.08544922, + "routerloss_mlp": 0.0, + "step": 1894, + "time_per_iteration": 4.8097145557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098768, + "balance_loss_mlp": 1.0857501, + "diversity_loss_mlp": 0.0, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.06969655176236832, + "language_loss": 0.85880721, + "learning_rate": 0.0007340998081127308, + "loss": 0.86979485, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1895, + "time_per_iteration": 2.757380485534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087705, + "balance_loss_mlp": 1.074646, + "diversity_loss_mlp": 0.0, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.06910669114263218, + "language_loss": 0.91127002, + "learning_rate": 0.0007338244768230007, + "loss": 0.92214715, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.13079834, + "routerloss_mlp": 0.0, + "step": 1896, + "time_per_iteration": 2.7967634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098629, + "balance_loss_mlp": 1.08584976, + "diversity_loss_mlp": 0.0, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.05804787602656793, + "language_loss": 0.88684666, + "learning_rate": 0.0007335490547545578, + "loss": 0.89783299, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.12780762, + "routerloss_mlp": 0.0, + "step": 1897, + "time_per_iteration": 3.086498260498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095377, + "balance_loss_mlp": 1.08286643, + "diversity_loss_mlp": 0.0, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06953546528053214, + "language_loss": 0.82679451, + "learning_rate": 0.0007332735420143308, + "loss": 0.83774823, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.12506104, + "routerloss_mlp": 0.0, + "step": 1898, + "time_per_iteration": 2.788245439529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097867, + "balance_loss_mlp": 1.08476591, + "diversity_loss_mlp": 0.0, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.07600656362423025, + "language_loss": 0.86647844, + "learning_rate": 0.0007329979387092826, + "loss": 0.87745708, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.13110352, + "routerloss_mlp": 0.0, + "step": 1899, + "time_per_iteration": 2.5437934398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101105, + "balance_loss_mlp": 1.08821869, + "diversity_loss_mlp": 0.0, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.05952938167480439, + "language_loss": 0.83796108, + "learning_rate": 0.0007327222449464124, + "loss": 0.8489722, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1900, + "time_per_iteration": 3.2824244499206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011078, + "balance_loss_mlp": 1.09499097, + "diversity_loss_mlp": 0.0, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07745224305421915, + "language_loss": 0.88634431, + "learning_rate": 0.0007324464608327538, + "loss": 0.89742231, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.12823486, + "routerloss_mlp": 0.0, + "step": 1901, + "time_per_iteration": 2.6411991119384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102996, + "balance_loss_mlp": 1.08995461, + "diversity_loss_mlp": 0.0, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.08223816362142805, + "language_loss": 0.88474846, + "learning_rate": 0.0007321705864753758, + "loss": 0.89577842, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.1305542, + "routerloss_mlp": 0.0, + "step": 1902, + "time_per_iteration": 2.682002544403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00931657, + "balance_loss_mlp": 1.62497878, + "diversity_loss_mlp": 0.20707282, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.026825446902959647, + "language_loss": 0.84137708, + "learning_rate": 0.0007318946219813823, + "loss": 0.85069364, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01563089, + "step": 1903, + "time_per_iteration": 3.0061404705047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108165, + "balance_loss_mlp": 1.09403849, + "diversity_loss_mlp": 0.0, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.07526416733947026, + "language_loss": 0.89736164, + "learning_rate": 0.000731618567457912, + "loss": 0.90844321, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.14105225, + "routerloss_mlp": 0.0, + "step": 1904, + "time_per_iteration": 2.6523027420043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099252, + "balance_loss_mlp": 1.08536446, + "diversity_loss_mlp": 0.0, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.07605082206895837, + "language_loss": 0.87058568, + "learning_rate": 0.000731342423012139, + "loss": 0.88157821, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.13903809, + "routerloss_mlp": 0.0, + "step": 1905, + "time_per_iteration": 3.0595312118530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096318, + "balance_loss_mlp": 1.08213234, + "diversity_loss_mlp": 0.0, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.07718853495225737, + "language_loss": 0.82559443, + "learning_rate": 0.0007310661887512722, + "loss": 0.83655763, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.1418457, + "routerloss_mlp": 0.0, + "step": 1906, + "time_per_iteration": 3.056859016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090478, + "balance_loss_mlp": 1.07672131, + "diversity_loss_mlp": 0.0, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.07458396044121823, + "language_loss": 0.8194133, + "learning_rate": 0.0007307898647825549, + "loss": 0.83031803, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.13769531, + "routerloss_mlp": 0.0, + "step": 1907, + "time_per_iteration": 2.670468807220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090408, + "balance_loss_mlp": 1.07666349, + "diversity_loss_mlp": 0.0, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.09231339543244264, + "language_loss": 0.89368939, + "learning_rate": 0.0007305134512132659, + "loss": 0.90459347, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.13751221, + "routerloss_mlp": 0.0, + "step": 1908, + "time_per_iteration": 2.6561663150787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091843, + "balance_loss_mlp": 1.07826495, + "diversity_loss_mlp": 0.0, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.08913139219920335, + "language_loss": 0.83308864, + "learning_rate": 0.0007302369481507183, + "loss": 0.84400707, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.13592529, + "routerloss_mlp": 0.0, + "step": 1909, + "time_per_iteration": 2.5485799312591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017138, + "balance_loss_mlp": 1.00979447, + "diversity_loss_mlp": 0.0, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.013277678950868657, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.80978894, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.07324219, + "routerloss_mlp": 0.0, + "step": 1910, + "time_per_iteration": 4.848855257034302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111697, + "balance_loss_mlp": 1.09842944, + "diversity_loss_mlp": 0.0, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.058739485749840115, + "language_loss": 0.85315347, + "learning_rate": 0.000729683673975274, + "loss": 0.86427045, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.13287354, + "routerloss_mlp": 0.0, + "step": 1911, + "time_per_iteration": 2.690218210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114863, + "balance_loss_mlp": 1.10165429, + "diversity_loss_mlp": 0.0, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.05585809452393386, + "language_loss": 0.8291769, + "learning_rate": 0.0007294069030771774, + "loss": 0.84032547, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.13232422, + "routerloss_mlp": 0.0, + "step": 1912, + "time_per_iteration": 3.678927183151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125561, + "balance_loss_mlp": 1.1124301, + "diversity_loss_mlp": 0.0, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.06389765233013874, + "language_loss": 0.90667701, + "learning_rate": 0.0007291300431154224, + "loss": 0.91793263, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1913, + "time_per_iteration": 2.616999387741089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043841, + "balance_loss_mlp": 1.03611672, + "diversity_loss_mlp": 0.0, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.02051984405011318, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.7143358, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.07714844, + "routerloss_mlp": 0.0, + "step": 1914, + "time_per_iteration": 4.973980903625488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137116, + "balance_loss_mlp": 1.12441444, + "diversity_loss_mlp": 0.0, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.0814243559806059, + "language_loss": 0.7981922, + "learning_rate": 0.0007285760564309179, + "loss": 0.8095634, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.1270752, + "routerloss_mlp": 0.0, + "step": 1915, + "time_per_iteration": 3.091447353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127931, + "balance_loss_mlp": 1.11485386, + "diversity_loss_mlp": 0.0, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.09574055809111115, + "language_loss": 0.84848046, + "learning_rate": 0.0007282989299232448, + "loss": 0.85975981, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.13092041, + "routerloss_mlp": 0.0, + "step": 1916, + "time_per_iteration": 3.074547052383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113017, + "balance_loss_mlp": 1.09977341, + "diversity_loss_mlp": 0.0, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.08763204320127825, + "language_loss": 0.83209801, + "learning_rate": 0.0007280217147820668, + "loss": 0.84322822, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.13256836, + "routerloss_mlp": 0.0, + "step": 1917, + "time_per_iteration": 2.6260228157043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092055, + "balance_loss_mlp": 1.07888198, + "diversity_loss_mlp": 0.0, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06316346716689762, + "language_loss": 0.79465461, + "learning_rate": 0.0007277444111150079, + "loss": 0.80557513, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.13189697, + "routerloss_mlp": 0.0, + "step": 1918, + "time_per_iteration": 2.6777923107147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088544, + "balance_loss_mlp": 1.07465601, + "diversity_loss_mlp": 0.0, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.09595367080188737, + "language_loss": 0.84512901, + "learning_rate": 0.0007274670190297272, + "loss": 0.85601443, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.13891602, + "routerloss_mlp": 0.0, + "step": 1919, + "time_per_iteration": 2.590839147567749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085845, + "balance_loss_mlp": 1.07205224, + "diversity_loss_mlp": 0.0, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.07431087712553297, + "language_loss": 0.82079387, + "learning_rate": 0.0007271895386339179, + "loss": 0.83165228, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.13806152, + "routerloss_mlp": 0.0, + "step": 1920, + "time_per_iteration": 2.7924282550811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094639, + "balance_loss_mlp": 1.08048892, + "diversity_loss_mlp": 0.0, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.07797312778631413, + "language_loss": 0.83431751, + "learning_rate": 0.0007269119700353073, + "loss": 0.84526384, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.14160156, + "routerloss_mlp": 0.0, + "step": 1921, + "time_per_iteration": 2.7155139446258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112357, + "balance_loss_mlp": 1.0987196, + "diversity_loss_mlp": 0.0, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.07250682713227712, + "language_loss": 0.84994757, + "learning_rate": 0.0007266343133416571, + "loss": 0.86107111, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.13647461, + "routerloss_mlp": 0.0, + "step": 1922, + "time_per_iteration": 2.7394983768463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073276, + "balance_loss_mlp": 1.06564641, + "diversity_loss_mlp": 0.0, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.035523530201468645, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78190196, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.07617188, + "routerloss_mlp": 0.0, + "step": 1923, + "time_per_iteration": 4.877161026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115366, + "balance_loss_mlp": 1.10153794, + "diversity_loss_mlp": 0.0, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.0789330271899564, + "language_loss": 0.84356588, + "learning_rate": 0.0007260787361004556, + "loss": 0.85471952, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.13830566, + "routerloss_mlp": 0.0, + "step": 1924, + "time_per_iteration": 2.608745813369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_mlp": 1.02985299, + "diversity_loss_mlp": 0.0, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.021371165562314075, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74798417, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.07080078, + "routerloss_mlp": 0.0, + "step": 1925, + "time_per_iteration": 4.906585931777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114233, + "balance_loss_mlp": 1.10069048, + "diversity_loss_mlp": 0.0, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.12026638393290963, + "language_loss": 0.87422252, + "learning_rate": 0.0007255228077730903, + "loss": 0.88536477, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.13549805, + "routerloss_mlp": 0.0, + "step": 1926, + "time_per_iteration": 2.6886680126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123836, + "balance_loss_mlp": 1.11107421, + "diversity_loss_mlp": 0.0, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.06719853297068734, + "language_loss": 0.81722987, + "learning_rate": 0.0007252447122218632, + "loss": 0.82846814, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.12768555, + "routerloss_mlp": 0.0, + "step": 1927, + "time_per_iteration": 3.1511058807373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125512, + "balance_loss_mlp": 1.11258984, + "diversity_loss_mlp": 0.0, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.08764579691953547, + "language_loss": 0.87849444, + "learning_rate": 0.0007249665292228834, + "loss": 0.88974959, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1928, + "time_per_iteration": 2.565991163253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120289, + "balance_loss_mlp": 1.1073308, + "diversity_loss_mlp": 0.0, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.0633685198143462, + "language_loss": 0.83318496, + "learning_rate": 0.000724688258884151, + "loss": 0.84438789, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.12963867, + "routerloss_mlp": 0.0, + "step": 1929, + "time_per_iteration": 2.531827926635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_mlp": 1.10286927, + "diversity_loss_mlp": 0.0, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.05744658583323744, + "language_loss": 0.86564112, + "learning_rate": 0.0007244099013137002, + "loss": 0.8767941, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 1930, + "time_per_iteration": 3.1130166053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.10404849, + "diversity_loss_mlp": 0.0, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.06880018611034966, + "language_loss": 0.88695574, + "learning_rate": 0.0007241314566195993, + "loss": 0.89812243, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.1262207, + "routerloss_mlp": 0.0, + "step": 1931, + "time_per_iteration": 3.374743700027466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110097, + "balance_loss_mlp": 1.08821416, + "diversity_loss_mlp": 0.0, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.06303779661636588, + "language_loss": 0.85510373, + "learning_rate": 0.0007238529249099496, + "loss": 0.86611342, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.12750244, + "routerloss_mlp": 0.0, + "step": 1932, + "time_per_iteration": 2.6654059886932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097731, + "balance_loss_mlp": 1.0911988, + "diversity_loss_mlp": 0.0, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.03412398452916775, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78954613, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.06542969, + "routerloss_mlp": 0.0, + "step": 1933, + "time_per_iteration": 4.851354598999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091175, + "balance_loss_mlp": 1.07859278, + "diversity_loss_mlp": 0.0, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.08014253307267598, + "language_loss": 0.80636895, + "learning_rate": 0.000723295600876581, + "loss": 0.81728071, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.12597656, + "routerloss_mlp": 0.0, + "step": 1934, + "time_per_iteration": 3.0025534629821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097044, + "balance_loss_mlp": 1.08416963, + "diversity_loss_mlp": 0.0, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.08698689907724866, + "language_loss": 0.88006312, + "learning_rate": 0.0007230168087692344, + "loss": 0.89103359, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.12872314, + "routerloss_mlp": 0.0, + "step": 1935, + "time_per_iteration": 2.6499342918395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095813, + "balance_loss_mlp": 1.0830214, + "diversity_loss_mlp": 0.0, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.07031074193849007, + "language_loss": 0.82382512, + "learning_rate": 0.0007227379300790839, + "loss": 0.8347832, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1936, + "time_per_iteration": 3.0040676593780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092601, + "balance_loss_mlp": 1.07969058, + "diversity_loss_mlp": 0.0, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.07132774808829288, + "language_loss": 0.85478282, + "learning_rate": 0.0007224589649143997, + "loss": 0.86570889, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 1937, + "time_per_iteration": 2.584545612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089825, + "balance_loss_mlp": 1.07662272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.0711139803163438, + "language_loss": 0.8120302, + "learning_rate": 0.0007221799133834861, + "loss": 0.82292843, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.13214111, + "routerloss_mlp": 0.0, + "step": 1938, + "time_per_iteration": 2.6393649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109955, + "balance_loss_mlp": 1.08649623, + "diversity_loss_mlp": 0.0, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.20460237815205612, + "language_loss": 0.81793052, + "learning_rate": 0.00072190077559468, + "loss": 0.82892597, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1939, + "time_per_iteration": 2.5494682788848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127139, + "balance_loss_mlp": 1.1140976, + "diversity_loss_mlp": 0.0, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.05817015695703163, + "language_loss": 0.89248812, + "learning_rate": 0.0007216215516563527, + "loss": 0.90375948, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.13049316, + "routerloss_mlp": 0.0, + "step": 1940, + "time_per_iteration": 2.6755452156066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129035, + "balance_loss_mlp": 1.1159811, + "diversity_loss_mlp": 0.0, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.07778932214282369, + "language_loss": 0.83852386, + "learning_rate": 0.0007213422416769083, + "loss": 0.84981418, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 1941, + "time_per_iteration": 2.6008002758026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135994, + "balance_loss_mlp": 1.12319708, + "diversity_loss_mlp": 0.0, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.06345716224902766, + "language_loss": 0.7501297, + "learning_rate": 0.0007210628457647849, + "loss": 0.76148963, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1942, + "time_per_iteration": 2.5911362171173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140859, + "balance_loss_mlp": 1.12763917, + "diversity_loss_mlp": 0.0, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.06753886702103719, + "language_loss": 0.78585184, + "learning_rate": 0.000720783364028453, + "loss": 0.7972604, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.13238525, + "routerloss_mlp": 0.0, + "step": 1943, + "time_per_iteration": 2.7490458488464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149977, + "balance_loss_mlp": 1.13685822, + "diversity_loss_mlp": 0.0, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.0650742437261564, + "language_loss": 0.87667847, + "learning_rate": 0.0007205037965764177, + "loss": 0.88817823, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.13140869, + "routerloss_mlp": 0.0, + "step": 1944, + "time_per_iteration": 2.5870554447174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134812, + "balance_loss_mlp": 1.12192512, + "diversity_loss_mlp": 0.0, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07468357539719116, + "language_loss": 0.85650361, + "learning_rate": 0.0007202241435172161, + "loss": 0.86785173, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.12902832, + "routerloss_mlp": 0.0, + "step": 1945, + "time_per_iteration": 2.7550253868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131811, + "balance_loss_mlp": 1.11901414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.07270487210957549, + "language_loss": 0.87884831, + "learning_rate": 0.0007199444049594198, + "loss": 0.8901664, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.12805176, + "routerloss_mlp": 0.0, + "step": 1946, + "time_per_iteration": 2.9499337673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111095, + "balance_loss_mlp": 1.09783912, + "diversity_loss_mlp": 0.0, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.07247382516020226, + "language_loss": 0.83384776, + "learning_rate": 0.0007196645810116322, + "loss": 0.84495866, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.13269043, + "routerloss_mlp": 0.0, + "step": 1947, + "time_per_iteration": 2.70394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113218, + "balance_loss_mlp": 1.1003499, + "diversity_loss_mlp": 0.0, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.07522309633784076, + "language_loss": 0.84431696, + "learning_rate": 0.0007193846717824912, + "loss": 0.8554492, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.12884521, + "routerloss_mlp": 0.0, + "step": 1948, + "time_per_iteration": 2.923752546310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116002, + "balance_loss_mlp": 1.10312748, + "diversity_loss_mlp": 0.0, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.06883561802065806, + "language_loss": 0.88268626, + "learning_rate": 0.0007191046773806669, + "loss": 0.89384627, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1949, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108921, + "balance_loss_mlp": 1.09593272, + "diversity_loss_mlp": 0.0, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.07969110082801287, + "language_loss": 0.83211446, + "learning_rate": 0.0007188245979148631, + "loss": 0.84320366, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.13006592, + "routerloss_mlp": 0.0, + "step": 1950, + "time_per_iteration": 3.193124294281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111725, + "balance_loss_mlp": 1.09892154, + "diversity_loss_mlp": 0.0, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.07005872092850987, + "language_loss": 0.87434363, + "learning_rate": 0.0007185444334938157, + "loss": 0.88546085, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.12811279, + "routerloss_mlp": 0.0, + "step": 1951, + "time_per_iteration": 2.669201135635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101783, + "balance_loss_mlp": 1.0892663, + "diversity_loss_mlp": 0.0, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.08195801919923047, + "language_loss": 0.85047525, + "learning_rate": 0.0007182641842262947, + "loss": 0.86149311, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.12518311, + "routerloss_mlp": 0.0, + "step": 1952, + "time_per_iteration": 2.602139472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092197, + "balance_loss_mlp": 1.07936394, + "diversity_loss_mlp": 0.0, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.07349771430020792, + "language_loss": 0.77754879, + "learning_rate": 0.0007179838502211022, + "loss": 0.78847075, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.128479, + "routerloss_mlp": 0.0, + "step": 1953, + "time_per_iteration": 2.85720157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094498, + "balance_loss_mlp": 1.08148086, + "diversity_loss_mlp": 0.0, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.0681681729591206, + "language_loss": 0.86330736, + "learning_rate": 0.0007177034315870738, + "loss": 0.87425238, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.13024902, + "routerloss_mlp": 0.0, + "step": 1954, + "time_per_iteration": 2.958862066268921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101066, + "balance_loss_mlp": 1.08803654, + "diversity_loss_mlp": 0.0, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.06642365438263753, + "language_loss": 0.90809441, + "learning_rate": 0.0007174229284330773, + "loss": 0.91910505, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.13037109, + "routerloss_mlp": 0.0, + "step": 1955, + "time_per_iteration": 2.5824947357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108936, + "balance_loss_mlp": 1.07642531, + "diversity_loss_mlp": 0.0, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.07788827503332588, + "language_loss": 0.86705017, + "learning_rate": 0.0007171423408680141, + "loss": 0.87794375, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.12939453, + "routerloss_mlp": 0.0, + "step": 1956, + "time_per_iteration": 2.8101606369018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00950311, + "balance_loss_mlp": 1.6602329, + "diversity_loss_mlp": 0.20739825, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.03218717292019043, + "language_loss": 0.89567441, + "learning_rate": 0.0007168616690008176, + "loss": 0.90517747, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01649548, + "step": 1957, + "time_per_iteration": 2.6774377822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081569, + "balance_loss_mlp": 1.06840825, + "diversity_loss_mlp": 0.0, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.07242251254882147, + "language_loss": 0.85681045, + "learning_rate": 0.0007165809129404545, + "loss": 0.86762613, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.13171387, + "routerloss_mlp": 0.0, + "step": 1958, + "time_per_iteration": 2.8396048545837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090105, + "balance_loss_mlp": 1.07657433, + "diversity_loss_mlp": 0.0, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.08227545286248691, + "language_loss": 0.86212921, + "learning_rate": 0.0007163000727959239, + "loss": 0.87303019, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.13562012, + "routerloss_mlp": 0.0, + "step": 1959, + "time_per_iteration": 2.478990316390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087148, + "balance_loss_mlp": 1.07989979, + "diversity_loss_mlp": 0.0, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.05215322395932221, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79046214, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.07226562, + "routerloss_mlp": 0.0, + "step": 1960, + "time_per_iteration": 4.869986057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095327, + "balance_loss_mlp": 1.08232689, + "diversity_loss_mlp": 0.0, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.08048811275026858, + "language_loss": 0.84568793, + "learning_rate": 0.00071573814069052, + "loss": 0.85664117, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.13018799, + "routerloss_mlp": 0.0, + "step": 1961, + "time_per_iteration": 2.9122819900512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109901, + "balance_loss_mlp": 1.08614171, + "diversity_loss_mlp": 0.0, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.06061063893945359, + "language_loss": 0.88073885, + "learning_rate": 0.0007154570489478081, + "loss": 0.89172894, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.12878418, + "routerloss_mlp": 0.0, + "step": 1962, + "time_per_iteration": 3.1824018955230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111399, + "balance_loss_mlp": 1.10154414, + "diversity_loss_mlp": 0.0, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.06274200702745775, + "language_loss": 0.86391222, + "learning_rate": 0.0007151758735572514, + "loss": 0.87505209, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.12451172, + "routerloss_mlp": 0.0, + "step": 1963, + "time_per_iteration": 2.997624158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111089, + "balance_loss_mlp": 1.09836888, + "diversity_loss_mlp": 0.0, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.07983075782925624, + "language_loss": 0.80894458, + "learning_rate": 0.0007148946146280119, + "loss": 0.82005548, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.12731934, + "routerloss_mlp": 0.0, + "step": 1964, + "time_per_iteration": 2.836583137512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00620122, + "balance_loss_mlp": 1.05382681, + "diversity_loss_mlp": 0.16216688, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.0017779517528101797, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.72812271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01212509, + "step": 1965, + "time_per_iteration": 4.906678915023804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_mlp": 1.02436352, + "diversity_loss_mlp": 0.0, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.025755206304302582, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.7637251, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.06176758, + "routerloss_mlp": 0.0, + "step": 1966, + "time_per_iteration": 4.93319296836853 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127949, + "balance_loss_mlp": 1.11581361, + "diversity_loss_mlp": 0.0, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.05898800907157556, + "language_loss": 0.83873129, + "learning_rate": 0.0007140503377003022, + "loss": 0.85001081, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 1967, + "time_per_iteration": 2.9807000160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123739, + "balance_loss_mlp": 1.11125755, + "diversity_loss_mlp": 0.0, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.06421364750503517, + "language_loss": 0.84625173, + "learning_rate": 0.000713768745708599, + "loss": 0.85748911, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 1968, + "time_per_iteration": 2.610745429992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118961, + "balance_loss_mlp": 1.10671234, + "diversity_loss_mlp": 0.0, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.06880095080762995, + "language_loss": 0.77052647, + "learning_rate": 0.0007134870707245085, + "loss": 0.78171611, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.12249756, + "routerloss_mlp": 0.0, + "step": 1969, + "time_per_iteration": 3.302985429763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120912, + "balance_loss_mlp": 1.10852587, + "diversity_loss_mlp": 0.0, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.07142024228833302, + "language_loss": 0.84469545, + "learning_rate": 0.0007132053128573864, + "loss": 0.85590458, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.12384033, + "routerloss_mlp": 0.0, + "step": 1970, + "time_per_iteration": 2.7751197814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124437, + "balance_loss_mlp": 1.11231327, + "diversity_loss_mlp": 0.0, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.06795721743578591, + "language_loss": 0.83786452, + "learning_rate": 0.0007129234722166211, + "loss": 0.84910882, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 1971, + "time_per_iteration": 2.806898832321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114684, + "balance_loss_mlp": 1.10238707, + "diversity_loss_mlp": 0.0, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.06601167392952549, + "language_loss": 0.91087604, + "learning_rate": 0.0007126415489116328, + "loss": 0.92202282, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.1229248, + "routerloss_mlp": 0.0, + "step": 1972, + "time_per_iteration": 2.656651496887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.09782279, + "diversity_loss_mlp": 0.0, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.06641244535011205, + "language_loss": 0.81145501, + "learning_rate": 0.0007123595430518736, + "loss": 0.82255375, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 1973, + "time_per_iteration": 2.8665072917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102568, + "balance_loss_mlp": 1.09068835, + "diversity_loss_mlp": 0.0, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.07235703206146665, + "language_loss": 0.86411089, + "learning_rate": 0.0007120774547468282, + "loss": 0.87513655, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 1974, + "time_per_iteration": 2.5590381622314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00948323, + "balance_loss_mlp": 1.65707994, + "diversity_loss_mlp": 0.20756721, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.03148003592885531, + "language_loss": 0.81558585, + "learning_rate": 0.0007117952841060128, + "loss": 0.82506907, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01599924, + "step": 1975, + "time_per_iteration": 2.6777563095092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083824, + "balance_loss_mlp": 1.07167053, + "diversity_loss_mlp": 0.0, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.07660828670939425, + "language_loss": 0.83672053, + "learning_rate": 0.0007115130312389756, + "loss": 0.8475588, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.12145996, + "routerloss_mlp": 0.0, + "step": 1976, + "time_per_iteration": 2.7103323936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.07200503, + "diversity_loss_mlp": 0.0, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.08353002189035653, + "language_loss": 0.79290646, + "learning_rate": 0.0007112306962552973, + "loss": 0.80375111, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.12463379, + "routerloss_mlp": 0.0, + "step": 1977, + "time_per_iteration": 2.576239824295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084774, + "balance_loss_mlp": 1.07254314, + "diversity_loss_mlp": 0.0, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.06483406604645132, + "language_loss": 0.85315859, + "learning_rate": 0.0007109482792645896, + "loss": 0.86400628, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 1978, + "time_per_iteration": 2.7146143913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084927, + "balance_loss_mlp": 1.07276165, + "diversity_loss_mlp": 0.0, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.06865418790878511, + "language_loss": 0.83831733, + "learning_rate": 0.0007106657803764969, + "loss": 0.84916663, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 1979, + "time_per_iteration": 2.73152494430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086146, + "balance_loss_mlp": 1.07395101, + "diversity_loss_mlp": 0.0, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.07620298141647525, + "language_loss": 0.81962979, + "learning_rate": 0.0007103831997006948, + "loss": 0.83049119, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.12194824, + "routerloss_mlp": 0.0, + "step": 1980, + "time_per_iteration": 2.7383615970611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094611, + "balance_loss_mlp": 1.08276772, + "diversity_loss_mlp": 0.0, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.0842263164190672, + "language_loss": 0.85342598, + "learning_rate": 0.0007101005373468908, + "loss": 0.86437213, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 1981, + "time_per_iteration": 2.889251708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097444, + "balance_loss_mlp": 1.08543372, + "diversity_loss_mlp": 0.0, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.06048237516575629, + "language_loss": 0.86649287, + "learning_rate": 0.0007098177934248242, + "loss": 0.87746727, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 1982, + "time_per_iteration": 2.773146867752075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920145, + "balance_loss_mlp": 1.60273147, + "diversity_loss_mlp": 0.20649332, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.033525346661278974, + "language_loss": 0.85516387, + "learning_rate": 0.0007095349680442661, + "loss": 0.86436534, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01553278, + "step": 1983, + "time_per_iteration": 2.8675785064697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116298, + "balance_loss_mlp": 1.1045742, + "diversity_loss_mlp": 0.0, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.06407324010727367, + "language_loss": 0.78783178, + "learning_rate": 0.0007092520613150188, + "loss": 0.79899484, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 1984, + "time_per_iteration": 2.709177017211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00918651, + "balance_loss_mlp": 1.59999418, + "diversity_loss_mlp": 0.20665541, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.03070680845617011, + "language_loss": 0.80925471, + "learning_rate": 0.0007089690733469165, + "loss": 0.81844121, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532666, + "step": 1985, + "time_per_iteration": 2.750558376312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135734, + "balance_loss_mlp": 1.12384343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.08571071539105668, + "language_loss": 0.82313848, + "learning_rate": 0.000708686004249825, + "loss": 0.83449578, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 1986, + "time_per_iteration": 2.7550368309020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132102, + "balance_loss_mlp": 1.12012124, + "diversity_loss_mlp": 0.0, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.07744479108461458, + "language_loss": 0.91340905, + "learning_rate": 0.0007084028541336413, + "loss": 0.92473006, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.11975098, + "routerloss_mlp": 0.0, + "step": 1987, + "time_per_iteration": 2.703339099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00914957, + "balance_loss_mlp": 1.59260678, + "diversity_loss_mlp": 0.20690078, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.03035395776464378, + "language_loss": 0.86267084, + "learning_rate": 0.0007081196231082942, + "loss": 0.87182039, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01520337, + "step": 1988, + "time_per_iteration": 2.8075153827667236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.10567343, + "diversity_loss_mlp": 0.0, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.07746710731409655, + "language_loss": 0.80053389, + "learning_rate": 0.0007078363112837436, + "loss": 0.81171107, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.12036133, + "routerloss_mlp": 0.0, + "step": 1989, + "time_per_iteration": 2.811197280883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104881, + "balance_loss_mlp": 1.09261441, + "diversity_loss_mlp": 0.0, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.07961201652041947, + "language_loss": 0.84721339, + "learning_rate": 0.000707552918769981, + "loss": 0.85826218, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 1990, + "time_per_iteration": 2.4908246994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102216, + "balance_loss_mlp": 1.08987188, + "diversity_loss_mlp": 0.0, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.06284554422997896, + "language_loss": 0.83619118, + "learning_rate": 0.000707269445677029, + "loss": 0.84721333, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.12341309, + "routerloss_mlp": 0.0, + "step": 1991, + "time_per_iteration": 2.733126401901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101588, + "balance_loss_mlp": 1.08921361, + "diversity_loss_mlp": 0.0, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.07203164936975576, + "language_loss": 0.85140717, + "learning_rate": 0.0007069858921149416, + "loss": 0.86242306, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.12371826, + "routerloss_mlp": 0.0, + "step": 1992, + "time_per_iteration": 2.9382007122039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096798, + "balance_loss_mlp": 1.08434701, + "diversity_loss_mlp": 0.0, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.05485930037569587, + "language_loss": 0.85794246, + "learning_rate": 0.0007067022581938043, + "loss": 0.86891043, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.12457275, + "routerloss_mlp": 0.0, + "step": 1993, + "time_per_iteration": 2.857525110244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095406, + "balance_loss_mlp": 1.08321714, + "diversity_loss_mlp": 0.0, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.0871408980162776, + "language_loss": 0.83722532, + "learning_rate": 0.0007064185440237334, + "loss": 0.8481794, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1994, + "time_per_iteration": 2.7131123542785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099231, + "balance_loss_mlp": 1.08733368, + "diversity_loss_mlp": 0.0, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.06357294591464056, + "language_loss": 0.84358412, + "learning_rate": 0.0007061347497148764, + "loss": 0.85457647, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.11895752, + "routerloss_mlp": 0.0, + "step": 1995, + "time_per_iteration": 2.7398569583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102339, + "balance_loss_mlp": 1.09015, + "diversity_loss_mlp": 0.0, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.07322887134464046, + "language_loss": 0.86299884, + "learning_rate": 0.0007058508753774122, + "loss": 0.87402225, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.12188721, + "routerloss_mlp": 0.0, + "step": 1996, + "time_per_iteration": 2.6903162002563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108867, + "balance_loss_mlp": 1.09709477, + "diversity_loss_mlp": 0.0, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.0698381422429368, + "language_loss": 0.86921895, + "learning_rate": 0.0007055669211215505, + "loss": 0.88030767, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 1997, + "time_per_iteration": 2.695028066635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113141, + "balance_loss_mlp": 1.10084486, + "diversity_loss_mlp": 0.0, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.08585182349688475, + "language_loss": 0.77776283, + "learning_rate": 0.0007052828870575322, + "loss": 0.78889418, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 1998, + "time_per_iteration": 2.685685873031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011259, + "balance_loss_mlp": 1.11406291, + "diversity_loss_mlp": 0.0, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.06979871165732322, + "language_loss": 0.87060714, + "learning_rate": 0.0007049987732956291, + "loss": 0.8818661, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.11834717, + "routerloss_mlp": 0.0, + "step": 1999, + "time_per_iteration": 2.9710631370544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110647, + "balance_loss_mlp": 1.09428668, + "diversity_loss_mlp": 0.0, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.05561177596637214, + "language_loss": 0.82812738, + "learning_rate": 0.0007047145799461439, + "loss": 0.83919203, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2000, + "time_per_iteration": 2.8492860794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105216, + "balance_loss_mlp": 1.09293747, + "diversity_loss_mlp": 0.0, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.06017266002852966, + "language_loss": 0.82272708, + "learning_rate": 0.00070443030711941, + "loss": 0.83377922, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2001, + "time_per_iteration": 2.769383430480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100076, + "balance_loss_mlp": 1.08806002, + "diversity_loss_mlp": 0.0, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.061888534691205976, + "language_loss": 0.82098496, + "learning_rate": 0.0007041459549257924, + "loss": 0.83198571, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2002, + "time_per_iteration": 2.876244306564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089803, + "balance_loss_mlp": 1.07744145, + "diversity_loss_mlp": 0.0, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.06816771124006925, + "language_loss": 0.78024125, + "learning_rate": 0.0007038615234756859, + "loss": 0.79113925, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.12359619, + "routerloss_mlp": 0.0, + "step": 2003, + "time_per_iteration": 3.1744768619537354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086899, + "balance_loss_mlp": 1.07477546, + "diversity_loss_mlp": 0.0, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.09233530116269285, + "language_loss": 0.83808231, + "learning_rate": 0.000703577012879517, + "loss": 0.84895122, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2004, + "time_per_iteration": 2.633391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089004, + "balance_loss_mlp": 1.07705307, + "diversity_loss_mlp": 0.0, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.07105955558417659, + "language_loss": 0.88946962, + "learning_rate": 0.0007032924232477423, + "loss": 0.90035963, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2005, + "time_per_iteration": 2.6482574939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109168, + "balance_loss_mlp": 1.0797528, + "diversity_loss_mlp": 0.0, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.07024694433071269, + "language_loss": 0.80605727, + "learning_rate": 0.0007030077546908493, + "loss": 0.81697416, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2006, + "time_per_iteration": 2.6219046115875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087051, + "balance_loss_mlp": 1.08056581, + "diversity_loss_mlp": 0.0, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.032453276732354666, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84151709, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.06494141, + "routerloss_mlp": 0.0, + "step": 2007, + "time_per_iteration": 4.798014402389526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099136, + "balance_loss_mlp": 1.08744717, + "diversity_loss_mlp": 0.0, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.08661380313869275, + "language_loss": 0.79137146, + "learning_rate": 0.0007024381812438117, + "loss": 0.8023628, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.11676025, + "routerloss_mlp": 0.0, + "step": 2008, + "time_per_iteration": 2.5403189659118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110095, + "balance_loss_mlp": 1.08864713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.09407170185597404, + "language_loss": 0.83448064, + "learning_rate": 0.0007021532765747951, + "loss": 0.8454901, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.12310791, + "routerloss_mlp": 0.0, + "step": 2009, + "time_per_iteration": 2.9585187435150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094144, + "balance_loss_mlp": 1.08211613, + "diversity_loss_mlp": 0.0, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.0684890586406507, + "language_loss": 0.79048979, + "learning_rate": 0.0007018682934229162, + "loss": 0.80143124, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2010, + "time_per_iteration": 2.9703307151794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_mlp": 1.0842756, + "diversity_loss_mlp": 0.0, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.06303649013837292, + "language_loss": 0.82761061, + "learning_rate": 0.0007015832318988152, + "loss": 0.83857542, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2011, + "time_per_iteration": 2.6060009002685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_mlp": 1.02231336, + "diversity_loss_mlp": 0.0, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.017766506591404385, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.7491802, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.05615234, + "routerloss_mlp": 0.0, + "step": 2012, + "time_per_iteration": 4.938155651092529 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109088, + "balance_loss_mlp": 1.07810068, + "diversity_loss_mlp": 0.0, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.060967443696148906, + "language_loss": 0.84265292, + "learning_rate": 0.0007010128741766604, + "loss": 0.85356176, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.12792969, + "routerloss_mlp": 0.0, + "step": 2013, + "time_per_iteration": 2.7293431758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091499, + "balance_loss_mlp": 1.07861209, + "diversity_loss_mlp": 0.0, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.07873148114105366, + "language_loss": 0.84277219, + "learning_rate": 0.0007007275782000391, + "loss": 0.85368717, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.12896729, + "routerloss_mlp": 0.0, + "step": 2014, + "time_per_iteration": 2.644911766052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091625, + "balance_loss_mlp": 1.07889354, + "diversity_loss_mlp": 0.0, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.0868083489465314, + "language_loss": 0.8502394, + "learning_rate": 0.0007004422042940605, + "loss": 0.86115563, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 2015, + "time_per_iteration": 2.5096747875213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109894, + "balance_loss_mlp": 1.08593392, + "diversity_loss_mlp": 0.0, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.08227522563153689, + "language_loss": 0.89877218, + "learning_rate": 0.0007001567525695169, + "loss": 0.90976155, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.13012695, + "routerloss_mlp": 0.0, + "step": 2016, + "time_per_iteration": 2.606520891189575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105972, + "balance_loss_mlp": 1.09330583, + "diversity_loss_mlp": 0.0, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.06437704205290017, + "language_loss": 0.83705699, + "learning_rate": 0.0006998712231372303, + "loss": 0.84811676, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 2017, + "time_per_iteration": 3.016061305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119415, + "balance_loss_mlp": 1.10692167, + "diversity_loss_mlp": 0.0, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06622760195410109, + "language_loss": 0.85886908, + "learning_rate": 0.0006995856161080532, + "loss": 0.87006325, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.12487793, + "routerloss_mlp": 0.0, + "step": 2018, + "time_per_iteration": 2.8263893127441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_mlp": 1.11165869, + "diversity_loss_mlp": 0.0, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.06957079313074316, + "language_loss": 0.82328916, + "learning_rate": 0.0006992999315928679, + "loss": 0.83453172, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.1260376, + "routerloss_mlp": 0.0, + "step": 2019, + "time_per_iteration": 2.789020299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.11772799, + "diversity_loss_mlp": 0.0, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.05589846380959986, + "language_loss": 0.85480869, + "learning_rate": 0.0006990141697025871, + "loss": 0.86611497, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.12915039, + "routerloss_mlp": 0.0, + "step": 2020, + "time_per_iteration": 2.788597345352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067569, + "balance_loss_mlp": 1.06141829, + "diversity_loss_mlp": 0.0, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.034323999481440985, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77427208, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2021, + "time_per_iteration": 4.782108545303345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130924, + "balance_loss_mlp": 1.11879468, + "diversity_loss_mlp": 0.0, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0813348018947899, + "language_loss": 0.82333553, + "learning_rate": 0.0006984424142405392, + "loss": 0.83464473, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 2022, + "time_per_iteration": 2.804866075515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118053, + "balance_loss_mlp": 1.10578668, + "diversity_loss_mlp": 0.0, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.07379903296161248, + "language_loss": 0.82117045, + "learning_rate": 0.0006981564208907474, + "loss": 0.83235097, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2023, + "time_per_iteration": 2.5883662700653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130828, + "balance_loss_mlp": 1.11855519, + "diversity_loss_mlp": 0.0, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.07869766022149485, + "language_loss": 0.8995713, + "learning_rate": 0.0006978703506098102, + "loss": 0.91087961, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2024, + "time_per_iteration": 2.730283498764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127449, + "balance_loss_mlp": 1.11556411, + "diversity_loss_mlp": 0.0, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.0665173530375796, + "language_loss": 0.88210815, + "learning_rate": 0.00069758420350879, + "loss": 0.89338267, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2025, + "time_per_iteration": 2.62969708442688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932402, + "balance_loss_mlp": 1.62686133, + "diversity_loss_mlp": 0.20693868, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.03379762859523427, + "language_loss": 0.8613863, + "learning_rate": 0.000697297979698779, + "loss": 0.87071025, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01550185, + "step": 2026, + "time_per_iteration": 2.837543249130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107077, + "balance_loss_mlp": 1.09529877, + "diversity_loss_mlp": 0.0, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06049708379655892, + "language_loss": 0.83660531, + "learning_rate": 0.0006970116792908992, + "loss": 0.84767604, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2027, + "time_per_iteration": 3.1133604049682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107629, + "balance_loss_mlp": 1.0960542, + "diversity_loss_mlp": 0.0, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.07190738956644391, + "language_loss": 0.81380564, + "learning_rate": 0.000696725302396302, + "loss": 0.82488191, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2028, + "time_per_iteration": 2.6460230350494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109766, + "balance_loss_mlp": 1.08604932, + "diversity_loss_mlp": 0.0, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.06814290150602269, + "language_loss": 0.85887402, + "learning_rate": 0.0006964388491261692, + "loss": 0.86985064, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2029, + "time_per_iteration": 3.296208143234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099195, + "balance_loss_mlp": 1.0871129, + "diversity_loss_mlp": 0.0, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.075812953715104, + "language_loss": 0.87511015, + "learning_rate": 0.0006961523195917114, + "loss": 0.88610214, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2030, + "time_per_iteration": 2.803239345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107522, + "balance_loss_mlp": 1.09573865, + "diversity_loss_mlp": 0.0, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.0665807006884719, + "language_loss": 0.78137511, + "learning_rate": 0.0006958657139041696, + "loss": 0.79245031, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2031, + "time_per_iteration": 2.739151954650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061018, + "balance_loss_mlp": 1.05531955, + "diversity_loss_mlp": 0.0, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.035996309550900246, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77773988, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.05688477, + "routerloss_mlp": 0.0, + "step": 2032, + "time_per_iteration": 4.918209552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094882, + "balance_loss_mlp": 1.08307993, + "diversity_loss_mlp": 0.0, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.0751880944680772, + "language_loss": 0.78643966, + "learning_rate": 0.0006952922745149434, + "loss": 0.79738843, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2033, + "time_per_iteration": 2.6274161338806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091358, + "balance_loss_mlp": 1.07940745, + "diversity_loss_mlp": 0.0, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.07391479182011068, + "language_loss": 0.87674987, + "learning_rate": 0.000695005441035888, + "loss": 0.88766348, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2034, + "time_per_iteration": 2.647348642349243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018577, + "balance_loss_mlp": 1.01280713, + "diversity_loss_mlp": 0.0, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.010435626825017296, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74742007, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2035, + "time_per_iteration": 4.8861188888549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107172, + "balance_loss_mlp": 1.094733, + "diversity_loss_mlp": 0.0, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.06114898183694146, + "language_loss": 0.81133932, + "learning_rate": 0.0006944315470656863, + "loss": 0.82241106, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.12438965, + "routerloss_mlp": 0.0, + "step": 2036, + "time_per_iteration": 3.0057246685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108633, + "balance_loss_mlp": 1.09606266, + "diversity_loss_mlp": 0.0, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.0812142536963638, + "language_loss": 0.90953541, + "learning_rate": 0.000694144486797345, + "loss": 0.92062169, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.12579346, + "routerloss_mlp": 0.0, + "step": 2037, + "time_per_iteration": 2.6566872596740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_mlp": 1.0060699, + "diversity_loss_mlp": 0.0, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.012879447335335118, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80532491, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2038, + "time_per_iteration": 4.609802722930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103729, + "balance_loss_mlp": 1.09141517, + "diversity_loss_mlp": 0.0, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.07718413790316761, + "language_loss": 0.89271998, + "learning_rate": 0.0006935701402514156, + "loss": 0.90375727, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.12316895, + "routerloss_mlp": 0.0, + "step": 2039, + "time_per_iteration": 2.610905408859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101805, + "balance_loss_mlp": 1.01206541, + "diversity_loss_mlp": 0.0, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.016017309503016164, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74052942, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2040, + "time_per_iteration": 4.954579830169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106674, + "balance_loss_mlp": 1.09434199, + "diversity_loss_mlp": 0.0, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.0728619475730698, + "language_loss": 0.84539711, + "learning_rate": 0.0006929954931031422, + "loss": 0.85646391, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2041, + "time_per_iteration": 3.6979990005493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114297, + "balance_loss_mlp": 1.10201287, + "diversity_loss_mlp": 0.0, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.07303574322286652, + "language_loss": 0.88330269, + "learning_rate": 0.0006927080570819805, + "loss": 0.89444566, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2042, + "time_per_iteration": 2.5840306282043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126631, + "balance_loss_mlp": 1.11437607, + "diversity_loss_mlp": 0.0, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.09784101638347129, + "language_loss": 0.80726093, + "learning_rate": 0.0006924205462449161, + "loss": 0.81852722, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2043, + "time_per_iteration": 2.556964159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123407, + "balance_loss_mlp": 1.11139631, + "diversity_loss_mlp": 0.0, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.07674510212981295, + "language_loss": 0.81822228, + "learning_rate": 0.0006921329607035702, + "loss": 0.82945639, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.12005615, + "routerloss_mlp": 0.0, + "step": 2044, + "time_per_iteration": 3.2355051040649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109464, + "balance_loss_mlp": 1.09777582, + "diversity_loss_mlp": 0.0, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.0626655505852987, + "language_loss": 0.87889385, + "learning_rate": 0.0006918453005695938, + "loss": 0.88998848, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2045, + "time_per_iteration": 2.616405725479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112555, + "balance_loss_mlp": 1.10047281, + "diversity_loss_mlp": 0.0, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.0593607382511463, + "language_loss": 0.8430419, + "learning_rate": 0.0006915575659546662, + "loss": 0.85416746, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2046, + "time_per_iteration": 2.6596429347991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100096, + "balance_loss_mlp": 1.08785915, + "diversity_loss_mlp": 0.0, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.0680979304239865, + "language_loss": 0.80745959, + "learning_rate": 0.0006912697569704959, + "loss": 0.81846058, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2047, + "time_per_iteration": 2.5962154865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097855, + "balance_loss_mlp": 1.08564174, + "diversity_loss_mlp": 0.0, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.07634449995136075, + "language_loss": 0.8702817, + "learning_rate": 0.0006909818737288205, + "loss": 0.88126016, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.12207031, + "routerloss_mlp": 0.0, + "step": 2048, + "time_per_iteration": 2.5559332370758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111282, + "balance_loss_mlp": 1.09955215, + "diversity_loss_mlp": 0.0, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.07451514550279957, + "language_loss": 0.80715293, + "learning_rate": 0.000690693916341406, + "loss": 0.81826574, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2049, + "time_per_iteration": 2.605881690979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115507, + "balance_loss_mlp": 1.10377121, + "diversity_loss_mlp": 0.0, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.06516266173427393, + "language_loss": 0.82286257, + "learning_rate": 0.0006904058849200475, + "loss": 0.83401763, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2050, + "time_per_iteration": 2.7183115482330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.09360313, + "diversity_loss_mlp": 0.0, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.0753850450331705, + "language_loss": 0.84972727, + "learning_rate": 0.0006901177795765683, + "loss": 0.8607837, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 2051, + "time_per_iteration": 2.627774715423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105318, + "balance_loss_mlp": 1.09354019, + "diversity_loss_mlp": 0.0, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.06465732667856934, + "language_loss": 0.81096435, + "learning_rate": 0.0006898296004228213, + "loss": 0.82201755, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2052, + "time_per_iteration": 2.7607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050397, + "balance_loss_mlp": 1.04446077, + "diversity_loss_mlp": 0.0, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.03031396698302257, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177135, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.05932617, + "routerloss_mlp": 0.0, + "step": 2053, + "time_per_iteration": 4.876460552215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117073, + "balance_loss_mlp": 1.10529494, + "diversity_loss_mlp": 0.0, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.1105412420488248, + "language_loss": 0.79620701, + "learning_rate": 0.0006892530211320763, + "loss": 0.80737776, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2054, + "time_per_iteration": 2.702591896057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00944261, + "balance_loss_mlp": 1.6481061, + "diversity_loss_mlp": 0.21043469, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.03587460904718008, + "language_loss": 0.84313488, + "learning_rate": 0.000688964621218926, + "loss": 0.85257751, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01499031, + "step": 2055, + "time_per_iteration": 2.6392524242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109262, + "balance_loss_mlp": 1.08063984, + "diversity_loss_mlp": 0.0, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.0862390851468888, + "language_loss": 0.80478442, + "learning_rate": 0.0006886761479432037, + "loss": 0.81571066, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.11968994, + "routerloss_mlp": 0.0, + "step": 2056, + "time_per_iteration": 2.8577234745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079958, + "balance_loss_mlp": 1.06739902, + "diversity_loss_mlp": 0.0, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.06874544900142358, + "language_loss": 0.84387571, + "learning_rate": 0.0006883876014169045, + "loss": 0.85467529, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.12554932, + "routerloss_mlp": 0.0, + "step": 2057, + "time_per_iteration": 2.572458505630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073975, + "balance_loss_mlp": 1.06154716, + "diversity_loss_mlp": 0.0, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.07681071569739906, + "language_loss": 0.90056652, + "learning_rate": 0.000688098981752052, + "loss": 0.91130626, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.12432861, + "routerloss_mlp": 0.0, + "step": 2058, + "time_per_iteration": 2.7125563621520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080003, + "balance_loss_mlp": 1.06697917, + "diversity_loss_mlp": 0.0, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.08571295812058347, + "language_loss": 0.80176479, + "learning_rate": 0.0006878102890606982, + "loss": 0.81256485, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.13043213, + "routerloss_mlp": 0.0, + "step": 2059, + "time_per_iteration": 3.0797197818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108467, + "balance_loss_mlp": 1.07161617, + "diversity_loss_mlp": 0.0, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.08415103615204221, + "language_loss": 0.81576395, + "learning_rate": 0.0006875215234549239, + "loss": 0.82661068, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.1307373, + "routerloss_mlp": 0.0, + "step": 2060, + "time_per_iteration": 2.5358171463012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078952, + "balance_loss_mlp": 1.06604218, + "diversity_loss_mlp": 0.0, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.08360675720274492, + "language_loss": 0.85212821, + "learning_rate": 0.0006872326850468376, + "loss": 0.86291778, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.12927246, + "routerloss_mlp": 0.0, + "step": 2061, + "time_per_iteration": 2.685746669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079022, + "balance_loss_mlp": 1.06612396, + "diversity_loss_mlp": 0.0, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.08669948408116639, + "language_loss": 0.78834969, + "learning_rate": 0.0006869437739485762, + "loss": 0.79913992, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.12908936, + "routerloss_mlp": 0.0, + "step": 2062, + "time_per_iteration": 2.608938455581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085084, + "balance_loss_mlp": 1.07266808, + "diversity_loss_mlp": 0.0, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.06314890183319057, + "language_loss": 0.92750764, + "learning_rate": 0.0006866547902723053, + "loss": 0.93835843, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.12420654, + "routerloss_mlp": 0.0, + "step": 2063, + "time_per_iteration": 2.654764175415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07135844, + "diversity_loss_mlp": 0.0, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.10797740353372913, + "language_loss": 0.80444092, + "learning_rate": 0.000686365734130218, + "loss": 0.81527805, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.12365723, + "routerloss_mlp": 0.0, + "step": 2064, + "time_per_iteration": 2.7161076068878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085534, + "balance_loss_mlp": 1.07345843, + "diversity_loss_mlp": 0.0, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.06605501724079509, + "language_loss": 0.83883071, + "learning_rate": 0.000686076605634536, + "loss": 0.84968603, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2065, + "time_per_iteration": 2.5960052013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088286, + "balance_loss_mlp": 1.07656133, + "diversity_loss_mlp": 0.0, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.06893141882644385, + "language_loss": 0.84303313, + "learning_rate": 0.0006857874048975088, + "loss": 0.85391599, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2066, + "time_per_iteration": 2.5419557094573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.08599246, + "diversity_loss_mlp": 0.0, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.07076940729430262, + "language_loss": 0.86944497, + "learning_rate": 0.0006854981320314142, + "loss": 0.88042831, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2067, + "time_per_iteration": 2.4425127506256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101956, + "balance_loss_mlp": 1.0900414, + "diversity_loss_mlp": 0.0, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.08678893766230582, + "language_loss": 0.86775517, + "learning_rate": 0.0006852087871485579, + "loss": 0.87877476, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2068, + "time_per_iteration": 2.617234468460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104627, + "balance_loss_mlp": 1.09308147, + "diversity_loss_mlp": 0.0, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.08540761893483814, + "language_loss": 0.81805646, + "learning_rate": 0.0006849193703612735, + "loss": 0.82910275, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2069, + "time_per_iteration": 2.7818312644958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110023, + "balance_loss_mlp": 1.08808875, + "diversity_loss_mlp": 0.0, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.06305964525737012, + "language_loss": 0.77731991, + "learning_rate": 0.0006846298817819225, + "loss": 0.78832221, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2070, + "time_per_iteration": 2.970045328140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099656, + "balance_loss_mlp": 1.08777106, + "diversity_loss_mlp": 0.0, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.09229213766989015, + "language_loss": 0.81058359, + "learning_rate": 0.0006843403215228945, + "loss": 0.82158017, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2071, + "time_per_iteration": 2.47542405128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.08525538, + "diversity_loss_mlp": 0.0, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.06250612449775428, + "language_loss": 0.80665851, + "learning_rate": 0.0006840506896966065, + "loss": 0.81763273, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2072, + "time_per_iteration": 2.7048730850219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102151, + "balance_loss_mlp": 1.09000397, + "diversity_loss_mlp": 0.0, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.07670911788950584, + "language_loss": 0.82343054, + "learning_rate": 0.0006837609864155038, + "loss": 0.83445203, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2073, + "time_per_iteration": 2.940208673477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111546, + "balance_loss_mlp": 1.09976768, + "diversity_loss_mlp": 0.0, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.06443735331096001, + "language_loss": 0.83203363, + "learning_rate": 0.0006834712117920592, + "loss": 0.84314907, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2074, + "time_per_iteration": 2.6217153072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111182, + "balance_loss_mlp": 1.09892166, + "diversity_loss_mlp": 0.0, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.07401760730887977, + "language_loss": 0.85670066, + "learning_rate": 0.0006831813659387729, + "loss": 0.86781245, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2075, + "time_per_iteration": 2.5696237087249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109855, + "balance_loss_mlp": 1.09774292, + "diversity_loss_mlp": 0.0, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.05990934262108594, + "language_loss": 0.84167391, + "learning_rate": 0.0006828914489681733, + "loss": 0.85277247, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2076, + "time_per_iteration": 2.7859339714050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119025, + "balance_loss_mlp": 1.1072948, + "diversity_loss_mlp": 0.0, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.06517456650976074, + "language_loss": 0.85312855, + "learning_rate": 0.0006826014609928162, + "loss": 0.86431879, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2077, + "time_per_iteration": 2.6851699352264404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0060157, + "balance_loss_mlp": 1.02597332, + "diversity_loss_mlp": 0.1552759, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.0013651319096223075, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8380096, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01094547, + "step": 2078, + "time_per_iteration": 4.859188795089722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114581, + "balance_loss_mlp": 1.1030947, + "diversity_loss_mlp": 0.0, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.0748648316539235, + "language_loss": 0.80062771, + "learning_rate": 0.0006820212724781896, + "loss": 0.81177354, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.11486816, + "routerloss_mlp": 0.0, + "step": 2079, + "time_per_iteration": 2.6628189086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.09492946, + "diversity_loss_mlp": 0.0, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.06148312623903997, + "language_loss": 0.83733618, + "learning_rate": 0.0006817310721641694, + "loss": 0.84840119, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2080, + "time_per_iteration": 2.847182512283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119227, + "balance_loss_mlp": 1.10731816, + "diversity_loss_mlp": 0.0, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.07223167054032475, + "language_loss": 0.83566946, + "learning_rate": 0.00068144080129589, + "loss": 0.84686172, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2081, + "time_per_iteration": 2.7161402702331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115616, + "balance_loss_mlp": 1.10388541, + "diversity_loss_mlp": 0.0, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.07619573858560975, + "language_loss": 0.8280167, + "learning_rate": 0.0006811504599860441, + "loss": 0.83917284, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2082, + "time_per_iteration": 2.5584774017333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104984, + "balance_loss_mlp": 1.0928719, + "diversity_loss_mlp": 0.0, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.1306421138400452, + "language_loss": 0.8569895, + "learning_rate": 0.0006808600483473526, + "loss": 0.86803931, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2083, + "time_per_iteration": 2.864786148071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094797, + "balance_loss_mlp": 1.0824883, + "diversity_loss_mlp": 0.0, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.06339794743033755, + "language_loss": 0.86393988, + "learning_rate": 0.0006805695664925629, + "loss": 0.87488782, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.12304688, + "routerloss_mlp": 0.0, + "step": 2084, + "time_per_iteration": 2.844709634780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089155, + "balance_loss_mlp": 1.07735372, + "diversity_loss_mlp": 0.0, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.0888076684038974, + "language_loss": 0.83841193, + "learning_rate": 0.0006802790145344506, + "loss": 0.84930348, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2085, + "time_per_iteration": 2.4883856773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083386, + "balance_loss_mlp": 1.07145894, + "diversity_loss_mlp": 0.0, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07803386161895243, + "language_loss": 0.87420845, + "learning_rate": 0.0006799883925858176, + "loss": 0.88504231, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2086, + "time_per_iteration": 2.8824286460876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088787, + "balance_loss_mlp": 1.0766871, + "diversity_loss_mlp": 0.0, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.06924310288687491, + "language_loss": 0.85459089, + "learning_rate": 0.0006796977007594933, + "loss": 0.86547881, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2087, + "time_per_iteration": 2.6597371101379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00970559, + "balance_loss_mlp": 1.6983223, + "diversity_loss_mlp": 0.21244028, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.03280700890509502, + "language_loss": 0.86715519, + "learning_rate": 0.0006794069391683345, + "loss": 0.87686074, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01517779, + "step": 2088, + "time_per_iteration": 2.7649624347686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078316, + "balance_loss_mlp": 1.06610286, + "diversity_loss_mlp": 0.0, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.07764554073270104, + "language_loss": 0.80781567, + "learning_rate": 0.0006791161079252248, + "loss": 0.81859887, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2089, + "time_per_iteration": 2.6467885971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082706, + "balance_loss_mlp": 1.07014716, + "diversity_loss_mlp": 0.0, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.0935978018434956, + "language_loss": 0.82482743, + "learning_rate": 0.0006788252071430747, + "loss": 0.8356545, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.12561035, + "routerloss_mlp": 0.0, + "step": 2090, + "time_per_iteration": 2.684659242630005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076621, + "balance_loss_mlp": 1.06417561, + "diversity_loss_mlp": 0.0, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.061003649340911806, + "language_loss": 0.86884034, + "learning_rate": 0.0006785342369348222, + "loss": 0.87960654, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.12451172, + "routerloss_mlp": 0.0, + "step": 2091, + "time_per_iteration": 2.7500762939453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081239, + "balance_loss_mlp": 1.06896663, + "diversity_loss_mlp": 0.0, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.08323404973511926, + "language_loss": 0.79681003, + "learning_rate": 0.0006782431974134316, + "loss": 0.80762231, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2092, + "time_per_iteration": 2.554500102996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085193, + "balance_loss_mlp": 1.07266974, + "diversity_loss_mlp": 0.0, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.06323665884579813, + "language_loss": 0.89339125, + "learning_rate": 0.0006779520886918949, + "loss": 0.90424317, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2093, + "time_per_iteration": 3.0625791549682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109256, + "balance_loss_mlp": 1.08038247, + "diversity_loss_mlp": 0.0, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.06591278584355922, + "language_loss": 0.81594688, + "learning_rate": 0.0006776609108832301, + "loss": 0.82687247, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2094, + "time_per_iteration": 2.84006929397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099273, + "balance_loss_mlp": 1.08723903, + "diversity_loss_mlp": 0.0, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.07397134749055344, + "language_loss": 0.84911013, + "learning_rate": 0.0006773696641004828, + "loss": 0.86010277, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.12030029, + "routerloss_mlp": 0.0, + "step": 2095, + "time_per_iteration": 2.5662059783935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110289, + "balance_loss_mlp": 1.09781969, + "diversity_loss_mlp": 0.0, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.07471072764212172, + "language_loss": 0.77422667, + "learning_rate": 0.0006770783484567247, + "loss": 0.78532958, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.12475586, + "routerloss_mlp": 0.0, + "step": 2096, + "time_per_iteration": 3.120000123977661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106409, + "balance_loss_mlp": 1.09445786, + "diversity_loss_mlp": 0.0, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.05645154934481913, + "language_loss": 0.85885596, + "learning_rate": 0.000676786964065055, + "loss": 0.86992002, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2097, + "time_per_iteration": 2.7947449684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.09767413, + "diversity_loss_mlp": 0.0, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.06468702094514471, + "language_loss": 0.78823644, + "learning_rate": 0.0006764955110385986, + "loss": 0.7993331, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2098, + "time_per_iteration": 2.7805027961730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113539, + "balance_loss_mlp": 1.10162365, + "diversity_loss_mlp": 0.0, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.06520165677387538, + "language_loss": 0.80479109, + "learning_rate": 0.0006762039894905083, + "loss": 0.81592649, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2099, + "time_per_iteration": 2.5934462547302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113864, + "balance_loss_mlp": 1.10191941, + "diversity_loss_mlp": 0.0, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.07619139256642768, + "language_loss": 0.80502266, + "learning_rate": 0.000675912399533962, + "loss": 0.81616127, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2100, + "time_per_iteration": 2.5193917751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0095878, + "balance_loss_mlp": 1.67460704, + "diversity_loss_mlp": 0.21229821, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.026749352452392162, + "language_loss": 0.8501215, + "learning_rate": 0.0006756207412821656, + "loss": 0.85970926, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532745, + "step": 2101, + "time_per_iteration": 3.0674142837524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125351, + "balance_loss_mlp": 1.11366224, + "diversity_loss_mlp": 0.0, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.07971707112625441, + "language_loss": 0.80680853, + "learning_rate": 0.0006753290148483505, + "loss": 0.81806201, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2102, + "time_per_iteration": 3.0177412033081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128591, + "balance_loss_mlp": 1.11720061, + "diversity_loss_mlp": 0.0, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.07197972569419236, + "language_loss": 0.78862077, + "learning_rate": 0.0006750372203457752, + "loss": 0.79990667, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2103, + "time_per_iteration": 2.4715232849121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133472, + "balance_loss_mlp": 1.12199795, + "diversity_loss_mlp": 0.0, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.05679089538273026, + "language_loss": 0.8629868, + "learning_rate": 0.0006747453578877242, + "loss": 0.87432158, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2104, + "time_per_iteration": 2.7127907276153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133404, + "balance_loss_mlp": 1.12154305, + "diversity_loss_mlp": 0.0, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.07881786572134404, + "language_loss": 0.83325595, + "learning_rate": 0.0006744534275875085, + "loss": 0.84459001, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.11853027, + "routerloss_mlp": 0.0, + "step": 2105, + "time_per_iteration": 2.9968934059143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_mlp": 1.11278331, + "diversity_loss_mlp": 0.0, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.06959652480101333, + "language_loss": 0.85228348, + "learning_rate": 0.0006741614295584657, + "loss": 0.86352497, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2106, + "time_per_iteration": 2.6837310791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128388, + "balance_loss_mlp": 1.1166873, + "diversity_loss_mlp": 0.0, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.07271017039443997, + "language_loss": 0.78820735, + "learning_rate": 0.0006738693639139595, + "loss": 0.79949123, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2107, + "time_per_iteration": 2.9876344203948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_mlp": 1.09982085, + "diversity_loss_mlp": 0.0, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.07545270814647756, + "language_loss": 0.7770499, + "learning_rate": 0.0006735772307673796, + "loss": 0.78816462, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2108, + "time_per_iteration": 3.5391368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.1007216, + "diversity_loss_mlp": 0.0, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.07028810729839409, + "language_loss": 0.8317976, + "learning_rate": 0.0006732850302321421, + "loss": 0.84292281, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2109, + "time_per_iteration": 2.924703359603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107897, + "balance_loss_mlp": 1.0962801, + "diversity_loss_mlp": 0.0, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.08331494403878895, + "language_loss": 0.84220135, + "learning_rate": 0.00067299276242169, + "loss": 0.85328031, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2110, + "time_per_iteration": 2.6628758907318115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00591895, + "balance_loss_mlp": 1.01285744, + "diversity_loss_mlp": 0.15005666, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.0011574932258311419, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.74974066, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01043818, + "step": 2111, + "time_per_iteration": 4.913798093795776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100112, + "balance_loss_mlp": 1.0884769, + "diversity_loss_mlp": 0.0, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.0671840972805921, + "language_loss": 0.77974957, + "learning_rate": 0.0006724080254290395, + "loss": 0.79075068, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2112, + "time_per_iteration": 2.790695905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087298, + "balance_loss_mlp": 1.07509685, + "diversity_loss_mlp": 0.0, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.06921545909042545, + "language_loss": 0.89956391, + "learning_rate": 0.0006721155564738566, + "loss": 0.91043687, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2113, + "time_per_iteration": 2.654052495956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00590146, + "balance_loss_mlp": 1.01069736, + "diversity_loss_mlp": 0.14874323, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.001129022163549877, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79212785, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01042587, + "step": 2114, + "time_per_iteration": 5.02890682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095601, + "balance_loss_mlp": 1.08348942, + "diversity_loss_mlp": 0.0, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.06673632265299649, + "language_loss": 0.85678279, + "learning_rate": 0.0006715304182135078, + "loss": 0.86773884, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.12109375, + "routerloss_mlp": 0.0, + "step": 2115, + "time_per_iteration": 2.6665151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_mlp": 1.07951176, + "diversity_loss_mlp": 0.0, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.08902530655488881, + "language_loss": 0.8859638, + "learning_rate": 0.0006712377491355127, + "loss": 0.89688623, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.12731934, + "routerloss_mlp": 0.0, + "step": 2116, + "time_per_iteration": 2.9124083518981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091485, + "balance_loss_mlp": 1.07896256, + "diversity_loss_mlp": 0.0, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.06275972542298792, + "language_loss": 0.81009984, + "learning_rate": 0.0006709450135771274, + "loss": 0.8210147, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2117, + "time_per_iteration": 2.9538469314575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109045, + "balance_loss_mlp": 1.07800436, + "diversity_loss_mlp": 0.0, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.06731197780562713, + "language_loss": 0.8655895, + "learning_rate": 0.0006706522116520023, + "loss": 0.87649393, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.12445068, + "routerloss_mlp": 0.0, + "step": 2118, + "time_per_iteration": 2.6403684616088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109386, + "balance_loss_mlp": 1.08127189, + "diversity_loss_mlp": 0.0, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.07339707473672348, + "language_loss": 0.82936597, + "learning_rate": 0.0006703593434738127, + "loss": 0.84030455, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.12579346, + "routerloss_mlp": 0.0, + "step": 2119, + "time_per_iteration": 2.706406354904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_mlp": 1.0847466, + "diversity_loss_mlp": 0.0, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.05750096894007485, + "language_loss": 0.78123623, + "learning_rate": 0.0006700664091562604, + "loss": 0.79220533, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2120, + "time_per_iteration": 2.5515992641448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102687, + "balance_loss_mlp": 1.09045601, + "diversity_loss_mlp": 0.0, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.08484846499370094, + "language_loss": 0.85241771, + "learning_rate": 0.0006697734088130725, + "loss": 0.86344457, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2121, + "time_per_iteration": 2.5997116565704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094995, + "balance_loss_mlp": 1.08268619, + "diversity_loss_mlp": 0.0, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.06901349076849703, + "language_loss": 0.85628182, + "learning_rate": 0.0006694803425580018, + "loss": 0.86723173, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.12310791, + "routerloss_mlp": 0.0, + "step": 2122, + "time_per_iteration": 2.975572109222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090381, + "balance_loss_mlp": 1.07825708, + "diversity_loss_mlp": 0.0, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.08123936309079019, + "language_loss": 0.84420574, + "learning_rate": 0.0006691872105048268, + "loss": 0.85510951, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.12133789, + "routerloss_mlp": 0.0, + "step": 2123, + "time_per_iteration": 2.5785253047943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109138, + "balance_loss_mlp": 1.07879114, + "diversity_loss_mlp": 0.0, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.06700388653835253, + "language_loss": 0.84703517, + "learning_rate": 0.0006688940127673513, + "loss": 0.85794896, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.12597656, + "routerloss_mlp": 0.0, + "step": 2124, + "time_per_iteration": 2.794312000274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080411, + "balance_loss_mlp": 1.06789398, + "diversity_loss_mlp": 0.0, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.11477925500015464, + "language_loss": 0.85646629, + "learning_rate": 0.0006686007494594049, + "loss": 0.86727041, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2125, + "time_per_iteration": 2.8629977703094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.06869102, + "diversity_loss_mlp": 0.0, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.08770785423003769, + "language_loss": 0.80226219, + "learning_rate": 0.0006683074206948425, + "loss": 0.81306815, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2126, + "time_per_iteration": 2.5477960109710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_mlp": 1.06884146, + "diversity_loss_mlp": 0.0, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.0688791895715759, + "language_loss": 0.81257784, + "learning_rate": 0.0006680140265875443, + "loss": 0.82338405, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2127, + "time_per_iteration": 2.824706792831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076947, + "balance_loss_mlp": 1.06504989, + "diversity_loss_mlp": 0.0, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.0706270365820259, + "language_loss": 0.95744675, + "learning_rate": 0.0006677205672514162, + "loss": 0.96821618, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2128, + "time_per_iteration": 2.6173171997070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081253, + "balance_loss_mlp": 1.06944525, + "diversity_loss_mlp": 0.0, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.08385407721227026, + "language_loss": 0.88751161, + "learning_rate": 0.000667427042800389, + "loss": 0.89832413, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2129, + "time_per_iteration": 2.746561288833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090066, + "balance_loss_mlp": 1.07828188, + "diversity_loss_mlp": 0.0, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.0802302808929841, + "language_loss": 0.82728851, + "learning_rate": 0.0006671334533484192, + "loss": 0.83818918, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2130, + "time_per_iteration": 2.7765390872955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094937, + "balance_loss_mlp": 1.08306408, + "diversity_loss_mlp": 0.0, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.06494454218377498, + "language_loss": 0.83394802, + "learning_rate": 0.0006668397990094881, + "loss": 0.84489739, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2131, + "time_per_iteration": 2.6814444065093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094306, + "balance_loss_mlp": 1.08240891, + "diversity_loss_mlp": 0.0, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.08851492372685672, + "language_loss": 0.84863144, + "learning_rate": 0.0006665460798976027, + "loss": 0.8595745, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2132, + "time_per_iteration": 2.734208822250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098079, + "balance_loss_mlp": 1.08680749, + "diversity_loss_mlp": 0.0, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.07834997970618658, + "language_loss": 0.8153789, + "learning_rate": 0.0006662522961267947, + "loss": 0.82635975, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2133, + "time_per_iteration": 2.642789363861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100422, + "balance_loss_mlp": 1.0889008, + "diversity_loss_mlp": 0.0, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.06175420460070233, + "language_loss": 0.87238759, + "learning_rate": 0.0006659584478111211, + "loss": 0.88339174, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2134, + "time_per_iteration": 2.8097283840179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110764, + "balance_loss_mlp": 1.09618366, + "diversity_loss_mlp": 0.0, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.07261990262121029, + "language_loss": 0.82762325, + "learning_rate": 0.000665664535064664, + "loss": 0.83869964, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2135, + "time_per_iteration": 3.034973382949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118337, + "balance_loss_mlp": 1.10702372, + "diversity_loss_mlp": 0.0, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.07277612177905571, + "language_loss": 0.82753229, + "learning_rate": 0.0006653705580015303, + "loss": 0.83871567, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.11309814, + "routerloss_mlp": 0.0, + "step": 2136, + "time_per_iteration": 2.719024181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130623, + "balance_loss_mlp": 1.11913705, + "diversity_loss_mlp": 0.0, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.09561286081072368, + "language_loss": 0.86333638, + "learning_rate": 0.0006650765167358523, + "loss": 0.87464261, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2137, + "time_per_iteration": 2.798013210296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119744, + "balance_loss_mlp": 1.10816908, + "diversity_loss_mlp": 0.0, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.06575385598885217, + "language_loss": 0.90120316, + "learning_rate": 0.0006647824113817864, + "loss": 0.9124006, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2138, + "time_per_iteration": 2.5290029048919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00941862, + "balance_loss_mlp": 1.64172852, + "diversity_loss_mlp": 0.21382158, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.027199696320483784, + "language_loss": 0.81782889, + "learning_rate": 0.000664488242053515, + "loss": 0.8272475, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01408678, + "step": 2139, + "time_per_iteration": 2.7610864639282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111286, + "balance_loss_mlp": 1.1009748, + "diversity_loss_mlp": 0.0, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.07795493316399416, + "language_loss": 0.83879304, + "learning_rate": 0.0006641940088652445, + "loss": 0.84992164, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 2140, + "time_per_iteration": 2.7797446250915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.08682573, + "diversity_loss_mlp": 0.0, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.09321248474614077, + "language_loss": 0.82214057, + "learning_rate": 0.0006638997119312065, + "loss": 0.83312857, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2141, + "time_per_iteration": 2.688427209854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_mlp": 1.07580638, + "diversity_loss_mlp": 0.0, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.05051376163622262, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76146024, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 2142, + "time_per_iteration": 4.916438817977905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084339, + "balance_loss_mlp": 1.07186329, + "diversity_loss_mlp": 0.0, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.0666522569579182, + "language_loss": 0.84487629, + "learning_rate": 0.000663310927282877, + "loss": 0.85571963, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.12475586, + "routerloss_mlp": 0.0, + "step": 2143, + "time_per_iteration": 2.742781162261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075707, + "balance_loss_mlp": 1.06302905, + "diversity_loss_mlp": 0.0, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.07553146792883669, + "language_loss": 0.85816187, + "learning_rate": 0.000663016439797172, + "loss": 0.86891896, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.12677002, + "routerloss_mlp": 0.0, + "step": 2144, + "time_per_iteration": 2.602322578430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075852, + "balance_loss_mlp": 1.06363273, + "diversity_loss_mlp": 0.0, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.09188682549299809, + "language_loss": 0.80924189, + "learning_rate": 0.0006627218890228724, + "loss": 0.82000041, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2145, + "time_per_iteration": 2.76790452003479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081048, + "balance_loss_mlp": 1.0687809, + "diversity_loss_mlp": 0.0, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.09235653357512275, + "language_loss": 0.83860421, + "learning_rate": 0.0006624272750743326, + "loss": 0.84941471, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.12261963, + "routerloss_mlp": 0.0, + "step": 2146, + "time_per_iteration": 2.986267566680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085622, + "balance_loss_mlp": 1.073385, + "diversity_loss_mlp": 0.0, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.06221373460159241, + "language_loss": 0.82866907, + "learning_rate": 0.0006621325980659322, + "loss": 0.83952528, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.12231445, + "routerloss_mlp": 0.0, + "step": 2147, + "time_per_iteration": 2.78074049949646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091906, + "balance_loss_mlp": 1.07981253, + "diversity_loss_mlp": 0.0, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.06655163113776748, + "language_loss": 0.81613219, + "learning_rate": 0.000661837858112075, + "loss": 0.82705128, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2148, + "time_per_iteration": 2.8118457794189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00920817, + "balance_loss_mlp": 1.59947157, + "diversity_loss_mlp": 0.21162269, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.03430222900415099, + "language_loss": 0.88696158, + "learning_rate": 0.0006615430553271888, + "loss": 0.89616972, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01526995, + "step": 2149, + "time_per_iteration": 2.809389352798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115952, + "balance_loss_mlp": 1.10438299, + "diversity_loss_mlp": 0.0, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.06824786639125466, + "language_loss": 0.85333586, + "learning_rate": 0.0006612481898257264, + "loss": 0.8644954, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2150, + "time_per_iteration": 2.855074644088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137757, + "balance_loss_mlp": 1.12599659, + "diversity_loss_mlp": 0.0, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.07789693292988349, + "language_loss": 0.851385, + "learning_rate": 0.000660953261722165, + "loss": 0.86276257, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.11749268, + "routerloss_mlp": 0.0, + "step": 2151, + "time_per_iteration": 2.5938022136688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113704, + "balance_loss_mlp": 1.12522054, + "diversity_loss_mlp": 0.0, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.08228338378299185, + "language_loss": 0.82884097, + "learning_rate": 0.0006606582711310055, + "loss": 0.84021133, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2152, + "time_per_iteration": 2.7282497882843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145368, + "balance_loss_mlp": 1.13366747, + "diversity_loss_mlp": 0.0, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.06559194318793425, + "language_loss": 0.82812124, + "learning_rate": 0.0006603632181667736, + "loss": 0.83957493, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2153, + "time_per_iteration": 2.6664750576019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103836, + "balance_loss_mlp": 1.09754133, + "diversity_loss_mlp": 0.0, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.03767833543400207, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.8004716, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.06298828, + "routerloss_mlp": 0.0, + "step": 2154, + "time_per_iteration": 4.910309791564941 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135237, + "balance_loss_mlp": 1.12367392, + "diversity_loss_mlp": 0.0, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.0807614788835298, + "language_loss": 0.81897664, + "learning_rate": 0.0006597729255773153, + "loss": 0.83032906, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.11560059, + "routerloss_mlp": 0.0, + "step": 2155, + "time_per_iteration": 2.509021520614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146058, + "balance_loss_mlp": 1.13441765, + "diversity_loss_mlp": 0.0, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.07993173196210833, + "language_loss": 0.82465029, + "learning_rate": 0.0006594776861812608, + "loss": 0.83611095, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2156, + "time_per_iteration": 2.656454086303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151315, + "balance_loss_mlp": 1.13991857, + "diversity_loss_mlp": 0.0, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.06494614409867079, + "language_loss": 0.8654387, + "learning_rate": 0.0006591823848704776, + "loss": 0.87695187, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.11395264, + "routerloss_mlp": 0.0, + "step": 2157, + "time_per_iteration": 2.9039251804351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134696, + "balance_loss_mlp": 1.12316287, + "diversity_loss_mlp": 0.0, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.07584878913150254, + "language_loss": 0.81510401, + "learning_rate": 0.0006588870217596117, + "loss": 0.82645094, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2158, + "time_per_iteration": 2.7366249561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121714, + "balance_loss_mlp": 1.11010289, + "diversity_loss_mlp": 0.0, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.0768974217493938, + "language_loss": 0.8567549, + "learning_rate": 0.0006585915969633334, + "loss": 0.86797202, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2159, + "time_per_iteration": 2.557969331741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105923, + "balance_loss_mlp": 1.09437764, + "diversity_loss_mlp": 0.0, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.06453825749462137, + "language_loss": 0.89545041, + "learning_rate": 0.0006582961105963366, + "loss": 0.90650964, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2160, + "time_per_iteration": 2.782766103744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089959, + "balance_loss_mlp": 1.07836008, + "diversity_loss_mlp": 0.0, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.09389311079563152, + "language_loss": 0.77639234, + "learning_rate": 0.0006580005627733395, + "loss": 0.78729188, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2161, + "time_per_iteration": 2.7049734592437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086569, + "balance_loss_mlp": 1.07492197, + "diversity_loss_mlp": 0.0, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.08236412019602501, + "language_loss": 0.81618345, + "learning_rate": 0.0006577049536090838, + "loss": 0.8270492, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.11645508, + "routerloss_mlp": 0.0, + "step": 2162, + "time_per_iteration": 2.723243236541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078674, + "balance_loss_mlp": 1.06676459, + "diversity_loss_mlp": 0.0, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.09869721655750711, + "language_loss": 0.85591501, + "learning_rate": 0.000657409283218335, + "loss": 0.86670172, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2163, + "time_per_iteration": 2.64973783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078005, + "balance_loss_mlp": 1.0662148, + "diversity_loss_mlp": 0.0, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.06806079796586995, + "language_loss": 0.81014043, + "learning_rate": 0.0006571135517158829, + "loss": 0.82092047, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2164, + "time_per_iteration": 2.6662614345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261192, + "balance_loss_mlp": 1.25542271, + "diversity_loss_mlp": 0.0, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.0963910676883023, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.78025252, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2165, + "time_per_iteration": 4.733267068862915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084468, + "balance_loss_mlp": 1.07227921, + "diversity_loss_mlp": 0.0, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.08489426271121504, + "language_loss": 0.83098751, + "learning_rate": 0.0006565219058351444, + "loss": 0.84183216, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.12194824, + "routerloss_mlp": 0.0, + "step": 2166, + "time_per_iteration": 2.555367946624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087654, + "balance_loss_mlp": 1.07506573, + "diversity_loss_mlp": 0.0, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.0663020588108057, + "language_loss": 0.82663929, + "learning_rate": 0.0006562259916865553, + "loss": 0.83751583, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.12585449, + "routerloss_mlp": 0.0, + "step": 2167, + "time_per_iteration": 2.5647947788238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085528, + "balance_loss_mlp": 1.07305884, + "diversity_loss_mlp": 0.0, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.11811458423881586, + "language_loss": 0.79392177, + "learning_rate": 0.0006559300168856573, + "loss": 0.80477709, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 2168, + "time_per_iteration": 2.737071990966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090803, + "balance_loss_mlp": 1.07860184, + "diversity_loss_mlp": 0.0, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.07183663020795078, + "language_loss": 0.86060214, + "learning_rate": 0.0006556339815473577, + "loss": 0.87151015, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.12200928, + "routerloss_mlp": 0.0, + "step": 2169, + "time_per_iteration": 2.6506707668304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087149, + "balance_loss_mlp": 1.07504892, + "diversity_loss_mlp": 0.0, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.07609133400056706, + "language_loss": 0.86409211, + "learning_rate": 0.000655337885786588, + "loss": 0.87496364, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2170, + "time_per_iteration": 2.8835949897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078201, + "balance_loss_mlp": 1.06654263, + "diversity_loss_mlp": 0.0, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.08298304012821277, + "language_loss": 0.85129267, + "learning_rate": 0.0006550417297183025, + "loss": 0.86207461, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2171, + "time_per_iteration": 2.6195385456085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087717, + "balance_loss_mlp": 1.07584357, + "diversity_loss_mlp": 0.0, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.07223590906341684, + "language_loss": 0.81395489, + "learning_rate": 0.0006547455134574793, + "loss": 0.82483202, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.11877441, + "routerloss_mlp": 0.0, + "step": 2172, + "time_per_iteration": 2.688387155532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091071, + "balance_loss_mlp": 1.07947183, + "diversity_loss_mlp": 0.0, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06986640066350178, + "language_loss": 0.84520721, + "learning_rate": 0.0006544492371191198, + "loss": 0.85611784, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2173, + "time_per_iteration": 3.1099753379821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094341, + "balance_loss_mlp": 1.08226562, + "diversity_loss_mlp": 0.0, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.06657472623207703, + "language_loss": 0.8341983, + "learning_rate": 0.0006541529008182485, + "loss": 0.84514177, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2174, + "time_per_iteration": 3.203376054763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107353, + "balance_loss_mlp": 1.09567666, + "diversity_loss_mlp": 0.0, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.07167092475387357, + "language_loss": 0.87561977, + "learning_rate": 0.0006538565046699136, + "loss": 0.8866933, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2175, + "time_per_iteration": 2.6136248111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122872, + "balance_loss_mlp": 1.1111474, + "diversity_loss_mlp": 0.0, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.08073018870716439, + "language_loss": 0.81308544, + "learning_rate": 0.0006535600487891862, + "loss": 0.82431418, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2176, + "time_per_iteration": 2.8484995365142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112142, + "balance_loss_mlp": 1.10968423, + "diversity_loss_mlp": 0.0, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.06933020813080157, + "language_loss": 0.89047962, + "learning_rate": 0.0006532635332911603, + "loss": 0.90169382, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2177, + "time_per_iteration": 2.6983814239501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139797, + "balance_loss_mlp": 1.12828767, + "diversity_loss_mlp": 0.0, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.07833316419755533, + "language_loss": 0.80340332, + "learning_rate": 0.0006529669582909541, + "loss": 0.81480134, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2178, + "time_per_iteration": 3.247034788131714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130167, + "balance_loss_mlp": 1.11881781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.08850961832331757, + "language_loss": 0.85867965, + "learning_rate": 0.0006526703239037077, + "loss": 0.86998129, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2179, + "time_per_iteration": 2.6653683185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00933718, + "balance_loss_mlp": 1.62844765, + "diversity_loss_mlp": 0.20954823, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.029582524443817385, + "language_loss": 0.86593473, + "learning_rate": 0.0006523736302445851, + "loss": 0.87527192, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01471971, + "step": 2180, + "time_per_iteration": 2.857030153274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120353, + "balance_loss_mlp": 1.10893881, + "diversity_loss_mlp": 0.0, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.0687803817541909, + "language_loss": 0.77392578, + "learning_rate": 0.0006520768774287728, + "loss": 0.78512931, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2181, + "time_per_iteration": 5.625683307647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114289, + "balance_loss_mlp": 1.10282135, + "diversity_loss_mlp": 0.0, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06088029266780351, + "language_loss": 0.85493296, + "learning_rate": 0.0006517800655714806, + "loss": 0.86607587, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2182, + "time_per_iteration": 2.812955617904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105622, + "balance_loss_mlp": 1.09442866, + "diversity_loss_mlp": 0.0, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07098705372074567, + "language_loss": 0.85399854, + "learning_rate": 0.0006514831947879407, + "loss": 0.86505473, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.11193848, + "routerloss_mlp": 0.0, + "step": 2183, + "time_per_iteration": 2.961418867111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097617, + "balance_loss_mlp": 1.08642888, + "diversity_loss_mlp": 0.0, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.08450852264083888, + "language_loss": 0.78323019, + "learning_rate": 0.0006511862651934091, + "loss": 0.79420632, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2184, + "time_per_iteration": 3.076414108276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091172, + "balance_loss_mlp": 1.07956707, + "diversity_loss_mlp": 0.0, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.06921087236063693, + "language_loss": 0.82092035, + "learning_rate": 0.0006508892769031638, + "loss": 0.83183205, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2185, + "time_per_iteration": 2.638606309890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089875, + "balance_loss_mlp": 1.07868707, + "diversity_loss_mlp": 0.0, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.07895440454445611, + "language_loss": 0.87322706, + "learning_rate": 0.000650592230032506, + "loss": 0.88412583, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.11187744, + "routerloss_mlp": 0.0, + "step": 2186, + "time_per_iteration": 2.702061176300049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093256, + "balance_loss_mlp": 1.0815382, + "diversity_loss_mlp": 0.0, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.07748698496632533, + "language_loss": 0.85121393, + "learning_rate": 0.0006502951246967595, + "loss": 0.8621465, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.11706543, + "routerloss_mlp": 0.0, + "step": 2187, + "time_per_iteration": 2.871629476547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087261, + "balance_loss_mlp": 1.07582331, + "diversity_loss_mlp": 0.0, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.06016607527200091, + "language_loss": 0.86913472, + "learning_rate": 0.0006499979610112706, + "loss": 0.88000733, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2188, + "time_per_iteration": 2.795278787612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107962, + "balance_loss_mlp": 1.06803894, + "diversity_loss_mlp": 0.0, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.0593739697007924, + "language_loss": 0.84024572, + "learning_rate": 0.000649700739091409, + "loss": 0.85104191, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2189, + "time_per_iteration": 2.822756290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123251, + "balance_loss_mlp": 1.11500144, + "diversity_loss_mlp": 0.0, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.03860831682793276, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74959522, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.08251953, + "routerloss_mlp": 0.0, + "step": 2190, + "time_per_iteration": 4.79919958114624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082066, + "balance_loss_mlp": 1.07052088, + "diversity_loss_mlp": 0.0, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.06761793691364075, + "language_loss": 0.85737348, + "learning_rate": 0.0006491061210101557, + "loss": 0.86819422, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2191, + "time_per_iteration": 2.661578416824341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094285, + "balance_loss_mlp": 1.08270931, + "diversity_loss_mlp": 0.0, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.0725556462678514, + "language_loss": 0.83956218, + "learning_rate": 0.0006488087250796157, + "loss": 0.85050505, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2192, + "time_per_iteration": 2.881225347518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095445, + "balance_loss_mlp": 1.08376861, + "diversity_loss_mlp": 0.0, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.09298126342392905, + "language_loss": 0.81662476, + "learning_rate": 0.0006485112713764049, + "loss": 0.82757914, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2193, + "time_per_iteration": 2.8921914100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_mlp": 1.08214593, + "diversity_loss_mlp": 0.0, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.058244545196029895, + "language_loss": 0.83715278, + "learning_rate": 0.0006482137600160051, + "loss": 0.84809017, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2194, + "time_per_iteration": 2.484341859817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094196, + "balance_loss_mlp": 1.08240056, + "diversity_loss_mlp": 0.0, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.08574033239321836, + "language_loss": 0.847399, + "learning_rate": 0.0006479161911139206, + "loss": 0.85834098, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2195, + "time_per_iteration": 2.5937106609344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082105, + "balance_loss_mlp": 1.07043433, + "diversity_loss_mlp": 0.0, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08791937036502419, + "language_loss": 0.85522735, + "learning_rate": 0.0006476185647856778, + "loss": 0.86604846, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.11657715, + "routerloss_mlp": 0.0, + "step": 2196, + "time_per_iteration": 2.569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080125, + "balance_loss_mlp": 1.06815672, + "diversity_loss_mlp": 0.0, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.07778870715402122, + "language_loss": 0.82192588, + "learning_rate": 0.0006473208811468255, + "loss": 0.83272707, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2197, + "time_per_iteration": 2.899557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072137, + "balance_loss_mlp": 1.06046605, + "diversity_loss_mlp": 0.0, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.07330307904629892, + "language_loss": 0.84140831, + "learning_rate": 0.0006470231403129347, + "loss": 0.85212964, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.11663818, + "routerloss_mlp": 0.0, + "step": 2198, + "time_per_iteration": 2.602447509765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106933, + "balance_loss_mlp": 1.05760026, + "diversity_loss_mlp": 0.0, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.06409293690085444, + "language_loss": 0.81590885, + "learning_rate": 0.0006467253423995988, + "loss": 0.82660222, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2199, + "time_per_iteration": 2.8557229042053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107934, + "balance_loss_mlp": 1.06755078, + "diversity_loss_mlp": 0.0, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.07244216805562081, + "language_loss": 0.78831869, + "learning_rate": 0.000646427487522433, + "loss": 0.79911208, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2200, + "time_per_iteration": 2.65742826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084908, + "balance_loss_mlp": 1.07336855, + "diversity_loss_mlp": 0.0, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.07121994515744344, + "language_loss": 0.83032513, + "learning_rate": 0.0006461295757970749, + "loss": 0.84117424, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2201, + "time_per_iteration": 2.950655698776245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090426, + "balance_loss_mlp": 1.07880902, + "diversity_loss_mlp": 0.0, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.07713064950594434, + "language_loss": 0.81538546, + "learning_rate": 0.0006458316073391839, + "loss": 0.82628965, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2202, + "time_per_iteration": 2.8609914779663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089677, + "balance_loss_mlp": 1.07874584, + "diversity_loss_mlp": 0.0, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.07022827859020209, + "language_loss": 0.87709206, + "learning_rate": 0.0006455335822644422, + "loss": 0.88798881, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2203, + "time_per_iteration": 2.6978323459625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118526, + "balance_loss_mlp": 1.10743332, + "diversity_loss_mlp": 0.0, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.08724206882012846, + "language_loss": 0.78530163, + "learning_rate": 0.0006452355006885527, + "loss": 0.79648691, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.11090088, + "routerloss_mlp": 0.0, + "step": 2204, + "time_per_iteration": 2.686579704284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00922718, + "balance_loss_mlp": 1.60671031, + "diversity_loss_mlp": 0.20807257, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.038668439213979985, + "language_loss": 0.8761735, + "learning_rate": 0.0006449373627272412, + "loss": 0.88540065, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01532654, + "step": 2205, + "time_per_iteration": 2.7558722496032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112883, + "balance_loss_mlp": 1.10164738, + "diversity_loss_mlp": 0.0, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.08032286277613819, + "language_loss": 0.82142913, + "learning_rate": 0.0006446391684962553, + "loss": 0.83255792, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.11230469, + "routerloss_mlp": 0.0, + "step": 2206, + "time_per_iteration": 2.6579248905181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117757, + "balance_loss_mlp": 1.10650921, + "diversity_loss_mlp": 0.0, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.06707307211931093, + "language_loss": 0.82899106, + "learning_rate": 0.000644340918111364, + "loss": 0.8401686, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2207, + "time_per_iteration": 2.5347208976745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117145, + "balance_loss_mlp": 1.10573626, + "diversity_loss_mlp": 0.0, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.09153331321335235, + "language_loss": 0.84820396, + "learning_rate": 0.0006440426116883585, + "loss": 0.85937536, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.11401367, + "routerloss_mlp": 0.0, + "step": 2208, + "time_per_iteration": 2.5513036251068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.11258864, + "diversity_loss_mlp": 0.0, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.07442494649717855, + "language_loss": 0.86227304, + "learning_rate": 0.0006437442493430519, + "loss": 0.87351412, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2209, + "time_per_iteration": 2.6560840606689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120019, + "balance_loss_mlp": 1.10829473, + "diversity_loss_mlp": 0.0, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.09545289030190586, + "language_loss": 0.86441422, + "learning_rate": 0.000643445831191278, + "loss": 0.8756144, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2210, + "time_per_iteration": 2.9028308391571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103179, + "balance_loss_mlp": 1.09162724, + "diversity_loss_mlp": 0.0, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.07646392549286844, + "language_loss": 0.81526744, + "learning_rate": 0.0006431473573488937, + "loss": 0.82629919, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2211, + "time_per_iteration": 2.7377443313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089807, + "balance_loss_mlp": 1.0782795, + "diversity_loss_mlp": 0.0, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.08107145257136338, + "language_loss": 0.85147351, + "learning_rate": 0.0006428488279317765, + "loss": 0.86237156, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2212, + "time_per_iteration": 2.6276626586914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109177, + "balance_loss_mlp": 1.08065951, + "diversity_loss_mlp": 0.0, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.09124161172132733, + "language_loss": 0.87490094, + "learning_rate": 0.0006425502430558259, + "loss": 0.88581866, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.11120605, + "routerloss_mlp": 0.0, + "step": 2213, + "time_per_iteration": 2.588928699493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109161, + "balance_loss_mlp": 1.08046961, + "diversity_loss_mlp": 0.0, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.06865062693642494, + "language_loss": 0.84588826, + "learning_rate": 0.0006422516028369628, + "loss": 0.85680431, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.11138916, + "routerloss_mlp": 0.0, + "step": 2214, + "time_per_iteration": 2.639619827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085781, + "balance_loss_mlp": 1.07456374, + "diversity_loss_mlp": 0.0, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.06481575152476399, + "language_loss": 0.83497036, + "learning_rate": 0.0006419529073911296, + "loss": 0.84582818, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2215, + "time_per_iteration": 2.8564555644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091551, + "balance_loss_mlp": 1.08075058, + "diversity_loss_mlp": 0.0, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.07537518077633425, + "language_loss": 0.85102242, + "learning_rate": 0.0006416541568342901, + "loss": 0.86193788, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.10797119, + "routerloss_mlp": 0.0, + "step": 2216, + "time_per_iteration": 2.8998327255249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082292, + "balance_loss_mlp": 1.07092535, + "diversity_loss_mlp": 0.0, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.06331803259599181, + "language_loss": 0.84347832, + "learning_rate": 0.0006413553512824297, + "loss": 0.85430121, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2217, + "time_per_iteration": 2.754044532775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084192, + "balance_loss_mlp": 1.07307625, + "diversity_loss_mlp": 0.0, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.07616444203019798, + "language_loss": 0.84374213, + "learning_rate": 0.0006410564908515549, + "loss": 0.85458404, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.11114502, + "routerloss_mlp": 0.0, + "step": 2218, + "time_per_iteration": 2.724478006362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081004, + "balance_loss_mlp": 1.06966138, + "diversity_loss_mlp": 0.0, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.0731173396075932, + "language_loss": 0.85161233, + "learning_rate": 0.0006407575756576935, + "loss": 0.86242241, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.11334229, + "routerloss_mlp": 0.0, + "step": 2219, + "time_per_iteration": 2.754624128341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093699, + "balance_loss_mlp": 1.08191478, + "diversity_loss_mlp": 0.0, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.068521011535794, + "language_loss": 0.87612599, + "learning_rate": 0.0006404586058168951, + "loss": 0.88706297, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2220, + "time_per_iteration": 2.6972298622131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100965, + "balance_loss_mlp": 1.08927631, + "diversity_loss_mlp": 0.0, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.1033551804820373, + "language_loss": 0.86327708, + "learning_rate": 0.0006401595814452296, + "loss": 0.87428677, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2221, + "time_per_iteration": 2.6071925163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_mlp": 1.08816695, + "diversity_loss_mlp": 0.0, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.07649462730323824, + "language_loss": 0.8070569, + "learning_rate": 0.000639860502658789, + "loss": 0.81805706, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 2222, + "time_per_iteration": 2.6844141483306885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101843, + "balance_loss_mlp": 1.08965993, + "diversity_loss_mlp": 0.0, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.0652732350229211, + "language_loss": 0.84929889, + "learning_rate": 0.0006395613695736853, + "loss": 0.86031729, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.1217041, + "routerloss_mlp": 0.0, + "step": 2223, + "time_per_iteration": 2.6799042224884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091013, + "balance_loss_mlp": 1.07850194, + "diversity_loss_mlp": 0.0, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.10552751254703834, + "language_loss": 0.82026577, + "learning_rate": 0.0006392621823060529, + "loss": 0.83117592, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.12518311, + "routerloss_mlp": 0.0, + "step": 2224, + "time_per_iteration": 2.722675323486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083211, + "balance_loss_mlp": 1.07109332, + "diversity_loss_mlp": 0.0, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.0790777786133485, + "language_loss": 0.8508532, + "learning_rate": 0.0006389629409720465, + "loss": 0.86168534, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2225, + "time_per_iteration": 2.6559393405914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084084, + "balance_loss_mlp": 1.07179379, + "diversity_loss_mlp": 0.0, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.0811747132385773, + "language_loss": 0.88654399, + "learning_rate": 0.0006386636456878417, + "loss": 0.89738482, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2226, + "time_per_iteration": 2.898261308670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083891, + "balance_loss_mlp": 1.07153535, + "diversity_loss_mlp": 0.0, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.07696212536929578, + "language_loss": 0.92413348, + "learning_rate": 0.0006383642965696353, + "loss": 0.93497235, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 2227, + "time_per_iteration": 2.467622995376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00932178, + "balance_loss_mlp": 1.62005818, + "diversity_loss_mlp": 0.21207821, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.033827312051000154, + "language_loss": 0.83018744, + "learning_rate": 0.000638064893733645, + "loss": 0.83950925, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01611001, + "step": 2228, + "time_per_iteration": 2.74554705619812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00939878, + "balance_loss_mlp": 1.63503206, + "diversity_loss_mlp": 0.21170495, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.03357304306136308, + "language_loss": 0.90087909, + "learning_rate": 0.000637765437296109, + "loss": 0.91027784, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01650969, + "step": 2229, + "time_per_iteration": 2.6807308197021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086799, + "balance_loss_mlp": 1.07446718, + "diversity_loss_mlp": 0.0, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.09425394332621637, + "language_loss": 0.85585725, + "learning_rate": 0.000637465927373287, + "loss": 0.86672527, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.12329102, + "routerloss_mlp": 0.0, + "step": 2230, + "time_per_iteration": 2.6279454231262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088133, + "balance_loss_mlp": 1.0761342, + "diversity_loss_mlp": 0.0, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.13300209785278838, + "language_loss": 0.79446864, + "learning_rate": 0.000637166364081459, + "loss": 0.80534995, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.11993408, + "routerloss_mlp": 0.0, + "step": 2231, + "time_per_iteration": 2.7252066135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108179, + "balance_loss_mlp": 1.07001245, + "diversity_loss_mlp": 0.0, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.08046243261781533, + "language_loss": 0.84081841, + "learning_rate": 0.0006368667475369256, + "loss": 0.85163629, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2232, + "time_per_iteration": 2.756286382675171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_mlp": 1.03840148, + "diversity_loss_mlp": 0.0, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.02809293853716727, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79574001, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.078125, + "routerloss_mlp": 0.0, + "step": 2233, + "time_per_iteration": 4.852276086807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_mlp": 1.02313304, + "diversity_loss_mlp": 0.0, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.02329901381823612, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79926044, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.07470703, + "routerloss_mlp": 0.0, + "step": 2234, + "time_per_iteration": 4.812516689300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107186, + "balance_loss_mlp": 1.09534228, + "diversity_loss_mlp": 0.0, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.06628794940731256, + "language_loss": 0.86166692, + "learning_rate": 0.0006359675795504112, + "loss": 0.87273884, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.1184082, + "routerloss_mlp": 0.0, + "step": 2235, + "time_per_iteration": 2.7691314220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112399, + "balance_loss_mlp": 1.11230159, + "diversity_loss_mlp": 0.0, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.08124483128316094, + "language_loss": 0.74637383, + "learning_rate": 0.0006356677511584775, + "loss": 0.75761378, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.11676025, + "routerloss_mlp": 0.0, + "step": 2236, + "time_per_iteration": 3.51676082611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138804, + "balance_loss_mlp": 1.12733603, + "diversity_loss_mlp": 0.0, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.08045247853644188, + "language_loss": 0.85975677, + "learning_rate": 0.0006353678700956511, + "loss": 0.87114477, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2237, + "time_per_iteration": 2.5487072467803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137661, + "balance_loss_mlp": 1.12605572, + "diversity_loss_mlp": 0.0, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.08414636037035166, + "language_loss": 0.84184766, + "learning_rate": 0.0006350679364783569, + "loss": 0.85322422, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2238, + "time_per_iteration": 2.730128288269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113973, + "balance_loss_mlp": 1.1279577, + "diversity_loss_mlp": 0.0, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.06707032645836293, + "language_loss": 0.85872072, + "learning_rate": 0.0006347679504230393, + "loss": 0.87011802, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2239, + "time_per_iteration": 2.640791893005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136631, + "balance_loss_mlp": 1.12453079, + "diversity_loss_mlp": 0.0, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.07174503893432663, + "language_loss": 0.7626543, + "learning_rate": 0.0006344679120461632, + "loss": 0.77402061, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2240, + "time_per_iteration": 3.3352768421173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128316, + "balance_loss_mlp": 1.11687779, + "diversity_loss_mlp": 0.0, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.08647233478950261, + "language_loss": 0.79984182, + "learning_rate": 0.0006341678214642134, + "loss": 0.81112498, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2241, + "time_per_iteration": 2.662132740020752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114748, + "balance_loss_mlp": 1.10336995, + "diversity_loss_mlp": 0.0, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.06482352137494116, + "language_loss": 0.82986903, + "learning_rate": 0.0006338676787936963, + "loss": 0.84101653, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2242, + "time_per_iteration": 3.064518451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123318, + "balance_loss_mlp": 1.11183178, + "diversity_loss_mlp": 0.0, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.07554467546841755, + "language_loss": 0.84015846, + "learning_rate": 0.0006335674841511367, + "loss": 0.85139167, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2243, + "time_per_iteration": 2.7494354248046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067953, + "balance_loss_mlp": 1.06189752, + "diversity_loss_mlp": 0.0, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.020266409588932003, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80249119, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.06054688, + "routerloss_mlp": 0.0, + "step": 2244, + "time_per_iteration": 5.019898414611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058136, + "balance_loss_mlp": 1.05208015, + "diversity_loss_mlp": 0.0, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.017496917907237546, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78423691, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.06054688, + "routerloss_mlp": 0.0, + "step": 2245, + "time_per_iteration": 4.940483808517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111091, + "balance_loss_mlp": 1.09893775, + "diversity_loss_mlp": 0.0, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.07826437205196314, + "language_loss": 0.82487583, + "learning_rate": 0.0006326665895567652, + "loss": 0.83598673, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.121521, + "routerloss_mlp": 0.0, + "step": 2246, + "time_per_iteration": 2.6287152767181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111115, + "balance_loss_mlp": 1.09895015, + "diversity_loss_mlp": 0.0, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.09268036537549412, + "language_loss": 0.87613881, + "learning_rate": 0.0006323661881916976, + "loss": 0.88725001, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.121521, + "routerloss_mlp": 0.0, + "step": 2247, + "time_per_iteration": 2.6966464519500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110117, + "balance_loss_mlp": 1.08901072, + "diversity_loss_mlp": 0.0, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.07850654458656253, + "language_loss": 0.812437, + "learning_rate": 0.0006320657354375179, + "loss": 0.82344878, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2248, + "time_per_iteration": 3.0057384967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.08872366, + "diversity_loss_mlp": 0.0, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.07399569527983862, + "language_loss": 0.87203169, + "learning_rate": 0.0006317652314108726, + "loss": 0.88303995, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.12097168, + "routerloss_mlp": 0.0, + "step": 2249, + "time_per_iteration": 2.6106557846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083093, + "balance_loss_mlp": 1.07126176, + "diversity_loss_mlp": 0.0, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.07131076511794647, + "language_loss": 0.91191232, + "learning_rate": 0.0006314646762284277, + "loss": 0.92274326, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2250, + "time_per_iteration": 2.601017951965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_mlp": 1.02617049, + "diversity_loss_mlp": 0.0, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.02997957544407836, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76458681, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.06689453, + "routerloss_mlp": 0.0, + "step": 2251, + "time_per_iteration": 4.872025966644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085431, + "balance_loss_mlp": 1.07351613, + "diversity_loss_mlp": 0.0, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.07162967916255573, + "language_loss": 0.77412337, + "learning_rate": 0.0006308634128629022, + "loss": 0.78497767, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2252, + "time_per_iteration": 2.858896255493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089815, + "balance_loss_mlp": 1.07750654, + "diversity_loss_mlp": 0.0, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.0655401202696214, + "language_loss": 0.8742274, + "learning_rate": 0.0006305627049132531, + "loss": 0.88512552, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2253, + "time_per_iteration": 2.8089702129364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108552, + "balance_loss_mlp": 1.07309866, + "diversity_loss_mlp": 0.0, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.05577202062379855, + "language_loss": 0.85968709, + "learning_rate": 0.0006302619462746662, + "loss": 0.87054229, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.12414551, + "routerloss_mlp": 0.0, + "step": 2254, + "time_per_iteration": 3.117469072341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090126, + "balance_loss_mlp": 1.07842588, + "diversity_loss_mlp": 0.0, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.07095559842956704, + "language_loss": 0.90230805, + "learning_rate": 0.0006299611370639069, + "loss": 0.91320932, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2255, + "time_per_iteration": 2.723188638687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084284, + "balance_loss_mlp": 1.07239318, + "diversity_loss_mlp": 0.0, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.07367301477096526, + "language_loss": 0.79524988, + "learning_rate": 0.0006296602773977593, + "loss": 0.80609274, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2256, + "time_per_iteration": 2.6743130683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099708, + "balance_loss_mlp": 1.08790588, + "diversity_loss_mlp": 0.0, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.06301035546935001, + "language_loss": 0.87406039, + "learning_rate": 0.0006293593673930277, + "loss": 0.88505745, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2257, + "time_per_iteration": 2.6397616863250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103769, + "balance_loss_mlp": 1.09211683, + "diversity_loss_mlp": 0.0, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07716264473653381, + "language_loss": 0.78774142, + "learning_rate": 0.0006290584071665358, + "loss": 0.79877913, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2258, + "time_per_iteration": 2.9148640632629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_mlp": 1.07634544, + "diversity_loss_mlp": 0.0, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.06859255861010008, + "language_loss": 0.82309216, + "learning_rate": 0.0006287573968351266, + "loss": 0.83397484, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2259, + "time_per_iteration": 2.582099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081836, + "balance_loss_mlp": 1.06989694, + "diversity_loss_mlp": 0.0, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.0728512329620832, + "language_loss": 0.8210361, + "learning_rate": 0.0006284563365156626, + "loss": 0.83185446, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2260, + "time_per_iteration": 2.802004814147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075245, + "balance_loss_mlp": 1.06343079, + "diversity_loss_mlp": 0.0, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.08318375282180102, + "language_loss": 0.87862843, + "learning_rate": 0.0006281552263250261, + "loss": 0.88938093, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.11810303, + "routerloss_mlp": 0.0, + "step": 2261, + "time_per_iteration": 2.5335495471954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_mlp": 1.02721453, + "diversity_loss_mlp": 0.0, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.02511862566194507, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81726044, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.07275391, + "routerloss_mlp": 0.0, + "step": 2262, + "time_per_iteration": 4.858395338058472 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067582, + "balance_loss_mlp": 1.05593562, + "diversity_loss_mlp": 0.0, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.07030760098393707, + "language_loss": 0.81181604, + "learning_rate": 0.0006275528567978593, + "loss": 0.82249182, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2263, + "time_per_iteration": 2.9562113285064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106752, + "balance_loss_mlp": 1.05570674, + "diversity_loss_mlp": 0.0, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.09515047383985015, + "language_loss": 0.82464182, + "learning_rate": 0.0006272515976951898, + "loss": 0.83531702, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2264, + "time_per_iteration": 3.0750486850738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106778, + "balance_loss_mlp": 1.05625236, + "diversity_loss_mlp": 0.0, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.06538835415995116, + "language_loss": 0.7903443, + "learning_rate": 0.0006269502891890687, + "loss": 0.80102211, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2265, + "time_per_iteration": 3.0723042488098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069278, + "balance_loss_mlp": 1.05721438, + "diversity_loss_mlp": 0.0, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.06791130510000161, + "language_loss": 0.88071477, + "learning_rate": 0.0006266489313964743, + "loss": 0.89140749, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.12060547, + "routerloss_mlp": 0.0, + "step": 2266, + "time_per_iteration": 2.7362618446350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00937641, + "balance_loss_mlp": 1.63294578, + "diversity_loss_mlp": 0.21328503, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.028233172977391998, + "language_loss": 0.85207379, + "learning_rate": 0.0006263475244344041, + "loss": 0.8614502, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01452552, + "step": 2267, + "time_per_iteration": 2.8842954635620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082065, + "balance_loss_mlp": 1.06979251, + "diversity_loss_mlp": 0.0, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.07502115173737808, + "language_loss": 0.84271002, + "learning_rate": 0.0006260460684198746, + "loss": 0.8535307, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.12268066, + "routerloss_mlp": 0.0, + "step": 2268, + "time_per_iteration": 2.6355533599853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.07749879, + "diversity_loss_mlp": 0.0, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.07640014386484298, + "language_loss": 0.84040511, + "learning_rate": 0.0006257445634699213, + "loss": 0.85130346, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.12322998, + "routerloss_mlp": 0.0, + "step": 2269, + "time_per_iteration": 2.5279150009155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089112, + "balance_loss_mlp": 1.07683921, + "diversity_loss_mlp": 0.0, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.16142331523875347, + "language_loss": 0.83037758, + "learning_rate": 0.0006254430097015993, + "loss": 0.84126872, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.12268066, + "routerloss_mlp": 0.0, + "step": 2270, + "time_per_iteration": 2.660228729248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_mlp": 1.03087568, + "diversity_loss_mlp": 0.0, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.024589935077845904, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77516735, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.06787109, + "routerloss_mlp": 0.0, + "step": 2271, + "time_per_iteration": 4.794579744338989 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070138, + "balance_loss_mlp": 1.05796623, + "diversity_loss_mlp": 0.0, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.057648382072647573, + "language_loss": 0.85053569, + "learning_rate": 0.0006248397561781609, + "loss": 0.86123705, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2272, + "time_per_iteration": 2.862569570541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067836, + "balance_loss_mlp": 1.05557537, + "diversity_loss_mlp": 0.0, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.08840424380788836, + "language_loss": 0.86255217, + "learning_rate": 0.0006245380566572482, + "loss": 0.87323052, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2273, + "time_per_iteration": 2.7386484146118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068942, + "balance_loss_mlp": 1.0566572, + "diversity_loss_mlp": 0.0, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07723857249852564, + "language_loss": 0.75794655, + "learning_rate": 0.0006242363087863744, + "loss": 0.76863599, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.12286377, + "routerloss_mlp": 0.0, + "step": 2274, + "time_per_iteration": 2.948030710220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010675, + "balance_loss_mlp": 1.05560887, + "diversity_loss_mlp": 0.0, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.06687985923679116, + "language_loss": 0.86043644, + "learning_rate": 0.0006239345126826878, + "loss": 0.87111151, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2275, + "time_per_iteration": 2.787750482559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071289, + "balance_loss_mlp": 1.05926108, + "diversity_loss_mlp": 0.0, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.07503499995760528, + "language_loss": 0.83946115, + "learning_rate": 0.0006236326684633561, + "loss": 0.85017407, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2276, + "time_per_iteration": 2.8109841346740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071151, + "balance_loss_mlp": 1.05921769, + "diversity_loss_mlp": 0.0, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.08049471875944368, + "language_loss": 0.75253642, + "learning_rate": 0.0006233307762455658, + "loss": 0.76324785, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.11932373, + "routerloss_mlp": 0.0, + "step": 2277, + "time_per_iteration": 2.632291793823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072177, + "balance_loss_mlp": 1.06043518, + "diversity_loss_mlp": 0.0, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.0727539933311737, + "language_loss": 0.83312476, + "learning_rate": 0.0006230288361465216, + "loss": 0.8438465, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2278, + "time_per_iteration": 3.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106943, + "balance_loss_mlp": 1.05752659, + "diversity_loss_mlp": 0.0, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.08745359184854619, + "language_loss": 0.84888816, + "learning_rate": 0.0006227268482834473, + "loss": 0.85958248, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.11889648, + "routerloss_mlp": 0.0, + "step": 2279, + "time_per_iteration": 2.9116861820220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00929134, + "balance_loss_mlp": 1.61467147, + "diversity_loss_mlp": 0.21327347, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.03053717197724305, + "language_loss": 0.8733198, + "learning_rate": 0.000622424812773585, + "loss": 0.88261116, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0151619, + "step": 2280, + "time_per_iteration": 2.83655047416687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087033, + "balance_loss_mlp": 1.07515955, + "diversity_loss_mlp": 0.0, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.09030781332224262, + "language_loss": 0.8003484, + "learning_rate": 0.000622122729734195, + "loss": 0.81121874, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2281, + "time_per_iteration": 2.598515033721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088881, + "balance_loss_mlp": 1.07746708, + "diversity_loss_mlp": 0.0, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.05965815533468205, + "language_loss": 0.87430406, + "learning_rate": 0.0006218205992825566, + "loss": 0.88519287, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2282, + "time_per_iteration": 2.6424663066864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084394, + "balance_loss_mlp": 1.07271123, + "diversity_loss_mlp": 0.0, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.06483845116972914, + "language_loss": 0.81733787, + "learning_rate": 0.0006215184215359671, + "loss": 0.8281818, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2283, + "time_per_iteration": 2.736311674118042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087917, + "balance_loss_mlp": 1.07662153, + "diversity_loss_mlp": 0.0, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.0656289826640407, + "language_loss": 0.86697561, + "learning_rate": 0.0006212161966117425, + "loss": 0.8778547, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.11297607, + "routerloss_mlp": 0.0, + "step": 2284, + "time_per_iteration": 2.727402448654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091514, + "balance_loss_mlp": 1.07989156, + "diversity_loss_mlp": 0.0, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.07463232969806483, + "language_loss": 0.81628394, + "learning_rate": 0.0006209139246272164, + "loss": 0.8271991, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 2285, + "time_per_iteration": 2.978759527206421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.08205843, + "diversity_loss_mlp": 0.0, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.08236326374350296, + "language_loss": 0.81938732, + "learning_rate": 0.0006206116056997421, + "loss": 0.83032608, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.1182251, + "routerloss_mlp": 0.0, + "step": 2286, + "time_per_iteration": 2.6111207008361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085811, + "balance_loss_mlp": 1.07444477, + "diversity_loss_mlp": 0.0, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.06662472973472185, + "language_loss": 0.82727671, + "learning_rate": 0.0006203092399466892, + "loss": 0.83813483, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2287, + "time_per_iteration": 2.6246864795684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109279, + "balance_loss_mlp": 1.08137023, + "diversity_loss_mlp": 0.0, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.06470350083987941, + "language_loss": 0.85380936, + "learning_rate": 0.0006200068274854473, + "loss": 0.86473733, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.11419678, + "routerloss_mlp": 0.0, + "step": 2288, + "time_per_iteration": 2.675197124481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091416, + "balance_loss_mlp": 1.07988858, + "diversity_loss_mlp": 0.0, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.0650031810595099, + "language_loss": 0.8588661, + "learning_rate": 0.0006197043684334229, + "loss": 0.86978024, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2289, + "time_per_iteration": 2.787095785140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092106, + "balance_loss_mlp": 1.08063841, + "diversity_loss_mlp": 0.0, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.0715970788084748, + "language_loss": 0.79333103, + "learning_rate": 0.0006194018629080411, + "loss": 0.80425215, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2290, + "time_per_iteration": 2.817836284637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103028, + "balance_loss_mlp": 1.09150028, + "diversity_loss_mlp": 0.0, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.07061114258803743, + "language_loss": 0.81714827, + "learning_rate": 0.0006190993110267451, + "loss": 0.82817852, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.11523438, + "routerloss_mlp": 0.0, + "step": 2291, + "time_per_iteration": 2.741288900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108227, + "balance_loss_mlp": 1.09614503, + "diversity_loss_mlp": 0.0, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.07455801894128893, + "language_loss": 0.84193838, + "learning_rate": 0.0006187967129069958, + "loss": 0.85302061, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.12084961, + "routerloss_mlp": 0.0, + "step": 2292, + "time_per_iteration": 2.5778286457061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106682, + "balance_loss_mlp": 1.09472573, + "diversity_loss_mlp": 0.0, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.06400814904414545, + "language_loss": 0.8690064, + "learning_rate": 0.0006184940686662722, + "loss": 0.88007319, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2293, + "time_per_iteration": 2.7292487621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111022, + "balance_loss_mlp": 1.09812045, + "diversity_loss_mlp": 0.0, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.06813451942076464, + "language_loss": 0.90379488, + "learning_rate": 0.0006181913784220714, + "loss": 0.91489702, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2294, + "time_per_iteration": 2.6506428718566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081962, + "balance_loss_mlp": 1.0750953, + "diversity_loss_mlp": 0.0, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.029819366941177792, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81635749, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.06884766, + "routerloss_mlp": 0.0, + "step": 2295, + "time_per_iteration": 4.882002592086792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110182, + "balance_loss_mlp": 1.09772444, + "diversity_loss_mlp": 0.0, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.07012194180041048, + "language_loss": 0.7971437, + "learning_rate": 0.0006175858603933146, + "loss": 0.80824548, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.12469482, + "routerloss_mlp": 0.0, + "step": 2296, + "time_per_iteration": 2.8836371898651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908854, + "balance_loss_mlp": 1.58032632, + "diversity_loss_mlp": 0.2095283, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.03267646081870075, + "language_loss": 0.80986243, + "learning_rate": 0.0006172830328438416, + "loss": 0.81895095, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01392685, + "step": 2297, + "time_per_iteration": 2.9758472442626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093338, + "balance_loss_mlp": 1.0806725, + "diversity_loss_mlp": 0.0, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.0684627092891604, + "language_loss": 0.86739677, + "learning_rate": 0.0006169801597610572, + "loss": 0.87833017, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.12670898, + "routerloss_mlp": 0.0, + "step": 2298, + "time_per_iteration": 2.796999454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080861, + "balance_loss_mlp": 1.06855834, + "diversity_loss_mlp": 0.0, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.09148837874044675, + "language_loss": 0.89672303, + "learning_rate": 0.0006166772412625469, + "loss": 0.90753162, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.12304688, + "routerloss_mlp": 0.0, + "step": 2299, + "time_per_iteration": 2.719217300415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079493, + "balance_loss_mlp": 1.06674969, + "diversity_loss_mlp": 0.0, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.0806717243265584, + "language_loss": 0.81995088, + "learning_rate": 0.0006163742774659141, + "loss": 0.83074582, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.12744141, + "routerloss_mlp": 0.0, + "step": 2300, + "time_per_iteration": 2.857851266860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082582, + "balance_loss_mlp": 1.07051837, + "diversity_loss_mlp": 0.0, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.07368324051857801, + "language_loss": 0.85920924, + "learning_rate": 0.0006160712684887801, + "loss": 0.87003505, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.1206665, + "routerloss_mlp": 0.0, + "step": 2301, + "time_per_iteration": 2.7615816593170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076648, + "balance_loss_mlp": 1.06491232, + "diversity_loss_mlp": 0.0, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.07775198871362894, + "language_loss": 0.81987381, + "learning_rate": 0.0006157682144487832, + "loss": 0.83064032, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2302, + "time_per_iteration": 2.759446620941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071769, + "balance_loss_mlp": 1.05998516, + "diversity_loss_mlp": 0.0, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.07391427816126875, + "language_loss": 0.82887244, + "learning_rate": 0.0006154651154635793, + "loss": 0.83959019, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2303, + "time_per_iteration": 2.8566582202911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074582, + "balance_loss_mlp": 1.0627867, + "diversity_loss_mlp": 0.0, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.07276664214775759, + "language_loss": 0.84800553, + "learning_rate": 0.0006151619716508421, + "loss": 0.85875136, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2304, + "time_per_iteration": 2.678624153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070842, + "balance_loss_mlp": 1.05890322, + "diversity_loss_mlp": 0.0, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.0708190445963316, + "language_loss": 0.87117589, + "learning_rate": 0.0006148587831282625, + "loss": 0.88188434, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2305, + "time_per_iteration": 2.6833643913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065001, + "balance_loss_mlp": 1.05813479, + "diversity_loss_mlp": 0.0, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.03167846404368131, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80241072, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.06884766, + "routerloss_mlp": 0.0, + "step": 2306, + "time_per_iteration": 4.908214092254639 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_mlp": 1.06202734, + "diversity_loss_mlp": 0.0, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.10781991147306623, + "language_loss": 0.87386847, + "learning_rate": 0.0006142522724244255, + "loss": 0.8846153, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.12664795, + "routerloss_mlp": 0.0, + "step": 2307, + "time_per_iteration": 2.559011459350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_mlp": 1.03301477, + "diversity_loss_mlp": 0.0, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.019467834986953515, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77524698, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.06982422, + "routerloss_mlp": 0.0, + "step": 2308, + "time_per_iteration": 4.990226984024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010956, + "balance_loss_mlp": 1.08379281, + "diversity_loss_mlp": 0.0, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.134173965781989, + "language_loss": 0.77330542, + "learning_rate": 0.000613645584293942, + "loss": 0.78426147, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2309, + "time_per_iteration": 2.925625801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096392, + "balance_loss_mlp": 1.08444726, + "diversity_loss_mlp": 0.0, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.07260585347328512, + "language_loss": 0.83497787, + "learning_rate": 0.0006133421739881185, + "loss": 0.84594172, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.11938477, + "routerloss_mlp": 0.0, + "step": 2310, + "time_per_iteration": 2.6521387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105727, + "balance_loss_mlp": 1.09360933, + "diversity_loss_mlp": 0.0, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.08716252058009813, + "language_loss": 0.82747865, + "learning_rate": 0.0006130387196789605, + "loss": 0.8385359, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.12115479, + "routerloss_mlp": 0.0, + "step": 2311, + "time_per_iteration": 2.7266759872436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100575, + "balance_loss_mlp": 1.08809423, + "diversity_loss_mlp": 0.0, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.057672451626414926, + "language_loss": 0.84308195, + "learning_rate": 0.0006127352214842795, + "loss": 0.85408771, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2312, + "time_per_iteration": 2.9728119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104798, + "balance_loss_mlp": 1.09263897, + "diversity_loss_mlp": 0.0, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.09124128780751645, + "language_loss": 0.85551131, + "learning_rate": 0.0006124316795219041, + "loss": 0.86655927, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2313, + "time_per_iteration": 2.793999671936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098642, + "balance_loss_mlp": 1.08649504, + "diversity_loss_mlp": 0.0, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.07392199689713573, + "language_loss": 0.82170153, + "learning_rate": 0.0006121280939096794, + "loss": 0.83268797, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.12145996, + "routerloss_mlp": 0.0, + "step": 2314, + "time_per_iteration": 2.7882213592529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087686, + "balance_loss_mlp": 1.07496047, + "diversity_loss_mlp": 0.0, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.07188819518398708, + "language_loss": 0.87831259, + "learning_rate": 0.000611824464765468, + "loss": 0.88918942, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.12738037, + "routerloss_mlp": 0.0, + "step": 2315, + "time_per_iteration": 2.570239305496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_mlp": 1.03435254, + "diversity_loss_mlp": 0.0, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.031544046963938845, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79636735, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.07421875, + "routerloss_mlp": 0.0, + "step": 2316, + "time_per_iteration": 4.63933539390564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107211, + "balance_loss_mlp": 1.05995071, + "diversity_loss_mlp": 0.0, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.10006595419905694, + "language_loss": 0.85561663, + "learning_rate": 0.000611217076352619, + "loss": 0.86633772, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2317, + "time_per_iteration": 2.763282299041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068374, + "balance_loss_mlp": 1.05613708, + "diversity_loss_mlp": 0.0, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.07080250397958886, + "language_loss": 0.8323034, + "learning_rate": 0.0006109133173197905, + "loss": 0.84298718, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 2318, + "time_per_iteration": 2.7228074073791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.0546751, + "diversity_loss_mlp": 0.0, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.07919775459104113, + "language_loss": 0.85392821, + "learning_rate": 0.0006106095152265935, + "loss": 0.86459887, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.12390137, + "routerloss_mlp": 0.0, + "step": 2319, + "time_per_iteration": 2.950333595275879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067661, + "balance_loss_mlp": 1.05547166, + "diversity_loss_mlp": 0.0, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.061336847968553085, + "language_loss": 0.84789562, + "learning_rate": 0.0006103056701909739, + "loss": 0.85857224, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2320, + "time_per_iteration": 2.9283788204193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076472, + "balance_loss_mlp": 1.06437278, + "diversity_loss_mlp": 0.0, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.06696737396207848, + "language_loss": 0.83276129, + "learning_rate": 0.0006100017823308956, + "loss": 0.84352595, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2321, + "time_per_iteration": 3.159337282180786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072799, + "balance_loss_mlp": 1.06091988, + "diversity_loss_mlp": 0.0, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.07676377008356373, + "language_loss": 0.79803503, + "learning_rate": 0.0006096978517643377, + "loss": 0.80876303, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2322, + "time_per_iteration": 2.8253674507141113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00921995, + "balance_loss_mlp": 1.60181236, + "diversity_loss_mlp": 0.21422489, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.03237790796068106, + "language_loss": 0.83347481, + "learning_rate": 0.0006093938786092968, + "loss": 0.84269476, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01397606, + "step": 2323, + "time_per_iteration": 2.648444890975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110106, + "balance_loss_mlp": 1.09840608, + "diversity_loss_mlp": 0.0, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.07300553293113453, + "language_loss": 0.90023661, + "learning_rate": 0.0006090898629837857, + "loss": 0.91133773, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2324, + "time_per_iteration": 2.852698564529419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126468, + "balance_loss_mlp": 1.11461282, + "diversity_loss_mlp": 0.0, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.06000654076761871, + "language_loss": 0.87143672, + "learning_rate": 0.0006087858050058337, + "loss": 0.8827014, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2325, + "time_per_iteration": 2.7674834728240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138358, + "balance_loss_mlp": 1.12663388, + "diversity_loss_mlp": 0.0, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.0853990663964482, + "language_loss": 0.82412744, + "learning_rate": 0.0006084817047934866, + "loss": 0.83551097, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.1171875, + "routerloss_mlp": 0.0, + "step": 2326, + "time_per_iteration": 2.6421871185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121575, + "balance_loss_mlp": 1.10977352, + "diversity_loss_mlp": 0.0, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.08985792381424736, + "language_loss": 0.89330196, + "learning_rate": 0.0006081775624648066, + "loss": 0.90451771, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.11791992, + "routerloss_mlp": 0.0, + "step": 2327, + "time_per_iteration": 2.578197956085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131674, + "balance_loss_mlp": 1.12057006, + "diversity_loss_mlp": 0.0, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.0872530433154025, + "language_loss": 0.83162999, + "learning_rate": 0.0006078733781378721, + "loss": 0.84294665, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2328, + "time_per_iteration": 2.6186208724975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099348, + "balance_loss_mlp": 1.08810675, + "diversity_loss_mlp": 0.0, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.07633837573658239, + "language_loss": 0.82202363, + "learning_rate": 0.0006075691519307781, + "loss": 0.83301711, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2329, + "time_per_iteration": 2.9000244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094166, + "balance_loss_mlp": 1.08247721, + "diversity_loss_mlp": 0.0, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.0736281868256213, + "language_loss": 0.81618124, + "learning_rate": 0.0006072648839616356, + "loss": 0.82712287, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.11694336, + "routerloss_mlp": 0.0, + "step": 2330, + "time_per_iteration": 2.6364829540252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083826, + "balance_loss_mlp": 1.07230425, + "diversity_loss_mlp": 0.0, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.0657010816534965, + "language_loss": 0.82723016, + "learning_rate": 0.0006069605743485718, + "loss": 0.83806837, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2331, + "time_per_iteration": 3.3334474563598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086805, + "balance_loss_mlp": 1.07531917, + "diversity_loss_mlp": 0.0, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.07225675858451452, + "language_loss": 0.83265316, + "learning_rate": 0.0006066562232097303, + "loss": 0.84352124, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2332, + "time_per_iteration": 2.705143690109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082054, + "balance_loss_mlp": 1.07051468, + "diversity_loss_mlp": 0.0, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.06521315479324259, + "language_loss": 0.8614397, + "learning_rate": 0.0006063518306632708, + "loss": 0.87226027, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2333, + "time_per_iteration": 2.9501705169677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085822, + "balance_loss_mlp": 1.07427073, + "diversity_loss_mlp": 0.0, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.07251688845149425, + "language_loss": 0.82197714, + "learning_rate": 0.0006060473968273688, + "loss": 0.83283544, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2334, + "time_per_iteration": 2.708394765853882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_mlp": 1.032179, + "diversity_loss_mlp": 0.0, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.02865006957504222, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78918916, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.07177734, + "routerloss_mlp": 0.0, + "step": 2335, + "time_per_iteration": 4.866912841796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_mlp": 1.01901519, + "diversity_loss_mlp": 0.0, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.021847156852776353, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82031286, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.07080078, + "routerloss_mlp": 0.0, + "step": 2336, + "time_per_iteration": 4.834076642990112 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108818, + "balance_loss_mlp": 1.07613969, + "diversity_loss_mlp": 0.0, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.09890748330953583, + "language_loss": 0.88285863, + "learning_rate": 0.0006051338487650047, + "loss": 0.89374042, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.12042236, + "routerloss_mlp": 0.0, + "step": 2337, + "time_per_iteration": 2.4428114891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00930205, + "balance_loss_mlp": 1.62015963, + "diversity_loss_mlp": 0.20974493, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.03186253719782368, + "language_loss": 0.82399797, + "learning_rate": 0.0006048292509534095, + "loss": 0.83329999, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01525321, + "step": 2338, + "time_per_iteration": 2.6332457065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079855, + "balance_loss_mlp": 1.06772542, + "diversity_loss_mlp": 0.0, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.08456945041025239, + "language_loss": 0.77873439, + "learning_rate": 0.0006045246124434895, + "loss": 0.7895329, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.12127686, + "routerloss_mlp": 0.0, + "step": 2339, + "time_per_iteration": 2.7590980529785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073061, + "balance_loss_mlp": 1.06156278, + "diversity_loss_mlp": 0.0, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.06841757056071682, + "language_loss": 0.86623305, + "learning_rate": 0.0006042199333535162, + "loss": 0.87696362, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2340, + "time_per_iteration": 3.293574333190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079106, + "balance_loss_mlp": 1.06769133, + "diversity_loss_mlp": 0.0, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06101547553515947, + "language_loss": 0.84343052, + "learning_rate": 0.0006039152138017763, + "loss": 0.85422158, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2341, + "time_per_iteration": 3.0700981616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087051, + "balance_loss_mlp": 1.07579744, + "diversity_loss_mlp": 0.0, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.09071323966594208, + "language_loss": 0.83541143, + "learning_rate": 0.0006036104539065726, + "loss": 0.84628195, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.11260986, + "routerloss_mlp": 0.0, + "step": 2342, + "time_per_iteration": 2.6694719791412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089407, + "balance_loss_mlp": 1.07793319, + "diversity_loss_mlp": 0.0, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.08270437502254605, + "language_loss": 0.84371507, + "learning_rate": 0.000603305653786223, + "loss": 0.85460913, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2343, + "time_per_iteration": 3.16105318069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083424, + "balance_loss_mlp": 1.07187295, + "diversity_loss_mlp": 0.0, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.07028076371432387, + "language_loss": 0.84103405, + "learning_rate": 0.0006030008135590622, + "loss": 0.85186827, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2344, + "time_per_iteration": 2.7197835445404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082164, + "balance_loss_mlp": 1.07096398, + "diversity_loss_mlp": 0.0, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.05864949769745669, + "language_loss": 0.7999413, + "learning_rate": 0.0006026959333434387, + "loss": 0.81076288, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2345, + "time_per_iteration": 2.777010202407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00919083, + "balance_loss_mlp": 1.6008426, + "diversity_loss_mlp": 0.20793086, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.028469676504860836, + "language_loss": 0.77684712, + "learning_rate": 0.0006023910132577181, + "loss": 0.78603798, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01469593, + "step": 2346, + "time_per_iteration": 2.689173936843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093318, + "balance_loss_mlp": 1.08186746, + "diversity_loss_mlp": 0.0, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.07173117007756048, + "language_loss": 0.84956741, + "learning_rate": 0.0006020860534202806, + "loss": 0.86050057, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2347, + "time_per_iteration": 2.499941110610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099048, + "balance_loss_mlp": 1.08747303, + "diversity_loss_mlp": 0.0, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.06525031943024168, + "language_loss": 0.81076705, + "learning_rate": 0.0006017810539495224, + "loss": 0.82175756, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2348, + "time_per_iteration": 2.9487318992614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094541, + "balance_loss_mlp": 1.08284068, + "diversity_loss_mlp": 0.0, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.07881291561071736, + "language_loss": 0.82607108, + "learning_rate": 0.0006014760149638547, + "loss": 0.83701646, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.11700439, + "routerloss_mlp": 0.0, + "step": 2349, + "time_per_iteration": 2.7228691577911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096509, + "balance_loss_mlp": 1.0852139, + "diversity_loss_mlp": 0.0, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.08019466042103662, + "language_loss": 0.88398969, + "learning_rate": 0.000601170936581704, + "loss": 0.8949548, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.112854, + "routerloss_mlp": 0.0, + "step": 2350, + "time_per_iteration": 2.521714687347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090727, + "balance_loss_mlp": 1.07951522, + "diversity_loss_mlp": 0.0, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.08533615412567333, + "language_loss": 0.84897137, + "learning_rate": 0.0006008658189215121, + "loss": 0.85987866, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2351, + "time_per_iteration": 2.6506216526031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.07545722, + "diversity_loss_mlp": 0.0, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.09237808795246917, + "language_loss": 0.80232167, + "learning_rate": 0.0006005606621017366, + "loss": 0.81319243, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2352, + "time_per_iteration": 2.5878968238830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010807, + "balance_loss_mlp": 1.06907678, + "diversity_loss_mlp": 0.0, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.07057821380790058, + "language_loss": 0.80339801, + "learning_rate": 0.0006002554662408496, + "loss": 0.81420493, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.1161499, + "routerloss_mlp": 0.0, + "step": 2353, + "time_per_iteration": 2.883782386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.0691061, + "diversity_loss_mlp": 0.0, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.0736680584084088, + "language_loss": 0.9135446, + "learning_rate": 0.0005999502314573388, + "loss": 0.9243511, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2354, + "time_per_iteration": 2.645484685897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103614, + "balance_loss_mlp": 1.09201527, + "diversity_loss_mlp": 0.0, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.07036557956994945, + "language_loss": 0.86196381, + "learning_rate": 0.0005996449578697066, + "loss": 0.87299991, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.11602783, + "routerloss_mlp": 0.0, + "step": 2355, + "time_per_iteration": 2.648574113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906536, + "balance_loss_mlp": 1.57839537, + "diversity_loss_mlp": 0.20635399, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.031145483684461562, + "language_loss": 0.81619978, + "learning_rate": 0.0005993396455964709, + "loss": 0.82526517, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01416124, + "step": 2356, + "time_per_iteration": 2.7277767658233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115937, + "balance_loss_mlp": 1.14805746, + "diversity_loss_mlp": 0.0, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.07904312092760724, + "language_loss": 0.81657517, + "learning_rate": 0.0005990342947561647, + "loss": 0.82816887, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.11315918, + "routerloss_mlp": 0.0, + "step": 2357, + "time_per_iteration": 2.696223258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167894, + "balance_loss_mlp": 1.15651524, + "diversity_loss_mlp": 0.0, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.07381995676601517, + "language_loss": 0.78198934, + "learning_rate": 0.0005987289054673351, + "loss": 0.79366827, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2358, + "time_per_iteration": 2.602642059326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01360078, + "balance_loss_mlp": 1.35392714, + "diversity_loss_mlp": 0.0, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.12195170998658643, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77935815, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2359, + "time_per_iteration": 4.880090713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146892, + "balance_loss_mlp": 1.13553107, + "diversity_loss_mlp": 0.0, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.07250720881476776, + "language_loss": 0.91548061, + "learning_rate": 0.0005981180120183722, + "loss": 0.9269495, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2360, + "time_per_iteration": 2.680730104446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133243, + "balance_loss_mlp": 1.121382, + "diversity_loss_mlp": 0.0, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.055968167495159496, + "language_loss": 0.85338825, + "learning_rate": 0.0005978125080954089, + "loss": 0.8647207, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.11853027, + "routerloss_mlp": 0.0, + "step": 2361, + "time_per_iteration": 2.791376829147339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124184, + "balance_loss_mlp": 1.11265099, + "diversity_loss_mlp": 0.0, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.08653591933533131, + "language_loss": 0.77322888, + "learning_rate": 0.000597506966198262, + "loss": 0.7844708, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2362, + "time_per_iteration": 2.97446870803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119088, + "balance_loss_mlp": 1.10733426, + "diversity_loss_mlp": 0.0, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.09240364374598002, + "language_loss": 0.84247041, + "learning_rate": 0.0005972013864455536, + "loss": 0.85366124, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.11743164, + "routerloss_mlp": 0.0, + "step": 2363, + "time_per_iteration": 2.577167510986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108786, + "balance_loss_mlp": 1.09771168, + "diversity_loss_mlp": 0.0, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.0787330127694287, + "language_loss": 0.8535012, + "learning_rate": 0.0005968957689559203, + "loss": 0.8645891, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2364, + "time_per_iteration": 2.7120981216430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105615, + "balance_loss_mlp": 1.09457588, + "diversity_loss_mlp": 0.0, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.07389843074969835, + "language_loss": 0.88484383, + "learning_rate": 0.0005965901138480131, + "loss": 0.89590001, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2365, + "time_per_iteration": 2.578874349594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110202, + "balance_loss_mlp": 1.09081471, + "diversity_loss_mlp": 0.0, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.06426783448513047, + "language_loss": 0.87068385, + "learning_rate": 0.0005962844212404982, + "loss": 0.88170409, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.11206055, + "routerloss_mlp": 0.0, + "step": 2366, + "time_per_iteration": 2.6638920307159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096105, + "balance_loss_mlp": 1.08472049, + "diversity_loss_mlp": 0.0, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.05830156527831164, + "language_loss": 0.87147355, + "learning_rate": 0.0005959786912520558, + "loss": 0.88243461, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 2367, + "time_per_iteration": 2.6142454147338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088371, + "balance_loss_mlp": 1.07726681, + "diversity_loss_mlp": 0.0, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.06261196085687584, + "language_loss": 0.83712542, + "learning_rate": 0.0005956729240013806, + "loss": 0.84800917, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2368, + "time_per_iteration": 2.786256790161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095858, + "balance_loss_mlp": 1.08447385, + "diversity_loss_mlp": 0.0, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.06874460659515655, + "language_loss": 0.91648531, + "learning_rate": 0.0005953671196071824, + "loss": 0.92744386, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2369, + "time_per_iteration": 2.756943941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093695, + "balance_loss_mlp": 1.08220375, + "diversity_loss_mlp": 0.0, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.07258619671695062, + "language_loss": 0.80044961, + "learning_rate": 0.0005950612781881846, + "loss": 0.81138659, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2370, + "time_per_iteration": 2.6791019439697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00906758, + "balance_loss_mlp": 1.57760763, + "diversity_loss_mlp": 0.20680004, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.03266097765038979, + "language_loss": 0.76005763, + "learning_rate": 0.0005947553998631259, + "loss": 0.76912522, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01455403, + "step": 2371, + "time_per_iteration": 2.908493995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010769, + "balance_loss_mlp": 1.06543183, + "diversity_loss_mlp": 0.0, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.05564189265933484, + "language_loss": 0.79205543, + "learning_rate": 0.000594449484750758, + "loss": 0.80282438, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2372, + "time_per_iteration": 3.18151593208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072251, + "balance_loss_mlp": 1.06046152, + "diversity_loss_mlp": 0.0, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.07444834598910231, + "language_loss": 0.83208215, + "learning_rate": 0.0005941435329698484, + "loss": 0.84280467, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2373, + "time_per_iteration": 2.6709630489349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107048, + "balance_loss_mlp": 1.05895281, + "diversity_loss_mlp": 0.0, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.06837725942446468, + "language_loss": 0.83204812, + "learning_rate": 0.0005938375446391778, + "loss": 0.84275293, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.11529541, + "routerloss_mlp": 0.0, + "step": 2374, + "time_per_iteration": 2.6943106651306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_mlp": 1.06261396, + "diversity_loss_mlp": 0.0, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.0748623734907781, + "language_loss": 0.8912878, + "learning_rate": 0.0005935315198775415, + "loss": 0.90203297, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2375, + "time_per_iteration": 2.6303911209106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066821, + "balance_loss_mlp": 1.05491209, + "diversity_loss_mlp": 0.0, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.06590971106227904, + "language_loss": 0.87093645, + "learning_rate": 0.0005932254588037486, + "loss": 0.88160467, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2376, + "time_per_iteration": 2.5003554821014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106434, + "balance_loss_mlp": 1.0520016, + "diversity_loss_mlp": 0.0, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.07188519107297629, + "language_loss": 0.86239958, + "learning_rate": 0.000592919361536623, + "loss": 0.87304294, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.12335205, + "routerloss_mlp": 0.0, + "step": 2377, + "time_per_iteration": 2.6426758766174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106134, + "balance_loss_mlp": 1.04946113, + "diversity_loss_mlp": 0.0, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.06083573176815847, + "language_loss": 0.88679874, + "learning_rate": 0.0005926132281950017, + "loss": 0.89741206, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.11871338, + "routerloss_mlp": 0.0, + "step": 2378, + "time_per_iteration": 2.7510690689086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065869, + "balance_loss_mlp": 1.05310154, + "diversity_loss_mlp": 0.0, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.07940360452878177, + "language_loss": 0.85365742, + "learning_rate": 0.0005923070588977367, + "loss": 0.86431611, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.12774658, + "routerloss_mlp": 0.0, + "step": 2379, + "time_per_iteration": 2.7969985008239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066405, + "balance_loss_mlp": 1.05444837, + "diversity_loss_mlp": 0.0, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.06398281947580985, + "language_loss": 0.86384034, + "learning_rate": 0.0005920008537636931, + "loss": 0.87450439, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.11956787, + "routerloss_mlp": 0.0, + "step": 2380, + "time_per_iteration": 2.90964412689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_mlp": 1.05391335, + "diversity_loss_mlp": 0.0, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.05698304417859526, + "language_loss": 0.86739266, + "learning_rate": 0.0005916946129117504, + "loss": 0.87805718, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.12548828, + "routerloss_mlp": 0.0, + "step": 2381, + "time_per_iteration": 2.9013612270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074542, + "balance_loss_mlp": 1.06223381, + "diversity_loss_mlp": 0.0, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.07634094682432664, + "language_loss": 0.80304879, + "learning_rate": 0.0005913883364608017, + "loss": 0.81379426, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.12298584, + "routerloss_mlp": 0.0, + "step": 2382, + "time_per_iteration": 3.086503505706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108411, + "balance_loss_mlp": 1.07212973, + "diversity_loss_mlp": 0.0, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.06243795661807547, + "language_loss": 0.8841778, + "learning_rate": 0.0005910820245297542, + "loss": 0.89501894, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.11975098, + "routerloss_mlp": 0.0, + "step": 2383, + "time_per_iteration": 2.8612842559814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090258, + "balance_loss_mlp": 1.07756186, + "diversity_loss_mlp": 0.0, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.08243832238560393, + "language_loss": 0.80972016, + "learning_rate": 0.000590775677237529, + "loss": 0.82062268, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.12695312, + "routerloss_mlp": 0.0, + "step": 2384, + "time_per_iteration": 2.731405735015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094631, + "balance_loss_mlp": 1.08257282, + "diversity_loss_mlp": 0.0, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.07578687885193977, + "language_loss": 0.80532229, + "learning_rate": 0.0005904692947030601, + "loss": 0.81626856, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.1204834, + "routerloss_mlp": 0.0, + "step": 2385, + "time_per_iteration": 2.6176209449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106556, + "balance_loss_mlp": 1.09437895, + "diversity_loss_mlp": 0.0, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.08078833732724985, + "language_loss": 0.8953619, + "learning_rate": 0.0005901628770452963, + "loss": 0.90642744, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.1217041, + "routerloss_mlp": 0.0, + "step": 2386, + "time_per_iteration": 2.5513737201690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.10345697, + "diversity_loss_mlp": 0.0, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.09403156888929357, + "language_loss": 0.87502134, + "learning_rate": 0.000589856424383199, + "loss": 0.88617843, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.12255859, + "routerloss_mlp": 0.0, + "step": 2387, + "time_per_iteration": 2.599862813949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111298, + "balance_loss_mlp": 1.10114813, + "diversity_loss_mlp": 0.0, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.08117329221401763, + "language_loss": 0.8309918, + "learning_rate": 0.000589549936835744, + "loss": 0.8421216, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2388, + "time_per_iteration": 2.914754867553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101869, + "balance_loss_mlp": 1.0899775, + "diversity_loss_mlp": 0.0, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06559429512714879, + "language_loss": 0.79056096, + "learning_rate": 0.0005892434145219202, + "loss": 0.80157959, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2389, + "time_per_iteration": 2.6295268535614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00898813, + "balance_loss_mlp": 1.5620172, + "diversity_loss_mlp": 0.2081904, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.0365067866217014, + "language_loss": 0.82780147, + "learning_rate": 0.0005889368575607303, + "loss": 0.83678961, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01370906, + "step": 2390, + "time_per_iteration": 2.8635401725769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089349, + "balance_loss_mlp": 1.07753515, + "diversity_loss_mlp": 0.0, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.056196182118315396, + "language_loss": 0.78421402, + "learning_rate": 0.00058863026607119, + "loss": 0.79510748, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.11816406, + "routerloss_mlp": 0.0, + "step": 2391, + "time_per_iteration": 3.0734708309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099092, + "balance_loss_mlp": 1.08715332, + "diversity_loss_mlp": 0.0, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.07079174515079527, + "language_loss": 0.795928, + "learning_rate": 0.0005883236401723287, + "loss": 0.80691886, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.11932373, + "routerloss_mlp": 0.0, + "step": 2392, + "time_per_iteration": 3.1697676181793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.08348131, + "diversity_loss_mlp": 0.0, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.08882239564338372, + "language_loss": 0.84418833, + "learning_rate": 0.0005880169799831893, + "loss": 0.85514069, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2393, + "time_per_iteration": 2.668509006500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095056, + "balance_loss_mlp": 1.08327174, + "diversity_loss_mlp": 0.0, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.06874062850812142, + "language_loss": 0.81593782, + "learning_rate": 0.0005877102856228278, + "loss": 0.82688844, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2394, + "time_per_iteration": 2.862039566040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099301, + "balance_loss_mlp": 1.08791018, + "diversity_loss_mlp": 0.0, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.07005170830273995, + "language_loss": 0.84822053, + "learning_rate": 0.0005874035572103133, + "loss": 0.85921353, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.1138916, + "routerloss_mlp": 0.0, + "step": 2395, + "time_per_iteration": 2.660466194152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092906, + "balance_loss_mlp": 1.08152771, + "diversity_loss_mlp": 0.0, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.09691208121118819, + "language_loss": 0.82382149, + "learning_rate": 0.0005870967948647288, + "loss": 0.83475053, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2396, + "time_per_iteration": 2.8379006385803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259876, + "balance_loss_mlp": 1.25238955, + "diversity_loss_mlp": 0.0, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.08205623370138872, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75568175, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.07470703, + "routerloss_mlp": 0.0, + "step": 2397, + "time_per_iteration": 5.0380027294158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00912357, + "balance_loss_mlp": 1.5885272, + "diversity_loss_mlp": 0.20776251, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.030510515868204604, + "language_loss": 0.86040902, + "learning_rate": 0.0005864831688507443, + "loss": 0.86953259, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0142122, + "step": 2398, + "time_per_iteration": 2.9795196056365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099565, + "balance_loss_mlp": 1.08854449, + "diversity_loss_mlp": 0.0, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.07495608045078013, + "language_loss": 0.75224954, + "learning_rate": 0.0005861763054205754, + "loss": 0.76324517, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2399, + "time_per_iteration": 2.7307660579681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00908198, + "balance_loss_mlp": 1.58042729, + "diversity_loss_mlp": 0.20863593, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.03052990379504839, + "language_loss": 0.8056978, + "learning_rate": 0.0005858694085337976, + "loss": 0.81477976, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01366598, + "step": 2400, + "time_per_iteration": 2.8421711921691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115275, + "balance_loss_mlp": 1.10424817, + "diversity_loss_mlp": 0.0, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.08470381171074581, + "language_loss": 0.8355788, + "learning_rate": 0.0005855624783095589, + "loss": 0.84673154, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2401, + "time_per_iteration": 2.554006814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114727, + "balance_loss_mlp": 1.10386109, + "diversity_loss_mlp": 0.0, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.07139821582333657, + "language_loss": 0.85265267, + "learning_rate": 0.00058525551486702, + "loss": 0.86379993, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2402, + "time_per_iteration": 2.5159239768981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119193, + "balance_loss_mlp": 1.10795164, + "diversity_loss_mlp": 0.0, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.08747389081307531, + "language_loss": 0.80850065, + "learning_rate": 0.0005849485183253548, + "loss": 0.81969261, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.11242676, + "routerloss_mlp": 0.0, + "step": 2403, + "time_per_iteration": 2.643031358718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110943, + "balance_loss_mlp": 1.09971905, + "diversity_loss_mlp": 0.0, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.06974006499463392, + "language_loss": 0.8764264, + "learning_rate": 0.0005846414888037501, + "loss": 0.88753581, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.11224365, + "routerloss_mlp": 0.0, + "step": 2404, + "time_per_iteration": 2.4847412109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091069, + "balance_loss_mlp": 1.07962489, + "diversity_loss_mlp": 0.0, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.07303422211334305, + "language_loss": 0.82384312, + "learning_rate": 0.0005843344264214049, + "loss": 0.83475375, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.11444092, + "routerloss_mlp": 0.0, + "step": 2405, + "time_per_iteration": 2.7470028400421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093931, + "balance_loss_mlp": 1.08265948, + "diversity_loss_mlp": 0.0, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.06660378994806349, + "language_loss": 0.84838545, + "learning_rate": 0.0005840273312975317, + "loss": 0.85932475, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2406, + "time_per_iteration": 2.834179162979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082019, + "balance_loss_mlp": 1.07018733, + "diversity_loss_mlp": 0.0, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.07201348711751891, + "language_loss": 0.89853442, + "learning_rate": 0.0005837202035513555, + "loss": 0.90935457, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2407, + "time_per_iteration": 2.578505277633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081302, + "balance_loss_mlp": 1.06933987, + "diversity_loss_mlp": 0.0, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.06479654524201506, + "language_loss": 0.81299376, + "learning_rate": 0.0005834130433021136, + "loss": 0.82380676, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.11956787, + "routerloss_mlp": 0.0, + "step": 2408, + "time_per_iteration": 2.742830991744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075359, + "balance_loss_mlp": 1.0631156, + "diversity_loss_mlp": 0.0, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.06628126289532602, + "language_loss": 0.73402894, + "learning_rate": 0.0005831058506690563, + "loss": 0.74478251, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.12237549, + "routerloss_mlp": 0.0, + "step": 2409, + "time_per_iteration": 2.6239566802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875374, + "balance_loss_mlp": 1.5126431, + "diversity_loss_mlp": 0.20975235, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.03030502692098504, + "language_loss": 0.86162984, + "learning_rate": 0.0005827986257714464, + "loss": 0.87038362, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01417591, + "step": 2410, + "time_per_iteration": 2.9302031993865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069185, + "balance_loss_mlp": 1.05664992, + "diversity_loss_mlp": 0.0, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.07558638886093381, + "language_loss": 0.88803709, + "learning_rate": 0.0005824913687285591, + "loss": 0.89872897, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.12542725, + "routerloss_mlp": 0.0, + "step": 2411, + "time_per_iteration": 2.685814142227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070655, + "balance_loss_mlp": 1.05821514, + "diversity_loss_mlp": 0.0, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.1080687232114875, + "language_loss": 0.81367224, + "learning_rate": 0.0005821840796596821, + "loss": 0.82437879, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.12445068, + "routerloss_mlp": 0.0, + "step": 2412, + "time_per_iteration": 2.6551058292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073612, + "balance_loss_mlp": 1.06099916, + "diversity_loss_mlp": 0.0, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.07026214254932567, + "language_loss": 0.80428362, + "learning_rate": 0.0005818767586841158, + "loss": 0.81501973, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.12609863, + "routerloss_mlp": 0.0, + "step": 2413, + "time_per_iteration": 2.759437322616577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085225, + "balance_loss_mlp": 1.07259476, + "diversity_loss_mlp": 0.0, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.08627931539992734, + "language_loss": 0.86441922, + "learning_rate": 0.0005815694059211726, + "loss": 0.8752715, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.12640381, + "routerloss_mlp": 0.0, + "step": 2414, + "time_per_iteration": 2.658977746963501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171514, + "balance_loss_mlp": 1.16250181, + "diversity_loss_mlp": 0.0, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.047494824411654174, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82045138, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 2415, + "time_per_iteration": 4.799519777297974 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145606, + "balance_loss_mlp": 1.13711834, + "diversity_loss_mlp": 0.0, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.043373387729815825, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78090668, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.08496094, + "routerloss_mlp": 0.0, + "step": 2416, + "time_per_iteration": 4.990553379058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0087124, + "balance_loss_mlp": 1.50839305, + "diversity_loss_mlp": 0.20828754, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.030578892859867562, + "language_loss": 0.86378521, + "learning_rate": 0.0005806471581013931, + "loss": 0.87249762, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01289999, + "step": 2417, + "time_per_iteration": 2.6900436878204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122345, + "balance_loss_mlp": 1.11040044, + "diversity_loss_mlp": 0.0, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.07418438196536063, + "language_loss": 0.78360349, + "learning_rate": 0.0005803396793823146, + "loss": 0.79482698, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.1194458, + "routerloss_mlp": 0.0, + "step": 2418, + "time_per_iteration": 2.8027873039245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113389, + "balance_loss_mlp": 1.12212396, + "diversity_loss_mlp": 0.0, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.07660062238284089, + "language_loss": 0.85582161, + "learning_rate": 0.0005800321694726065, + "loss": 0.86716056, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2419, + "time_per_iteration": 4.293209075927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870744, + "balance_loss_mlp": 1.50698626, + "diversity_loss_mlp": 0.20827082, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.03270390918014964, + "language_loss": 0.86636543, + "learning_rate": 0.0005797246284916545, + "loss": 0.87507284, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01311516, + "step": 2420, + "time_per_iteration": 2.7184417247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112587, + "balance_loss_mlp": 1.1061976, + "diversity_loss_mlp": 0.0, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.04763479459010098, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78617769, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2421, + "time_per_iteration": 4.978823900222778 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164162, + "balance_loss_mlp": 1.1527952, + "diversity_loss_mlp": 0.0, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.08359324638355049, + "language_loss": 0.87635398, + "learning_rate": 0.0005791094537936233, + "loss": 0.8879956, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2422, + "time_per_iteration": 2.706270217895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145768, + "balance_loss_mlp": 1.1349256, + "diversity_loss_mlp": 0.0, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.07317342210777962, + "language_loss": 0.81790811, + "learning_rate": 0.0005788018203153762, + "loss": 0.82936579, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2423, + "time_per_iteration": 2.5965187549591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114513, + "balance_loss_mlp": 1.13404965, + "diversity_loss_mlp": 0.0, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.08308161607945047, + "language_loss": 0.85607517, + "learning_rate": 0.000578494156243549, + "loss": 0.86752647, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.11083984, + "routerloss_mlp": 0.0, + "step": 2424, + "time_per_iteration": 2.5783984661102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124685, + "balance_loss_mlp": 1.1135745, + "diversity_loss_mlp": 0.0, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.06702614551613306, + "language_loss": 0.88852286, + "learning_rate": 0.0005781864616975878, + "loss": 0.89976966, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2425, + "time_per_iteration": 2.6615347862243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105595, + "balance_loss_mlp": 1.09463954, + "diversity_loss_mlp": 0.0, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.0790317604017366, + "language_loss": 0.84397781, + "learning_rate": 0.0005778787367969502, + "loss": 0.85503376, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2426, + "time_per_iteration": 2.5796711444854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095822, + "balance_loss_mlp": 1.08478928, + "diversity_loss_mlp": 0.0, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.062032004097500974, + "language_loss": 0.80925953, + "learning_rate": 0.0005775709816611053, + "loss": 0.82021779, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.11029053, + "routerloss_mlp": 0.0, + "step": 2427, + "time_per_iteration": 2.9491348266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_mlp": 1.07454419, + "diversity_loss_mlp": 0.0, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.0676389696771178, + "language_loss": 0.83549029, + "learning_rate": 0.0005772631964095346, + "loss": 0.8463425, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.10681152, + "routerloss_mlp": 0.0, + "step": 2428, + "time_per_iteration": 2.6981353759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081501, + "balance_loss_mlp": 1.07072484, + "diversity_loss_mlp": 0.0, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.08126061261115217, + "language_loss": 0.8576231, + "learning_rate": 0.000576955381161731, + "loss": 0.86843812, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.10778809, + "routerloss_mlp": 0.0, + "step": 2429, + "time_per_iteration": 2.6633517742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074344, + "balance_loss_mlp": 1.06313229, + "diversity_loss_mlp": 0.0, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.08275287351868318, + "language_loss": 0.86212349, + "learning_rate": 0.0005766475360371985, + "loss": 0.87286699, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2430, + "time_per_iteration": 2.5904853343963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072898, + "balance_loss_mlp": 1.06205034, + "diversity_loss_mlp": 0.0, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.0860704645170746, + "language_loss": 0.84563982, + "learning_rate": 0.0005763396611554536, + "loss": 0.85636878, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.10852051, + "routerloss_mlp": 0.0, + "step": 2431, + "time_per_iteration": 2.6467607021331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071528, + "balance_loss_mlp": 1.0607698, + "diversity_loss_mlp": 0.0, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.08998246562287979, + "language_loss": 0.80544329, + "learning_rate": 0.0005760317566360237, + "loss": 0.81615859, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 2432, + "time_per_iteration": 3.006641387939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075816, + "balance_loss_mlp": 1.0648669, + "diversity_loss_mlp": 0.0, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.07509845156715887, + "language_loss": 0.84929144, + "learning_rate": 0.000575723822598448, + "loss": 0.86004961, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2433, + "time_per_iteration": 2.764425277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067328, + "balance_loss_mlp": 1.0558188, + "diversity_loss_mlp": 0.0, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.06651895210271294, + "language_loss": 0.8167448, + "learning_rate": 0.0005754158591622773, + "loss": 0.82741809, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2434, + "time_per_iteration": 2.9786107540130615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075165, + "balance_loss_mlp": 1.06366098, + "diversity_loss_mlp": 0.0, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.07251033111677281, + "language_loss": 0.82255369, + "learning_rate": 0.0005751078664470732, + "loss": 0.83330536, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2435, + "time_per_iteration": 2.5367684364318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079887, + "balance_loss_mlp": 1.06816268, + "diversity_loss_mlp": 0.0, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.07721942828462902, + "language_loss": 0.85977614, + "learning_rate": 0.0005747998445724094, + "loss": 0.87057501, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.11724854, + "routerloss_mlp": 0.0, + "step": 2436, + "time_per_iteration": 2.636200189590454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108497, + "balance_loss_mlp": 1.07313251, + "diversity_loss_mlp": 0.0, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.07122055500535385, + "language_loss": 0.89087129, + "learning_rate": 0.0005744917936578707, + "loss": 0.90172094, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.11828613, + "routerloss_mlp": 0.0, + "step": 2437, + "time_per_iteration": 2.7820210456848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.07790279, + "diversity_loss_mlp": 0.0, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.0674848593159629, + "language_loss": 0.84104413, + "learning_rate": 0.0005741837138230526, + "loss": 0.85194385, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.1206665, + "routerloss_mlp": 0.0, + "step": 2438, + "time_per_iteration": 2.7324602603912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091997, + "balance_loss_mlp": 1.07981968, + "diversity_loss_mlp": 0.0, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.08534673561441382, + "language_loss": 0.86345065, + "learning_rate": 0.0005738756051875627, + "loss": 0.87437063, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.12176514, + "routerloss_mlp": 0.0, + "step": 2439, + "time_per_iteration": 3.0705649852752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098053, + "balance_loss_mlp": 1.08564377, + "diversity_loss_mlp": 0.0, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.06467123496854205, + "language_loss": 0.83114249, + "learning_rate": 0.0005735674678710192, + "loss": 0.84212297, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.12414551, + "routerloss_mlp": 0.0, + "step": 2440, + "time_per_iteration": 2.6645498275756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089062, + "balance_loss_mlp": 1.07644403, + "diversity_loss_mlp": 0.0, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.09155388913703945, + "language_loss": 0.81178355, + "learning_rate": 0.0005732593019930517, + "loss": 0.82267421, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.12628174, + "routerloss_mlp": 0.0, + "step": 2441, + "time_per_iteration": 2.892775774002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084176, + "balance_loss_mlp": 1.07203436, + "diversity_loss_mlp": 0.0, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.07090754106091501, + "language_loss": 0.87927258, + "learning_rate": 0.0005729511076733008, + "loss": 0.89011431, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.12139893, + "routerloss_mlp": 0.0, + "step": 2442, + "time_per_iteration": 2.629671096801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080039, + "balance_loss_mlp": 1.06766534, + "diversity_loss_mlp": 0.0, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.0886658808398658, + "language_loss": 0.85080904, + "learning_rate": 0.000572642885031418, + "loss": 0.86160946, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.1237793, + "routerloss_mlp": 0.0, + "step": 2443, + "time_per_iteration": 2.858177900314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083351, + "balance_loss_mlp": 1.07077432, + "diversity_loss_mlp": 0.0, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.06516149518751314, + "language_loss": 0.80735445, + "learning_rate": 0.0005723346341870662, + "loss": 0.81818795, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.12573242, + "routerloss_mlp": 0.0, + "step": 2444, + "time_per_iteration": 2.7146968841552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084098, + "balance_loss_mlp": 1.07161689, + "diversity_loss_mlp": 0.0, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.08093347646647668, + "language_loss": 0.86360067, + "learning_rate": 0.0005720263552599188, + "loss": 0.87444162, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2445, + "time_per_iteration": 2.5240447521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077035, + "balance_loss_mlp": 1.06469131, + "diversity_loss_mlp": 0.0, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.10031003663616385, + "language_loss": 0.80052316, + "learning_rate": 0.0005717180483696604, + "loss": 0.81129348, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.12347412, + "routerloss_mlp": 0.0, + "step": 2446, + "time_per_iteration": 2.8576042652130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076731, + "balance_loss_mlp": 1.06456566, + "diversity_loss_mlp": 0.0, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.06704052343949889, + "language_loss": 0.82989585, + "learning_rate": 0.0005714097136359862, + "loss": 0.84066319, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.12164307, + "routerloss_mlp": 0.0, + "step": 2447, + "time_per_iteration": 2.624566078186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841696, + "balance_loss_mlp": 1.45028305, + "diversity_loss_mlp": 0.205522, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.027205551471082397, + "language_loss": 0.86918223, + "learning_rate": 0.0005711013511786027, + "loss": 0.87759912, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01379322, + "step": 2448, + "time_per_iteration": 2.797086238861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106901, + "balance_loss_mlp": 1.05689788, + "diversity_loss_mlp": 0.0, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.06342125158561994, + "language_loss": 0.83811176, + "learning_rate": 0.0005707929611172263, + "loss": 0.84880185, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2449, + "time_per_iteration": 2.731825351715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071528, + "balance_loss_mlp": 1.05951726, + "diversity_loss_mlp": 0.0, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.09170207604049842, + "language_loss": 0.84256124, + "learning_rate": 0.000570484543571585, + "loss": 0.85327655, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2450, + "time_per_iteration": 2.5735461711883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064618, + "balance_loss_mlp": 1.05268502, + "diversity_loss_mlp": 0.0, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.08479509676509417, + "language_loss": 0.82936448, + "learning_rate": 0.0005701760986614171, + "loss": 0.84001064, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2451, + "time_per_iteration": 2.537297248840332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071081, + "balance_loss_mlp": 1.0591718, + "diversity_loss_mlp": 0.0, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.059658494784791405, + "language_loss": 0.8734417, + "learning_rate": 0.0005698676265064714, + "loss": 0.88415247, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.11901855, + "routerloss_mlp": 0.0, + "step": 2452, + "time_per_iteration": 2.5586979389190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076856, + "balance_loss_mlp": 1.06525099, + "diversity_loss_mlp": 0.0, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.0707454592736124, + "language_loss": 0.89208829, + "learning_rate": 0.0005695591272265074, + "loss": 0.90285689, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.1159668, + "routerloss_mlp": 0.0, + "step": 2453, + "time_per_iteration": 2.527719736099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088311, + "balance_loss_mlp": 1.07617581, + "diversity_loss_mlp": 0.0, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.07134640406799209, + "language_loss": 0.81947398, + "learning_rate": 0.0005692506009412954, + "loss": 0.83035707, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.12133789, + "routerloss_mlp": 0.0, + "step": 2454, + "time_per_iteration": 2.6558947563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0064123, + "balance_loss_mlp": 1.11988485, + "diversity_loss_mlp": 0.13842735, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.002527541257966033, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78192496, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01207405, + "step": 2455, + "time_per_iteration": 5.005730628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_mlp": 1.07716715, + "diversity_loss_mlp": 0.0, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07179176619920838, + "language_loss": 0.89308333, + "learning_rate": 0.0005686334678342593, + "loss": 0.90397304, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2456, + "time_per_iteration": 2.8779940605163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094143, + "balance_loss_mlp": 1.08280611, + "diversity_loss_mlp": 0.0, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.08187467616753978, + "language_loss": 0.81664062, + "learning_rate": 0.0005683248612520274, + "loss": 0.82758206, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.11334229, + "routerloss_mlp": 0.0, + "step": 2457, + "time_per_iteration": 3.0844156742095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087436, + "balance_loss_mlp": 1.07605195, + "diversity_loss_mlp": 0.0, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.08330432962991885, + "language_loss": 0.83940041, + "learning_rate": 0.0005680162281437321, + "loss": 0.85027468, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2458, + "time_per_iteration": 2.886364221572876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108407, + "balance_loss_mlp": 1.07263231, + "diversity_loss_mlp": 0.0, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.06607837126207569, + "language_loss": 0.84340584, + "learning_rate": 0.000567707568629195, + "loss": 0.8542465, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2459, + "time_per_iteration": 2.7153613567352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082795, + "balance_loss_mlp": 1.0712074, + "diversity_loss_mlp": 0.0, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.0662532862091719, + "language_loss": 0.82247961, + "learning_rate": 0.0005673988828282486, + "loss": 0.8333075, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2460, + "time_per_iteration": 2.6740705966949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_mlp": 1.06760526, + "diversity_loss_mlp": 0.0, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.05997115702153478, + "language_loss": 0.81122911, + "learning_rate": 0.0005670901708607352, + "loss": 0.82202172, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.11645508, + "routerloss_mlp": 0.0, + "step": 2461, + "time_per_iteration": 3.0222864151000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077887, + "balance_loss_mlp": 1.0661211, + "diversity_loss_mlp": 0.0, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.12722631062247966, + "language_loss": 0.83784962, + "learning_rate": 0.0005667814328465076, + "loss": 0.84862852, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2462, + "time_per_iteration": 2.62223744392395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071899, + "balance_loss_mlp": 1.06031179, + "diversity_loss_mlp": 0.0, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.10920156375550993, + "language_loss": 0.82163846, + "learning_rate": 0.0005664726689054285, + "loss": 0.83235747, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2463, + "time_per_iteration": 2.474776029586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072445, + "balance_loss_mlp": 1.06096554, + "diversity_loss_mlp": 0.0, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.07990467081118383, + "language_loss": 0.80772603, + "learning_rate": 0.0005661638791573704, + "loss": 0.81845051, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2464, + "time_per_iteration": 2.699165105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073096, + "balance_loss_mlp": 1.06145513, + "diversity_loss_mlp": 0.0, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.06593248790897067, + "language_loss": 0.86978662, + "learning_rate": 0.0005658550637222164, + "loss": 0.8805176, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2465, + "time_per_iteration": 2.6154093742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070219, + "balance_loss_mlp": 1.0586381, + "diversity_loss_mlp": 0.0, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.06422453310815268, + "language_loss": 0.82103038, + "learning_rate": 0.0005655462227198592, + "loss": 0.83173257, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.11566162, + "routerloss_mlp": 0.0, + "step": 2466, + "time_per_iteration": 2.888040065765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068955, + "balance_loss_mlp": 1.05703366, + "diversity_loss_mlp": 0.0, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.07464863741428074, + "language_loss": 0.84426093, + "learning_rate": 0.0005652373562702016, + "loss": 0.85495043, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2467, + "time_per_iteration": 2.6240220069885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071196, + "balance_loss_mlp": 1.05926943, + "diversity_loss_mlp": 0.0, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.06778780294468974, + "language_loss": 0.88405621, + "learning_rate": 0.000564928464493156, + "loss": 0.89476824, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2468, + "time_per_iteration": 2.598493814468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068768, + "balance_loss_mlp": 1.05676329, + "diversity_loss_mlp": 0.0, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.06443301027733518, + "language_loss": 0.81735635, + "learning_rate": 0.000564619547508645, + "loss": 0.82804406, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.11999512, + "routerloss_mlp": 0.0, + "step": 2469, + "time_per_iteration": 4.510512828826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_mlp": 1.05816698, + "diversity_loss_mlp": 0.0, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.0879456232971056, + "language_loss": 0.82882106, + "learning_rate": 0.0005643106054366008, + "loss": 0.83952397, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.12121582, + "routerloss_mlp": 0.0, + "step": 2470, + "time_per_iteration": 2.5648152828216553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.06276536, + "diversity_loss_mlp": 0.0, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.06194770014341408, + "language_loss": 0.79193991, + "learning_rate": 0.000564001638396965, + "loss": 0.8026849, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.11706543, + "routerloss_mlp": 0.0, + "step": 2471, + "time_per_iteration": 2.7267987728118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073205, + "balance_loss_mlp": 1.06152296, + "diversity_loss_mlp": 0.0, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.06505306942508977, + "language_loss": 0.82164901, + "learning_rate": 0.0005636926465096897, + "loss": 0.83238107, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2472, + "time_per_iteration": 3.035590887069702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078551, + "balance_loss_mlp": 1.06670165, + "diversity_loss_mlp": 0.0, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.08684318660371242, + "language_loss": 0.8723672, + "learning_rate": 0.0005633836298947363, + "loss": 0.88315272, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2473, + "time_per_iteration": 4.002026796340942 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091096, + "balance_loss_mlp": 1.07912695, + "diversity_loss_mlp": 0.0, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.0706680414575132, + "language_loss": 0.70566314, + "learning_rate": 0.000563074588672075, + "loss": 0.71657413, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.11950684, + "routerloss_mlp": 0.0, + "step": 2474, + "time_per_iteration": 2.6985795497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089769, + "balance_loss_mlp": 1.07802129, + "diversity_loss_mlp": 0.0, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.06282750442858279, + "language_loss": 0.85378051, + "learning_rate": 0.0005627655229616868, + "loss": 0.86467826, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.11743164, + "routerloss_mlp": 0.0, + "step": 2475, + "time_per_iteration": 2.7580935955047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091111, + "balance_loss_mlp": 1.07941031, + "diversity_loss_mlp": 0.0, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.07002888905047219, + "language_loss": 0.90058106, + "learning_rate": 0.0005624564328835616, + "loss": 0.91149217, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2476, + "time_per_iteration": 2.789257764816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108666, + "balance_loss_mlp": 1.07509637, + "diversity_loss_mlp": 0.0, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.06042863191219761, + "language_loss": 0.84203571, + "learning_rate": 0.0005621473185576986, + "loss": 0.85290229, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2477, + "time_per_iteration": 2.724280834197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089922, + "balance_loss_mlp": 1.07846594, + "diversity_loss_mlp": 0.0, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.07203405271885309, + "language_loss": 0.87555075, + "learning_rate": 0.0005618381801041068, + "loss": 0.88644993, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2478, + "time_per_iteration": 2.6800026893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085708, + "balance_loss_mlp": 1.0738883, + "diversity_loss_mlp": 0.0, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.08495018756940642, + "language_loss": 0.83006722, + "learning_rate": 0.0005615290176428044, + "loss": 0.84092432, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.11810303, + "routerloss_mlp": 0.0, + "step": 2479, + "time_per_iteration": 2.6456432342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078377, + "balance_loss_mlp": 1.06658673, + "diversity_loss_mlp": 0.0, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.07371403414772894, + "language_loss": 0.84979588, + "learning_rate": 0.0005612198312938187, + "loss": 0.86057961, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.11779785, + "routerloss_mlp": 0.0, + "step": 2480, + "time_per_iteration": 2.7325923442840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085233, + "balance_loss_mlp": 1.0737772, + "diversity_loss_mlp": 0.0, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.05926830515799366, + "language_loss": 0.79493093, + "learning_rate": 0.0005609106211771868, + "loss": 0.80578327, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2481, + "time_per_iteration": 2.8374931812286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108305, + "balance_loss_mlp": 1.07103384, + "diversity_loss_mlp": 0.0, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.06643858588339867, + "language_loss": 0.88938701, + "learning_rate": 0.0005606013874129543, + "loss": 0.90021759, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.12011719, + "routerloss_mlp": 0.0, + "step": 2482, + "time_per_iteration": 2.7547929286956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081649, + "balance_loss_mlp": 1.07017505, + "diversity_loss_mlp": 0.0, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.06416127972697647, + "language_loss": 0.80410159, + "learning_rate": 0.0005602921301211768, + "loss": 0.81491804, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2483, + "time_per_iteration": 2.7025153636932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080053, + "balance_loss_mlp": 1.06850159, + "diversity_loss_mlp": 0.0, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.07652865967226291, + "language_loss": 0.8209163, + "learning_rate": 0.0005599828494219185, + "loss": 0.83171678, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.11541748, + "routerloss_mlp": 0.0, + "step": 2484, + "time_per_iteration": 2.5415024757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070825, + "balance_loss_mlp": 1.05903542, + "diversity_loss_mlp": 0.0, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.07721505579443601, + "language_loss": 0.89162952, + "learning_rate": 0.0005596735454352527, + "loss": 0.90233779, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.11785889, + "routerloss_mlp": 0.0, + "step": 2485, + "time_per_iteration": 2.8591346740722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077742, + "balance_loss_mlp": 1.06591046, + "diversity_loss_mlp": 0.0, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.07819028279068943, + "language_loss": 0.85696715, + "learning_rate": 0.0005593642182812619, + "loss": 0.86774457, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.1182251, + "routerloss_mlp": 0.0, + "step": 2486, + "time_per_iteration": 2.679927349090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_mlp": 1.06575358, + "diversity_loss_mlp": 0.0, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.0859238614993436, + "language_loss": 0.83753216, + "learning_rate": 0.0005590548680800378, + "loss": 0.84830678, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.11694336, + "routerloss_mlp": 0.0, + "step": 2487, + "time_per_iteration": 3.0984909534454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071222, + "balance_loss_mlp": 1.05950415, + "diversity_loss_mlp": 0.0, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.06795851613398404, + "language_loss": 0.76434267, + "learning_rate": 0.0005587454949516804, + "loss": 0.77505481, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2488, + "time_per_iteration": 2.692324161529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_mlp": 1.06507468, + "diversity_loss_mlp": 0.0, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.06921637005003253, + "language_loss": 0.8785038, + "learning_rate": 0.0005584360990162993, + "loss": 0.88927084, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.11627197, + "routerloss_mlp": 0.0, + "step": 2489, + "time_per_iteration": 2.646521806716919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077817, + "balance_loss_mlp": 1.06614649, + "diversity_loss_mlp": 0.0, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.06386300972416134, + "language_loss": 0.85713631, + "learning_rate": 0.0005581266803940124, + "loss": 0.86791456, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.11657715, + "routerloss_mlp": 0.0, + "step": 2490, + "time_per_iteration": 2.735152244567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.05925143, + "diversity_loss_mlp": 0.0, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.0718717211843218, + "language_loss": 0.87536263, + "learning_rate": 0.0005578172392049471, + "loss": 0.88607073, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.11553955, + "routerloss_mlp": 0.0, + "step": 2491, + "time_per_iteration": 2.7718377113342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00892921, + "balance_loss_mlp": 1.54530287, + "diversity_loss_mlp": 0.21191472, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.033555176901221506, + "language_loss": 0.84551859, + "learning_rate": 0.0005575077755692386, + "loss": 0.85444778, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01431197, + "step": 2492, + "time_per_iteration": 2.81888747215271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070402, + "balance_loss_mlp": 1.05893993, + "diversity_loss_mlp": 0.0, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.054684262853474656, + "language_loss": 0.86001486, + "learning_rate": 0.0005571982896070316, + "loss": 0.8707189, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.11456299, + "routerloss_mlp": 0.0, + "step": 2493, + "time_per_iteration": 2.655311346054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084039, + "balance_loss_mlp": 1.07248712, + "diversity_loss_mlp": 0.0, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.07545203546694841, + "language_loss": 0.89854079, + "learning_rate": 0.0005568887814384792, + "loss": 0.90938115, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.11547852, + "routerloss_mlp": 0.0, + "step": 2494, + "time_per_iteration": 2.5930681228637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_mlp": 1.07098675, + "diversity_loss_mlp": 0.0, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.07194257940045806, + "language_loss": 0.87281573, + "learning_rate": 0.000556579251183743, + "loss": 0.88364077, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.11517334, + "routerloss_mlp": 0.0, + "step": 2495, + "time_per_iteration": 2.6386003494262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076942, + "balance_loss_mlp": 1.06520605, + "diversity_loss_mlp": 0.0, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.0750590648958695, + "language_loss": 0.80158448, + "learning_rate": 0.0005562696989629936, + "loss": 0.81235385, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.11737061, + "routerloss_mlp": 0.0, + "step": 2496, + "time_per_iteration": 2.7050864696502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00880705, + "balance_loss_mlp": 1.52288473, + "diversity_loss_mlp": 0.21003026, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.02916103721032611, + "language_loss": 0.82606125, + "learning_rate": 0.0005559601248964095, + "loss": 0.83486831, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01424794, + "step": 2497, + "time_per_iteration": 2.6473939418792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085332, + "balance_loss_mlp": 1.0741564, + "diversity_loss_mlp": 0.0, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.07410871061403823, + "language_loss": 0.85882998, + "learning_rate": 0.0005556505291041783, + "loss": 0.86968333, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.11175537, + "routerloss_mlp": 0.0, + "step": 2498, + "time_per_iteration": 2.665832042694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105898, + "balance_loss_mlp": 1.09428692, + "diversity_loss_mlp": 0.0, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.06465509842390993, + "language_loss": 0.84413946, + "learning_rate": 0.0005553409117064954, + "loss": 0.8551985, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2499, + "time_per_iteration": 2.880300521850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00859857, + "balance_loss_mlp": 1.48415303, + "diversity_loss_mlp": 0.20870377, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.02869897963967695, + "language_loss": 0.84937358, + "learning_rate": 0.0005550312728235654, + "loss": 0.85797209, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01342856, + "step": 2500, + "time_per_iteration": 2.7199203968048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109419, + "balance_loss_mlp": 1.08251953, + "diversity_loss_mlp": 0.0, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.07331859457791397, + "language_loss": 0.83879191, + "learning_rate": 0.0005547216125756003, + "loss": 0.84973377, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2501, + "time_per_iteration": 2.732786178588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098928, + "balance_loss_mlp": 1.08708501, + "diversity_loss_mlp": 0.0, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.07387575947985975, + "language_loss": 0.82064617, + "learning_rate": 0.0005544119310828211, + "loss": 0.83163536, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.11846924, + "routerloss_mlp": 0.0, + "step": 2502, + "time_per_iteration": 3.1029446125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100673, + "balance_loss_mlp": 1.08865714, + "diversity_loss_mlp": 0.0, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.06596898477591598, + "language_loss": 0.84657413, + "learning_rate": 0.0005541022284654568, + "loss": 0.8575809, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.12017822, + "routerloss_mlp": 0.0, + "step": 2503, + "time_per_iteration": 2.901026725769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092163, + "balance_loss_mlp": 1.08015907, + "diversity_loss_mlp": 0.0, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.0759157238743441, + "language_loss": 0.83907866, + "learning_rate": 0.0005537925048437446, + "loss": 0.85000032, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2504, + "time_per_iteration": 2.6014060974121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00594545, + "balance_loss_mlp": 1.03097272, + "diversity_loss_mlp": 0.13453583, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.0017952613590721677, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76346016, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01179097, + "step": 2505, + "time_per_iteration": 4.960138320922852 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00867388, + "balance_loss_mlp": 1.49711311, + "diversity_loss_mlp": 0.20998067, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.029195885141922995, + "language_loss": 0.88189656, + "learning_rate": 0.0005531729950682664, + "loss": 0.8905704, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01384138, + "step": 2506, + "time_per_iteration": 3.056671142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082436, + "balance_loss_mlp": 1.07027662, + "diversity_loss_mlp": 0.0, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.09591114443507165, + "language_loss": 0.84746361, + "learning_rate": 0.000552863209155015, + "loss": 0.85828793, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.12158203, + "routerloss_mlp": 0.0, + "step": 2507, + "time_per_iteration": 2.473930835723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00866012, + "balance_loss_mlp": 1.49284506, + "diversity_loss_mlp": 0.21081753, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.03047035716712285, + "language_loss": 0.82048851, + "learning_rate": 0.0005525534027184461, + "loss": 0.82914865, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01418037, + "step": 2508, + "time_per_iteration": 2.5708260536193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078028, + "balance_loss_mlp": 1.06624985, + "diversity_loss_mlp": 0.0, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.06261213728600334, + "language_loss": 0.83131289, + "learning_rate": 0.0005522435758788365, + "loss": 0.84209323, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2509, + "time_per_iteration": 2.7291650772094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00853572, + "balance_loss_mlp": 1.46908307, + "diversity_loss_mlp": 0.20966808, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.03495470447814039, + "language_loss": 0.80126894, + "learning_rate": 0.0005519337287564721, + "loss": 0.80980462, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01419635, + "step": 2510, + "time_per_iteration": 2.843698024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077046, + "balance_loss_mlp": 1.06536365, + "diversity_loss_mlp": 0.0, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07525780944119016, + "language_loss": 0.83495927, + "learning_rate": 0.000551623861471646, + "loss": 0.84572971, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.11669922, + "routerloss_mlp": 0.0, + "step": 2511, + "time_per_iteration": 2.7327091693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133891, + "balance_loss_mlp": 1.1273582, + "diversity_loss_mlp": 0.0, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.052890092991212126, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79952717, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.06542969, + "routerloss_mlp": 0.0, + "step": 2512, + "time_per_iteration": 4.820046901702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073764, + "balance_loss_mlp": 1.06182551, + "diversity_loss_mlp": 0.0, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.09417698665840035, + "language_loss": 0.8670119, + "learning_rate": 0.0005510040668958211, + "loss": 0.87774956, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.1192627, + "routerloss_mlp": 0.0, + "step": 2513, + "time_per_iteration": 2.579780101776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051826, + "balance_loss_mlp": 1.04515004, + "diversity_loss_mlp": 0.0, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.02705432320804172, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78812408, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.06689453, + "routerloss_mlp": 0.0, + "step": 2514, + "time_per_iteration": 4.83507227897644 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106953, + "balance_loss_mlp": 1.05716157, + "diversity_loss_mlp": 0.0, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.07432123735470587, + "language_loss": 0.83170015, + "learning_rate": 0.0005503841931138645, + "loss": 0.84239542, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.12365723, + "routerloss_mlp": 0.0, + "step": 2515, + "time_per_iteration": 2.6834895610809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071737, + "balance_loss_mlp": 1.05963731, + "diversity_loss_mlp": 0.0, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.07510504832931036, + "language_loss": 0.81515384, + "learning_rate": 0.0005500742268214025, + "loss": 0.82587123, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.12091064, + "routerloss_mlp": 0.0, + "step": 2516, + "time_per_iteration": 2.494479179382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084077, + "balance_loss_mlp": 1.0715425, + "diversity_loss_mlp": 0.0, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.06432693662792612, + "language_loss": 0.85142744, + "learning_rate": 0.0005497642410884014, + "loss": 0.86226821, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.12542725, + "routerloss_mlp": 0.0, + "step": 2517, + "time_per_iteration": 2.760425090789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080501, + "balance_loss_mlp": 1.06788325, + "diversity_loss_mlp": 0.0, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.06763953923030977, + "language_loss": 0.85120749, + "learning_rate": 0.0005494542360352085, + "loss": 0.86201251, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.12628174, + "routerloss_mlp": 0.0, + "step": 2518, + "time_per_iteration": 2.6524109840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108191, + "balance_loss_mlp": 1.06955993, + "diversity_loss_mlp": 0.0, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.06089591080825084, + "language_loss": 0.85741639, + "learning_rate": 0.0005491442117821783, + "loss": 0.86823547, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.12353516, + "routerloss_mlp": 0.0, + "step": 2519, + "time_per_iteration": 2.7461459636688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079216, + "balance_loss_mlp": 1.06654429, + "diversity_loss_mlp": 0.0, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.07584750574127574, + "language_loss": 0.87494171, + "learning_rate": 0.0005488341684496732, + "loss": 0.88573384, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.12677002, + "routerloss_mlp": 0.0, + "step": 2520, + "time_per_iteration": 2.6621458530426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080655, + "balance_loss_mlp": 1.06843615, + "diversity_loss_mlp": 0.0, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06605179609441998, + "language_loss": 0.9207437, + "learning_rate": 0.0005485241061580624, + "loss": 0.9315502, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2521, + "time_per_iteration": 2.772949457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089898, + "balance_loss_mlp": 1.07741094, + "diversity_loss_mlp": 0.0, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.06556104217544546, + "language_loss": 0.8458938, + "learning_rate": 0.0005482140250277228, + "loss": 0.85679281, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.12481689, + "routerloss_mlp": 0.0, + "step": 2522, + "time_per_iteration": 2.978330135345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847105, + "balance_loss_mlp": 1.45509815, + "diversity_loss_mlp": 0.21114388, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.03368619412239962, + "language_loss": 0.87090278, + "learning_rate": 0.0005479039251790387, + "loss": 0.87937379, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01398425, + "step": 2523, + "time_per_iteration": 2.6939120292663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00840008, + "balance_loss_mlp": 1.44148707, + "diversity_loss_mlp": 0.21069397, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.03188648694570784, + "language_loss": 0.84722733, + "learning_rate": 0.0005475938067324014, + "loss": 0.85562754, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0139178, + "step": 2524, + "time_per_iteration": 2.859184980392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106923, + "balance_loss_mlp": 1.09528267, + "diversity_loss_mlp": 0.0, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.06962736532334403, + "language_loss": 0.83518255, + "learning_rate": 0.0005472836698082098, + "loss": 0.84625173, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.11633301, + "routerloss_mlp": 0.0, + "step": 2525, + "time_per_iteration": 2.534783363342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101033, + "balance_loss_mlp": 1.08923149, + "diversity_loss_mlp": 0.0, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.07423434170097615, + "language_loss": 0.84140873, + "learning_rate": 0.0005469735145268694, + "loss": 0.85241902, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2526, + "time_per_iteration": 2.7064108848571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090982, + "balance_loss_mlp": 1.07928169, + "diversity_loss_mlp": 0.0, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.0731540325655248, + "language_loss": 0.81093931, + "learning_rate": 0.0005466633410087933, + "loss": 0.82184911, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2527, + "time_per_iteration": 2.682969570159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085513, + "balance_loss_mlp": 1.07793164, + "diversity_loss_mlp": 0.0, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.03711409557498352, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78346336, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.07568359, + "routerloss_mlp": 0.0, + "step": 2528, + "time_per_iteration": 4.962444067001343 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085086, + "balance_loss_mlp": 1.07360601, + "diversity_loss_mlp": 0.0, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.07791605184695856, + "language_loss": 0.88148236, + "learning_rate": 0.0005460429397441214, + "loss": 0.89233321, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.11468506, + "routerloss_mlp": 0.0, + "step": 2529, + "time_per_iteration": 2.5908102989196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00835644, + "balance_loss_mlp": 1.43002903, + "diversity_loss_mlp": 0.21195745, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.03186279831907627, + "language_loss": 0.87013817, + "learning_rate": 0.0005457327122383866, + "loss": 0.87849462, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01465126, + "step": 2530, + "time_per_iteration": 2.656264543533325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036926, + "balance_loss_mlp": 1.02939153, + "diversity_loss_mlp": 0.0, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.02373673385224348, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75673413, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.07519531, + "routerloss_mlp": 0.0, + "step": 2531, + "time_per_iteration": 4.838496208190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100935, + "balance_loss_mlp": 1.08965194, + "diversity_loss_mlp": 0.0, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.06845758574896237, + "language_loss": 0.75823385, + "learning_rate": 0.0005451122040823244, + "loss": 0.76924324, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2532, + "time_per_iteration": 2.770751714706421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099102, + "balance_loss_mlp": 1.08746696, + "diversity_loss_mlp": 0.0, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.07387169787784394, + "language_loss": 0.77164292, + "learning_rate": 0.0005448019236728997, + "loss": 0.7826339, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.11621094, + "routerloss_mlp": 0.0, + "step": 2533, + "time_per_iteration": 2.8874497413635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00837303, + "balance_loss_mlp": 1.43305767, + "diversity_loss_mlp": 0.21233971, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.03246629845535473, + "language_loss": 0.8471576, + "learning_rate": 0.0005444916258698255, + "loss": 0.85553062, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01460437, + "step": 2534, + "time_per_iteration": 2.623748540878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112009, + "balance_loss_mlp": 1.10867584, + "diversity_loss_mlp": 0.0, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.06488105381348498, + "language_loss": 0.86077154, + "learning_rate": 0.0005441813107935704, + "loss": 0.87197244, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2535, + "time_per_iteration": 2.6705739498138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124443, + "balance_loss_mlp": 1.11277819, + "diversity_loss_mlp": 0.0, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.07112550287999594, + "language_loss": 0.86025345, + "learning_rate": 0.0005438709785646091, + "loss": 0.87149793, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2536, + "time_per_iteration": 2.5624749660491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120427, + "balance_loss_mlp": 1.10864902, + "diversity_loss_mlp": 0.0, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.08492074314505418, + "language_loss": 0.86885595, + "learning_rate": 0.0005435606293034234, + "loss": 0.8800602, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2537, + "time_per_iteration": 2.6347479820251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121847, + "balance_loss_mlp": 1.11035514, + "diversity_loss_mlp": 0.0, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.08214525409599778, + "language_loss": 0.84619427, + "learning_rate": 0.0005432502631305016, + "loss": 0.8574127, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2538, + "time_per_iteration": 2.700613021850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113422, + "balance_loss_mlp": 1.10190618, + "diversity_loss_mlp": 0.0, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.06429037959601741, + "language_loss": 0.83193302, + "learning_rate": 0.0005429398801663386, + "loss": 0.84306723, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2539, + "time_per_iteration": 2.9839913845062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097658, + "balance_loss_mlp": 1.08599913, + "diversity_loss_mlp": 0.0, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.12053819121868696, + "language_loss": 0.8290484, + "learning_rate": 0.0005426294805314355, + "loss": 0.84002495, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2540, + "time_per_iteration": 2.5029373168945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094803, + "balance_loss_mlp": 1.08291781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.06245664696917761, + "language_loss": 0.80155998, + "learning_rate": 0.0005423190643463003, + "loss": 0.81250799, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2541, + "time_per_iteration": 2.949772357940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093208, + "balance_loss_mlp": 1.08163261, + "diversity_loss_mlp": 0.0, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.07791209549750817, + "language_loss": 0.8281579, + "learning_rate": 0.0005420086317314473, + "loss": 0.83908999, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2542, + "time_per_iteration": 2.6383941173553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088553, + "balance_loss_mlp": 1.0765729, + "diversity_loss_mlp": 0.0, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.06362759827284906, + "language_loss": 0.81081557, + "learning_rate": 0.0005416981828073971, + "loss": 0.82170111, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.11981201, + "routerloss_mlp": 0.0, + "step": 2543, + "time_per_iteration": 2.8023576736450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007156, + "balance_loss_mlp": 0.99990815, + "diversity_loss_mlp": 0.0, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.01938913368632236, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78122175, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.07226562, + "routerloss_mlp": 0.0, + "step": 2544, + "time_per_iteration": 4.817458629608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093446, + "balance_loss_mlp": 1.08184147, + "diversity_loss_mlp": 0.0, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.08678858450341921, + "language_loss": 0.84937072, + "learning_rate": 0.000541077236513819, + "loss": 0.86030519, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.11590576, + "routerloss_mlp": 0.0, + "step": 2545, + "time_per_iteration": 2.5271120071411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089352, + "balance_loss_mlp": 1.07800293, + "diversity_loss_mlp": 0.0, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.07207098978073255, + "language_loss": 0.82449925, + "learning_rate": 0.0005407667393853638, + "loss": 0.83539271, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2546, + "time_per_iteration": 2.6385204792022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093617, + "balance_loss_mlp": 1.08250618, + "diversity_loss_mlp": 0.0, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.06843607218978102, + "language_loss": 0.83673334, + "learning_rate": 0.0005404562264298569, + "loss": 0.84766948, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.11108398, + "routerloss_mlp": 0.0, + "step": 2547, + "time_per_iteration": 2.845250368118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102851, + "balance_loss_mlp": 1.09120405, + "diversity_loss_mlp": 0.0, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.06940893068641271, + "language_loss": 0.83999467, + "learning_rate": 0.0005401456977678498, + "loss": 0.8510232, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2548, + "time_per_iteration": 2.638720750808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099322, + "balance_loss_mlp": 1.08754444, + "diversity_loss_mlp": 0.0, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.08453175850654031, + "language_loss": 0.77431965, + "learning_rate": 0.0005398351535199008, + "loss": 0.78531289, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.11773682, + "routerloss_mlp": 0.0, + "step": 2549, + "time_per_iteration": 3.064035415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103016, + "balance_loss_mlp": 1.09175706, + "diversity_loss_mlp": 0.0, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.07238427843662706, + "language_loss": 0.84189212, + "learning_rate": 0.0005395245938065735, + "loss": 0.85292226, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2550, + "time_per_iteration": 2.7746829986572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118468, + "balance_loss_mlp": 1.10702372, + "diversity_loss_mlp": 0.0, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.08583684211433391, + "language_loss": 0.82631576, + "learning_rate": 0.0005392140187484379, + "loss": 0.83750039, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2551, + "time_per_iteration": 2.582195281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124142, + "balance_loss_mlp": 1.11273384, + "diversity_loss_mlp": 0.0, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.0682243054902728, + "language_loss": 0.89719319, + "learning_rate": 0.0005389034284660701, + "loss": 0.90843463, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.11401367, + "routerloss_mlp": 0.0, + "step": 2552, + "time_per_iteration": 2.824427366256714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131566, + "balance_loss_mlp": 1.12022352, + "diversity_loss_mlp": 0.0, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.08386347311462448, + "language_loss": 0.82537109, + "learning_rate": 0.000538592823080052, + "loss": 0.83668673, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.11340332, + "routerloss_mlp": 0.0, + "step": 2553, + "time_per_iteration": 3.24122953414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127167, + "balance_loss_mlp": 1.11565781, + "diversity_loss_mlp": 0.0, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.06967590045443849, + "language_loss": 0.84592807, + "learning_rate": 0.000538282202710971, + "loss": 0.85719973, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.11505127, + "routerloss_mlp": 0.0, + "step": 2554, + "time_per_iteration": 2.5753910541534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130476, + "balance_loss_mlp": 1.11918652, + "diversity_loss_mlp": 0.0, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.07442252581599826, + "language_loss": 0.82315147, + "learning_rate": 0.000537971567479421, + "loss": 0.83445626, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2555, + "time_per_iteration": 2.7354228496551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127557, + "balance_loss_mlp": 1.11596429, + "diversity_loss_mlp": 0.0, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.09076326784032986, + "language_loss": 0.88129175, + "learning_rate": 0.0005376609175060011, + "loss": 0.8925674, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2556, + "time_per_iteration": 2.6124610900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106232, + "balance_loss_mlp": 1.09465659, + "diversity_loss_mlp": 0.0, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.07210041581715526, + "language_loss": 0.80779845, + "learning_rate": 0.0005373502529113162, + "loss": 0.81886077, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2557, + "time_per_iteration": 2.823993444442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100884, + "balance_loss_mlp": 1.08888519, + "diversity_loss_mlp": 0.0, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.07460313059090624, + "language_loss": 0.81449521, + "learning_rate": 0.0005370395738159773, + "loss": 0.82550406, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2558, + "time_per_iteration": 2.6436777114868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00834873, + "balance_loss_mlp": 1.42800272, + "diversity_loss_mlp": 0.21467975, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.03347414568603151, + "language_loss": 0.82822633, + "learning_rate": 0.0005367288803406003, + "loss": 0.83657515, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01353174, + "step": 2559, + "time_per_iteration": 2.662224531173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083349, + "balance_loss_mlp": 1.07132101, + "diversity_loss_mlp": 0.0, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.0788259825299616, + "language_loss": 0.818443, + "learning_rate": 0.0005364181726058073, + "loss": 0.82927656, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.12023926, + "routerloss_mlp": 0.0, + "step": 2560, + "time_per_iteration": 2.686300277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.06417727, + "diversity_loss_mlp": 0.0, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.07955060847799823, + "language_loss": 0.8272332, + "learning_rate": 0.0005361074507322261, + "loss": 0.83799613, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.12103271, + "routerloss_mlp": 0.0, + "step": 2561, + "time_per_iteration": 2.5809431076049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073469, + "balance_loss_mlp": 1.06138754, + "diversity_loss_mlp": 0.0, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.07091460094801966, + "language_loss": 0.81425411, + "learning_rate": 0.000535796714840489, + "loss": 0.82498884, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2562, + "time_per_iteration": 2.6425187587738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073356, + "balance_loss_mlp": 1.06107163, + "diversity_loss_mlp": 0.0, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.10871355986071002, + "language_loss": 0.83800626, + "learning_rate": 0.0005354859650512348, + "loss": 0.84873986, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.12280273, + "routerloss_mlp": 0.0, + "step": 2563, + "time_per_iteration": 2.7957375049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074544, + "balance_loss_mlp": 1.06282604, + "diversity_loss_mlp": 0.0, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.0798917687203661, + "language_loss": 0.87428886, + "learning_rate": 0.0005351752014851074, + "loss": 0.88503432, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.11712646, + "routerloss_mlp": 0.0, + "step": 2564, + "time_per_iteration": 2.6205673217773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085324, + "balance_loss_mlp": 1.07352281, + "diversity_loss_mlp": 0.0, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.06874397476353511, + "language_loss": 0.83621442, + "learning_rate": 0.0005348644242627553, + "loss": 0.84706771, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2565, + "time_per_iteration": 2.7460625171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010105, + "balance_loss_mlp": 1.00411022, + "diversity_loss_mlp": 0.0, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.013767653611631516, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76297128, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2566, + "time_per_iteration": 4.943475723266602 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110567, + "balance_loss_mlp": 1.09899187, + "diversity_loss_mlp": 0.0, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.08759046492811678, + "language_loss": 0.81650245, + "learning_rate": 0.0005342428293320013, + "loss": 0.82760805, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2567, + "time_per_iteration": 2.7889564037323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102659, + "balance_loss_mlp": 1.09142327, + "diversity_loss_mlp": 0.0, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.07999691418133484, + "language_loss": 0.8344667, + "learning_rate": 0.0005339320118649238, + "loss": 0.84549326, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.11230469, + "routerloss_mlp": 0.0, + "step": 2568, + "time_per_iteration": 2.7774229049682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.09715271, + "diversity_loss_mlp": 0.0, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.07608170940546952, + "language_loss": 0.86422324, + "learning_rate": 0.000533621181224271, + "loss": 0.87530512, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2569, + "time_per_iteration": 2.7708005905151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095635, + "balance_loss_mlp": 1.08442283, + "diversity_loss_mlp": 0.0, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.06858054906862693, + "language_loss": 0.8138749, + "learning_rate": 0.0005333103375307182, + "loss": 0.82483125, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.11218262, + "routerloss_mlp": 0.0, + "step": 2570, + "time_per_iteration": 2.8407034873962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090688, + "balance_loss_mlp": 1.07972121, + "diversity_loss_mlp": 0.0, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06174009778797697, + "language_loss": 0.85711801, + "learning_rate": 0.0005329994809049451, + "loss": 0.86802495, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.10974121, + "routerloss_mlp": 0.0, + "step": 2571, + "time_per_iteration": 2.7500712871551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096363, + "balance_loss_mlp": 1.08508563, + "diversity_loss_mlp": 0.0, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.06855083904022342, + "language_loss": 0.88066995, + "learning_rate": 0.0005326886114676375, + "loss": 0.89163363, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2572, + "time_per_iteration": 2.730137825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_mlp": 1.07269001, + "diversity_loss_mlp": 0.0, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06053914015656951, + "language_loss": 0.88364595, + "learning_rate": 0.0005323777293394854, + "loss": 0.89448464, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2573, + "time_per_iteration": 2.539825201034546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084718, + "balance_loss_mlp": 1.07365584, + "diversity_loss_mlp": 0.0, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.06797932871808014, + "language_loss": 0.81904709, + "learning_rate": 0.000532066834641184, + "loss": 0.8298943, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.11065674, + "routerloss_mlp": 0.0, + "step": 2574, + "time_per_iteration": 2.6663713455200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103824, + "balance_loss_mlp": 1.09271336, + "diversity_loss_mlp": 0.0, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.07191084425213706, + "language_loss": 0.85331243, + "learning_rate": 0.0005317559274934334, + "loss": 0.86435068, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.11114502, + "routerloss_mlp": 0.0, + "step": 2575, + "time_per_iteration": 2.756410598754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097116, + "balance_loss_mlp": 1.08592236, + "diversity_loss_mlp": 0.0, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.08893709148941176, + "language_loss": 0.80365205, + "learning_rate": 0.0005314450080169382, + "loss": 0.81462318, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.11199951, + "routerloss_mlp": 0.0, + "step": 2576, + "time_per_iteration": 2.613163471221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092174, + "balance_loss_mlp": 1.0810523, + "diversity_loss_mlp": 0.0, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.10818754121519983, + "language_loss": 0.8082127, + "learning_rate": 0.0005311340763324083, + "loss": 0.81913447, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.11126709, + "routerloss_mlp": 0.0, + "step": 2577, + "time_per_iteration": 2.5670807361602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087439, + "balance_loss_mlp": 1.07612574, + "diversity_loss_mlp": 0.0, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.07097138632102568, + "language_loss": 0.82323599, + "learning_rate": 0.0005308231325605578, + "loss": 0.83411032, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.11315918, + "routerloss_mlp": 0.0, + "step": 2578, + "time_per_iteration": 2.6519079208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085757, + "balance_loss_mlp": 1.07421172, + "diversity_loss_mlp": 0.0, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.06601832089031445, + "language_loss": 0.76727217, + "learning_rate": 0.0005305121768221061, + "loss": 0.7781297, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.11535645, + "routerloss_mlp": 0.0, + "step": 2579, + "time_per_iteration": 3.1306209564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_mlp": 1.03489161, + "diversity_loss_mlp": 0.0, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.022004289450105873, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76079202, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 2580, + "time_per_iteration": 4.8141255378723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079045, + "balance_loss_mlp": 1.06767821, + "diversity_loss_mlp": 0.0, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.06618835036619775, + "language_loss": 0.91614985, + "learning_rate": 0.0005298902299282984, + "loss": 0.92694032, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2581, + "time_per_iteration": 2.586012125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087579, + "balance_loss_mlp": 1.07617044, + "diversity_loss_mlp": 0.0, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.07143589820149647, + "language_loss": 0.84265745, + "learning_rate": 0.0005295792390144033, + "loss": 0.85353327, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2582, + "time_per_iteration": 2.704911708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096311, + "balance_loss_mlp": 1.08442605, + "diversity_loss_mlp": 0.0, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.07556433689349051, + "language_loss": 0.83576399, + "learning_rate": 0.0005292682366168294, + "loss": 0.84672707, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2583, + "time_per_iteration": 2.5530638694763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105009, + "balance_loss_mlp": 1.09309435, + "diversity_loss_mlp": 0.0, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.06699014279274042, + "language_loss": 0.80089158, + "learning_rate": 0.0005289572228563181, + "loss": 0.81194162, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.11914062, + "routerloss_mlp": 0.0, + "step": 2584, + "time_per_iteration": 2.729093551635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100707, + "balance_loss_mlp": 1.08861935, + "diversity_loss_mlp": 0.0, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.0657007833960997, + "language_loss": 0.83234823, + "learning_rate": 0.000528646197853616, + "loss": 0.8433553, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.12078857, + "routerloss_mlp": 0.0, + "step": 2585, + "time_per_iteration": 2.727252721786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113697, + "balance_loss_mlp": 1.10166335, + "diversity_loss_mlp": 0.0, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.07376563164337009, + "language_loss": 0.85810697, + "learning_rate": 0.0005283351617294735, + "loss": 0.86924398, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.12023926, + "routerloss_mlp": 0.0, + "step": 2586, + "time_per_iteration": 2.945610761642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011716, + "balance_loss_mlp": 1.00470638, + "diversity_loss_mlp": 0.0, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.017193207514109847, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77648377, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.0703125, + "routerloss_mlp": 0.0, + "step": 2587, + "time_per_iteration": 5.038366079330444 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.07597303, + "diversity_loss_mlp": 0.0, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.06591325697086226, + "language_loss": 0.86769819, + "learning_rate": 0.0005277130565998916, + "loss": 0.87858337, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.12554932, + "routerloss_mlp": 0.0, + "step": 2588, + "time_per_iteration": 2.7726681232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086804, + "balance_loss_mlp": 1.07443595, + "diversity_loss_mlp": 0.0, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.05822748641904789, + "language_loss": 0.81899714, + "learning_rate": 0.0005274019878359748, + "loss": 0.82986516, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.12371826, + "routerloss_mlp": 0.0, + "step": 2589, + "time_per_iteration": 2.733985424041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075103, + "balance_loss_mlp": 1.06275249, + "diversity_loss_mlp": 0.0, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.0736619230298454, + "language_loss": 0.87174684, + "learning_rate": 0.0005270909084336628, + "loss": 0.88249791, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.12335205, + "routerloss_mlp": 0.0, + "step": 2590, + "time_per_iteration": 2.648728370666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075145, + "balance_loss_mlp": 1.06231809, + "diversity_loss_mlp": 0.0, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.07329601175103365, + "language_loss": 0.8877548, + "learning_rate": 0.0005267798185137276, + "loss": 0.89850616, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.12835693, + "routerloss_mlp": 0.0, + "step": 2591, + "time_per_iteration": 2.616903066635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061242, + "balance_loss_mlp": 1.04852843, + "diversity_loss_mlp": 0.0, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.0712913700859702, + "language_loss": 0.89140213, + "learning_rate": 0.0005264687181969444, + "loss": 0.90201461, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.12713623, + "routerloss_mlp": 0.0, + "step": 2592, + "time_per_iteration": 2.7121951580047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067017, + "balance_loss_mlp": 1.05430353, + "diversity_loss_mlp": 0.0, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.07969645648170227, + "language_loss": 0.75208342, + "learning_rate": 0.0005261576076040937, + "loss": 0.76275361, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.12719727, + "routerloss_mlp": 0.0, + "step": 2593, + "time_per_iteration": 3.248811721801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059604, + "balance_loss_mlp": 1.04746807, + "diversity_loss_mlp": 0.0, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.07355463018535204, + "language_loss": 0.84396625, + "learning_rate": 0.0005258464868559591, + "loss": 0.85456228, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.12121582, + "routerloss_mlp": 0.0, + "step": 2594, + "time_per_iteration": 2.6535778045654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_mlp": 1.0461601, + "diversity_loss_mlp": 0.0, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.06735340586139127, + "language_loss": 0.88490266, + "learning_rate": 0.0005255353560733284, + "loss": 0.89548326, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.11907959, + "routerloss_mlp": 0.0, + "step": 2595, + "time_per_iteration": 2.5711045265197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_mlp": 1.03453541, + "diversity_loss_mlp": 0.0, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.025598241729826776, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76619136, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 2596, + "time_per_iteration": 4.7992448806762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106698, + "balance_loss_mlp": 1.05498767, + "diversity_loss_mlp": 0.0, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.07107233717475309, + "language_loss": 0.83179224, + "learning_rate": 0.0005249130648877492, + "loss": 0.84246206, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.11987305, + "routerloss_mlp": 0.0, + "step": 2597, + "time_per_iteration": 2.7089900970458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068426, + "balance_loss_mlp": 1.05646324, + "diversity_loss_mlp": 0.0, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.08792128719199578, + "language_loss": 0.84945238, + "learning_rate": 0.0005246019047263953, + "loss": 0.86013663, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.11962891, + "routerloss_mlp": 0.0, + "step": 2598, + "time_per_iteration": 2.4586942195892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070932, + "balance_loss_mlp": 1.0594883, + "diversity_loss_mlp": 0.0, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.08031275074858332, + "language_loss": 0.82562858, + "learning_rate": 0.0005242907350137353, + "loss": 0.83633792, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2599, + "time_per_iteration": 2.547146797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075807, + "balance_loss_mlp": 1.06445217, + "diversity_loss_mlp": 0.0, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.08690624784708721, + "language_loss": 0.79332286, + "learning_rate": 0.0005239795558705754, + "loss": 0.80408096, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2600, + "time_per_iteration": 2.5985541343688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077027, + "balance_loss_mlp": 1.06555915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.06025548364908716, + "language_loss": 0.89517641, + "learning_rate": 0.0005236683674177264, + "loss": 0.90594667, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2601, + "time_per_iteration": 2.6358349323272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090245, + "balance_loss_mlp": 1.07874131, + "diversity_loss_mlp": 0.0, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.06252214062087984, + "language_loss": 0.82497251, + "learning_rate": 0.0005233571697760021, + "loss": 0.83587497, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.11505127, + "routerloss_mlp": 0.0, + "step": 2602, + "time_per_iteration": 2.8629817962646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112229, + "balance_loss_mlp": 1.10087442, + "diversity_loss_mlp": 0.0, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.06974132169475507, + "language_loss": 0.8293485, + "learning_rate": 0.0005230459630662203, + "loss": 0.84047079, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.11352539, + "routerloss_mlp": 0.0, + "step": 2603, + "time_per_iteration": 2.939380168914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114631, + "balance_loss_mlp": 1.10359812, + "diversity_loss_mlp": 0.0, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.10511771954620508, + "language_loss": 0.81605637, + "learning_rate": 0.0005227347474092022, + "loss": 0.82720268, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2604, + "time_per_iteration": 2.7169747352600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112322, + "balance_loss_mlp": 1.11197877, + "diversity_loss_mlp": 0.0, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.07495893748856379, + "language_loss": 0.83243322, + "learning_rate": 0.0005224235229257724, + "loss": 0.84366548, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.11236572, + "routerloss_mlp": 0.0, + "step": 2605, + "time_per_iteration": 2.6940438747406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113092, + "balance_loss_mlp": 1.10178471, + "diversity_loss_mlp": 0.0, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.06884013858989874, + "language_loss": 0.86851203, + "learning_rate": 0.0005221122897367589, + "loss": 0.87964296, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.11309814, + "routerloss_mlp": 0.0, + "step": 2606, + "time_per_iteration": 2.800685405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109504, + "balance_loss_mlp": 1.09854841, + "diversity_loss_mlp": 0.0, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.08142217271827161, + "language_loss": 0.81335354, + "learning_rate": 0.0005218010479629932, + "loss": 0.82444859, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.10961914, + "routerloss_mlp": 0.0, + "step": 2607, + "time_per_iteration": 2.657087564468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098904, + "balance_loss_mlp": 1.08753133, + "diversity_loss_mlp": 0.0, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.08269023882009051, + "language_loss": 0.82140303, + "learning_rate": 0.0005214897977253102, + "loss": 0.83239204, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2608, + "time_per_iteration": 2.649846076965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084998, + "balance_loss_mlp": 1.07372093, + "diversity_loss_mlp": 0.0, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.061165709745894754, + "language_loss": 0.84233439, + "learning_rate": 0.0005211785391445473, + "loss": 0.8531844, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2609, + "time_per_iteration": 2.7179222106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087043, + "balance_loss_mlp": 1.07538986, + "diversity_loss_mlp": 0.0, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.06641391212047838, + "language_loss": 0.79080439, + "learning_rate": 0.0005208672723415467, + "loss": 0.80167478, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2610, + "time_per_iteration": 2.7928884029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.07359457, + "diversity_loss_mlp": 0.0, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.07063839016412009, + "language_loss": 0.79436052, + "learning_rate": 0.0005205559974371525, + "loss": 0.80521345, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2611, + "time_per_iteration": 2.75744366645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085649, + "balance_loss_mlp": 1.07412767, + "diversity_loss_mlp": 0.0, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.06307258943078059, + "language_loss": 0.82345438, + "learning_rate": 0.0005202447145522123, + "loss": 0.83431089, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.1151123, + "routerloss_mlp": 0.0, + "step": 2612, + "time_per_iteration": 2.6847879886627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084149, + "balance_loss_mlp": 1.07245421, + "diversity_loss_mlp": 0.0, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.060686478103186246, + "language_loss": 0.79358983, + "learning_rate": 0.0005199334238075769, + "loss": 0.80443138, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.11682129, + "routerloss_mlp": 0.0, + "step": 2613, + "time_per_iteration": 2.560041666030884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084812, + "balance_loss_mlp": 1.07277226, + "diversity_loss_mlp": 0.0, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.086387426867178, + "language_loss": 0.91963339, + "learning_rate": 0.0005196221253241, + "loss": 0.93048155, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.12030029, + "routerloss_mlp": 0.0, + "step": 2614, + "time_per_iteration": 2.6397578716278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107839, + "balance_loss_mlp": 1.06617713, + "diversity_loss_mlp": 0.0, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.09198716130289855, + "language_loss": 0.82890773, + "learning_rate": 0.0005193108192226383, + "loss": 0.83969164, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2615, + "time_per_iteration": 2.7370193004608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076176, + "balance_loss_mlp": 1.06396329, + "diversity_loss_mlp": 0.0, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.08941342921082604, + "language_loss": 0.86907744, + "learning_rate": 0.000518999505624052, + "loss": 0.87983918, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.12213135, + "routerloss_mlp": 0.0, + "step": 2616, + "time_per_iteration": 2.733515739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067104, + "balance_loss_mlp": 1.05521274, + "diversity_loss_mlp": 0.0, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.05504525356098391, + "language_loss": 0.83447164, + "learning_rate": 0.000518688184649203, + "loss": 0.84514272, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.11883545, + "routerloss_mlp": 0.0, + "step": 2617, + "time_per_iteration": 2.816542625427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075166, + "balance_loss_mlp": 1.06264269, + "diversity_loss_mlp": 0.0, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.07489503160460931, + "language_loss": 0.83596766, + "learning_rate": 0.0005183768564189577, + "loss": 0.84671938, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.12524414, + "routerloss_mlp": 0.0, + "step": 2618, + "time_per_iteration": 2.5781893730163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081949, + "balance_loss_mlp": 1.07029045, + "diversity_loss_mlp": 0.0, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.0695581827230682, + "language_loss": 0.81485611, + "learning_rate": 0.0005180655210541838, + "loss": 0.82567555, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.11651611, + "routerloss_mlp": 0.0, + "step": 2619, + "time_per_iteration": 2.5642077922821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091231, + "balance_loss_mlp": 1.07894695, + "diversity_loss_mlp": 0.0, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.08072673001204132, + "language_loss": 0.83226323, + "learning_rate": 0.0005177541786757527, + "loss": 0.84317553, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.1227417, + "routerloss_mlp": 0.0, + "step": 2620, + "time_per_iteration": 2.7365450859069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100722, + "balance_loss_mlp": 1.0882231, + "diversity_loss_mlp": 0.0, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.0921594393427519, + "language_loss": 0.82626402, + "learning_rate": 0.000517442829404538, + "loss": 0.83727121, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2621, + "time_per_iteration": 3.053333044052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097629, + "balance_loss_mlp": 1.08534431, + "diversity_loss_mlp": 0.0, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.0844592365120011, + "language_loss": 0.87026393, + "learning_rate": 0.0005171314733614166, + "loss": 0.88124025, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.12286377, + "routerloss_mlp": 0.0, + "step": 2622, + "time_per_iteration": 2.8867554664611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099715, + "balance_loss_mlp": 1.08721614, + "diversity_loss_mlp": 0.0, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.07191738026805333, + "language_loss": 0.78457403, + "learning_rate": 0.0005168201106672671, + "loss": 0.79557121, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.125, + "routerloss_mlp": 0.0, + "step": 2623, + "time_per_iteration": 2.7532849311828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083535, + "balance_loss_mlp": 1.07122076, + "diversity_loss_mlp": 0.0, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.06664161086213699, + "language_loss": 0.84876573, + "learning_rate": 0.0005165087414429717, + "loss": 0.85960108, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.12316895, + "routerloss_mlp": 0.0, + "step": 2624, + "time_per_iteration": 2.614475965499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_mlp": 1.061566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.06712294156504883, + "language_loss": 0.83509946, + "learning_rate": 0.0005161973658094144, + "loss": 0.84583604, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.12072754, + "routerloss_mlp": 0.0, + "step": 2625, + "time_per_iteration": 2.6536033153533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00875819, + "balance_loss_mlp": 1.51064336, + "diversity_loss_mlp": 0.21324398, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.02954045761884847, + "language_loss": 0.82599998, + "learning_rate": 0.000515885983887482, + "loss": 0.83475816, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01387555, + "step": 2626, + "time_per_iteration": 2.801612138748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070563, + "balance_loss_mlp": 1.05863595, + "diversity_loss_mlp": 0.0, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.07357396162877478, + "language_loss": 0.84283531, + "learning_rate": 0.0005155745957980636, + "loss": 0.8535409, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.11920166, + "routerloss_mlp": 0.0, + "step": 2627, + "time_per_iteration": 2.6239585876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071483, + "balance_loss_mlp": 1.0589962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.06901961430938243, + "language_loss": 0.88532668, + "learning_rate": 0.000515263201662051, + "loss": 0.89604151, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.12493896, + "routerloss_mlp": 0.0, + "step": 2628, + "time_per_iteration": 2.65803861618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107038, + "balance_loss_mlp": 1.05840504, + "diversity_loss_mlp": 0.0, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.06314416177701848, + "language_loss": 0.8250618, + "learning_rate": 0.0005149518016003378, + "loss": 0.8357656, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.11968994, + "routerloss_mlp": 0.0, + "step": 2629, + "time_per_iteration": 3.1646623611450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061865, + "balance_loss_mlp": 1.04946709, + "diversity_loss_mlp": 0.0, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.1007750022567515, + "language_loss": 0.82337832, + "learning_rate": 0.0005146403957338206, + "loss": 0.83399695, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.12402344, + "routerloss_mlp": 0.0, + "step": 2630, + "time_per_iteration": 2.5879476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_mlp": 1.05236936, + "diversity_loss_mlp": 0.0, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.06667308072604639, + "language_loss": 0.82288837, + "learning_rate": 0.0005143289841833975, + "loss": 0.83353263, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.12060547, + "routerloss_mlp": 0.0, + "step": 2631, + "time_per_iteration": 2.8448615074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.05643749, + "diversity_loss_mlp": 0.0, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.09203997555384738, + "language_loss": 0.82179189, + "learning_rate": 0.0005140175670699696, + "loss": 0.83247638, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.11999512, + "routerloss_mlp": 0.0, + "step": 2632, + "time_per_iteration": 2.642666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067258, + "balance_loss_mlp": 1.05545044, + "diversity_loss_mlp": 0.0, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.04894531982576629, + "language_loss": 0.82796603, + "learning_rate": 0.0005137061445144395, + "loss": 0.8386386, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.11804199, + "routerloss_mlp": 0.0, + "step": 2633, + "time_per_iteration": 2.8800737857818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076133, + "balance_loss_mlp": 1.06476033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.06583044180155191, + "language_loss": 0.87074906, + "learning_rate": 0.000513394716637712, + "loss": 0.88151038, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.1137085, + "routerloss_mlp": 0.0, + "step": 2634, + "time_per_iteration": 2.7507505416870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_mlp": 1.02921486, + "diversity_loss_mlp": 0.0, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.03533282921310782, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80227697, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.06787109, + "routerloss_mlp": 0.0, + "step": 2635, + "time_per_iteration": 4.825605869293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110028, + "balance_loss_mlp": 1.08881176, + "diversity_loss_mlp": 0.0, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.07735545811428028, + "language_loss": 0.81068468, + "learning_rate": 0.0005127718454042958, + "loss": 0.82168746, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.11462402, + "routerloss_mlp": 0.0, + "step": 2636, + "time_per_iteration": 2.8241050243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099835, + "balance_loss_mlp": 1.08840299, + "diversity_loss_mlp": 0.0, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.08187506034762644, + "language_loss": 0.83836603, + "learning_rate": 0.0005124604022894269, + "loss": 0.8493644, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.11425781, + "routerloss_mlp": 0.0, + "step": 2637, + "time_per_iteration": 2.9366774559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019034, + "balance_loss_mlp": 1.01259708, + "diversity_loss_mlp": 0.0, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.025963071476552062, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.7820726, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.06445312, + "routerloss_mlp": 0.0, + "step": 2638, + "time_per_iteration": 4.828620433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092892, + "balance_loss_mlp": 1.08166814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.07837351333742608, + "language_loss": 0.83244252, + "learning_rate": 0.0005118375016679325, + "loss": 0.84337139, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.11224365, + "routerloss_mlp": 0.0, + "step": 2639, + "time_per_iteration": 2.801852226257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077953, + "balance_loss_mlp": 1.0666697, + "diversity_loss_mlp": 0.0, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.07879033409242599, + "language_loss": 0.80358827, + "learning_rate": 0.0005115260444031382, + "loss": 0.81436777, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.11279297, + "routerloss_mlp": 0.0, + "step": 2640, + "time_per_iteration": 2.596771240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010253, + "balance_loss_mlp": 1.00422084, + "diversity_loss_mlp": 0.0, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011737851482073082, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79742074, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.06030273, + "routerloss_mlp": 0.0, + "step": 2641, + "time_per_iteration": 4.948842287063599 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075403, + "balance_loss_mlp": 1.06412029, + "diversity_loss_mlp": 0.0, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.08031663653823312, + "language_loss": 0.8740893, + "learning_rate": 0.0005109031165700483, + "loss": 0.88484335, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.112854, + "routerloss_mlp": 0.0, + "step": 2642, + "time_per_iteration": 2.5833895206451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060876, + "balance_loss_mlp": 1.04938459, + "diversity_loss_mlp": 0.0, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.06372027514248847, + "language_loss": 0.83170295, + "learning_rate": 0.0005105916462435945, + "loss": 0.84231174, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2643, + "time_per_iteration": 2.841296911239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106144, + "balance_loss_mlp": 1.05014455, + "diversity_loss_mlp": 0.0, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.0681709540800111, + "language_loss": 0.85266602, + "learning_rate": 0.0005102801718050989, + "loss": 0.86328042, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2644, + "time_per_iteration": 2.680905818939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058539, + "balance_loss_mlp": 1.04714894, + "diversity_loss_mlp": 0.0, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.07434027721258654, + "language_loss": 0.89314902, + "learning_rate": 0.0005099686933754867, + "loss": 0.90373439, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.1138916, + "routerloss_mlp": 0.0, + "step": 2645, + "time_per_iteration": 2.723043441772461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062253, + "balance_loss_mlp": 1.05088663, + "diversity_loss_mlp": 0.0, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.07256046334666034, + "language_loss": 0.8429243, + "learning_rate": 0.0005096572110756845, + "loss": 0.85354686, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2646, + "time_per_iteration": 2.6682143211364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069615, + "balance_loss_mlp": 1.05801558, + "diversity_loss_mlp": 0.0, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.06200075514200526, + "language_loss": 0.85445803, + "learning_rate": 0.0005093457250266205, + "loss": 0.86515421, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.11584473, + "routerloss_mlp": 0.0, + "step": 2647, + "time_per_iteration": 2.682891368865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069794, + "balance_loss_mlp": 1.05816472, + "diversity_loss_mlp": 0.0, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.1092618136395953, + "language_loss": 0.83279526, + "learning_rate": 0.000509034235349224, + "loss": 0.84349322, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.11627197, + "routerloss_mlp": 0.0, + "step": 2648, + "time_per_iteration": 2.7173004150390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068823, + "balance_loss_mlp": 1.05756938, + "diversity_loss_mlp": 0.0, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.07759183255272654, + "language_loss": 0.81290972, + "learning_rate": 0.0005087227421644266, + "loss": 0.82359791, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.11248779, + "routerloss_mlp": 0.0, + "step": 2649, + "time_per_iteration": 2.79217791557312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066501, + "balance_loss_mlp": 1.05469334, + "diversity_loss_mlp": 0.0, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.07036579944312285, + "language_loss": 0.85978615, + "learning_rate": 0.0005084112455931602, + "loss": 0.87045121, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.11798096, + "routerloss_mlp": 0.0, + "step": 2650, + "time_per_iteration": 2.593323230743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.06125915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.06673546987966349, + "language_loss": 0.85377133, + "learning_rate": 0.0005080997457563586, + "loss": 0.86449993, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2651, + "time_per_iteration": 2.5473101139068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074592, + "balance_loss_mlp": 1.06324303, + "diversity_loss_mlp": 0.0, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.07839929831674766, + "language_loss": 0.79146206, + "learning_rate": 0.0005077882427749569, + "loss": 0.80220807, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.11340332, + "routerloss_mlp": 0.0, + "step": 2652, + "time_per_iteration": 2.5378577709198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081444, + "balance_loss_mlp": 1.07002354, + "diversity_loss_mlp": 0.0, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.09222135648623411, + "language_loss": 0.84599656, + "learning_rate": 0.0005074767367698913, + "loss": 0.85681099, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.11407471, + "routerloss_mlp": 0.0, + "step": 2653, + "time_per_iteration": 2.7541823387145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.0749042, + "diversity_loss_mlp": 0.0, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.07250262260433718, + "language_loss": 0.82987714, + "learning_rate": 0.0005071652278620988, + "loss": 0.84074312, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.11688232, + "routerloss_mlp": 0.0, + "step": 2654, + "time_per_iteration": 3.0615251064300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089781, + "balance_loss_mlp": 1.07870018, + "diversity_loss_mlp": 0.0, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.07582936293709001, + "language_loss": 0.83328903, + "learning_rate": 0.0005068537161725186, + "loss": 0.84418684, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.11083984, + "routerloss_mlp": 0.0, + "step": 2655, + "time_per_iteration": 2.7840993404388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092544, + "balance_loss_mlp": 1.08139753, + "diversity_loss_mlp": 0.0, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.07786356346883126, + "language_loss": 0.84288549, + "learning_rate": 0.0005065422018220893, + "loss": 0.85381097, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.1114502, + "routerloss_mlp": 0.0, + "step": 2656, + "time_per_iteration": 2.832575798034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102102, + "balance_loss_mlp": 1.09118247, + "diversity_loss_mlp": 0.0, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.08194812181942494, + "language_loss": 0.80392313, + "learning_rate": 0.0005062306849317521, + "loss": 0.81494415, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.10931396, + "routerloss_mlp": 0.0, + "step": 2657, + "time_per_iteration": 2.794966220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100168, + "balance_loss_mlp": 1.08891487, + "diversity_loss_mlp": 0.0, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.08210850574888065, + "language_loss": 0.83486134, + "learning_rate": 0.0005059191656224487, + "loss": 0.84586298, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2658, + "time_per_iteration": 2.744889736175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093286, + "balance_loss_mlp": 1.08238411, + "diversity_loss_mlp": 0.0, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.07321009008554179, + "language_loss": 0.88860798, + "learning_rate": 0.0005056076440151212, + "loss": 0.89954078, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.10906982, + "routerloss_mlp": 0.0, + "step": 2659, + "time_per_iteration": 2.6951825618743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113007, + "balance_loss_mlp": 1.12453902, + "diversity_loss_mlp": 0.0, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.07076104465295206, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77418184, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2660, + "time_per_iteration": 4.850585460662842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081367, + "balance_loss_mlp": 1.07051301, + "diversity_loss_mlp": 0.0, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06225287802871053, + "language_loss": 0.86966121, + "learning_rate": 0.0005049845943901691, + "loss": 0.88047487, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.10864258, + "routerloss_mlp": 0.0, + "step": 2661, + "time_per_iteration": 2.8342370986938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079888, + "balance_loss_mlp": 1.0692786, + "diversity_loss_mlp": 0.0, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.058043198592839004, + "language_loss": 0.86637139, + "learning_rate": 0.0005046730666144338, + "loss": 0.87717032, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2662, + "time_per_iteration": 2.8066177368164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078601, + "balance_loss_mlp": 1.06801558, + "diversity_loss_mlp": 0.0, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.058701328600128284, + "language_loss": 0.87834954, + "learning_rate": 0.0005043615370244532, + "loss": 0.88913548, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 2663, + "time_per_iteration": 3.3716113567352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105143, + "balance_loss_mlp": 1.04589903, + "diversity_loss_mlp": 0.0, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.02890820887526385, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79295814, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2664, + "time_per_iteration": 4.632098913192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074303, + "balance_loss_mlp": 1.0636878, + "diversity_loss_mlp": 0.0, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.05776678043634197, + "language_loss": 0.85301316, + "learning_rate": 0.0005037384728855425, + "loss": 0.86375624, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2665, + "time_per_iteration": 2.8025074005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077204, + "balance_loss_mlp": 1.06618285, + "diversity_loss_mlp": 0.0, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.08001364709617295, + "language_loss": 0.84092522, + "learning_rate": 0.0005034269385785075, + "loss": 0.85169727, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2666, + "time_per_iteration": 2.6508989334106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070249, + "balance_loss_mlp": 1.05929327, + "diversity_loss_mlp": 0.0, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.06550806602425656, + "language_loss": 0.849998, + "learning_rate": 0.0005031154029410168, + "loss": 0.86070049, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.10955811, + "routerloss_mlp": 0.0, + "step": 2667, + "time_per_iteration": 2.6072959899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062599, + "balance_loss_mlp": 1.05130351, + "diversity_loss_mlp": 0.0, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.07261202613887993, + "language_loss": 0.86903906, + "learning_rate": 0.0005028038660940197, + "loss": 0.87966514, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2668, + "time_per_iteration": 2.5607664585113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060608, + "balance_loss_mlp": 1.04923522, + "diversity_loss_mlp": 0.0, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.06521290367629204, + "language_loss": 0.84553415, + "learning_rate": 0.0005024923281584648, + "loss": 0.8561402, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.11376953, + "routerloss_mlp": 0.0, + "step": 2669, + "time_per_iteration": 2.623643159866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066692, + "balance_loss_mlp": 1.05528402, + "diversity_loss_mlp": 0.0, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.06549707374857121, + "language_loss": 0.82560658, + "learning_rate": 0.0005021807892553026, + "loss": 0.83627355, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.11413574, + "routerloss_mlp": 0.0, + "step": 2670, + "time_per_iteration": 2.699392318725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062757, + "balance_loss_mlp": 1.05140269, + "diversity_loss_mlp": 0.0, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.07318428846825417, + "language_loss": 0.84862608, + "learning_rate": 0.0005018692495054828, + "loss": 0.85925364, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2671, + "time_per_iteration": 2.7645046710968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106912, + "balance_loss_mlp": 1.05812323, + "diversity_loss_mlp": 0.0, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.06397327244364565, + "language_loss": 0.80696338, + "learning_rate": 0.0005015577090299561, + "loss": 0.81765461, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.11004639, + "routerloss_mlp": 0.0, + "step": 2672, + "time_per_iteration": 2.684048891067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068328, + "balance_loss_mlp": 1.05731261, + "diversity_loss_mlp": 0.0, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.06574977800170037, + "language_loss": 0.86744952, + "learning_rate": 0.0005012461679496729, + "loss": 0.87813282, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.11022949, + "routerloss_mlp": 0.0, + "step": 2673, + "time_per_iteration": 2.5885825157165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077367, + "balance_loss_mlp": 1.06613708, + "diversity_loss_mlp": 0.0, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.09032594792095527, + "language_loss": 0.87748468, + "learning_rate": 0.0005009346263855848, + "loss": 0.88825834, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.11236572, + "routerloss_mlp": 0.0, + "step": 2674, + "time_per_iteration": 2.5970752239227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092275, + "balance_loss_mlp": 1.08141518, + "diversity_loss_mlp": 0.0, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.06465969942237398, + "language_loss": 0.83699256, + "learning_rate": 0.0005006230844586422, + "loss": 0.84791529, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2675, + "time_per_iteration": 2.7912445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00882234, + "balance_loss_mlp": 1.52600026, + "diversity_loss_mlp": 0.21199086, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.0263651655655577, + "language_loss": 0.78895926, + "learning_rate": 0.0005003115422897968, + "loss": 0.79778159, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01323896, + "step": 2676, + "time_per_iteration": 2.8051552772521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111408, + "balance_loss_mlp": 1.10282683, + "diversity_loss_mlp": 0.0, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.0741463219638638, + "language_loss": 0.87253916, + "learning_rate": 0.0005, + "loss": 0.88367999, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.11254883, + "routerloss_mlp": 0.0, + "step": 2677, + "time_per_iteration": 2.6435391902923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119404, + "balance_loss_mlp": 1.10841274, + "diversity_loss_mlp": 0.0, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.08792863943872284, + "language_loss": 0.79283178, + "learning_rate": 0.0004996884577102033, + "loss": 0.80402583, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.10992432, + "routerloss_mlp": 0.0, + "step": 2678, + "time_per_iteration": 3.089707374572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111646, + "balance_loss_mlp": 1.10545659, + "diversity_loss_mlp": 0.0, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.08112886088857633, + "language_loss": 0.84611261, + "learning_rate": 0.000499376915541358, + "loss": 0.85727721, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.10998535, + "routerloss_mlp": 0.0, + "step": 2679, + "time_per_iteration": 2.7143540382385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109969, + "balance_loss_mlp": 1.08910465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.16255458440641746, + "language_loss": 0.81113428, + "learning_rate": 0.0004990653736144155, + "loss": 0.82213122, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 2680, + "time_per_iteration": 2.857952356338501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084916, + "balance_loss_mlp": 1.07416916, + "diversity_loss_mlp": 0.0, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.06912387000686389, + "language_loss": 0.85820174, + "learning_rate": 0.0004987538320503271, + "loss": 0.86905092, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.10748291, + "routerloss_mlp": 0.0, + "step": 2681, + "time_per_iteration": 2.485462188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077984, + "balance_loss_mlp": 1.06715369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.08121908376237164, + "language_loss": 0.83137929, + "learning_rate": 0.0004984422909700442, + "loss": 0.84215909, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2682, + "time_per_iteration": 2.7179505825042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068711, + "balance_loss_mlp": 1.05784559, + "diversity_loss_mlp": 0.0, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.07829442771548371, + "language_loss": 0.83800036, + "learning_rate": 0.0004981307504945173, + "loss": 0.84868753, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2683, + "time_per_iteration": 2.71893048286438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061815, + "balance_loss_mlp": 1.05075228, + "diversity_loss_mlp": 0.0, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.08619577510477876, + "language_loss": 0.89448887, + "learning_rate": 0.0004978192107446976, + "loss": 0.90510702, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2684, + "time_per_iteration": 2.7385506629943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062179, + "balance_loss_mlp": 1.05111599, + "diversity_loss_mlp": 0.0, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.08129158019501125, + "language_loss": 0.8740204, + "learning_rate": 0.0004975076718415353, + "loss": 0.88464212, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2685, + "time_per_iteration": 2.599379777908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_mlp": 1.04478931, + "diversity_loss_mlp": 0.0, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.06772474949474022, + "language_loss": 0.90610582, + "learning_rate": 0.0004971961339059806, + "loss": 0.91666389, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.11016846, + "routerloss_mlp": 0.0, + "step": 2686, + "time_per_iteration": 2.498819589614868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057473, + "balance_loss_mlp": 1.04611838, + "diversity_loss_mlp": 0.0, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.06487308694775892, + "language_loss": 0.84021914, + "learning_rate": 0.0004968845970589832, + "loss": 0.85079384, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2687, + "time_per_iteration": 2.6814825534820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061325, + "balance_loss_mlp": 1.04982185, + "diversity_loss_mlp": 0.0, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.06911328459433905, + "language_loss": 0.8435297, + "learning_rate": 0.0004965730614214926, + "loss": 0.8541429, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.11499023, + "routerloss_mlp": 0.0, + "step": 2688, + "time_per_iteration": 2.6537294387817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106762, + "balance_loss_mlp": 1.05618167, + "diversity_loss_mlp": 0.0, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.07039148040030412, + "language_loss": 0.85285878, + "learning_rate": 0.0004962615271144576, + "loss": 0.86353499, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.11431885, + "routerloss_mlp": 0.0, + "step": 2689, + "time_per_iteration": 2.50710129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064714, + "balance_loss_mlp": 1.05325246, + "diversity_loss_mlp": 0.0, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.0770213433091723, + "language_loss": 0.82680881, + "learning_rate": 0.0004959499942588264, + "loss": 0.83745599, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.11456299, + "routerloss_mlp": 0.0, + "step": 2690, + "time_per_iteration": 2.892293930053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049886, + "balance_loss_mlp": 1.04297149, + "diversity_loss_mlp": 0.0, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.03551055813206397, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79249913, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.06933594, + "routerloss_mlp": 0.0, + "step": 2691, + "time_per_iteration": 4.764665842056274 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070219, + "balance_loss_mlp": 1.05894208, + "diversity_loss_mlp": 0.0, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.08037192658361764, + "language_loss": 0.85416174, + "learning_rate": 0.0004953269333855661, + "loss": 0.86486399, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.11273193, + "routerloss_mlp": 0.0, + "step": 2692, + "time_per_iteration": 2.785511016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075514, + "balance_loss_mlp": 1.06407034, + "diversity_loss_mlp": 0.0, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.06114385406953633, + "language_loss": 0.84516799, + "learning_rate": 0.0004950154056098309, + "loss": 0.85592318, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.11437988, + "routerloss_mlp": 0.0, + "step": 2693, + "time_per_iteration": 2.683246374130249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.07183599, + "diversity_loss_mlp": 0.0, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.08066804074186672, + "language_loss": 0.84078431, + "learning_rate": 0.0004947038797692867, + "loss": 0.85161769, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.1149292, + "routerloss_mlp": 0.0, + "step": 2694, + "time_per_iteration": 2.8312196731567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00872465, + "balance_loss_mlp": 1.50766385, + "diversity_loss_mlp": 0.2097543, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.031552182630998016, + "language_loss": 0.77636528, + "learning_rate": 0.0004943923559848789, + "loss": 0.78508997, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01375636, + "step": 2695, + "time_per_iteration": 2.8084189891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010865, + "balance_loss_mlp": 1.07534158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.055486891719670514, + "language_loss": 0.90695632, + "learning_rate": 0.0004940808343775515, + "loss": 0.91782129, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.1116333, + "routerloss_mlp": 0.0, + "step": 2696, + "time_per_iteration": 2.6868011951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00874209, + "balance_loss_mlp": 1.50797677, + "diversity_loss_mlp": 0.21290711, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.034010170020107075, + "language_loss": 0.82213199, + "learning_rate": 0.0004937693150682479, + "loss": 0.83087409, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01376703, + "step": 2697, + "time_per_iteration": 2.5905513763427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090314, + "balance_loss_mlp": 1.07915568, + "diversity_loss_mlp": 0.0, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.06705206433038317, + "language_loss": 0.7658723, + "learning_rate": 0.0004934577981779107, + "loss": 0.77677542, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.1116333, + "routerloss_mlp": 0.0, + "step": 2698, + "time_per_iteration": 2.7049057483673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087095, + "balance_loss_mlp": 1.07585335, + "diversity_loss_mlp": 0.0, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.061529133753451364, + "language_loss": 0.812904, + "learning_rate": 0.0004931462838274817, + "loss": 0.82377493, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.11242676, + "routerloss_mlp": 0.0, + "step": 2699, + "time_per_iteration": 2.8723175525665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089813, + "balance_loss_mlp": 1.07877994, + "diversity_loss_mlp": 0.0, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.08487292742433496, + "language_loss": 0.84222901, + "learning_rate": 0.0004928347721379011, + "loss": 0.85312712, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2700, + "time_per_iteration": 2.639867067337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080545, + "balance_loss_mlp": 1.06974459, + "diversity_loss_mlp": 0.0, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.06134037245316137, + "language_loss": 0.82221866, + "learning_rate": 0.0004925232632301089, + "loss": 0.83302414, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.10797119, + "routerloss_mlp": 0.0, + "step": 2701, + "time_per_iteration": 2.622311592102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077123, + "balance_loss_mlp": 1.0660243, + "diversity_loss_mlp": 0.0, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.06337758152829237, + "language_loss": 0.79842103, + "learning_rate": 0.0004922117572250431, + "loss": 0.80919224, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.11096191, + "routerloss_mlp": 0.0, + "step": 2702, + "time_per_iteration": 2.6980605125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070723, + "balance_loss_mlp": 1.05936241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.07398400160993446, + "language_loss": 0.80852163, + "learning_rate": 0.0004919002542436414, + "loss": 0.81922889, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2703, + "time_per_iteration": 2.8354647159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072853, + "balance_loss_mlp": 1.0619514, + "diversity_loss_mlp": 0.0, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.064542502306726, + "language_loss": 0.8126899, + "learning_rate": 0.0004915887544068399, + "loss": 0.8234185, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.10906982, + "routerloss_mlp": 0.0, + "step": 2704, + "time_per_iteration": 2.6693973541259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068776, + "balance_loss_mlp": 1.05770195, + "diversity_loss_mlp": 0.0, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.06578360362401801, + "language_loss": 0.7856639, + "learning_rate": 0.0004912772578355736, + "loss": 0.79635167, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2705, + "time_per_iteration": 2.892735481262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0611918, + "diversity_loss_mlp": 0.0, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.07750798967783011, + "language_loss": 0.82549465, + "learning_rate": 0.000490965764650776, + "loss": 0.83621788, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.11126709, + "routerloss_mlp": 0.0, + "step": 2706, + "time_per_iteration": 2.8544106483459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070979, + "balance_loss_mlp": 1.05984521, + "diversity_loss_mlp": 0.0, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.06572065456776559, + "language_loss": 0.82828736, + "learning_rate": 0.0004906542749733798, + "loss": 0.83899713, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.11132812, + "routerloss_mlp": 0.0, + "step": 2707, + "time_per_iteration": 3.6044294834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107442, + "balance_loss_mlp": 1.06353068, + "diversity_loss_mlp": 0.0, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.055629683487612144, + "language_loss": 0.85401118, + "learning_rate": 0.0004903427889243156, + "loss": 0.86475539, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.10894775, + "routerloss_mlp": 0.0, + "step": 2708, + "time_per_iteration": 2.830115795135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075334, + "balance_loss_mlp": 1.06425905, + "diversity_loss_mlp": 0.0, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.06692681375903406, + "language_loss": 0.85444081, + "learning_rate": 0.0004900313066245134, + "loss": 0.86519414, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2709, + "time_per_iteration": 2.6552441120147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106839, + "balance_loss_mlp": 1.05745232, + "diversity_loss_mlp": 0.0, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.06855502771674758, + "language_loss": 0.81061214, + "learning_rate": 0.0004897198281949012, + "loss": 0.82129598, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.10949707, + "routerloss_mlp": 0.0, + "step": 2710, + "time_per_iteration": 2.645981550216675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00874972, + "balance_loss_mlp": 1.51124442, + "diversity_loss_mlp": 0.21021394, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.03577466895356274, + "language_loss": 0.78009295, + "learning_rate": 0.0004894083537564057, + "loss": 0.78884268, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01424256, + "step": 2711, + "time_per_iteration": 2.746945858001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0086804, + "balance_loss_mlp": 1.49602354, + "diversity_loss_mlp": 0.21089339, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.02967241377466632, + "language_loss": 0.80981171, + "learning_rate": 0.0004890968834299519, + "loss": 0.81849211, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01458106, + "step": 2712, + "time_per_iteration": 2.749049663543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072348, + "balance_loss_mlp": 1.06096959, + "diversity_loss_mlp": 0.0, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.06422523073894505, + "language_loss": 0.78739542, + "learning_rate": 0.0004887854173364633, + "loss": 0.79811883, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.11364746, + "routerloss_mlp": 0.0, + "step": 2713, + "time_per_iteration": 2.760077953338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00862336, + "balance_loss_mlp": 1.48416615, + "diversity_loss_mlp": 0.2112534, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.02839704110509781, + "language_loss": 0.81564224, + "learning_rate": 0.0004884739555968617, + "loss": 0.8242656, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01462588, + "step": 2714, + "time_per_iteration": 2.902200698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043267, + "balance_loss_mlp": 1.03711605, + "diversity_loss_mlp": 0.0, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.025188943281148922, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.8002032, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 2715, + "time_per_iteration": 4.977273464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847492, + "balance_loss_mlp": 1.45660305, + "diversity_loss_mlp": 0.21012819, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.03573397478438407, + "language_loss": 0.86888605, + "learning_rate": 0.0004878510456629992, + "loss": 0.87736094, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01412619, + "step": 2716, + "time_per_iteration": 2.998455286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068588, + "balance_loss_mlp": 1.05767989, + "diversity_loss_mlp": 0.0, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.06765059094142209, + "language_loss": 0.85142076, + "learning_rate": 0.00048753959771057314, + "loss": 0.86210662, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.10925293, + "routerloss_mlp": 0.0, + "step": 2717, + "time_per_iteration": 2.6113662719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065726, + "balance_loss_mlp": 1.05442464, + "diversity_loss_mlp": 0.0, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.08600503840688169, + "language_loss": 0.82445514, + "learning_rate": 0.0004872281545957044, + "loss": 0.83511233, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.11297607, + "routerloss_mlp": 0.0, + "step": 2718, + "time_per_iteration": 2.7617604732513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070418, + "balance_loss_mlp": 1.05911732, + "diversity_loss_mlp": 0.0, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.061040572409093316, + "language_loss": 0.86051857, + "learning_rate": 0.0004869167164393055, + "loss": 0.87122279, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.11303711, + "routerloss_mlp": 0.0, + "step": 2719, + "time_per_iteration": 2.932154417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069726, + "balance_loss_mlp": 1.05857992, + "diversity_loss_mlp": 0.0, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.11614833297327579, + "language_loss": 0.89542395, + "learning_rate": 0.00048660528336228793, + "loss": 0.90612125, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.11151123, + "routerloss_mlp": 0.0, + "step": 2720, + "time_per_iteration": 2.7917380332946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071611, + "balance_loss_mlp": 1.06013143, + "diversity_loss_mlp": 0.0, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.05730438157509479, + "language_loss": 0.90177751, + "learning_rate": 0.0004862938554855606, + "loss": 0.91249359, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.11474609, + "routerloss_mlp": 0.0, + "step": 2721, + "time_per_iteration": 2.809875965118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074661, + "balance_loss_mlp": 1.06371188, + "diversity_loss_mlp": 0.0, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.06740042101514945, + "language_loss": 0.86071771, + "learning_rate": 0.0004859824329300304, + "loss": 0.87146431, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.10949707, + "routerloss_mlp": 0.0, + "step": 2722, + "time_per_iteration": 2.5660176277160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070887, + "balance_loss_mlp": 1.05932951, + "diversity_loss_mlp": 0.0, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.06312939516717878, + "language_loss": 0.83826602, + "learning_rate": 0.00048567101581660244, + "loss": 0.84897488, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.11560059, + "routerloss_mlp": 0.0, + "step": 2723, + "time_per_iteration": 2.593005895614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107168, + "balance_loss_mlp": 1.0603317, + "diversity_loss_mlp": 0.0, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.07171512526566694, + "language_loss": 0.86622667, + "learning_rate": 0.00048535960426617956, + "loss": 0.87694347, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.11346436, + "routerloss_mlp": 0.0, + "step": 2724, + "time_per_iteration": 2.611551523208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070313, + "balance_loss_mlp": 1.05852962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.07077799246948024, + "language_loss": 0.81735158, + "learning_rate": 0.0004850481983996621, + "loss": 0.82805473, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.11767578, + "routerloss_mlp": 0.0, + "step": 2725, + "time_per_iteration": 2.7656939029693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058325, + "balance_loss_mlp": 1.04673731, + "diversity_loss_mlp": 0.0, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.07497614956550303, + "language_loss": 0.87961793, + "learning_rate": 0.0004847367983379492, + "loss": 0.89020109, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.11578369, + "routerloss_mlp": 0.0, + "step": 2726, + "time_per_iteration": 2.523099899291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066059, + "balance_loss_mlp": 1.05477571, + "diversity_loss_mlp": 0.0, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.06275633211650163, + "language_loss": 0.78715622, + "learning_rate": 0.00048442540420193643, + "loss": 0.79781681, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.11291504, + "routerloss_mlp": 0.0, + "step": 2727, + "time_per_iteration": 2.9433038234710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056804, + "balance_loss_mlp": 1.04506755, + "diversity_loss_mlp": 0.0, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.07393634521455344, + "language_loss": 0.79367208, + "learning_rate": 0.0004841140161125182, + "loss": 0.80424011, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.11730957, + "routerloss_mlp": 0.0, + "step": 2728, + "time_per_iteration": 3.619252920150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063037, + "balance_loss_mlp": 1.05171847, + "diversity_loss_mlp": 0.0, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.07165329358033216, + "language_loss": 0.84827459, + "learning_rate": 0.0004838026341905857, + "loss": 0.85890496, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.11322021, + "routerloss_mlp": 0.0, + "step": 2729, + "time_per_iteration": 2.716114044189453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057536, + "balance_loss_mlp": 1.04594862, + "diversity_loss_mlp": 0.0, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.13042739485624238, + "language_loss": 0.85312545, + "learning_rate": 0.00048349125855702844, + "loss": 0.86370087, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.11572266, + "routerloss_mlp": 0.0, + "step": 2730, + "time_per_iteration": 2.787280559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00837258, + "balance_loss_mlp": 1.43598437, + "diversity_loss_mlp": 0.21135046, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.027658523195400363, + "language_loss": 0.81318069, + "learning_rate": 0.00048317988933273287, + "loss": 0.82155323, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01359018, + "step": 2731, + "time_per_iteration": 2.763814687728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057346, + "balance_loss_mlp": 1.04585993, + "diversity_loss_mlp": 0.0, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.07420390441928848, + "language_loss": 0.82373381, + "learning_rate": 0.00048286852663858367, + "loss": 0.83430725, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.11480713, + "routerloss_mlp": 0.0, + "step": 2732, + "time_per_iteration": 2.9533157348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063655, + "balance_loss_mlp": 1.05203819, + "diversity_loss_mlp": 0.0, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.07616653501098058, + "language_loss": 0.8428973, + "learning_rate": 0.000482557170595462, + "loss": 0.8535338, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.11608887, + "routerloss_mlp": 0.0, + "step": 2733, + "time_per_iteration": 2.865147829055786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065104, + "balance_loss_mlp": 1.0532366, + "diversity_loss_mlp": 0.0, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.060395165010054055, + "language_loss": 0.87880594, + "learning_rate": 0.0004822458213242475, + "loss": 0.88945693, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.11859131, + "routerloss_mlp": 0.0, + "step": 2734, + "time_per_iteration": 2.557253360748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070633, + "balance_loss_mlp": 1.05886698, + "diversity_loss_mlp": 0.0, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.1031910380133139, + "language_loss": 0.86086309, + "learning_rate": 0.00048193447894581627, + "loss": 0.8715694, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.11761475, + "routerloss_mlp": 0.0, + "step": 2735, + "time_per_iteration": 3.122976541519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076001, + "balance_loss_mlp": 1.06436014, + "diversity_loss_mlp": 0.0, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.06843040001694842, + "language_loss": 0.8809998, + "learning_rate": 0.00048162314358104243, + "loss": 0.89175981, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.11639404, + "routerloss_mlp": 0.0, + "step": 2736, + "time_per_iteration": 2.6340246200561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00824973, + "balance_loss_mlp": 1.41347969, + "diversity_loss_mlp": 0.20989257, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.031515925317837694, + "language_loss": 0.83306372, + "learning_rate": 0.0004813118153507969, + "loss": 0.84131336, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01328672, + "step": 2737, + "time_per_iteration": 2.7356157302856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_mlp": 1.03480983, + "diversity_loss_mlp": 0.0, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.03217065957479051, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83488321, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.06396484, + "routerloss_mlp": 0.0, + "step": 2738, + "time_per_iteration": 4.772867202758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107199, + "balance_loss_mlp": 1.06062317, + "diversity_loss_mlp": 0.0, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.0555866415390632, + "language_loss": 0.83715498, + "learning_rate": 0.00048068918077736163, + "loss": 0.84787494, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.11358643, + "routerloss_mlp": 0.0, + "step": 2739, + "time_per_iteration": 3.2028074264526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076914, + "balance_loss_mlp": 1.06573176, + "diversity_loss_mlp": 0.0, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.06998122113459494, + "language_loss": 0.81445146, + "learning_rate": 0.0004803778746759001, + "loss": 0.82522058, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2740, + "time_per_iteration": 2.87070369720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082959, + "balance_loss_mlp": 1.07215285, + "diversity_loss_mlp": 0.0, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.07737040857299185, + "language_loss": 0.82122779, + "learning_rate": 0.00048006657619242317, + "loss": 0.83205736, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.10809326, + "routerloss_mlp": 0.0, + "step": 2741, + "time_per_iteration": 2.6385269165039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107519, + "balance_loss_mlp": 1.06447887, + "diversity_loss_mlp": 0.0, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.07879516603511716, + "language_loss": 0.78380877, + "learning_rate": 0.00047975528544778775, + "loss": 0.79456067, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.10717773, + "routerloss_mlp": 0.0, + "step": 2742, + "time_per_iteration": 2.6197235584259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079206, + "balance_loss_mlp": 1.06839335, + "diversity_loss_mlp": 0.0, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.07439948679259917, + "language_loss": 0.88816094, + "learning_rate": 0.00047944400256284754, + "loss": 0.89895302, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.10827637, + "routerloss_mlp": 0.0, + "step": 2743, + "time_per_iteration": 2.6887855529785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00830459, + "balance_loss_mlp": 1.42072511, + "diversity_loss_mlp": 0.21262056, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.03227823662204125, + "language_loss": 0.799101, + "learning_rate": 0.0004791327276584532, + "loss": 0.80740565, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01378582, + "step": 2744, + "time_per_iteration": 2.8497848510742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.07629538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.0718535906247093, + "language_loss": 0.80497956, + "learning_rate": 0.00047882146085545264, + "loss": 0.81585032, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.10784912, + "routerloss_mlp": 0.0, + "step": 2745, + "time_per_iteration": 2.6078941822052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017458, + "balance_loss_mlp": 1.01199865, + "diversity_loss_mlp": 0.0, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.013176381696238814, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76419842, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.0546875, + "routerloss_mlp": 0.0, + "step": 2746, + "time_per_iteration": 4.974900007247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078307, + "balance_loss_mlp": 1.06777453, + "diversity_loss_mlp": 0.0, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.0894490118638191, + "language_loss": 0.79344547, + "learning_rate": 0.00047819895203700684, + "loss": 0.80422854, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2747, + "time_per_iteration": 2.717135190963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015273, + "balance_loss_mlp": 1.00983751, + "diversity_loss_mlp": 0.0, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.009473538771460566, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76527709, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.05444336, + "routerloss_mlp": 0.0, + "step": 2748, + "time_per_iteration": 4.642770290374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085947, + "balance_loss_mlp": 1.07577801, + "diversity_loss_mlp": 0.0, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.07060951554594143, + "language_loss": 0.88469762, + "learning_rate": 0.0004775764770742277, + "loss": 0.89555711, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2749, + "time_per_iteration": 2.8018476963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087955, + "balance_loss_mlp": 1.07761312, + "diversity_loss_mlp": 0.0, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.08234082280170717, + "language_loss": 0.86406553, + "learning_rate": 0.00047726525259079777, + "loss": 0.8749451, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2750, + "time_per_iteration": 2.8415229320526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00831428, + "balance_loss_mlp": 1.42309499, + "diversity_loss_mlp": 0.21321589, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.03400797212131273, + "language_loss": 0.88723552, + "learning_rate": 0.0004769540369337798, + "loss": 0.89554983, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01327293, + "step": 2751, + "time_per_iteration": 2.752032518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100532, + "balance_loss_mlp": 1.09000587, + "diversity_loss_mlp": 0.0, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06288245154731438, + "language_loss": 0.85769415, + "learning_rate": 0.00047664283022399794, + "loss": 0.86869949, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2752, + "time_per_iteration": 2.8568003177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107464, + "balance_loss_mlp": 1.09725976, + "diversity_loss_mlp": 0.0, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.0883883166781065, + "language_loss": 0.80924225, + "learning_rate": 0.00047633163258227376, + "loss": 0.82031691, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.10205078, + "routerloss_mlp": 0.0, + "step": 2753, + "time_per_iteration": 2.8275938034057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104119, + "balance_loss_mlp": 1.09359312, + "diversity_loss_mlp": 0.0, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.06733658380062774, + "language_loss": 0.85417688, + "learning_rate": 0.0004760204441294247, + "loss": 0.86521804, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2754, + "time_per_iteration": 2.6338090896606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104137, + "balance_loss_mlp": 1.09376574, + "diversity_loss_mlp": 0.0, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.06936353635633287, + "language_loss": 0.85999346, + "learning_rate": 0.00047570926498626486, + "loss": 0.87103486, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 2755, + "time_per_iteration": 2.716575860977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108637, + "balance_loss_mlp": 1.09822416, + "diversity_loss_mlp": 0.0, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.061285448286525046, + "language_loss": 0.81361842, + "learning_rate": 0.00047539809527360474, + "loss": 0.82470477, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2756, + "time_per_iteration": 2.881225109100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102434, + "balance_loss_mlp": 1.0919373, + "diversity_loss_mlp": 0.0, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.05865021558391441, + "language_loss": 0.82642096, + "learning_rate": 0.0004750869351122511, + "loss": 0.83744538, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 2757, + "time_per_iteration": 2.9978790283203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096362, + "balance_loss_mlp": 1.08600891, + "diversity_loss_mlp": 0.0, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.07787390265260127, + "language_loss": 0.81663013, + "learning_rate": 0.00047477578462300685, + "loss": 0.82759368, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2758, + "time_per_iteration": 2.700833797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090285, + "balance_loss_mlp": 1.07975245, + "diversity_loss_mlp": 0.0, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.069319292192906, + "language_loss": 0.80022508, + "learning_rate": 0.0004744646439266718, + "loss": 0.81112796, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.10528564, + "routerloss_mlp": 0.0, + "step": 2759, + "time_per_iteration": 3.0144033432006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084993, + "balance_loss_mlp": 1.07477677, + "diversity_loss_mlp": 0.0, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.05678736813253772, + "language_loss": 0.92058611, + "learning_rate": 0.000474153513144041, + "loss": 0.93143606, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 2760, + "time_per_iteration": 2.890305995941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082633, + "balance_loss_mlp": 1.07224369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.06975892982263965, + "language_loss": 0.8659752, + "learning_rate": 0.00047384239239590633, + "loss": 0.87680155, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2761, + "time_per_iteration": 2.864649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076622, + "balance_loss_mlp": 1.06607819, + "diversity_loss_mlp": 0.0, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06592907525694008, + "language_loss": 0.88956439, + "learning_rate": 0.0004735312818030556, + "loss": 0.90033066, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 2762, + "time_per_iteration": 2.7256298065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.06967998, + "diversity_loss_mlp": 0.0, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.06903030148880929, + "language_loss": 0.82737643, + "learning_rate": 0.0004732201814862727, + "loss": 0.83817625, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 2763, + "time_per_iteration": 2.785104990005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078579, + "balance_loss_mlp": 1.0687145, + "diversity_loss_mlp": 0.0, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.07391416357546753, + "language_loss": 0.81619537, + "learning_rate": 0.0004729090915663373, + "loss": 0.82698119, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 2764, + "time_per_iteration": 2.841716766357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841129, + "balance_loss_mlp": 1.43825924, + "diversity_loss_mlp": 0.21717778, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.03676047653681057, + "language_loss": 0.84753668, + "learning_rate": 0.00047259801216402534, + "loss": 0.85594797, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01341068, + "step": 2765, + "time_per_iteration": 2.5414865016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078812, + "balance_loss_mlp": 1.06872129, + "diversity_loss_mlp": 0.0, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.08353685320939014, + "language_loss": 0.86307138, + "learning_rate": 0.00047228694340010845, + "loss": 0.87385947, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 2766, + "time_per_iteration": 2.571230173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083419, + "balance_loss_mlp": 1.07304192, + "diversity_loss_mlp": 0.0, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.07758433064211989, + "language_loss": 0.85983396, + "learning_rate": 0.0004719758853953544, + "loss": 0.87066811, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 2767, + "time_per_iteration": 3.5577545166015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.07479465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.08923013324738549, + "language_loss": 0.83480549, + "learning_rate": 0.00047166483827052645, + "loss": 0.84565854, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.10510254, + "routerloss_mlp": 0.0, + "step": 2768, + "time_per_iteration": 2.3904964923858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014357, + "balance_loss_mlp": 1.0088253, + "diversity_loss_mlp": 0.0, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.015852342000118255, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78092843, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2769, + "time_per_iteration": 4.993681907653809 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100393, + "balance_loss_mlp": 1.08974218, + "diversity_loss_mlp": 0.0, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.07499519146645399, + "language_loss": 0.8344022, + "learning_rate": 0.000471042777143682, + "loss": 0.84540612, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.10656738, + "routerloss_mlp": 0.0, + "step": 2770, + "time_per_iteration": 3.2187654972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099844, + "balance_loss_mlp": 1.0895741, + "diversity_loss_mlp": 0.0, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.07177386868704265, + "language_loss": 0.79602164, + "learning_rate": 0.0004707317633831707, + "loss": 0.80702007, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 2771, + "time_per_iteration": 2.5579092502593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097617, + "balance_loss_mlp": 1.08694136, + "diversity_loss_mlp": 0.0, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.08358365289860634, + "language_loss": 0.78326285, + "learning_rate": 0.00047042076098559673, + "loss": 0.79423904, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.10687256, + "routerloss_mlp": 0.0, + "step": 2772, + "time_per_iteration": 2.6240808963775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089428, + "balance_loss_mlp": 1.07924104, + "diversity_loss_mlp": 0.0, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.07827879900232339, + "language_loss": 0.7374208, + "learning_rate": 0.00047010977007170174, + "loss": 0.7483151, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 2773, + "time_per_iteration": 3.239807605743408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108646, + "balance_loss_mlp": 1.07606506, + "diversity_loss_mlp": 0.0, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.0770996892807777, + "language_loss": 0.82462615, + "learning_rate": 0.00046979879076222334, + "loss": 0.83549076, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 2774, + "time_per_iteration": 2.6871917247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081473, + "balance_loss_mlp": 1.07122087, + "diversity_loss_mlp": 0.0, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.060681013844514214, + "language_loss": 0.84932172, + "learning_rate": 0.0004694878231778939, + "loss": 0.86013645, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2775, + "time_per_iteration": 3.3516969680786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083848, + "balance_loss_mlp": 1.07336903, + "diversity_loss_mlp": 0.0, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.06561156947814625, + "language_loss": 0.84353071, + "learning_rate": 0.0004691768674394423, + "loss": 0.85436922, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2776, + "time_per_iteration": 2.9356815814971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010203, + "balance_loss_mlp": 1.01491189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.017317997453326725, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85504305, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 2777, + "time_per_iteration": 4.766932010650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017275, + "balance_loss_mlp": 1.01186275, + "diversity_loss_mlp": 0.0, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.016201867017030143, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77670807, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 2778, + "time_per_iteration": 5.022111177444458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081648, + "balance_loss_mlp": 1.07109189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.08348606714079294, + "language_loss": 0.79229748, + "learning_rate": 0.00046824407250656676, + "loss": 0.803114, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.10565186, + "routerloss_mlp": 0.0, + "step": 2779, + "time_per_iteration": 2.6202685832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079296, + "balance_loss_mlp": 1.06859064, + "diversity_loss_mlp": 0.0, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.0812040646365834, + "language_loss": 0.83481312, + "learning_rate": 0.0004679331653588161, + "loss": 0.84560603, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.1071167, + "routerloss_mlp": 0.0, + "step": 2780, + "time_per_iteration": 2.6287879943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.07337165, + "diversity_loss_mlp": 0.0, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.08148878126655458, + "language_loss": 0.85570091, + "learning_rate": 0.0004676222706605147, + "loss": 0.86654037, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2781, + "time_per_iteration": 2.634186029434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082358, + "balance_loss_mlp": 1.07175457, + "diversity_loss_mlp": 0.0, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.08561637601090062, + "language_loss": 0.84885913, + "learning_rate": 0.0004673113885323626, + "loss": 0.85968268, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.10601807, + "routerloss_mlp": 0.0, + "step": 2782, + "time_per_iteration": 2.839108943939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084061, + "balance_loss_mlp": 1.07358241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.0730092425976976, + "language_loss": 0.78793383, + "learning_rate": 0.00046700051909505494, + "loss": 0.79877448, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.10479736, + "routerloss_mlp": 0.0, + "step": 2783, + "time_per_iteration": 3.1548988819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080824, + "balance_loss_mlp": 1.06943369, + "diversity_loss_mlp": 0.0, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06678731146909953, + "language_loss": 0.84066731, + "learning_rate": 0.000466689662469282, + "loss": 0.85147554, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.11383057, + "routerloss_mlp": 0.0, + "step": 2784, + "time_per_iteration": 2.6213507652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082867, + "balance_loss_mlp": 1.07235312, + "diversity_loss_mlp": 0.0, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.06931446022689573, + "language_loss": 0.83996934, + "learning_rate": 0.00046637881877572917, + "loss": 0.85079801, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.10522461, + "routerloss_mlp": 0.0, + "step": 2785, + "time_per_iteration": 3.1161208152770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084248, + "balance_loss_mlp": 1.07350779, + "diversity_loss_mlp": 0.0, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.05978198327100757, + "language_loss": 0.84824258, + "learning_rate": 0.0004660679881350764, + "loss": 0.85908508, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.10742188, + "routerloss_mlp": 0.0, + "step": 2786, + "time_per_iteration": 2.7317774295806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_mlp": 1.0375849, + "diversity_loss_mlp": 0.0, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.025126940202686972, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.7665174, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.06005859, + "routerloss_mlp": 0.0, + "step": 2787, + "time_per_iteration": 5.0151801109313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079952, + "balance_loss_mlp": 1.06945598, + "diversity_loss_mlp": 0.0, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07181749108152896, + "language_loss": 0.78038859, + "learning_rate": 0.0004654463664951667, + "loss": 0.79118812, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 2788, + "time_per_iteration": 2.9862492084503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074852, + "balance_loss_mlp": 1.06444538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06160548649513732, + "language_loss": 0.83008492, + "learning_rate": 0.0004651355757372447, + "loss": 0.84083349, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 2789, + "time_per_iteration": 2.6209347248077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00838367, + "balance_loss_mlp": 1.43426061, + "diversity_loss_mlp": 0.2158158, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.029696530744324656, + "language_loss": 0.8589375, + "learning_rate": 0.00046482479851489274, + "loss": 0.86732113, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01332852, + "step": 2790, + "time_per_iteration": 2.6991934776306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077231, + "balance_loss_mlp": 1.06660962, + "diversity_loss_mlp": 0.0, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.09378702232215988, + "language_loss": 0.77937293, + "learning_rate": 0.00046451403494876525, + "loss": 0.79014528, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.10632324, + "routerloss_mlp": 0.0, + "step": 2791, + "time_per_iteration": 2.8735973834991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070268, + "balance_loss_mlp": 1.05943799, + "diversity_loss_mlp": 0.0, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.07434319158841775, + "language_loss": 0.84554839, + "learning_rate": 0.0004642032851595111, + "loss": 0.85625106, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.1083374, + "routerloss_mlp": 0.0, + "step": 2792, + "time_per_iteration": 2.7458536624908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065699, + "balance_loss_mlp": 1.05472004, + "diversity_loss_mlp": 0.0, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.06545464420604186, + "language_loss": 0.85163087, + "learning_rate": 0.00046389254926777404, + "loss": 0.86228788, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.10980225, + "routerloss_mlp": 0.0, + "step": 2793, + "time_per_iteration": 2.823887825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062925, + "balance_loss_mlp": 1.0519762, + "diversity_loss_mlp": 0.0, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.06502650627416932, + "language_loss": 0.78292251, + "learning_rate": 0.0004635818273941926, + "loss": 0.79355174, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.10955811, + "routerloss_mlp": 0.0, + "step": 2794, + "time_per_iteration": 3.569359302520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058844, + "balance_loss_mlp": 1.04798412, + "diversity_loss_mlp": 0.0, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.0851115940139546, + "language_loss": 0.81696212, + "learning_rate": 0.0004632711196593997, + "loss": 0.82755053, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2795, + "time_per_iteration": 2.763248920440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059516, + "balance_loss_mlp": 1.04872167, + "diversity_loss_mlp": 0.0, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.08577601840657965, + "language_loss": 0.85307401, + "learning_rate": 0.00046296042618402297, + "loss": 0.86366916, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.10791016, + "routerloss_mlp": 0.0, + "step": 2796, + "time_per_iteration": 3.059995651245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065204, + "balance_loss_mlp": 1.05436158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.05816929772054262, + "language_loss": 0.79285312, + "learning_rate": 0.0004626497470886839, + "loss": 0.80350512, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.10845947, + "routerloss_mlp": 0.0, + "step": 2797, + "time_per_iteration": 2.9551138877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059667, + "balance_loss_mlp": 1.04897988, + "diversity_loss_mlp": 0.0, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.06686475877008137, + "language_loss": 0.82082057, + "learning_rate": 0.00046233908249399897, + "loss": 0.83141726, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.10693359, + "routerloss_mlp": 0.0, + "step": 2798, + "time_per_iteration": 2.7494163513183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_mlp": 1.06012726, + "diversity_loss_mlp": 0.0, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.06311972638358435, + "language_loss": 0.78919041, + "learning_rate": 0.00046202843252057905, + "loss": 0.79990107, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.10943604, + "routerloss_mlp": 0.0, + "step": 2799, + "time_per_iteration": 2.586824655532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076382, + "balance_loss_mlp": 1.06545627, + "diversity_loss_mlp": 0.0, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.06763496495115903, + "language_loss": 0.83705521, + "learning_rate": 0.00046171779728902896, + "loss": 0.84781897, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2800, + "time_per_iteration": 2.5922951698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084728, + "balance_loss_mlp": 1.07354665, + "diversity_loss_mlp": 0.0, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.12725923305511472, + "language_loss": 0.86135888, + "learning_rate": 0.000461407176919948, + "loss": 0.87220615, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.11181641, + "routerloss_mlp": 0.0, + "step": 2801, + "time_per_iteration": 2.532080888748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085364, + "balance_loss_mlp": 1.07459974, + "diversity_loss_mlp": 0.0, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.08372818850883645, + "language_loss": 0.85317719, + "learning_rate": 0.00046109657153392997, + "loss": 0.8640309, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.10772705, + "routerloss_mlp": 0.0, + "step": 2802, + "time_per_iteration": 2.7498726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082922, + "balance_loss_mlp": 1.07185912, + "diversity_loss_mlp": 0.0, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.07972844989907181, + "language_loss": 0.82981819, + "learning_rate": 0.0004607859812515622, + "loss": 0.84064734, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.11071777, + "routerloss_mlp": 0.0, + "step": 2803, + "time_per_iteration": 2.5823397636413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077838, + "balance_loss_mlp": 1.06679916, + "diversity_loss_mlp": 0.0, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.06982591680837838, + "language_loss": 0.88185596, + "learning_rate": 0.00046047540619342667, + "loss": 0.89263427, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.1104126, + "routerloss_mlp": 0.0, + "step": 2804, + "time_per_iteration": 2.582594156265259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089122, + "balance_loss_mlp": 1.07845902, + "diversity_loss_mlp": 0.0, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.06923180186476277, + "language_loss": 0.80359995, + "learning_rate": 0.00046016484648009933, + "loss": 0.81449121, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.10675049, + "routerloss_mlp": 0.0, + "step": 2805, + "time_per_iteration": 2.705085277557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082055, + "balance_loss_mlp": 1.0713259, + "diversity_loss_mlp": 0.0, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.06938884531628577, + "language_loss": 0.81049907, + "learning_rate": 0.0004598543022321501, + "loss": 0.82131958, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.10736084, + "routerloss_mlp": 0.0, + "step": 2806, + "time_per_iteration": 2.6722495555877686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855076, + "balance_loss_mlp": 1.46593428, + "diversity_loss_mlp": 0.21781196, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.030466031644405155, + "language_loss": 0.79783833, + "learning_rate": 0.0004595437735701433, + "loss": 0.80638903, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01320273, + "step": 2807, + "time_per_iteration": 2.734110116958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088135, + "balance_loss_mlp": 1.07728648, + "diversity_loss_mlp": 0.0, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.08474622827734493, + "language_loss": 0.83849192, + "learning_rate": 0.00045923326061463623, + "loss": 0.84937334, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2808, + "time_per_iteration": 2.7606189250946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089545, + "balance_loss_mlp": 1.07878006, + "diversity_loss_mlp": 0.0, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.06442619071995537, + "language_loss": 0.8173002, + "learning_rate": 0.00045892276348618113, + "loss": 0.82819563, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.10772705, + "routerloss_mlp": 0.0, + "step": 2809, + "time_per_iteration": 2.9691591262817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_mlp": 1.02887774, + "diversity_loss_mlp": 0.0, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.01908051648382603, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79294789, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 2810, + "time_per_iteration": 4.957923173904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089256, + "balance_loss_mlp": 1.07848597, + "diversity_loss_mlp": 0.0, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.05960464217413758, + "language_loss": 0.80596066, + "learning_rate": 0.000458301817192603, + "loss": 0.81685317, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.10778809, + "routerloss_mlp": 0.0, + "step": 2811, + "time_per_iteration": 2.852247714996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021724, + "balance_loss_mlp": 1.0165503, + "diversity_loss_mlp": 0.0, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.015447521326512613, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81863511, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.05175781, + "routerloss_mlp": 0.0, + "step": 2812, + "time_per_iteration": 4.808724880218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080487, + "balance_loss_mlp": 1.06993747, + "diversity_loss_mlp": 0.0, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.06805695837678187, + "language_loss": 0.87130654, + "learning_rate": 0.00045768093565369983, + "loss": 0.88211143, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.10552979, + "routerloss_mlp": 0.0, + "step": 2813, + "time_per_iteration": 2.7794101238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090945, + "balance_loss_mlp": 1.08034182, + "diversity_loss_mlp": 0.0, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.06578755075233327, + "language_loss": 0.8208549, + "learning_rate": 0.0004573705194685646, + "loss": 0.83176434, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.1060791, + "routerloss_mlp": 0.0, + "step": 2814, + "time_per_iteration": 2.686871290206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084437, + "balance_loss_mlp": 1.07364845, + "diversity_loss_mlp": 0.0, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.07321549809116977, + "language_loss": 0.84966654, + "learning_rate": 0.00045706011983366157, + "loss": 0.86051095, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.10784912, + "routerloss_mlp": 0.0, + "step": 2815, + "time_per_iteration": 2.676772117614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843207, + "balance_loss_mlp": 1.44560027, + "diversity_loss_mlp": 0.21445701, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.03775972378408833, + "language_loss": 0.82685602, + "learning_rate": 0.00045674973686949847, + "loss": 0.83528805, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01317827, + "step": 2816, + "time_per_iteration": 2.548164129257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079521, + "balance_loss_mlp": 1.06887531, + "diversity_loss_mlp": 0.0, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.06715248152064907, + "language_loss": 0.85478067, + "learning_rate": 0.0004564393706965766, + "loss": 0.86557591, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.10656738, + "routerloss_mlp": 0.0, + "step": 2817, + "time_per_iteration": 2.9715416431427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078759, + "balance_loss_mlp": 1.06789875, + "diversity_loss_mlp": 0.0, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.07300594242261846, + "language_loss": 0.81410033, + "learning_rate": 0.00045612902143539116, + "loss": 0.82488787, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.10864258, + "routerloss_mlp": 0.0, + "step": 2818, + "time_per_iteration": 2.5861568450927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_mlp": 1.05926371, + "diversity_loss_mlp": 0.0, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.07796543703625758, + "language_loss": 0.8169418, + "learning_rate": 0.00045581868920642986, + "loss": 0.82763875, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 2819, + "time_per_iteration": 2.495675563812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079235, + "balance_loss_mlp": 1.06864905, + "diversity_loss_mlp": 0.0, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.08284985931126, + "language_loss": 0.79605496, + "learning_rate": 0.00045550837413017457, + "loss": 0.80684733, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2820, + "time_per_iteration": 2.5968475341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081511, + "balance_loss_mlp": 1.07137275, + "diversity_loss_mlp": 0.0, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06853869944040722, + "language_loss": 0.85501075, + "learning_rate": 0.0004551980763271005, + "loss": 0.86582589, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 2821, + "time_per_iteration": 2.6689629554748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080137, + "balance_loss_mlp": 1.06970072, + "diversity_loss_mlp": 0.0, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.07047505467714002, + "language_loss": 0.83788973, + "learning_rate": 0.0004548877959176756, + "loss": 0.84869111, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2822, + "time_per_iteration": 2.8898305892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.06903815, + "diversity_loss_mlp": 0.0, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.06782192405371351, + "language_loss": 0.86297488, + "learning_rate": 0.00045457753302236166, + "loss": 0.87376869, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.10339355, + "routerloss_mlp": 0.0, + "step": 2823, + "time_per_iteration": 2.626262903213501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087962, + "balance_loss_mlp": 1.07755554, + "diversity_loss_mlp": 0.0, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.07336203540826484, + "language_loss": 0.87131381, + "learning_rate": 0.00045426728776161353, + "loss": 0.88219345, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2824, + "time_per_iteration": 2.7630255222320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085716, + "balance_loss_mlp": 1.07529116, + "diversity_loss_mlp": 0.0, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.07766893457840997, + "language_loss": 0.81382459, + "learning_rate": 0.00045395706025587863, + "loss": 0.82468176, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2825, + "time_per_iteration": 2.653036594390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070472, + "balance_loss_mlp": 1.05976105, + "diversity_loss_mlp": 0.0, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.08392292239142347, + "language_loss": 0.82965428, + "learning_rate": 0.00045364685062559843, + "loss": 0.84035897, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.10717773, + "routerloss_mlp": 0.0, + "step": 2826, + "time_per_iteration": 2.8091156482696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075397, + "balance_loss_mlp": 1.06498957, + "diversity_loss_mlp": 0.0, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.06510139608888613, + "language_loss": 0.91622829, + "learning_rate": 0.0004533366589912067, + "loss": 0.92698228, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2827, + "time_per_iteration": 2.949005365371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075847, + "balance_loss_mlp": 1.06538677, + "diversity_loss_mlp": 0.0, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.07049343673366977, + "language_loss": 0.77641904, + "learning_rate": 0.0004530264854731306, + "loss": 0.78717756, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.10461426, + "routerloss_mlp": 0.0, + "step": 2828, + "time_per_iteration": 3.054252862930298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079568, + "balance_loss_mlp": 1.06920242, + "diversity_loss_mlp": 0.0, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.05986165572949975, + "language_loss": 0.84122354, + "learning_rate": 0.00045271633019179034, + "loss": 0.85201919, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 2829, + "time_per_iteration": 2.788818836212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077904, + "balance_loss_mlp": 1.06762242, + "diversity_loss_mlp": 0.0, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.05963281032217842, + "language_loss": 0.87701666, + "learning_rate": 0.0004524061932675986, + "loss": 0.88779569, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.10284424, + "routerloss_mlp": 0.0, + "step": 2830, + "time_per_iteration": 2.861154079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073509, + "balance_loss_mlp": 1.06306028, + "diversity_loss_mlp": 0.0, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.11132414831600651, + "language_loss": 0.87095535, + "learning_rate": 0.00045209607482096125, + "loss": 0.88169038, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.10455322, + "routerloss_mlp": 0.0, + "step": 2831, + "time_per_iteration": 3.041248321533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107632, + "balance_loss_mlp": 1.06573415, + "diversity_loss_mlp": 0.0, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.07049073021000962, + "language_loss": 0.84385192, + "learning_rate": 0.0004517859749722772, + "loss": 0.85461509, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2832, + "time_per_iteration": 2.663478374481201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075242, + "balance_loss_mlp": 1.0643816, + "diversity_loss_mlp": 0.0, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.06386820666055518, + "language_loss": 0.79316235, + "learning_rate": 0.0004514758938419376, + "loss": 0.80391467, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.10870361, + "routerloss_mlp": 0.0, + "step": 2833, + "time_per_iteration": 2.8141582012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_mlp": 1.03721869, + "diversity_loss_mlp": 0.0, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.027736452139364785, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77963334, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.05541992, + "routerloss_mlp": 0.0, + "step": 2834, + "time_per_iteration": 4.960749864578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075071, + "balance_loss_mlp": 1.06446719, + "diversity_loss_mlp": 0.0, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.06436328535255592, + "language_loss": 0.83993077, + "learning_rate": 0.00045085578821782175, + "loss": 0.85068148, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.1060791, + "routerloss_mlp": 0.0, + "step": 2835, + "time_per_iteration": 2.6025185585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020548, + "balance_loss_mlp": 1.01516008, + "diversity_loss_mlp": 0.0, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.015651807900939278, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77155292, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 2836, + "time_per_iteration": 4.911514043807983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079494, + "balance_loss_mlp": 1.06864595, + "diversity_loss_mlp": 0.0, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.05502946705999508, + "language_loss": 0.81078947, + "learning_rate": 0.00045023575891159866, + "loss": 0.82158434, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.10852051, + "routerloss_mlp": 0.0, + "step": 2837, + "time_per_iteration": 2.7158284187316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008506, + "balance_loss_mlp": 1.00321293, + "diversity_loss_mlp": 0.0, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.010060791837063862, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75772309, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 2838, + "time_per_iteration": 4.9448912143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078413, + "balance_loss_mlp": 1.06803036, + "diversity_loss_mlp": 0.0, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.059936217606746015, + "language_loss": 0.78111225, + "learning_rate": 0.0004496158068861354, + "loss": 0.79189646, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 2839, + "time_per_iteration": 2.8019115924835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.07090366, + "diversity_loss_mlp": 0.0, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.06804602152838367, + "language_loss": 0.80713242, + "learning_rate": 0.00044930586015455207, + "loss": 0.81794775, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.10638428, + "routerloss_mlp": 0.0, + "step": 2840, + "time_per_iteration": 2.771359443664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076777, + "balance_loss_mlp": 1.06646562, + "diversity_loss_mlp": 0.0, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.0578733121218936, + "language_loss": 0.88904727, + "learning_rate": 0.000448995933104179, + "loss": 0.89981508, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2841, + "time_per_iteration": 2.8486392498016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081465, + "balance_loss_mlp": 1.07075977, + "diversity_loss_mlp": 0.0, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.07392730491467848, + "language_loss": 0.80162299, + "learning_rate": 0.00044868602585534077, + "loss": 0.81243765, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.10699463, + "routerloss_mlp": 0.0, + "step": 2842, + "time_per_iteration": 2.8463480472564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074348, + "balance_loss_mlp": 1.06379187, + "diversity_loss_mlp": 0.0, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.0858024928700591, + "language_loss": 0.89360344, + "learning_rate": 0.0004483761385283541, + "loss": 0.90434694, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.10565186, + "routerloss_mlp": 0.0, + "step": 2843, + "time_per_iteration": 2.534032106399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00870358, + "balance_loss_mlp": 1.4994092, + "diversity_loss_mlp": 0.21570696, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.030684440159293704, + "language_loss": 0.8165319, + "learning_rate": 0.0004480662712435281, + "loss": 0.82523549, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01280049, + "step": 2844, + "time_per_iteration": 2.7523300647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081741, + "balance_loss_mlp": 1.07085109, + "diversity_loss_mlp": 0.0, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.08261462073704483, + "language_loss": 0.88389564, + "learning_rate": 0.0004477564241211635, + "loss": 0.89471304, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 2845, + "time_per_iteration": 2.5676896572113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068187, + "balance_loss_mlp": 1.0573566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.07762403474355188, + "language_loss": 0.868963, + "learning_rate": 0.0004474465972815541, + "loss": 0.87964487, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.10839844, + "routerloss_mlp": 0.0, + "step": 2846, + "time_per_iteration": 2.4843738079071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073996, + "balance_loss_mlp": 1.06337464, + "diversity_loss_mlp": 0.0, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.05857404260801407, + "language_loss": 0.87612844, + "learning_rate": 0.000447136790844985, + "loss": 0.88686836, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.10626221, + "routerloss_mlp": 0.0, + "step": 2847, + "time_per_iteration": 2.659214973449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068933, + "balance_loss_mlp": 1.05774474, + "diversity_loss_mlp": 0.0, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.0657788254057266, + "language_loss": 0.80922693, + "learning_rate": 0.00044682700493173385, + "loss": 0.81991625, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.11187744, + "routerloss_mlp": 0.0, + "step": 2848, + "time_per_iteration": 2.8093039989471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071859, + "balance_loss_mlp": 1.06077814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.06921376228249611, + "language_loss": 0.80399549, + "learning_rate": 0.00044651723966207004, + "loss": 0.81471407, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.11090088, + "routerloss_mlp": 0.0, + "step": 2849, + "time_per_iteration": 3.1084961891174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.05826974, + "diversity_loss_mlp": 0.0, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.06382752106805908, + "language_loss": 0.78137773, + "learning_rate": 0.00044620749515625536, + "loss": 0.79206896, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2850, + "time_per_iteration": 2.8127682209014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_mlp": 1.05505395, + "diversity_loss_mlp": 0.0, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.07084116902380141, + "language_loss": 0.85142213, + "learning_rate": 0.00044589777153454334, + "loss": 0.86208153, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.10888672, + "routerloss_mlp": 0.0, + "step": 2851, + "time_per_iteration": 2.7690277099609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063836, + "balance_loss_mlp": 1.05239749, + "diversity_loss_mlp": 0.0, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.06308922523972363, + "language_loss": 0.83850712, + "learning_rate": 0.00044558806891717895, + "loss": 0.84914547, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.11450195, + "routerloss_mlp": 0.0, + "step": 2852, + "time_per_iteration": 2.542076587677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066106, + "balance_loss_mlp": 1.05529404, + "diversity_loss_mlp": 0.0, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.06000502851088379, + "language_loss": 0.79783493, + "learning_rate": 0.0004452783874243998, + "loss": 0.808496, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.1081543, + "routerloss_mlp": 0.0, + "step": 2853, + "time_per_iteration": 2.8680150508880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070657, + "balance_loss_mlp": 1.06022012, + "diversity_loss_mlp": 0.0, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.07387916596955035, + "language_loss": 0.84572864, + "learning_rate": 0.00044496872717643475, + "loss": 0.85643518, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 2854, + "time_per_iteration": 2.676128625869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_mlp": 1.04261672, + "diversity_loss_mlp": 0.0, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.03710413532206065, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78137678, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.05761719, + "routerloss_mlp": 0.0, + "step": 2855, + "time_per_iteration": 4.937518835067749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.06609333, + "diversity_loss_mlp": 0.0, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.06582649113696544, + "language_loss": 0.81989098, + "learning_rate": 0.0004443494708958217, + "loss": 0.83065504, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 2856, + "time_per_iteration": 2.9764318466186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077652, + "balance_loss_mlp": 1.06707263, + "diversity_loss_mlp": 0.0, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.05962775351044122, + "language_loss": 0.80705082, + "learning_rate": 0.0004440398751035906, + "loss": 0.81782728, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2857, + "time_per_iteration": 2.8708760738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107925, + "balance_loss_mlp": 1.06846118, + "diversity_loss_mlp": 0.0, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.08652259855452149, + "language_loss": 0.83723986, + "learning_rate": 0.00044373030103700645, + "loss": 0.84803236, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.10791016, + "routerloss_mlp": 0.0, + "step": 2858, + "time_per_iteration": 2.629887342453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857386, + "balance_loss_mlp": 1.47058845, + "diversity_loss_mlp": 0.21831456, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.03034959963101528, + "language_loss": 0.79655832, + "learning_rate": 0.000443420748816257, + "loss": 0.80513215, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01293462, + "step": 2859, + "time_per_iteration": 2.8473408222198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107821, + "balance_loss_mlp": 1.06795764, + "diversity_loss_mlp": 0.0, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.07076083110298415, + "language_loss": 0.78692329, + "learning_rate": 0.0004431112185615208, + "loss": 0.79770535, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2860, + "time_per_iteration": 2.751131534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.0721283, + "diversity_loss_mlp": 0.0, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.06396450124437818, + "language_loss": 0.7993266, + "learning_rate": 0.00044280171039296845, + "loss": 0.81015229, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.10449219, + "routerloss_mlp": 0.0, + "step": 2861, + "time_per_iteration": 2.606870651245117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082002, + "balance_loss_mlp": 1.0716126, + "diversity_loss_mlp": 0.0, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.0734058146638898, + "language_loss": 0.8832019, + "learning_rate": 0.0004424922244307616, + "loss": 0.89402187, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2862, + "time_per_iteration": 2.728055477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081346, + "balance_loss_mlp": 1.07124305, + "diversity_loss_mlp": 0.0, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.08810368166009505, + "language_loss": 0.82030249, + "learning_rate": 0.00044218276079505315, + "loss": 0.83111596, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.10101318, + "routerloss_mlp": 0.0, + "step": 2863, + "time_per_iteration": 2.8925743103027344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076982, + "balance_loss_mlp": 1.0667721, + "diversity_loss_mlp": 0.0, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.06918705117949257, + "language_loss": 0.74817479, + "learning_rate": 0.0004418733196059876, + "loss": 0.75894463, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 2864, + "time_per_iteration": 2.747131109237671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068041, + "balance_loss_mlp": 1.0579797, + "diversity_loss_mlp": 0.0, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.060188467246496694, + "language_loss": 0.79747194, + "learning_rate": 0.0004415639009837008, + "loss": 0.80815232, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 2865, + "time_per_iteration": 2.838609218597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077074, + "balance_loss_mlp": 1.06704867, + "diversity_loss_mlp": 0.0, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.06869441498871262, + "language_loss": 0.82126647, + "learning_rate": 0.00044125450504831955, + "loss": 0.83203721, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2866, + "time_per_iteration": 2.7267115116119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080685, + "balance_loss_mlp": 1.07046294, + "diversity_loss_mlp": 0.0, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.0812577822304444, + "language_loss": 0.82503623, + "learning_rate": 0.0004409451319199622, + "loss": 0.83584309, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 2867, + "time_per_iteration": 2.6727194786071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080071, + "balance_loss_mlp": 1.07005203, + "diversity_loss_mlp": 0.0, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07302760882162292, + "language_loss": 0.84415638, + "learning_rate": 0.0004406357817187381, + "loss": 0.8549571, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2868, + "time_per_iteration": 2.9669716358184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084226, + "balance_loss_mlp": 1.07424247, + "diversity_loss_mlp": 0.0, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.06120403113840053, + "language_loss": 0.81250817, + "learning_rate": 0.0004403264545647474, + "loss": 0.82335043, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 2869, + "time_per_iteration": 3.535280704498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092656, + "balance_loss_mlp": 1.08244562, + "diversity_loss_mlp": 0.0, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.05305368525165607, + "language_loss": 0.84751379, + "learning_rate": 0.00044001715057808154, + "loss": 0.85844034, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 2870, + "time_per_iteration": 2.757197618484497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00867753, + "balance_loss_mlp": 1.49414647, + "diversity_loss_mlp": 0.21602358, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.02933333976418528, + "language_loss": 0.81627762, + "learning_rate": 0.0004397078698788232, + "loss": 0.82495517, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01266836, + "step": 2871, + "time_per_iteration": 3.241936445236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046288, + "balance_loss_mlp": 1.04097104, + "diversity_loss_mlp": 0.0, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.0256992480173019, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81488657, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 2872, + "time_per_iteration": 4.879035234451294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103168, + "balance_loss_mlp": 1.09304726, + "diversity_loss_mlp": 0.0, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.06889966135830194, + "language_loss": 0.78025937, + "learning_rate": 0.00043908937882281343, + "loss": 0.79129106, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.10119629, + "routerloss_mlp": 0.0, + "step": 2873, + "time_per_iteration": 2.624072313308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097291, + "balance_loss_mlp": 1.08644319, + "diversity_loss_mlp": 0.0, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.06659644406743612, + "language_loss": 0.82492054, + "learning_rate": 0.0004387801687061814, + "loss": 0.83589351, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.10858154, + "routerloss_mlp": 0.0, + "step": 2874, + "time_per_iteration": 2.839524269104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100748, + "balance_loss_mlp": 1.09040689, + "diversity_loss_mlp": 0.0, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.06411004123803754, + "language_loss": 0.80204833, + "learning_rate": 0.0004384709823571958, + "loss": 0.81305587, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2875, + "time_per_iteration": 2.768268346786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092947, + "balance_loss_mlp": 1.08278441, + "diversity_loss_mlp": 0.0, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.0827933156096061, + "language_loss": 0.83099473, + "learning_rate": 0.0004381618198958932, + "loss": 0.84192419, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 2876, + "time_per_iteration": 3.509364604949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084381, + "balance_loss_mlp": 1.07393849, + "diversity_loss_mlp": 0.0, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.0672046455921574, + "language_loss": 0.83616996, + "learning_rate": 0.00043785268144230137, + "loss": 0.84701377, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.10449219, + "routerloss_mlp": 0.0, + "step": 2877, + "time_per_iteration": 2.8941080570220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078858, + "balance_loss_mlp": 1.06849325, + "diversity_loss_mlp": 0.0, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.08466064144544548, + "language_loss": 0.82657743, + "learning_rate": 0.00043754356711643837, + "loss": 0.83736604, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 2878, + "time_per_iteration": 2.6849513053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072509, + "balance_loss_mlp": 1.0620904, + "diversity_loss_mlp": 0.0, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.08115939494621484, + "language_loss": 0.84283209, + "learning_rate": 0.0004372344770383132, + "loss": 0.85355723, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2879, + "time_per_iteration": 2.809833526611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064473, + "balance_loss_mlp": 1.05426884, + "diversity_loss_mlp": 0.0, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.15468249092113104, + "language_loss": 0.82951438, + "learning_rate": 0.00043692541132792507, + "loss": 0.84015906, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.10205078, + "routerloss_mlp": 0.0, + "step": 2880, + "time_per_iteration": 2.6886332035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106573, + "balance_loss_mlp": 1.05541205, + "diversity_loss_mlp": 0.0, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.07258014540865806, + "language_loss": 0.83396262, + "learning_rate": 0.00043661637010526384, + "loss": 0.84461993, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2881, + "time_per_iteration": 2.484912872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010637, + "balance_loss_mlp": 1.05335283, + "diversity_loss_mlp": 0.0, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.07022154553173111, + "language_loss": 0.83217472, + "learning_rate": 0.00043630735349031025, + "loss": 0.8428117, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 2882, + "time_per_iteration": 2.627950429916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064657, + "balance_loss_mlp": 1.05427396, + "diversity_loss_mlp": 0.0, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.05734398116556458, + "language_loss": 0.81837022, + "learning_rate": 0.00043599836160303495, + "loss": 0.8290168, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2883, + "time_per_iteration": 2.87358021736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061705, + "balance_loss_mlp": 1.05094647, + "diversity_loss_mlp": 0.0, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.05952583825506871, + "language_loss": 0.77472365, + "learning_rate": 0.0004356893945633995, + "loss": 0.78534073, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 2884, + "time_per_iteration": 2.9415786266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058414, + "balance_loss_mlp": 1.04738104, + "diversity_loss_mlp": 0.0, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.06387157363580499, + "language_loss": 0.81997669, + "learning_rate": 0.0004353804524913551, + "loss": 0.8305608, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.11035156, + "routerloss_mlp": 0.0, + "step": 2885, + "time_per_iteration": 2.5772132873535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106399, + "balance_loss_mlp": 1.05298674, + "diversity_loss_mlp": 0.0, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.07314612024272811, + "language_loss": 0.82015049, + "learning_rate": 0.0004350715355068441, + "loss": 0.8307904, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.11010742, + "routerloss_mlp": 0.0, + "step": 2886, + "time_per_iteration": 2.7211849689483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062969, + "balance_loss_mlp": 1.05221653, + "diversity_loss_mlp": 0.0, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.08671001380075964, + "language_loss": 0.79774809, + "learning_rate": 0.00043476264372979847, + "loss": 0.8083778, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.10754395, + "routerloss_mlp": 0.0, + "step": 2887, + "time_per_iteration": 2.5452206134796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064606, + "balance_loss_mlp": 1.05403173, + "diversity_loss_mlp": 0.0, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.08125450311694367, + "language_loss": 0.78590369, + "learning_rate": 0.0004344537772801408, + "loss": 0.79654968, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.10577393, + "routerloss_mlp": 0.0, + "step": 2888, + "time_per_iteration": 3.870267391204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_mlp": 1.02839172, + "diversity_loss_mlp": 0.0, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.026917818165577125, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74456155, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 2889, + "time_per_iteration": 4.943026065826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091351, + "balance_loss_mlp": 1.08043766, + "diversity_loss_mlp": 0.0, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.07456412824125162, + "language_loss": 0.83536172, + "learning_rate": 0.0004338361208426298, + "loss": 0.84627521, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.10919189, + "routerloss_mlp": 0.0, + "step": 2890, + "time_per_iteration": 2.65266752243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094404, + "balance_loss_mlp": 1.08348465, + "diversity_loss_mlp": 0.0, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.057576040721241756, + "language_loss": 0.81499392, + "learning_rate": 0.00043352733109457164, + "loss": 0.82593793, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.10919189, + "routerloss_mlp": 0.0, + "step": 2891, + "time_per_iteration": 2.927246332168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106638, + "balance_loss_mlp": 1.09556401, + "diversity_loss_mlp": 0.0, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.0763949134442708, + "language_loss": 0.84462321, + "learning_rate": 0.00043321856715349244, + "loss": 0.85568959, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.11077881, + "routerloss_mlp": 0.0, + "step": 2892, + "time_per_iteration": 2.970857858657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110278, + "balance_loss_mlp": 1.0918721, + "diversity_loss_mlp": 0.0, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.07453927070697552, + "language_loss": 0.80594504, + "learning_rate": 0.00043290982913926466, + "loss": 0.81697285, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.10913086, + "routerloss_mlp": 0.0, + "step": 2893, + "time_per_iteration": 2.8581972122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105658, + "balance_loss_mlp": 1.09473801, + "diversity_loss_mlp": 0.0, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.08476057735977802, + "language_loss": 0.84177083, + "learning_rate": 0.0004326011171717514, + "loss": 0.85282743, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.109375, + "routerloss_mlp": 0.0, + "step": 2894, + "time_per_iteration": 2.90563702583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094642, + "balance_loss_mlp": 1.08371019, + "diversity_loss_mlp": 0.0, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.06785531665857511, + "language_loss": 0.80468631, + "learning_rate": 0.0004322924313708051, + "loss": 0.8156327, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.10931396, + "routerloss_mlp": 0.0, + "step": 2895, + "time_per_iteration": 2.51784610748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092855, + "balance_loss_mlp": 1.08219218, + "diversity_loss_mlp": 0.0, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.07706946900287333, + "language_loss": 0.84533763, + "learning_rate": 0.0004319837718562681, + "loss": 0.85626626, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.10668945, + "routerloss_mlp": 0.0, + "step": 2896, + "time_per_iteration": 2.5862512588500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083747, + "balance_loss_mlp": 1.07321525, + "diversity_loss_mlp": 0.0, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.0793708179068888, + "language_loss": 0.83050567, + "learning_rate": 0.0004316751387479726, + "loss": 0.84134316, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2897, + "time_per_iteration": 2.778136730194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857516, + "balance_loss_mlp": 1.47219694, + "diversity_loss_mlp": 0.21748725, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.034004819690404205, + "language_loss": 0.82499564, + "learning_rate": 0.0004313665321657409, + "loss": 0.83357084, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01267361, + "step": 2898, + "time_per_iteration": 3.7754030227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.06795418, + "diversity_loss_mlp": 0.0, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.08236969633510602, + "language_loss": 0.79824448, + "learning_rate": 0.00043105795222938436, + "loss": 0.80903113, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.1071167, + "routerloss_mlp": 0.0, + "step": 2899, + "time_per_iteration": 2.7090694904327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073397, + "balance_loss_mlp": 1.06296027, + "diversity_loss_mlp": 0.0, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.07659548301877016, + "language_loss": 0.78690445, + "learning_rate": 0.00043074939905870467, + "loss": 0.79763848, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2900, + "time_per_iteration": 2.6444900035858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069144, + "balance_loss_mlp": 1.05899358, + "diversity_loss_mlp": 0.0, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.08372730008806528, + "language_loss": 0.80284113, + "learning_rate": 0.0004304408727734927, + "loss": 0.81353253, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 2901, + "time_per_iteration": 2.6800661087036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855039, + "balance_loss_mlp": 1.46478724, + "diversity_loss_mlp": 0.21833366, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.026106559121528438, + "language_loss": 0.88945115, + "learning_rate": 0.0004301323734935288, + "loss": 0.89800155, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01347797, + "step": 2902, + "time_per_iteration": 2.6880388259887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_mlp": 1.05446076, + "diversity_loss_mlp": 0.0, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.08715674624995783, + "language_loss": 0.87386537, + "learning_rate": 0.000429823901338583, + "loss": 0.88451326, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2903, + "time_per_iteration": 2.611330032348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.06004524, + "diversity_loss_mlp": 0.0, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.07350666628476007, + "language_loss": 0.86772639, + "learning_rate": 0.00042951545642841513, + "loss": 0.87843215, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.10534668, + "routerloss_mlp": 0.0, + "step": 2904, + "time_per_iteration": 3.066653251647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06802535, + "diversity_loss_mlp": 0.0, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.06907930895976065, + "language_loss": 0.86694556, + "learning_rate": 0.0004292070388827737, + "loss": 0.87773216, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.10644531, + "routerloss_mlp": 0.0, + "step": 2905, + "time_per_iteration": 2.5430614948272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068326, + "balance_loss_mlp": 1.05785918, + "diversity_loss_mlp": 0.0, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06877653703862108, + "language_loss": 0.81346464, + "learning_rate": 0.00042889864882139753, + "loss": 0.82414794, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2906, + "time_per_iteration": 2.5722434520721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075755, + "balance_loss_mlp": 1.06534863, + "diversity_loss_mlp": 0.0, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06732553967994827, + "language_loss": 0.81503737, + "learning_rate": 0.0004285902863640139, + "loss": 0.82579494, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.10406494, + "routerloss_mlp": 0.0, + "step": 2907, + "time_per_iteration": 2.643721580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074648, + "balance_loss_mlp": 1.06431222, + "diversity_loss_mlp": 0.0, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.06943407338412115, + "language_loss": 0.86278725, + "learning_rate": 0.00042828195163033966, + "loss": 0.87353367, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.10339355, + "routerloss_mlp": 0.0, + "step": 2908, + "time_per_iteration": 2.7045791149139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081822, + "balance_loss_mlp": 1.07135582, + "diversity_loss_mlp": 0.0, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07324820072157985, + "language_loss": 0.79102659, + "learning_rate": 0.0004279736447400812, + "loss": 0.80184484, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2909, + "time_per_iteration": 2.585176944732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107588, + "balance_loss_mlp": 1.06558049, + "diversity_loss_mlp": 0.0, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.07142642262643135, + "language_loss": 0.78468478, + "learning_rate": 0.00042766536581293385, + "loss": 0.79544365, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 2910, + "time_per_iteration": 2.723602771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090371, + "balance_loss_mlp": 1.07975566, + "diversity_loss_mlp": 0.0, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.0702995437532307, + "language_loss": 0.79552364, + "learning_rate": 0.0004273571149685819, + "loss": 0.80642736, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 2911, + "time_per_iteration": 2.7220258712768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091791, + "balance_loss_mlp": 1.08147311, + "diversity_loss_mlp": 0.0, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.06270923487878967, + "language_loss": 0.84021366, + "learning_rate": 0.00042704889232669937, + "loss": 0.85113156, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 2912, + "time_per_iteration": 2.6799380779266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00848913, + "balance_loss_mlp": 1.45588994, + "diversity_loss_mlp": 0.21708892, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.03254511626684893, + "language_loss": 0.85648382, + "learning_rate": 0.0004267406980069484, + "loss": 0.86497295, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01242387, + "step": 2913, + "time_per_iteration": 2.7309391498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.10193157, + "diversity_loss_mlp": 0.0, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.05402445789476675, + "language_loss": 0.79744071, + "learning_rate": 0.0004264325321289808, + "loss": 0.80856508, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.10510254, + "routerloss_mlp": 0.0, + "step": 2914, + "time_per_iteration": 2.8245773315429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104335, + "balance_loss_mlp": 1.09404707, + "diversity_loss_mlp": 0.0, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.07588418732744176, + "language_loss": 0.86308336, + "learning_rate": 0.00042612439481243736, + "loss": 0.87412667, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.10284424, + "routerloss_mlp": 0.0, + "step": 2915, + "time_per_iteration": 2.7910971641540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109566, + "balance_loss_mlp": 1.09916496, + "diversity_loss_mlp": 0.0, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.07165476469353879, + "language_loss": 0.90284097, + "learning_rate": 0.00042581628617694735, + "loss": 0.91393661, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 2916, + "time_per_iteration": 2.7449898719787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00839442, + "balance_loss_mlp": 1.43753612, + "diversity_loss_mlp": 0.21687999, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.03331291255724556, + "language_loss": 0.81856477, + "learning_rate": 0.0004255082063421296, + "loss": 0.82695925, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01223436, + "step": 2917, + "time_per_iteration": 2.705263614654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.12130046, + "diversity_loss_mlp": 0.0, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.07697799391889214, + "language_loss": 0.84842837, + "learning_rate": 0.00042520015542759065, + "loss": 0.85974395, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2918, + "time_per_iteration": 2.8643360137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110893, + "balance_loss_mlp": 1.09857666, + "diversity_loss_mlp": 0.0, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.059259650717302215, + "language_loss": 0.88182557, + "learning_rate": 0.00042489213355292687, + "loss": 0.89291489, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2919, + "time_per_iteration": 2.871605634689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113923, + "balance_loss_mlp": 1.1035037, + "diversity_loss_mlp": 0.0, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.07025137955977834, + "language_loss": 0.81129396, + "learning_rate": 0.00042458414083772276, + "loss": 0.82243323, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.10424805, + "routerloss_mlp": 0.0, + "step": 2920, + "time_per_iteration": 2.5280137062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110383, + "balance_loss_mlp": 1.09353638, + "diversity_loss_mlp": 0.0, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.06291310679725345, + "language_loss": 0.85259616, + "learning_rate": 0.000424276177401552, + "loss": 0.86363447, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 2921, + "time_per_iteration": 2.8061861991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091107, + "balance_loss_mlp": 1.08052063, + "diversity_loss_mlp": 0.0, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.06947728514830868, + "language_loss": 0.8586399, + "learning_rate": 0.0004239682433639763, + "loss": 0.86955094, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2922, + "time_per_iteration": 2.7068192958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.07726383, + "diversity_loss_mlp": 0.0, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.06724553342566655, + "language_loss": 0.85617495, + "learning_rate": 0.0004236603388445467, + "loss": 0.86705184, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 2923, + "time_per_iteration": 2.5658164024353027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083423, + "balance_loss_mlp": 1.07329023, + "diversity_loss_mlp": 0.0, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.06491959150956746, + "language_loss": 0.82087809, + "learning_rate": 0.00042335246396280166, + "loss": 0.83171237, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.10131836, + "routerloss_mlp": 0.0, + "step": 2924, + "time_per_iteration": 2.7210686206817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076248, + "balance_loss_mlp": 1.06606197, + "diversity_loss_mlp": 0.0, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.06924351044147684, + "language_loss": 0.90442908, + "learning_rate": 0.0004230446188382693, + "loss": 0.91519153, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 2925, + "time_per_iteration": 2.5210559368133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072823, + "balance_loss_mlp": 1.06237423, + "diversity_loss_mlp": 0.0, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.06189914516088338, + "language_loss": 0.80191588, + "learning_rate": 0.0004227368035904654, + "loss": 0.81264406, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.10455322, + "routerloss_mlp": 0.0, + "step": 2926, + "time_per_iteration": 2.957545757293701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073658, + "balance_loss_mlp": 1.06312013, + "diversity_loss_mlp": 0.0, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.07119677802103677, + "language_loss": 0.8312782, + "learning_rate": 0.00042242901833889474, + "loss": 0.84201479, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.10540771, + "routerloss_mlp": 0.0, + "step": 2927, + "time_per_iteration": 2.6197497844696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069584, + "balance_loss_mlp": 1.05933261, + "diversity_loss_mlp": 0.0, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.07548469953325632, + "language_loss": 0.85944557, + "learning_rate": 0.0004221212632030501, + "loss": 0.87014145, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 2928, + "time_per_iteration": 3.0718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074032, + "balance_loss_mlp": 1.0636375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.0702405954135719, + "language_loss": 0.8005904, + "learning_rate": 0.0004218135383024124, + "loss": 0.81133074, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2929, + "time_per_iteration": 2.6883885860443115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068804, + "balance_loss_mlp": 1.05836129, + "diversity_loss_mlp": 0.0, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.07423933793606223, + "language_loss": 0.85405028, + "learning_rate": 0.0004215058437564511, + "loss": 0.86473835, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2930, + "time_per_iteration": 2.5645458698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.06520677, + "diversity_loss_mlp": 0.0, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.07045402067927274, + "language_loss": 0.82365847, + "learning_rate": 0.00042119817968462397, + "loss": 0.83441579, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.10528564, + "routerloss_mlp": 0.0, + "step": 2931, + "time_per_iteration": 2.596431255340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843243, + "balance_loss_mlp": 1.44432163, + "diversity_loss_mlp": 0.21611315, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.034099962370994746, + "language_loss": 0.87154222, + "learning_rate": 0.0004208905462063766, + "loss": 0.8799746, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01302544, + "step": 2932, + "time_per_iteration": 2.7103724479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088146, + "balance_loss_mlp": 1.07760167, + "diversity_loss_mlp": 0.0, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.07257480225633914, + "language_loss": 0.84035242, + "learning_rate": 0.00042058294344114315, + "loss": 0.8512339, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.10546875, + "routerloss_mlp": 0.0, + "step": 2933, + "time_per_iteration": 2.6817541122436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00846618, + "balance_loss_mlp": 1.45035362, + "diversity_loss_mlp": 0.21710092, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.03239193802507573, + "language_loss": 0.77597153, + "learning_rate": 0.0004202753715083456, + "loss": 0.78443778, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01289086, + "step": 2934, + "time_per_iteration": 3.1172194480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.08684492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.08960488369203884, + "language_loss": 0.8126961, + "learning_rate": 0.0004199678305273936, + "loss": 0.82367325, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.10876465, + "routerloss_mlp": 0.0, + "step": 2935, + "time_per_iteration": 2.648293972015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096103, + "balance_loss_mlp": 1.08564794, + "diversity_loss_mlp": 0.0, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.06584718006017456, + "language_loss": 0.81395173, + "learning_rate": 0.0004196603206176854, + "loss": 0.82491279, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.10461426, + "routerloss_mlp": 0.0, + "step": 2936, + "time_per_iteration": 2.9504921436309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110133, + "balance_loss_mlp": 1.09094691, + "diversity_loss_mlp": 0.0, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.06854637503151859, + "language_loss": 0.83705592, + "learning_rate": 0.000419352841898607, + "loss": 0.84806919, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2937, + "time_per_iteration": 2.965176582336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100386, + "balance_loss_mlp": 1.09003913, + "diversity_loss_mlp": 0.0, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.06908295336200668, + "language_loss": 0.77684075, + "learning_rate": 0.000419045394489532, + "loss": 0.7878446, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 2938, + "time_per_iteration": 2.692997455596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094877, + "balance_loss_mlp": 1.08429718, + "diversity_loss_mlp": 0.0, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.06508171061148607, + "language_loss": 0.76831025, + "learning_rate": 0.0004187379785098224, + "loss": 0.77925897, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.10583496, + "routerloss_mlp": 0.0, + "step": 2939, + "time_per_iteration": 3.123154401779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110149, + "balance_loss_mlp": 1.09110653, + "diversity_loss_mlp": 0.0, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.08014464510269267, + "language_loss": 0.83749938, + "learning_rate": 0.00041843059407882744, + "loss": 0.84851432, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.10388184, + "routerloss_mlp": 0.0, + "step": 2940, + "time_per_iteration": 2.9720611572265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099107, + "balance_loss_mlp": 1.0887475, + "diversity_loss_mlp": 0.0, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.06910210619422795, + "language_loss": 0.82642627, + "learning_rate": 0.0004181232413158842, + "loss": 0.83741736, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2941, + "time_per_iteration": 2.657360315322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094217, + "balance_loss_mlp": 1.08388722, + "diversity_loss_mlp": 0.0, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.08913898875539945, + "language_loss": 0.82192254, + "learning_rate": 0.0004178159203403179, + "loss": 0.83286464, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2942, + "time_per_iteration": 2.8812596797943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080275, + "balance_loss_mlp": 1.07014799, + "diversity_loss_mlp": 0.0, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.06202774017820852, + "language_loss": 0.8130517, + "learning_rate": 0.0004175086312714409, + "loss": 0.82385445, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 2943, + "time_per_iteration": 2.561537027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080645, + "balance_loss_mlp": 1.07015431, + "diversity_loss_mlp": 0.0, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.05809127095966742, + "language_loss": 0.83570457, + "learning_rate": 0.00041720137422855366, + "loss": 0.84651101, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.10491943, + "routerloss_mlp": 0.0, + "step": 2944, + "time_per_iteration": 2.7395284175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075472, + "balance_loss_mlp": 1.06576228, + "diversity_loss_mlp": 0.0, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.07239714207057282, + "language_loss": 0.79116005, + "learning_rate": 0.00041689414933094383, + "loss": 0.80191475, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 2945, + "time_per_iteration": 2.654930353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067367, + "balance_loss_mlp": 1.05734193, + "diversity_loss_mlp": 0.0, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.07615309090382201, + "language_loss": 0.80823922, + "learning_rate": 0.00041658695669788653, + "loss": 0.81891298, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2946, + "time_per_iteration": 2.747903347015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069298, + "balance_loss_mlp": 1.05894506, + "diversity_loss_mlp": 0.0, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.09594015960064259, + "language_loss": 0.81304628, + "learning_rate": 0.00041627979644864453, + "loss": 0.82373923, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 2947, + "time_per_iteration": 2.8192365169525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064628, + "balance_loss_mlp": 1.05435264, + "diversity_loss_mlp": 0.0, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.06124486727819338, + "language_loss": 0.81212783, + "learning_rate": 0.0004159726687024683, + "loss": 0.82277411, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 2948, + "time_per_iteration": 2.634019613265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066585, + "balance_loss_mlp": 1.05610037, + "diversity_loss_mlp": 0.0, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.0698899799050157, + "language_loss": 0.7929486, + "learning_rate": 0.00041566557357859506, + "loss": 0.80361444, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2949, + "time_per_iteration": 2.861374616622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.05816913, + "diversity_loss_mlp": 0.0, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.0603589352170923, + "language_loss": 0.79605162, + "learning_rate": 0.0004153585111962502, + "loss": 0.80673802, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2950, + "time_per_iteration": 3.3136749267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076091, + "balance_loss_mlp": 1.06528509, + "diversity_loss_mlp": 0.0, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.07046051490297799, + "language_loss": 0.84271163, + "learning_rate": 0.0004150514816746453, + "loss": 0.85347259, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.10803223, + "routerloss_mlp": 0.0, + "step": 2951, + "time_per_iteration": 2.7142550945281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079575, + "balance_loss_mlp": 1.0689894, + "diversity_loss_mlp": 0.0, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.07561213643312675, + "language_loss": 0.85564739, + "learning_rate": 0.0004147444851329802, + "loss": 0.8664431, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 2952, + "time_per_iteration": 2.663442611694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079915, + "balance_loss_mlp": 1.06943655, + "diversity_loss_mlp": 0.0, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.06334656392280237, + "language_loss": 0.85917854, + "learning_rate": 0.00041443752169044126, + "loss": 0.86997765, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 2953, + "time_per_iteration": 3.0424787998199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083209, + "balance_loss_mlp": 1.07296944, + "diversity_loss_mlp": 0.0, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.08759511227816434, + "language_loss": 0.84844387, + "learning_rate": 0.0004141305914662025, + "loss": 0.85927594, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.10241699, + "routerloss_mlp": 0.0, + "step": 2954, + "time_per_iteration": 2.720574378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080604, + "balance_loss_mlp": 1.06977344, + "diversity_loss_mlp": 0.0, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0625505952609041, + "language_loss": 0.80443704, + "learning_rate": 0.0004138236945794246, + "loss": 0.81524312, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.10839844, + "routerloss_mlp": 0.0, + "step": 2955, + "time_per_iteration": 2.880007743835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067912, + "balance_loss_mlp": 1.05775595, + "diversity_loss_mlp": 0.0, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.08164782403227437, + "language_loss": 0.84066302, + "learning_rate": 0.00041351683114925576, + "loss": 0.85134214, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2956, + "time_per_iteration": 3.061213731765747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072205, + "balance_loss_mlp": 1.06213737, + "diversity_loss_mlp": 0.0, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.06079019071224684, + "language_loss": 0.86355555, + "learning_rate": 0.0004132100012948308, + "loss": 0.87427759, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 2957, + "time_per_iteration": 2.631786823272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069231, + "balance_loss_mlp": 1.0587523, + "diversity_loss_mlp": 0.0, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.07979265854660174, + "language_loss": 0.84526646, + "learning_rate": 0.00041290320513527145, + "loss": 0.85595882, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.10473633, + "routerloss_mlp": 0.0, + "step": 2958, + "time_per_iteration": 2.5593366622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061477, + "balance_loss_mlp": 1.05111814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.09201222931646683, + "language_loss": 0.85128796, + "learning_rate": 0.0004125964427896867, + "loss": 0.86190271, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.1036377, + "routerloss_mlp": 0.0, + "step": 2959, + "time_per_iteration": 2.667381525039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063517, + "balance_loss_mlp": 1.05320501, + "diversity_loss_mlp": 0.0, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06922825543149586, + "language_loss": 0.79212141, + "learning_rate": 0.0004122897143771723, + "loss": 0.80275661, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 2960, + "time_per_iteration": 2.523068904876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067248, + "balance_loss_mlp": 1.0569005, + "diversity_loss_mlp": 0.0, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.06880331468011665, + "language_loss": 0.81306094, + "learning_rate": 0.0004119830200168109, + "loss": 0.82373345, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.10351562, + "routerloss_mlp": 0.0, + "step": 2961, + "time_per_iteration": 2.7224626541137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106382, + "balance_loss_mlp": 1.05356169, + "diversity_loss_mlp": 0.0, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.08443053343043137, + "language_loss": 0.88515878, + "learning_rate": 0.0004116763598276714, + "loss": 0.89579695, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 2962, + "time_per_iteration": 2.4910728931427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067582, + "balance_loss_mlp": 1.05738318, + "diversity_loss_mlp": 0.0, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.07427131552828858, + "language_loss": 0.81298989, + "learning_rate": 0.00041136973392881017, + "loss": 0.82366574, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.10198975, + "routerloss_mlp": 0.0, + "step": 2963, + "time_per_iteration": 2.8261218070983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063452, + "balance_loss_mlp": 1.05275846, + "diversity_loss_mlp": 0.0, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.0795338566562928, + "language_loss": 0.82039535, + "learning_rate": 0.00041106314243926983, + "loss": 0.83102989, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.10699463, + "routerloss_mlp": 0.0, + "step": 2964, + "time_per_iteration": 2.7321033477783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058191, + "balance_loss_mlp": 1.04802823, + "diversity_loss_mlp": 0.0, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.07985594809339186, + "language_loss": 0.87473917, + "learning_rate": 0.0004107565854780798, + "loss": 0.88532114, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2965, + "time_per_iteration": 2.685188055038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105982, + "balance_loss_mlp": 1.0495863, + "diversity_loss_mlp": 0.0, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.12021988187086102, + "language_loss": 0.80887079, + "learning_rate": 0.000410450063164256, + "loss": 0.81946903, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.10241699, + "routerloss_mlp": 0.0, + "step": 2966, + "time_per_iteration": 2.8859732151031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061114, + "balance_loss_mlp": 1.05084372, + "diversity_loss_mlp": 0.0, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.07877125068742231, + "language_loss": 0.82298398, + "learning_rate": 0.00041014357561680115, + "loss": 0.83359516, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 2967, + "time_per_iteration": 2.5546090602874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072036, + "balance_loss_mlp": 1.06186163, + "diversity_loss_mlp": 0.0, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.0603559044145355, + "language_loss": 0.86396813, + "learning_rate": 0.0004098371229547039, + "loss": 0.87468845, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.10174561, + "routerloss_mlp": 0.0, + "step": 2968, + "time_per_iteration": 2.7246880531311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055906, + "balance_loss_mlp": 1.05082798, + "diversity_loss_mlp": 0.0, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.032213471653528905, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81066716, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 2969, + "time_per_iteration": 4.802457571029663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845784, + "balance_loss_mlp": 1.44834208, + "diversity_loss_mlp": 0.21849446, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.042172582609019446, + "language_loss": 0.80489594, + "learning_rate": 0.00040922432276247107, + "loss": 0.81335378, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01236574, + "step": 2970, + "time_per_iteration": 2.579711675643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100592, + "balance_loss_mlp": 1.09026289, + "diversity_loss_mlp": 0.0, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.08651791755700546, + "language_loss": 0.84556907, + "learning_rate": 0.0004089179754702457, + "loss": 0.85657501, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 2971, + "time_per_iteration": 2.744509220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109172, + "balance_loss_mlp": 1.08128309, + "diversity_loss_mlp": 0.0, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.0875480726861112, + "language_loss": 0.79658413, + "learning_rate": 0.00040861166353919843, + "loss": 0.80750132, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.10443115, + "routerloss_mlp": 0.0, + "step": 2972, + "time_per_iteration": 2.816767692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00843649, + "balance_loss_mlp": 1.44322622, + "diversity_loss_mlp": 0.21953782, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.0303598736791247, + "language_loss": 0.81879437, + "learning_rate": 0.00040830538708824983, + "loss": 0.82723081, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01226737, + "step": 2973, + "time_per_iteration": 2.8936269283294678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084736, + "balance_loss_mlp": 1.07479978, + "diversity_loss_mlp": 0.0, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.06866249599002382, + "language_loss": 0.81754982, + "learning_rate": 0.000407999146236307, + "loss": 0.82839715, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2974, + "time_per_iteration": 2.558587074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086743, + "balance_loss_mlp": 1.07657444, + "diversity_loss_mlp": 0.0, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.07286762161416734, + "language_loss": 0.83382261, + "learning_rate": 0.0004076929411022634, + "loss": 0.84468997, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2975, + "time_per_iteration": 2.604498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082309, + "balance_loss_mlp": 1.07231879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.06868291627032407, + "language_loss": 0.79575276, + "learning_rate": 0.0004073867718049982, + "loss": 0.80657583, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 2976, + "time_per_iteration": 3.082519054412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00841274, + "balance_loss_mlp": 1.44052804, + "diversity_loss_mlp": 0.21771878, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.03510584247140754, + "language_loss": 0.8255651, + "learning_rate": 0.00040708063846337704, + "loss": 0.83397782, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01215104, + "step": 2977, + "time_per_iteration": 2.7563750743865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108914, + "balance_loss_mlp": 1.07897186, + "diversity_loss_mlp": 0.0, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.07105452232664011, + "language_loss": 0.81019402, + "learning_rate": 0.00040677454119625143, + "loss": 0.82108539, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.10168457, + "routerloss_mlp": 0.0, + "step": 2978, + "time_per_iteration": 2.575923442840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089611, + "balance_loss_mlp": 1.07962155, + "diversity_loss_mlp": 0.0, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.07243213986729599, + "language_loss": 0.82912952, + "learning_rate": 0.0004064684801224587, + "loss": 0.84002566, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.09985352, + "routerloss_mlp": 0.0, + "step": 2979, + "time_per_iteration": 2.5965535640716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085844, + "balance_loss_mlp": 1.07600939, + "diversity_loss_mlp": 0.0, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.11138747568582645, + "language_loss": 0.80322999, + "learning_rate": 0.00040616245536082224, + "loss": 0.81408834, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 2980, + "time_per_iteration": 2.599320650100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079792, + "balance_loss_mlp": 1.07008803, + "diversity_loss_mlp": 0.0, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.06764455313032879, + "language_loss": 0.81366718, + "learning_rate": 0.00040585646703015165, + "loss": 0.82446504, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 2981, + "time_per_iteration": 2.8000056743621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_mlp": 1.0740515, + "diversity_loss_mlp": 0.0, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.07435230765684324, + "language_loss": 0.78094304, + "learning_rate": 0.0004055505152492419, + "loss": 0.79178286, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2982, + "time_per_iteration": 2.6867222785949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075905, + "balance_loss_mlp": 1.06574273, + "diversity_loss_mlp": 0.0, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.06874763078804642, + "language_loss": 0.74040514, + "learning_rate": 0.00040524460013687425, + "loss": 0.7511642, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 2983, + "time_per_iteration": 2.722419500350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070682, + "balance_loss_mlp": 1.06058455, + "diversity_loss_mlp": 0.0, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.06717754752260814, + "language_loss": 0.81118953, + "learning_rate": 0.0004049387218118155, + "loss": 0.82189637, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 2984, + "time_per_iteration": 2.960744857788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065588, + "balance_loss_mlp": 1.05519915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07543134348802799, + "language_loss": 0.85138291, + "learning_rate": 0.00040463288039281777, + "loss": 0.86203879, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 2985, + "time_per_iteration": 2.769758939743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_mlp": 1.03847778, + "diversity_loss_mlp": 0.0, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.0202426857746204, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78919691, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.04785156, + "routerloss_mlp": 0.0, + "step": 2986, + "time_per_iteration": 4.966659784317017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062164, + "balance_loss_mlp": 1.05206716, + "diversity_loss_mlp": 0.0, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.15131369926607025, + "language_loss": 0.82060635, + "learning_rate": 0.0004040213087479444, + "loss": 0.83122802, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 2987, + "time_per_iteration": 2.9445290565490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071505, + "balance_loss_mlp": 1.0615747, + "diversity_loss_mlp": 0.0, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.0782867157663105, + "language_loss": 0.85397077, + "learning_rate": 0.0004037155787595018, + "loss": 0.86468589, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 2988, + "time_per_iteration": 2.5765254497528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066911, + "balance_loss_mlp": 1.05708241, + "diversity_loss_mlp": 0.0, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.06722963936024443, + "language_loss": 0.80743146, + "learning_rate": 0.000403409886151987, + "loss": 0.81810057, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 2989, + "time_per_iteration": 2.916736364364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_mlp": 1.02410662, + "diversity_loss_mlp": 0.0, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.01652195359171043, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.8302803, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.0480957, + "routerloss_mlp": 0.0, + "step": 2990, + "time_per_iteration": 4.79939866065979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019783, + "balance_loss_mlp": 1.0149194, + "diversity_loss_mlp": 0.0, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.012607930583697005, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79218388, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 2991, + "time_per_iteration": 4.873241901397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107448, + "balance_loss_mlp": 1.06493187, + "diversity_loss_mlp": 0.0, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.07321689676824589, + "language_loss": 0.7675758, + "learning_rate": 0.00040249303380173807, + "loss": 0.77832061, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 2992, + "time_per_iteration": 3.119454860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075897, + "balance_loss_mlp": 1.06607461, + "diversity_loss_mlp": 0.0, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.06951674167184135, + "language_loss": 0.78929973, + "learning_rate": 0.00040218749190459126, + "loss": 0.80005872, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 2993, + "time_per_iteration": 2.735741138458252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074749, + "balance_loss_mlp": 1.06464601, + "diversity_loss_mlp": 0.0, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.09040694151318206, + "language_loss": 0.82524914, + "learning_rate": 0.00040188198798162775, + "loss": 0.83599663, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.10101318, + "routerloss_mlp": 0.0, + "step": 2994, + "time_per_iteration": 2.604189872741699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107928, + "balance_loss_mlp": 1.06903386, + "diversity_loss_mlp": 0.0, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.07247823517444965, + "language_loss": 0.85413349, + "learning_rate": 0.000401576522151455, + "loss": 0.86492634, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.10247803, + "routerloss_mlp": 0.0, + "step": 2995, + "time_per_iteration": 2.8580820560455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082336, + "balance_loss_mlp": 1.07231033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.07641213429349043, + "language_loss": 0.82611746, + "learning_rate": 0.0004012710945326651, + "loss": 0.83694082, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.10021973, + "routerloss_mlp": 0.0, + "step": 2996, + "time_per_iteration": 2.7899913787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093927, + "balance_loss_mlp": 1.08396673, + "diversity_loss_mlp": 0.0, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.06499516885792743, + "language_loss": 0.81305802, + "learning_rate": 0.0004009657052438355, + "loss": 0.82399726, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 2997, + "time_per_iteration": 2.7985143661499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109354, + "balance_loss_mlp": 1.08339536, + "diversity_loss_mlp": 0.0, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.07919341256021087, + "language_loss": 0.85873878, + "learning_rate": 0.00040066035440352904, + "loss": 0.86967415, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 2998, + "time_per_iteration": 2.633052110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032353, + "balance_loss_mlp": 1.02706063, + "diversity_loss_mlp": 0.0, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.024696349234847453, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80325484, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 2999, + "time_per_iteration": 4.901000022888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111633, + "balance_loss_mlp": 1.10161996, + "diversity_loss_mlp": 0.0, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.09685011562347093, + "language_loss": 0.76085562, + "learning_rate": 0.00040004976854266145, + "loss": 0.77197194, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3000, + "time_per_iteration": 2.5440561771392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106478, + "balance_loss_mlp": 1.09615445, + "diversity_loss_mlp": 0.0, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.08566214489971447, + "language_loss": 0.81596673, + "learning_rate": 0.0003997445337591505, + "loss": 0.82703155, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.10327148, + "routerloss_mlp": 0.0, + "step": 3001, + "time_per_iteration": 2.6576101779937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101254, + "balance_loss_mlp": 1.09120488, + "diversity_loss_mlp": 0.0, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.07034086792873868, + "language_loss": 0.74008942, + "learning_rate": 0.0003994393378982635, + "loss": 0.75110197, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3002, + "time_per_iteration": 2.646756172180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_mlp": 1.02816153, + "diversity_loss_mlp": 0.0, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.018933197318392565, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80571294, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.05126953, + "routerloss_mlp": 0.0, + "step": 3003, + "time_per_iteration": 4.810927867889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084692, + "balance_loss_mlp": 1.07440448, + "diversity_loss_mlp": 0.0, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.09168460196837042, + "language_loss": 0.8788178, + "learning_rate": 0.0003988290634182961, + "loss": 0.88966477, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.10290527, + "routerloss_mlp": 0.0, + "step": 3004, + "time_per_iteration": 2.8026678562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086517, + "balance_loss_mlp": 1.0765686, + "diversity_loss_mlp": 0.0, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.07023697016091271, + "language_loss": 0.80836314, + "learning_rate": 0.0003985239850361453, + "loss": 0.81922829, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3005, + "time_per_iteration": 2.605581760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108379, + "balance_loss_mlp": 1.0739491, + "diversity_loss_mlp": 0.0, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.08589270039345176, + "language_loss": 0.84542817, + "learning_rate": 0.0003982189460504777, + "loss": 0.85626608, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.09838867, + "routerloss_mlp": 0.0, + "step": 3006, + "time_per_iteration": 2.755309820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081707, + "balance_loss_mlp": 1.07148504, + "diversity_loss_mlp": 0.0, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.07367765629951939, + "language_loss": 0.79058981, + "learning_rate": 0.00039791394657971935, + "loss": 0.80140698, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3007, + "time_per_iteration": 2.7115721702575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083463, + "balance_loss_mlp": 1.07349145, + "diversity_loss_mlp": 0.0, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.08639799759711958, + "language_loss": 0.84195948, + "learning_rate": 0.00039760898674228205, + "loss": 0.85279417, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.09967041, + "routerloss_mlp": 0.0, + "step": 3008, + "time_per_iteration": 2.6536192893981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.07249665, + "diversity_loss_mlp": 0.0, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.06522284264232586, + "language_loss": 0.80620825, + "learning_rate": 0.0003973040666565613, + "loss": 0.81703728, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.10406494, + "routerloss_mlp": 0.0, + "step": 3009, + "time_per_iteration": 3.0663528442382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083981, + "balance_loss_mlp": 1.07382393, + "diversity_loss_mlp": 0.0, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.06612730330601824, + "language_loss": 0.82148051, + "learning_rate": 0.000396999186440938, + "loss": 0.83232027, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 3010, + "time_per_iteration": 2.8332176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078314, + "balance_loss_mlp": 1.06794286, + "diversity_loss_mlp": 0.0, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.0828593686110812, + "language_loss": 0.85258269, + "learning_rate": 0.000396694346213777, + "loss": 0.86336583, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.10369873, + "routerloss_mlp": 0.0, + "step": 3011, + "time_per_iteration": 2.6009714603424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107282, + "balance_loss_mlp": 1.06272256, + "diversity_loss_mlp": 0.0, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.06962390382868744, + "language_loss": 0.83265769, + "learning_rate": 0.0003963895460934276, + "loss": 0.84338593, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 3012, + "time_per_iteration": 3.1654391288757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069146, + "balance_loss_mlp": 1.05900097, + "diversity_loss_mlp": 0.0, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.07925389671051855, + "language_loss": 0.84790504, + "learning_rate": 0.00039608478619822376, + "loss": 0.85859656, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.10144043, + "routerloss_mlp": 0.0, + "step": 3013, + "time_per_iteration": 2.427522659301758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_mlp": 1.05792189, + "diversity_loss_mlp": 0.0, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.06006231039706783, + "language_loss": 0.82350284, + "learning_rate": 0.00039578006664648394, + "loss": 0.83418107, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3014, + "time_per_iteration": 2.744586229324341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073341, + "balance_loss_mlp": 1.06352377, + "diversity_loss_mlp": 0.0, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.06972986465808689, + "language_loss": 0.81348431, + "learning_rate": 0.0003954753875565105, + "loss": 0.82421774, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3015, + "time_per_iteration": 3.0640695095062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072847, + "balance_loss_mlp": 1.06282723, + "diversity_loss_mlp": 0.0, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.07357715078918559, + "language_loss": 0.82623494, + "learning_rate": 0.00039517074904659057, + "loss": 0.83696342, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3016, + "time_per_iteration": 2.6665265560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010727, + "balance_loss_mlp": 1.06269789, + "diversity_loss_mlp": 0.0, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.06753013197016527, + "language_loss": 0.84737754, + "learning_rate": 0.00039486615123499535, + "loss": 0.85810453, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.10003662, + "routerloss_mlp": 0.0, + "step": 3017, + "time_per_iteration": 2.868724822998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067949, + "balance_loss_mlp": 1.05761325, + "diversity_loss_mlp": 0.0, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.06414820954678578, + "language_loss": 0.84855384, + "learning_rate": 0.00039456159423997996, + "loss": 0.85923326, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.10333252, + "routerloss_mlp": 0.0, + "step": 3018, + "time_per_iteration": 2.7043581008911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067563, + "balance_loss_mlp": 1.05765033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.06908857206879536, + "language_loss": 0.89950442, + "learning_rate": 0.00039425707817978406, + "loss": 0.91018009, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3019, + "time_per_iteration": 2.661128044128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106838, + "balance_loss_mlp": 1.0578835, + "diversity_loss_mlp": 0.0, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.08125232064199928, + "language_loss": 0.83649898, + "learning_rate": 0.00039395260317263124, + "loss": 0.84718275, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.1050415, + "routerloss_mlp": 0.0, + "step": 3020, + "time_per_iteration": 2.5645148754119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070808, + "balance_loss_mlp": 1.06039524, + "diversity_loss_mlp": 0.0, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.06887634041791851, + "language_loss": 0.85043871, + "learning_rate": 0.0003936481693367291, + "loss": 0.86114681, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3021, + "time_per_iteration": 2.7062771320343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077533, + "balance_loss_mlp": 1.06673217, + "diversity_loss_mlp": 0.0, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08641696356618225, + "language_loss": 0.87619507, + "learning_rate": 0.0003933437767902697, + "loss": 0.88697034, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.10803223, + "routerloss_mlp": 0.0, + "step": 3022, + "time_per_iteration": 2.7680017948150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078804, + "balance_loss_mlp": 1.06846249, + "diversity_loss_mlp": 0.0, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.0708496595357851, + "language_loss": 0.78467089, + "learning_rate": 0.00039303942565142825, + "loss": 0.79545891, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 3023, + "time_per_iteration": 2.7319986820220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071706, + "balance_loss_mlp": 1.06121564, + "diversity_loss_mlp": 0.0, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.06941107329713525, + "language_loss": 0.76844412, + "learning_rate": 0.0003927351160383644, + "loss": 0.77916121, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 3024, + "time_per_iteration": 2.7925262451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069902, + "balance_loss_mlp": 1.05980492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.07084631667240687, + "language_loss": 0.77815473, + "learning_rate": 0.000392430848069222, + "loss": 0.78885376, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.10095215, + "routerloss_mlp": 0.0, + "step": 3025, + "time_per_iteration": 2.5290136337280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075514, + "balance_loss_mlp": 1.06532741, + "diversity_loss_mlp": 0.0, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.07224483468752362, + "language_loss": 0.82501459, + "learning_rate": 0.00039212662186212795, + "loss": 0.83576977, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3026, + "time_per_iteration": 2.6017684936523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106609, + "balance_loss_mlp": 1.05593956, + "diversity_loss_mlp": 0.0, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.05478704818063415, + "language_loss": 0.77076197, + "learning_rate": 0.0003918224375351934, + "loss": 0.78142285, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 3027, + "time_per_iteration": 2.707127571105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069708, + "balance_loss_mlp": 1.05940795, + "diversity_loss_mlp": 0.0, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.07026049561627037, + "language_loss": 0.78559566, + "learning_rate": 0.0003915182952065135, + "loss": 0.79629278, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.10302734, + "routerloss_mlp": 0.0, + "step": 3028, + "time_per_iteration": 2.6728062629699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00863261, + "balance_loss_mlp": 1.48110199, + "diversity_loss_mlp": 0.21947324, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.028926470462326558, + "language_loss": 0.87632734, + "learning_rate": 0.0003912141949941664, + "loss": 0.88495994, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0129736, + "step": 3029, + "time_per_iteration": 2.7290279865264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068105, + "balance_loss_mlp": 1.05748928, + "diversity_loss_mlp": 0.0, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.11092566755711959, + "language_loss": 0.82848042, + "learning_rate": 0.0003909101370162143, + "loss": 0.83916146, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.10620117, + "routerloss_mlp": 0.0, + "step": 3030, + "time_per_iteration": 2.5907628536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057716, + "balance_loss_mlp": 1.05161262, + "diversity_loss_mlp": 0.0, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.028764883169419067, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73491609, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.06103516, + "routerloss_mlp": 0.0, + "step": 3031, + "time_per_iteration": 4.87787127494812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066859, + "balance_loss_mlp": 1.05651772, + "diversity_loss_mlp": 0.0, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.06710106844205427, + "language_loss": 0.82853395, + "learning_rate": 0.0003903021482356622, + "loss": 0.83920258, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.10345459, + "routerloss_mlp": 0.0, + "step": 3032, + "time_per_iteration": 2.777536153793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067707, + "balance_loss_mlp": 1.05757427, + "diversity_loss_mlp": 0.0, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.05521171326439417, + "language_loss": 0.82775813, + "learning_rate": 0.00038999821766910465, + "loss": 0.83843517, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.10131836, + "routerloss_mlp": 0.0, + "step": 3033, + "time_per_iteration": 2.990370035171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064798, + "balance_loss_mlp": 1.05444503, + "diversity_loss_mlp": 0.0, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.06933125597123427, + "language_loss": 0.85725427, + "learning_rate": 0.00038969432980902606, + "loss": 0.86790228, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.10357666, + "routerloss_mlp": 0.0, + "step": 3034, + "time_per_iteration": 2.522594690322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101659, + "balance_loss_mlp": 1.01134527, + "diversity_loss_mlp": 0.0, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.016170176694849804, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80801094, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.05249023, + "routerloss_mlp": 0.0, + "step": 3035, + "time_per_iteration": 4.804777383804321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070254, + "balance_loss_mlp": 1.06007361, + "diversity_loss_mlp": 0.0, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.06630987198212972, + "language_loss": 0.82630336, + "learning_rate": 0.00038908668268020953, + "loss": 0.83700585, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.10180664, + "routerloss_mlp": 0.0, + "step": 3036, + "time_per_iteration": 2.6598165035247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064922, + "balance_loss_mlp": 1.0547123, + "diversity_loss_mlp": 0.0, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.06353975651870693, + "language_loss": 0.85077345, + "learning_rate": 0.00038878292364738097, + "loss": 0.86142278, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3037, + "time_per_iteration": 2.817431688308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066587, + "balance_loss_mlp": 1.05653155, + "diversity_loss_mlp": 0.0, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.06847185322789755, + "language_loss": 0.86992419, + "learning_rate": 0.0003884792077928508, + "loss": 0.88059008, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.10058594, + "routerloss_mlp": 0.0, + "step": 3038, + "time_per_iteration": 2.515582323074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067425, + "balance_loss_mlp": 1.05704808, + "diversity_loss_mlp": 0.0, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.08132102193369704, + "language_loss": 0.76704037, + "learning_rate": 0.0003881755352345322, + "loss": 0.77771461, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 3039, + "time_per_iteration": 2.506476402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.05959702, + "diversity_loss_mlp": 0.0, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.05655703451029381, + "language_loss": 0.87182224, + "learning_rate": 0.0003878719060903207, + "loss": 0.88252252, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.10437012, + "routerloss_mlp": 0.0, + "step": 3040, + "time_per_iteration": 2.5755503177642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077595, + "balance_loss_mlp": 1.06733704, + "diversity_loss_mlp": 0.0, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.07213898072930079, + "language_loss": 0.83620822, + "learning_rate": 0.0003875683204780961, + "loss": 0.84698415, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 3041, + "time_per_iteration": 2.7087528705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00858209, + "balance_loss_mlp": 1.47420132, + "diversity_loss_mlp": 0.21720865, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.0337374590034744, + "language_loss": 0.85750413, + "learning_rate": 0.00038726477851572043, + "loss": 0.86608613, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01250451, + "step": 3042, + "time_per_iteration": 2.8391060829162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085797, + "balance_loss_mlp": 1.07552087, + "diversity_loss_mlp": 0.0, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.07424787281712622, + "language_loss": 0.8043561, + "learning_rate": 0.0003869612803210395, + "loss": 0.81521404, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3043, + "time_per_iteration": 2.6728439331054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085503, + "balance_loss_mlp": 1.07525158, + "diversity_loss_mlp": 0.0, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.0731909762270397, + "language_loss": 0.83286428, + "learning_rate": 0.0003866578260118817, + "loss": 0.8437193, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.10253906, + "routerloss_mlp": 0.0, + "step": 3044, + "time_per_iteration": 2.6332969665527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108221, + "balance_loss_mlp": 1.07239914, + "diversity_loss_mlp": 0.0, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.07445534470947208, + "language_loss": 0.82966632, + "learning_rate": 0.0003863544157060581, + "loss": 0.84048843, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3045, + "time_per_iteration": 2.668837785720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081605, + "balance_loss_mlp": 1.07137656, + "diversity_loss_mlp": 0.0, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.07387128485113956, + "language_loss": 0.82359195, + "learning_rate": 0.0003860510495213634, + "loss": 0.83440793, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3046, + "time_per_iteration": 2.8229498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106696, + "balance_loss_mlp": 1.05705416, + "diversity_loss_mlp": 0.0, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.08160785595799389, + "language_loss": 0.78622752, + "learning_rate": 0.0003857477275755746, + "loss": 0.79689717, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3047, + "time_per_iteration": 2.6294050216674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066008, + "balance_loss_mlp": 1.0557915, + "diversity_loss_mlp": 0.0, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.0580402220657833, + "language_loss": 0.83646655, + "learning_rate": 0.00038544444998645167, + "loss": 0.84712666, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.10217285, + "routerloss_mlp": 0.0, + "step": 3048, + "time_per_iteration": 3.0289785861968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059507, + "balance_loss_mlp": 1.04951751, + "diversity_loss_mlp": 0.0, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.0674332369398686, + "language_loss": 0.81847656, + "learning_rate": 0.00038514121687173767, + "loss": 0.82907164, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 3049, + "time_per_iteration": 2.5797152519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058576, + "balance_loss_mlp": 1.04861593, + "diversity_loss_mlp": 0.0, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.08495884025795868, + "language_loss": 0.82019609, + "learning_rate": 0.00038483802834915807, + "loss": 0.83078188, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3050, + "time_per_iteration": 3.0199241638183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061818, + "balance_loss_mlp": 1.05154216, + "diversity_loss_mlp": 0.0, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.07816426751212531, + "language_loss": 0.78978479, + "learning_rate": 0.00038453488453642074, + "loss": 0.800403, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3051, + "time_per_iteration": 2.7338953018188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105642, + "balance_loss_mlp": 1.04610801, + "diversity_loss_mlp": 0.0, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.07385283463746846, + "language_loss": 0.86878967, + "learning_rate": 0.00038423178555121697, + "loss": 0.87935388, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.10308838, + "routerloss_mlp": 0.0, + "step": 3052, + "time_per_iteration": 2.7545297145843506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058315, + "balance_loss_mlp": 1.04783666, + "diversity_loss_mlp": 0.0, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.07920619209623277, + "language_loss": 0.85583031, + "learning_rate": 0.00038392873151121994, + "loss": 0.86641347, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.1048584, + "routerloss_mlp": 0.0, + "step": 3053, + "time_per_iteration": 3.07143235206604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.04924083, + "diversity_loss_mlp": 0.0, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.07754087781816771, + "language_loss": 0.83137167, + "learning_rate": 0.0003836257225340859, + "loss": 0.84196955, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.10552979, + "routerloss_mlp": 0.0, + "step": 3054, + "time_per_iteration": 2.6132304668426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066843, + "balance_loss_mlp": 1.05597091, + "diversity_loss_mlp": 0.0, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.0689474058081498, + "language_loss": 0.82020974, + "learning_rate": 0.00038332275873745336, + "loss": 0.83087826, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.10882568, + "routerloss_mlp": 0.0, + "step": 3055, + "time_per_iteration": 3.107823371887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00855378, + "balance_loss_mlp": 1.46855807, + "diversity_loss_mlp": 0.21676093, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.026786885849911755, + "language_loss": 0.82891941, + "learning_rate": 0.0003830198402389431, + "loss": 0.83747321, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01271825, + "step": 3056, + "time_per_iteration": 2.7645249366760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_mlp": 1.03548789, + "diversity_loss_mlp": 0.0, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.027829027984012215, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78389645, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.05224609, + "routerloss_mlp": 0.0, + "step": 3057, + "time_per_iteration": 4.995454549789429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082248, + "balance_loss_mlp": 1.07115602, + "diversity_loss_mlp": 0.0, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.10105227922023945, + "language_loss": 0.83302426, + "learning_rate": 0.0003824141396066855, + "loss": 0.8438468, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.11096191, + "routerloss_mlp": 0.0, + "step": 3058, + "time_per_iteration": 2.568283796310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086707, + "balance_loss_mlp": 1.07570362, + "diversity_loss_mlp": 0.0, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.10870959422332387, + "language_loss": 0.8283565, + "learning_rate": 0.000382111357708092, + "loss": 0.83922356, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.10998535, + "routerloss_mlp": 0.0, + "step": 3059, + "time_per_iteration": 2.7063958644866943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080617, + "balance_loss_mlp": 1.06985879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.09017347087331092, + "language_loss": 0.83373827, + "learning_rate": 0.00038180862157792864, + "loss": 0.84454447, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.10760498, + "routerloss_mlp": 0.0, + "step": 3060, + "time_per_iteration": 2.7716259956359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071354, + "balance_loss_mlp": 1.06098306, + "diversity_loss_mlp": 0.0, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.06780881013643715, + "language_loss": 0.81814772, + "learning_rate": 0.0003815059313337279, + "loss": 0.82886124, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 3061, + "time_per_iteration": 2.664134979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072016, + "balance_loss_mlp": 1.06180596, + "diversity_loss_mlp": 0.0, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.06335749004143083, + "language_loss": 0.78063929, + "learning_rate": 0.00038120328709300436, + "loss": 0.79135942, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3062, + "time_per_iteration": 2.8627028465270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066046, + "balance_loss_mlp": 1.05566847, + "diversity_loss_mlp": 0.0, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.06769296518732247, + "language_loss": 0.8382163, + "learning_rate": 0.0003809006889732549, + "loss": 0.84887671, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.1038208, + "routerloss_mlp": 0.0, + "step": 3063, + "time_per_iteration": 2.809983253479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066721, + "balance_loss_mlp": 1.05686879, + "diversity_loss_mlp": 0.0, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.07471445768221775, + "language_loss": 0.88052714, + "learning_rate": 0.0003805981370919589, + "loss": 0.89119434, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3064, + "time_per_iteration": 2.526881456375122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106806, + "balance_loss_mlp": 1.05822492, + "diversity_loss_mlp": 0.0, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.06588713514234819, + "language_loss": 0.83812523, + "learning_rate": 0.0003802956315665771, + "loss": 0.84880579, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3065, + "time_per_iteration": 2.6691834926605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072593, + "balance_loss_mlp": 1.06285346, + "diversity_loss_mlp": 0.0, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.11425397529110681, + "language_loss": 0.8185159, + "learning_rate": 0.0003799931725145529, + "loss": 0.82924175, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.09729004, + "routerloss_mlp": 0.0, + "step": 3066, + "time_per_iteration": 2.6098556518554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_mlp": 1.06719375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.07983506473752326, + "language_loss": 0.85902935, + "learning_rate": 0.00037969076005331083, + "loss": 0.86980045, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3067, + "time_per_iteration": 2.7626185417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081851, + "balance_loss_mlp": 1.07184935, + "diversity_loss_mlp": 0.0, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.07247659487205776, + "language_loss": 0.8802191, + "learning_rate": 0.00037938839430025817, + "loss": 0.89103758, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3068, + "time_per_iteration": 2.6493396759033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088575, + "balance_loss_mlp": 1.07886577, + "diversity_loss_mlp": 0.0, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.0655302097756617, + "language_loss": 0.85496283, + "learning_rate": 0.0003790860753727835, + "loss": 0.8658486, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3069, + "time_per_iteration": 2.7941815853118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089673, + "balance_loss_mlp": 1.07995713, + "diversity_loss_mlp": 0.0, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0796849495747384, + "language_loss": 0.82864797, + "learning_rate": 0.00037878380338825766, + "loss": 0.83954477, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3070, + "time_per_iteration": 2.6861939430236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102877, + "balance_loss_mlp": 1.09311378, + "diversity_loss_mlp": 0.0, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.08458672700427887, + "language_loss": 0.81556624, + "learning_rate": 0.00037848157846403287, + "loss": 0.82659507, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3071, + "time_per_iteration": 2.873662233352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101959, + "balance_loss_mlp": 1.09236836, + "diversity_loss_mlp": 0.0, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.07248408902015292, + "language_loss": 0.83281767, + "learning_rate": 0.0003781794007174435, + "loss": 0.84383726, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3072, + "time_per_iteration": 2.762472629547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088348, + "balance_loss_mlp": 1.08360386, + "diversity_loss_mlp": 0.0, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.032251872290910595, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75162888, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.04736328, + "routerloss_mlp": 0.0, + "step": 3073, + "time_per_iteration": 4.854618787765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107188, + "balance_loss_mlp": 1.09715033, + "diversity_loss_mlp": 0.0, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.058981009489694675, + "language_loss": 0.80947924, + "learning_rate": 0.0003775751872264152, + "loss": 0.8205511, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.1003418, + "routerloss_mlp": 0.0, + "step": 3074, + "time_per_iteration": 2.771085023880005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101985, + "balance_loss_mlp": 1.09195375, + "diversity_loss_mlp": 0.0, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.056077752757325364, + "language_loss": 0.87175214, + "learning_rate": 0.0003772731517165527, + "loss": 0.88277197, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.10028076, + "routerloss_mlp": 0.0, + "step": 3075, + "time_per_iteration": 2.8292393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103862, + "balance_loss_mlp": 1.09419441, + "diversity_loss_mlp": 0.0, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.07602524147414737, + "language_loss": 0.83311272, + "learning_rate": 0.0003769711638534784, + "loss": 0.84415126, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3076, + "time_per_iteration": 2.97261381149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099488, + "balance_loss_mlp": 1.08962953, + "diversity_loss_mlp": 0.0, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.07287223806238774, + "language_loss": 0.79046565, + "learning_rate": 0.00037666922375443446, + "loss": 0.8014605, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3077, + "time_per_iteration": 2.6755480766296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093205, + "balance_loss_mlp": 1.08349538, + "diversity_loss_mlp": 0.0, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.06803693763690793, + "language_loss": 0.81907725, + "learning_rate": 0.00037636733153664396, + "loss": 0.83000934, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3078, + "time_per_iteration": 2.8055219650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109815, + "balance_loss_mlp": 1.08854795, + "diversity_loss_mlp": 0.0, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.08595437511710807, + "language_loss": 0.80202127, + "learning_rate": 0.0003760654873173124, + "loss": 0.81300277, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3079, + "time_per_iteration": 2.6700353622436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089136, + "balance_loss_mlp": 1.07927787, + "diversity_loss_mlp": 0.0, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.06826446524438025, + "language_loss": 0.82043588, + "learning_rate": 0.00037576369121362566, + "loss": 0.8313272, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3080, + "time_per_iteration": 2.596071481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089019, + "balance_loss_mlp": 1.07946444, + "diversity_loss_mlp": 0.0, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.057614109423291045, + "language_loss": 0.81680822, + "learning_rate": 0.0003754619433427516, + "loss": 0.82769841, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3081, + "time_per_iteration": 2.9003093242645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087273, + "balance_loss_mlp": 1.07771826, + "diversity_loss_mlp": 0.0, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.09118109008842482, + "language_loss": 0.7796042, + "learning_rate": 0.0003751602438218392, + "loss": 0.79047692, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3082, + "time_per_iteration": 2.7739951610565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06927121, + "diversity_loss_mlp": 0.0, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.07641398361038237, + "language_loss": 0.84107417, + "learning_rate": 0.0003748585927680186, + "loss": 0.85186076, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3083, + "time_per_iteration": 2.6706809997558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087865, + "balance_loss_mlp": 1.07850111, + "diversity_loss_mlp": 0.0, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.07450452823339063, + "language_loss": 0.82992828, + "learning_rate": 0.00037455699029840086, + "loss": 0.84080696, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3084, + "time_per_iteration": 2.648775100708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082396, + "balance_loss_mlp": 1.07310402, + "diversity_loss_mlp": 0.0, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.0678124296562273, + "language_loss": 0.84694779, + "learning_rate": 0.0003742554365300787, + "loss": 0.85777175, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3085, + "time_per_iteration": 2.787437677383423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00854998, + "balance_loss_mlp": 1.4632709, + "diversity_loss_mlp": 0.21810779, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.030613192067315453, + "language_loss": 0.79049134, + "learning_rate": 0.0003739539315801255, + "loss": 0.79904133, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01430825, + "step": 3086, + "time_per_iteration": 2.9476425647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088902, + "balance_loss_mlp": 1.07956231, + "diversity_loss_mlp": 0.0, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.08021663243926581, + "language_loss": 0.91758776, + "learning_rate": 0.000373652475565596, + "loss": 0.92847675, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3087, + "time_per_iteration": 2.473820924758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086482, + "balance_loss_mlp": 1.07684994, + "diversity_loss_mlp": 0.0, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.0746565513598584, + "language_loss": 0.81288451, + "learning_rate": 0.00037335106860352587, + "loss": 0.8237493, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3088, + "time_per_iteration": 2.6710119247436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.07624292, + "diversity_loss_mlp": 0.0, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.06157127364570171, + "language_loss": 0.82947195, + "learning_rate": 0.00037304971081093146, + "loss": 0.84033072, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3089, + "time_per_iteration": 2.5530550479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095759, + "balance_loss_mlp": 1.0863055, + "diversity_loss_mlp": 0.0, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.06188782031055571, + "language_loss": 0.80896157, + "learning_rate": 0.00037274840230481024, + "loss": 0.81991911, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3090, + "time_per_iteration": 2.707697868347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094547, + "balance_loss_mlp": 1.08488476, + "diversity_loss_mlp": 0.0, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.07660649649984981, + "language_loss": 0.79309815, + "learning_rate": 0.00037244714320214077, + "loss": 0.80404359, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3091, + "time_per_iteration": 2.524418354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094365, + "balance_loss_mlp": 1.08449435, + "diversity_loss_mlp": 0.0, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.07189913531932149, + "language_loss": 0.83442843, + "learning_rate": 0.000372145933619882, + "loss": 0.84537208, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3092, + "time_per_iteration": 2.889267683029175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098289, + "balance_loss_mlp": 1.0883646, + "diversity_loss_mlp": 0.0, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.08404319768947686, + "language_loss": 0.82928061, + "learning_rate": 0.000371844773674974, + "loss": 0.84026349, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3093, + "time_per_iteration": 2.729433059692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00849837, + "balance_loss_mlp": 1.45755267, + "diversity_loss_mlp": 0.21677493, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.03215359042810467, + "language_loss": 0.82038867, + "learning_rate": 0.0003715436634843375, + "loss": 0.82888705, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01267278, + "step": 3094, + "time_per_iteration": 2.8759658336639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110065, + "balance_loss_mlp": 1.10049295, + "diversity_loss_mlp": 0.0, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.05868361705811182, + "language_loss": 0.80998492, + "learning_rate": 0.00037124260316487355, + "loss": 0.82108557, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3095, + "time_per_iteration": 2.8515610694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.11049807, + "diversity_loss_mlp": 0.0, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.06311708190042467, + "language_loss": 0.89435279, + "learning_rate": 0.0003709415928334643, + "loss": 0.90555483, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3096, + "time_per_iteration": 2.5820794105529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00850727, + "balance_loss_mlp": 1.45894229, + "diversity_loss_mlp": 0.21772251, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.03378868601366531, + "language_loss": 0.80653715, + "learning_rate": 0.00037064063260697233, + "loss": 0.81504446, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01239414, + "step": 3097, + "time_per_iteration": 2.897676467895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138893, + "balance_loss_mlp": 1.12893891, + "diversity_loss_mlp": 0.0, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.06769209825818075, + "language_loss": 0.78597271, + "learning_rate": 0.0003703397226022407, + "loss": 0.79736161, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3098, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056672, + "balance_loss_mlp": 1.05123568, + "diversity_loss_mlp": 0.0, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.0345928166567928, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76556545, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.05444336, + "routerloss_mlp": 0.0, + "step": 3099, + "time_per_iteration": 4.977718114852905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847219, + "balance_loss_mlp": 1.45243645, + "diversity_loss_mlp": 0.21764749, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.029968084230811296, + "language_loss": 0.83180296, + "learning_rate": 0.0003697380537253339, + "loss": 0.84027505, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01217673, + "step": 3100, + "time_per_iteration": 2.673551559448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121175, + "balance_loss_mlp": 1.11119175, + "diversity_loss_mlp": 0.0, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.06630352939366652, + "language_loss": 0.81596649, + "learning_rate": 0.0003694372950867471, + "loss": 0.82717824, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 3101, + "time_per_iteration": 2.7776670455932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119741, + "balance_loss_mlp": 1.1100198, + "diversity_loss_mlp": 0.0, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.07189145573728124, + "language_loss": 0.77408171, + "learning_rate": 0.0003691365871370976, + "loss": 0.78527915, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3102, + "time_per_iteration": 3.04355525970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116521, + "balance_loss_mlp": 1.1067102, + "diversity_loss_mlp": 0.0, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06839859357083694, + "language_loss": 0.8504554, + "learning_rate": 0.00036883592999313093, + "loss": 0.8616206, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3103, + "time_per_iteration": 2.6881608963012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111468, + "balance_loss_mlp": 1.1020087, + "diversity_loss_mlp": 0.0, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.07720585150601726, + "language_loss": 0.7960434, + "learning_rate": 0.0003685353237715722, + "loss": 0.80715817, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3104, + "time_per_iteration": 2.910879135131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104035, + "balance_loss_mlp": 1.09433126, + "diversity_loss_mlp": 0.0, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.08349083770410728, + "language_loss": 0.81658864, + "learning_rate": 0.0003682347685891274, + "loss": 0.82762903, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3105, + "time_per_iteration": 2.8556530475616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093856, + "balance_loss_mlp": 1.08412814, + "diversity_loss_mlp": 0.0, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.07861180875636395, + "language_loss": 0.80587226, + "learning_rate": 0.0003679342645624822, + "loss": 0.81681079, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3106, + "time_per_iteration": 2.9788949489593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091288, + "balance_loss_mlp": 1.08144689, + "diversity_loss_mlp": 0.0, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.062123999367099406, + "language_loss": 0.81345969, + "learning_rate": 0.0003676338118083025, + "loss": 0.82437259, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.09832764, + "routerloss_mlp": 0.0, + "step": 3107, + "time_per_iteration": 3.0514276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083265, + "balance_loss_mlp": 1.07369304, + "diversity_loss_mlp": 0.0, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.07200241428310707, + "language_loss": 0.79341209, + "learning_rate": 0.0003673334104432347, + "loss": 0.8042447, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3108, + "time_per_iteration": 2.6402766704559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084433, + "balance_loss_mlp": 1.07493854, + "diversity_loss_mlp": 0.0, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.06431634181531254, + "language_loss": 0.83437502, + "learning_rate": 0.0003670330605839048, + "loss": 0.84521937, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3109, + "time_per_iteration": 2.8350021839141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071839, + "balance_loss_mlp": 1.06252289, + "diversity_loss_mlp": 0.0, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.08338826074003908, + "language_loss": 0.76629049, + "learning_rate": 0.0003667327623469191, + "loss": 0.77700889, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3110, + "time_per_iteration": 2.7434427738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086579, + "balance_loss_mlp": 1.0770725, + "diversity_loss_mlp": 0.0, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.07334566089126898, + "language_loss": 0.7758621, + "learning_rate": 0.00036643251584886333, + "loss": 0.78672791, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3111, + "time_per_iteration": 2.7712619304656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080276, + "balance_loss_mlp": 1.07075715, + "diversity_loss_mlp": 0.0, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.0661546294312284, + "language_loss": 0.81729323, + "learning_rate": 0.00036613232120630393, + "loss": 0.82809597, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3112, + "time_per_iteration": 2.6437926292419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077999, + "balance_loss_mlp": 1.06822348, + "diversity_loss_mlp": 0.0, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.09952194732663294, + "language_loss": 0.80305058, + "learning_rate": 0.00036583217853578643, + "loss": 0.81383061, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3113, + "time_per_iteration": 2.5917038917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085265, + "balance_loss_mlp": 1.07562053, + "diversity_loss_mlp": 0.0, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.09394979208953491, + "language_loss": 0.77671385, + "learning_rate": 0.000365532087953837, + "loss": 0.78756654, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.09631348, + "routerloss_mlp": 0.0, + "step": 3114, + "time_per_iteration": 3.6197850704193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075561, + "balance_loss_mlp": 1.06598282, + "diversity_loss_mlp": 0.0, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.08322265150120763, + "language_loss": 0.89675403, + "learning_rate": 0.00036523204957696065, + "loss": 0.90750962, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3115, + "time_per_iteration": 2.5928850173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068785, + "balance_loss_mlp": 1.05900383, + "diversity_loss_mlp": 0.0, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.07018475264035358, + "language_loss": 0.80565965, + "learning_rate": 0.00036493206352164324, + "loss": 0.81634748, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3116, + "time_per_iteration": 2.9302330017089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070118, + "balance_loss_mlp": 1.06046212, + "diversity_loss_mlp": 0.0, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.07338463965566117, + "language_loss": 0.85090643, + "learning_rate": 0.000364632129904349, + "loss": 0.86160767, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3117, + "time_per_iteration": 2.7801764011383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072158, + "balance_loss_mlp": 1.0622344, + "diversity_loss_mlp": 0.0, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.06545944211786243, + "language_loss": 0.78013116, + "learning_rate": 0.00036433224884152283, + "loss": 0.79085279, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 3118, + "time_per_iteration": 2.714756727218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107233, + "balance_loss_mlp": 1.06249511, + "diversity_loss_mlp": 0.0, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.08041065589047977, + "language_loss": 0.77752131, + "learning_rate": 0.00036403242044958875, + "loss": 0.78824466, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.09832764, + "routerloss_mlp": 0.0, + "step": 3119, + "time_per_iteration": 2.583292245864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078089, + "balance_loss_mlp": 1.06846261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.07420053325288596, + "language_loss": 0.91699272, + "learning_rate": 0.0003637326448449507, + "loss": 0.92777365, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3120, + "time_per_iteration": 2.717006206512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080309, + "balance_loss_mlp": 1.07065916, + "diversity_loss_mlp": 0.0, + "epoch": 0.6004232397075798, + "flos": 545146661376.0, + "grad_norm": 0.053625374444117885, + "language_loss": 0.86324787, + "learning_rate": 0.00036343292214399177, + "loss": 0.87405097, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3121, + "time_per_iteration": 2.7628395557403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092008, + "balance_loss_mlp": 1.08205438, + "diversity_loss_mlp": 0.0, + "epoch": 0.6006156213928434, + "flos": 629947694592.0, + "grad_norm": 0.08110417303016995, + "language_loss": 0.77154052, + "learning_rate": 0.00036313325246307456, + "loss": 0.78246063, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3122, + "time_per_iteration": 2.7920055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097808, + "balance_loss_mlp": 1.08813453, + "diversity_loss_mlp": 0.0, + "epoch": 0.600808003078107, + "flos": 582315277824.0, + "grad_norm": 0.07750521229706399, + "language_loss": 0.87508434, + "learning_rate": 0.0003628336359185411, + "loss": 0.88606238, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 3123, + "time_per_iteration": 2.6752257347106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086195, + "balance_loss_mlp": 1.07632422, + "diversity_loss_mlp": 0.0, + "epoch": 0.6010003847633705, + "flos": 635274855936.0, + "grad_norm": 0.09005007447476754, + "language_loss": 0.75524527, + "learning_rate": 0.000362534072626713, + "loss": 0.7661072, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3124, + "time_per_iteration": 2.7923338413238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077955, + "balance_loss_mlp": 1.06818557, + "diversity_loss_mlp": 0.0, + "epoch": 0.6011927664486341, + "flos": 718763922432.0, + "grad_norm": 0.07223530633843779, + "language_loss": 0.81714958, + "learning_rate": 0.00036223456270389093, + "loss": 0.82792914, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3125, + "time_per_iteration": 3.0091912746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075718, + "balance_loss_mlp": 1.06540036, + "diversity_loss_mlp": 0.0, + "epoch": 0.6013851481338977, + "flos": 499036184064.0, + "grad_norm": 0.06403369467156497, + "language_loss": 0.80792087, + "learning_rate": 0.00036193510626635517, + "loss": 0.81867802, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 3126, + "time_per_iteration": 2.704378843307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066992, + "balance_loss_mlp": 1.05687714, + "diversity_loss_mlp": 0.0, + "epoch": 0.6015775298191612, + "flos": 749587447296.0, + "grad_norm": 0.06193993783441067, + "language_loss": 0.81725299, + "learning_rate": 0.0003616357034303649, + "loss": 0.82792288, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3127, + "time_per_iteration": 3.002530813217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062328, + "balance_loss_mlp": 1.05243957, + "diversity_loss_mlp": 0.0, + "epoch": 0.6017699115044248, + "flos": 593063202816.0, + "grad_norm": 0.054941683840542065, + "language_loss": 0.78751493, + "learning_rate": 0.0003613363543121584, + "loss": 0.79813826, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.09887695, + "routerloss_mlp": 0.0, + "step": 3128, + "time_per_iteration": 2.8690690994262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063837, + "balance_loss_mlp": 1.05367482, + "diversity_loss_mlp": 0.0, + "epoch": 0.6019622931896883, + "flos": 515111270400.0, + "grad_norm": 0.06760978748019858, + "language_loss": 0.85022873, + "learning_rate": 0.00036103705902795357, + "loss": 0.86086708, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 3129, + "time_per_iteration": 2.7233073711395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106265, + "balance_loss_mlp": 1.0526309, + "diversity_loss_mlp": 0.0, + "epoch": 0.6021546748749519, + "flos": 490469852160.0, + "grad_norm": 0.08999540715217709, + "language_loss": 0.79606092, + "learning_rate": 0.0003607378176939471, + "loss": 0.80668741, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3130, + "time_per_iteration": 2.6465327739715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060318, + "balance_loss_mlp": 1.0503943, + "diversity_loss_mlp": 0.0, + "epoch": 0.6023470565602155, + "flos": 541032721920.0, + "grad_norm": 0.0812918345139536, + "language_loss": 0.82358718, + "learning_rate": 0.00036043863042631465, + "loss": 0.83419037, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3131, + "time_per_iteration": 2.645275354385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060921, + "balance_loss_mlp": 1.05113363, + "diversity_loss_mlp": 0.0, + "epoch": 0.6025394382454791, + "flos": 845020408320.0, + "grad_norm": 0.07968064937120022, + "language_loss": 0.7648955, + "learning_rate": 0.00036013949734121133, + "loss": 0.77550471, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3132, + "time_per_iteration": 3.1564602851867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00847858, + "balance_loss_mlp": 1.44895816, + "diversity_loss_mlp": 0.22101411, + "epoch": 0.6027318199307425, + "flos": 577173496320.0, + "grad_norm": 0.03213509913040014, + "language_loss": 0.82544625, + "learning_rate": 0.00035984041855477043, + "loss": 0.83392477, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01287225, + "step": 3133, + "time_per_iteration": 2.7710041999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00606016, + "balance_loss_mlp": 1.03831875, + "diversity_loss_mlp": 0.14934492, + "epoch": 0.6029242016160061, + "flos": 1470976754688.0, + "grad_norm": 0.0016585081527992916, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79315913, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01218408, + "step": 3134, + "time_per_iteration": 5.010243892669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058814, + "balance_loss_mlp": 1.04887819, + "diversity_loss_mlp": 0.0, + "epoch": 0.6031165833012697, + "flos": 480744626688.0, + "grad_norm": 0.06935738535706247, + "language_loss": 0.79867685, + "learning_rate": 0.00035924242434230637, + "loss": 0.80926502, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.0993042, + "routerloss_mlp": 0.0, + "step": 3135, + "time_per_iteration": 2.644461154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059705, + "balance_loss_mlp": 1.04970384, + "diversity_loss_mlp": 0.0, + "epoch": 0.6033089649865333, + "flos": 499468612608.0, + "grad_norm": 0.08930778928911463, + "language_loss": 0.78960454, + "learning_rate": 0.00035894350914844516, + "loss": 0.80020154, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3136, + "time_per_iteration": 2.6219546794891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060882, + "balance_loss_mlp": 1.05073738, + "diversity_loss_mlp": 0.0, + "epoch": 0.6035013466717969, + "flos": 556613710848.0, + "grad_norm": 0.07477991129212373, + "language_loss": 0.82716846, + "learning_rate": 0.0003586446487175703, + "loss": 0.83777732, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 3137, + "time_per_iteration": 2.7377843856811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057253, + "balance_loss_mlp": 1.04716182, + "diversity_loss_mlp": 0.0, + "epoch": 0.6036937283570604, + "flos": 594827421696.0, + "grad_norm": 0.06084036951856249, + "language_loss": 0.85439289, + "learning_rate": 0.0003583458431657099, + "loss": 0.86496538, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 3138, + "time_per_iteration": 2.773810863494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056899, + "balance_loss_mlp": 1.04697502, + "diversity_loss_mlp": 0.0, + "epoch": 0.603886110042324, + "flos": 540958569984.0, + "grad_norm": 0.10358798927054172, + "language_loss": 0.82887417, + "learning_rate": 0.00035804709260887056, + "loss": 0.83944315, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.09924316, + "routerloss_mlp": 0.0, + "step": 3139, + "time_per_iteration": 2.7064261436462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0084935, + "balance_loss_mlp": 1.45506001, + "diversity_loss_mlp": 0.21838406, + "epoch": 0.6040784917275875, + "flos": 518582808576.0, + "grad_norm": 0.02792942393132789, + "language_loss": 0.89382195, + "learning_rate": 0.0003577483971630373, + "loss": 0.9023155, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01262751, + "step": 3140, + "time_per_iteration": 2.747962236404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063532, + "balance_loss_mlp": 1.053352, + "diversity_loss_mlp": 0.0, + "epoch": 0.6042708734128511, + "flos": 660751395840.0, + "grad_norm": 0.05833739987767841, + "language_loss": 0.84937215, + "learning_rate": 0.00035744975694417414, + "loss": 0.86000752, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.10180664, + "routerloss_mlp": 0.0, + "step": 3141, + "time_per_iteration": 2.886625289916992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060532, + "balance_loss_mlp": 1.05025589, + "diversity_loss_mlp": 0.0, + "epoch": 0.6044632550981146, + "flos": 572330520576.0, + "grad_norm": 0.07799366016494108, + "language_loss": 0.82322264, + "learning_rate": 0.00035715117206822344, + "loss": 0.83382797, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3142, + "time_per_iteration": 2.8120434284210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061407, + "balance_loss_mlp": 1.05125666, + "diversity_loss_mlp": 0.0, + "epoch": 0.6046556367833782, + "flos": 546681083904.0, + "grad_norm": 0.06292121779847899, + "language_loss": 0.80965286, + "learning_rate": 0.0003568526426511065, + "loss": 0.82026696, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.10150146, + "routerloss_mlp": 0.0, + "step": 3143, + "time_per_iteration": 2.600508689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00857497, + "balance_loss_mlp": 1.4695704, + "diversity_loss_mlp": 0.22092447, + "epoch": 0.6048480184686418, + "flos": 776838117888.0, + "grad_norm": 0.033476134745844106, + "language_loss": 0.83131814, + "learning_rate": 0.000356554168808722, + "loss": 0.8398931, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0122495, + "step": 3144, + "time_per_iteration": 3.026810646057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106893, + "balance_loss_mlp": 1.058887, + "diversity_loss_mlp": 0.0, + "epoch": 0.6050404001539054, + "flos": 657144036864.0, + "grad_norm": 0.07082652980877534, + "language_loss": 0.85014772, + "learning_rate": 0.00035625575065694837, + "loss": 0.86083698, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.10040283, + "routerloss_mlp": 0.0, + "step": 3145, + "time_per_iteration": 2.840867519378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845224, + "balance_loss_mlp": 1.44920301, + "diversity_loss_mlp": 0.21683007, + "epoch": 0.605232781839169, + "flos": 548983816704.0, + "grad_norm": 0.03030378734616264, + "language_loss": 0.77627134, + "learning_rate": 0.0003559573883116415, + "loss": 0.78472358, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01220777, + "step": 3146, + "time_per_iteration": 2.7349908351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107352, + "balance_loss_mlp": 1.06324959, + "diversity_loss_mlp": 0.0, + "epoch": 0.6054251635244324, + "flos": 605402449920.0, + "grad_norm": 0.05605665058846549, + "language_loss": 0.85758018, + "learning_rate": 0.00035565908188863604, + "loss": 0.86831534, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.10272217, + "routerloss_mlp": 0.0, + "step": 3147, + "time_per_iteration": 2.8125319480895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00845087, + "balance_loss_mlp": 1.44807422, + "diversity_loss_mlp": 0.21802135, + "epoch": 0.605617545209696, + "flos": 613679887872.0, + "grad_norm": 0.03003998541469304, + "language_loss": 0.79795343, + "learning_rate": 0.00035536083150374464, + "loss": 0.80640435, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01203923, + "step": 3148, + "time_per_iteration": 2.8052470684051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017561, + "balance_loss_mlp": 1.01191068, + "diversity_loss_mlp": 0.0, + "epoch": 0.6058099268949596, + "flos": 1498301577216.0, + "grad_norm": 0.017174605961616223, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75765514, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.05639648, + "routerloss_mlp": 0.0, + "step": 3149, + "time_per_iteration": 4.839694023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068624, + "balance_loss_mlp": 1.05813408, + "diversity_loss_mlp": 0.0, + "epoch": 0.6060023085802232, + "flos": 670476621312.0, + "grad_norm": 0.07659984741592324, + "language_loss": 0.86092103, + "learning_rate": 0.0003547644993114475, + "loss": 0.87160718, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.10491943, + "routerloss_mlp": 0.0, + "step": 3150, + "time_per_iteration": 2.847841739654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072042, + "balance_loss_mlp": 1.06145024, + "diversity_loss_mlp": 0.0, + "epoch": 0.6061946902654868, + "flos": 606168562176.0, + "grad_norm": 0.11052058943541425, + "language_loss": 0.79770887, + "learning_rate": 0.00035446641773555806, + "loss": 0.80842924, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.10595703, + "routerloss_mlp": 0.0, + "step": 3151, + "time_per_iteration": 2.748117208480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068453, + "balance_loss_mlp": 1.05804002, + "diversity_loss_mlp": 0.0, + "epoch": 0.6063870719507503, + "flos": 557844185088.0, + "grad_norm": 0.06928200582264574, + "language_loss": 0.87033039, + "learning_rate": 0.000354168392660816, + "loss": 0.88101488, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.10412598, + "routerloss_mlp": 0.0, + "step": 3152, + "time_per_iteration": 2.7237491607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064757, + "balance_loss_mlp": 1.05449951, + "diversity_loss_mlp": 0.0, + "epoch": 0.6065794536360138, + "flos": 557154796032.0, + "grad_norm": 0.08776252561897581, + "language_loss": 0.83035654, + "learning_rate": 0.0003538704242029252, + "loss": 0.84100413, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.1026001, + "routerloss_mlp": 0.0, + "step": 3153, + "time_per_iteration": 2.687469959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064416, + "balance_loss_mlp": 1.05382478, + "diversity_loss_mlp": 0.0, + "epoch": 0.6067718353212774, + "flos": 690144385536.0, + "grad_norm": 0.06996316305541914, + "language_loss": 0.78274238, + "learning_rate": 0.0003535725124775672, + "loss": 0.79338652, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.105896, + "routerloss_mlp": 0.0, + "step": 3154, + "time_per_iteration": 2.844794750213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056628, + "balance_loss_mlp": 1.04631591, + "diversity_loss_mlp": 0.0, + "epoch": 0.606964217006541, + "flos": 521804726784.0, + "grad_norm": 0.06399916678040601, + "language_loss": 0.86628783, + "learning_rate": 0.00035327465760040126, + "loss": 0.87685412, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 3155, + "time_per_iteration": 2.7096383571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_mlp": 1.03957009, + "diversity_loss_mlp": 0.0, + "epoch": 0.6071565986918045, + "flos": 641555707392.0, + "grad_norm": 0.08275092128409181, + "language_loss": 0.84610963, + "learning_rate": 0.00035297685968706526, + "loss": 0.85660648, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3156, + "time_per_iteration": 2.770024061203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054018, + "balance_loss_mlp": 1.04370594, + "diversity_loss_mlp": 0.0, + "epoch": 0.6073489803770681, + "flos": 560581917696.0, + "grad_norm": 0.07863496537101755, + "language_loss": 0.83056825, + "learning_rate": 0.00035267911885317454, + "loss": 0.84110844, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.10314941, + "routerloss_mlp": 0.0, + "step": 3157, + "time_per_iteration": 2.671334743499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050585, + "balance_loss_mlp": 1.04051757, + "diversity_loss_mlp": 0.0, + "epoch": 0.6075413620623317, + "flos": 586088193024.0, + "grad_norm": 0.06000790250856451, + "language_loss": 0.81843442, + "learning_rate": 0.0003523814352143222, + "loss": 0.82894027, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3158, + "time_per_iteration": 2.820080518722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053512, + "balance_loss_mlp": 1.04349208, + "diversity_loss_mlp": 0.0, + "epoch": 0.6077337437475953, + "flos": 630812551680.0, + "grad_norm": 0.0842902191025903, + "language_loss": 0.91154212, + "learning_rate": 0.00035208380888607937, + "loss": 0.92207724, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.10015869, + "routerloss_mlp": 0.0, + "step": 3159, + "time_per_iteration": 2.769655466079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_mlp": 1.02448559, + "diversity_loss_mlp": 0.0, + "epoch": 0.6079261254328588, + "flos": 1468503696384.0, + "grad_norm": 0.01971528727847153, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80491835, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.05371094, + "routerloss_mlp": 0.0, + "step": 3160, + "time_per_iteration": 4.852057933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020567, + "balance_loss_mlp": 1.015203, + "diversity_loss_mlp": 0.0, + "epoch": 0.6081185071181223, + "flos": 1523024861184.0, + "grad_norm": 0.015706814795434412, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76712799, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.05371094, + "routerloss_mlp": 0.0, + "step": 3161, + "time_per_iteration": 5.034492015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105269, + "balance_loss_mlp": 1.04277158, + "diversity_loss_mlp": 0.0, + "epoch": 0.6083108888033859, + "flos": 556319674368.0, + "grad_norm": 0.07240231538807727, + "language_loss": 0.82060492, + "learning_rate": 0.00035119127492038446, + "loss": 0.83113182, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3162, + "time_per_iteration": 2.7958009243011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058888, + "balance_loss_mlp": 1.04918981, + "diversity_loss_mlp": 0.0, + "epoch": 0.6085032704886495, + "flos": 841166000640.0, + "grad_norm": 0.08243185287386566, + "language_loss": 0.8267377, + "learning_rate": 0.00035089387898984436, + "loss": 0.83732659, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.09692383, + "routerloss_mlp": 0.0, + "step": 3163, + "time_per_iteration": 3.0141196250915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106001, + "balance_loss_mlp": 1.04982388, + "diversity_loss_mlp": 0.0, + "epoch": 0.6086956521739131, + "flos": 684792631296.0, + "grad_norm": 0.07404044041946549, + "language_loss": 0.81452298, + "learning_rate": 0.0003505965409474343, + "loss": 0.82512313, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3164, + "time_per_iteration": 2.884279727935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00822199, + "balance_loss_mlp": 1.40056133, + "diversity_loss_mlp": 0.21809974, + "epoch": 0.6088880338591766, + "flos": 535799536128.0, + "grad_norm": 0.02989314006565827, + "language_loss": 0.86555362, + "learning_rate": 0.0003502992609085913, + "loss": 0.8737756, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01286863, + "step": 3165, + "time_per_iteration": 2.665219306945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064479, + "balance_loss_mlp": 1.05481732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6090804155444401, + "flos": 731533026816.0, + "grad_norm": 0.0721176964117247, + "language_loss": 0.82392001, + "learning_rate": 0.00035000203898872954, + "loss": 0.83456486, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3166, + "time_per_iteration": 3.0119569301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064027, + "balance_loss_mlp": 1.05416799, + "diversity_loss_mlp": 0.0, + "epoch": 0.6092727972297037, + "flos": 699014665728.0, + "grad_norm": 0.07129548452914211, + "language_loss": 0.84480536, + "learning_rate": 0.0003497048753032406, + "loss": 0.85544562, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3167, + "time_per_iteration": 2.854588031768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069841, + "balance_loss_mlp": 1.05985689, + "diversity_loss_mlp": 0.0, + "epoch": 0.6094651789149673, + "flos": 1051946735616.0, + "grad_norm": 0.07231997141892146, + "language_loss": 0.80835009, + "learning_rate": 0.000349407769967494, + "loss": 0.8190484, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.09979248, + "routerloss_mlp": 0.0, + "step": 3168, + "time_per_iteration": 3.3936102390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072291, + "balance_loss_mlp": 1.06240892, + "diversity_loss_mlp": 0.0, + "epoch": 0.6096575606002309, + "flos": 503085883392.0, + "grad_norm": 0.08318926372150726, + "language_loss": 0.8467539, + "learning_rate": 0.0003491107230968361, + "loss": 0.85747683, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.09881592, + "routerloss_mlp": 0.0, + "step": 3169, + "time_per_iteration": 2.618696928024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070277, + "balance_loss_mlp": 1.06021023, + "diversity_loss_mlp": 0.0, + "epoch": 0.6098499422854944, + "flos": 585643281408.0, + "grad_norm": 0.06713277413300113, + "language_loss": 0.81751496, + "learning_rate": 0.00034881373480659085, + "loss": 0.82821774, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3170, + "time_per_iteration": 2.862299919128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063164, + "balance_loss_mlp": 1.05321598, + "diversity_loss_mlp": 0.0, + "epoch": 0.610042323970758, + "flos": 469205996544.0, + "grad_norm": 0.08200914133790435, + "language_loss": 0.77840459, + "learning_rate": 0.0003485168052120594, + "loss": 0.78903627, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3171, + "time_per_iteration": 2.564657688140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060206, + "balance_loss_mlp": 1.05049598, + "diversity_loss_mlp": 0.0, + "epoch": 0.6102347056560216, + "flos": 514177403904.0, + "grad_norm": 0.07281146068818606, + "language_loss": 0.80045426, + "learning_rate": 0.00034821993442851973, + "loss": 0.81105626, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3172, + "time_per_iteration": 2.6049551963806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058405, + "balance_loss_mlp": 1.04840922, + "diversity_loss_mlp": 0.0, + "epoch": 0.6104270873412851, + "flos": 469013276160.0, + "grad_norm": 0.08175384117022455, + "language_loss": 0.82176208, + "learning_rate": 0.00034792312257122735, + "loss": 0.83234608, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.09991455, + "routerloss_mlp": 0.0, + "step": 3173, + "time_per_iteration": 2.6007068157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00813523, + "balance_loss_mlp": 1.38556361, + "diversity_loss_mlp": 0.21673629, + "epoch": 0.6106194690265486, + "flos": 549875837952.0, + "grad_norm": 0.0335182000566727, + "language_loss": 0.80848879, + "learning_rate": 0.00034762636975541506, + "loss": 0.81662405, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01237353, + "step": 3174, + "time_per_iteration": 2.6783013343811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061612, + "balance_loss_mlp": 1.05138397, + "diversity_loss_mlp": 0.0, + "epoch": 0.6108118507118122, + "flos": 472857772032.0, + "grad_norm": 0.07909505551334972, + "language_loss": 0.81032109, + "learning_rate": 0.0003473296760962923, + "loss": 0.82093716, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.10229492, + "routerloss_mlp": 0.0, + "step": 3175, + "time_per_iteration": 2.7157249450683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017221, + "balance_loss_mlp": 1.01159382, + "diversity_loss_mlp": 0.0, + "epoch": 0.6110042323970758, + "flos": 1445166904320.0, + "grad_norm": 0.020158265394599716, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79550958, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.05615234, + "routerloss_mlp": 0.0, + "step": 3176, + "time_per_iteration": 4.707489728927612 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_mlp": 1.04915345, + "diversity_loss_mlp": 0.0, + "epoch": 0.6111966140823394, + "flos": 794153590272.0, + "grad_norm": 0.08734600695876651, + "language_loss": 0.8132062, + "learning_rate": 0.00034673646670883976, + "loss": 0.82379746, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.09973145, + "routerloss_mlp": 0.0, + "step": 3177, + "time_per_iteration": 2.965688705444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101108, + "balance_loss_mlp": 1.00557232, + "diversity_loss_mlp": 0.0, + "epoch": 0.611388995767603, + "flos": 1557650663424.0, + "grad_norm": 0.01801959168057259, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76726103, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.05517578, + "routerloss_mlp": 0.0, + "step": 3178, + "time_per_iteration": 4.958420991897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00819092, + "balance_loss_mlp": 1.39532781, + "diversity_loss_mlp": 0.21795917, + "epoch": 0.6115813774528664, + "flos": 712169210880.0, + "grad_norm": 0.031831362939539476, + "language_loss": 0.81821573, + "learning_rate": 0.0003461434953300865, + "loss": 0.82640672, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01244847, + "step": 3179, + "time_per_iteration": 2.92270827293396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063068, + "balance_loss_mlp": 1.05295873, + "diversity_loss_mlp": 0.0, + "epoch": 0.61177375913813, + "flos": 684308072448.0, + "grad_norm": 0.055258394831610054, + "language_loss": 0.81141388, + "learning_rate": 0.0003458470991817515, + "loss": 0.82204449, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3180, + "time_per_iteration": 2.9693758487701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060777, + "balance_loss_mlp": 1.05068588, + "diversity_loss_mlp": 0.0, + "epoch": 0.6119661408233936, + "flos": 511662127104.0, + "grad_norm": 0.06960725666926779, + "language_loss": 0.85075366, + "learning_rate": 0.0003455507628808802, + "loss": 0.86136144, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.10089111, + "routerloss_mlp": 0.0, + "step": 3181, + "time_per_iteration": 2.6036593914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071608, + "balance_loss_mlp": 1.06117702, + "diversity_loss_mlp": 0.0, + "epoch": 0.6121585225086572, + "flos": 556809002496.0, + "grad_norm": 0.09091925049493645, + "language_loss": 0.84135175, + "learning_rate": 0.00034525448654252076, + "loss": 0.85206783, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.10430908, + "routerloss_mlp": 0.0, + "step": 3182, + "time_per_iteration": 2.636809825897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061719, + "balance_loss_mlp": 1.05150867, + "diversity_loss_mlp": 0.0, + "epoch": 0.6123509041939207, + "flos": 561849467904.0, + "grad_norm": 0.07252100888517035, + "language_loss": 0.82806599, + "learning_rate": 0.0003449582702816976, + "loss": 0.83868313, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.10211182, + "routerloss_mlp": 0.0, + "step": 3183, + "time_per_iteration": 2.707475423812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070149, + "balance_loss_mlp": 1.05986118, + "diversity_loss_mlp": 0.0, + "epoch": 0.6125432858791843, + "flos": 558056729088.0, + "grad_norm": 0.07323153161974344, + "language_loss": 0.82831162, + "learning_rate": 0.0003446621142134122, + "loss": 0.8390131, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.10290527, + "routerloss_mlp": 0.0, + "step": 3184, + "time_per_iteration": 2.6639719009399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068209, + "balance_loss_mlp": 1.05824375, + "diversity_loss_mlp": 0.0, + "epoch": 0.6127356675644479, + "flos": 415015944192.0, + "grad_norm": 0.08088263565451759, + "language_loss": 0.84134692, + "learning_rate": 0.0003443660184526424, + "loss": 0.85202903, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3185, + "time_per_iteration": 2.465219736099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068542, + "balance_loss_mlp": 1.05862343, + "diversity_loss_mlp": 0.0, + "epoch": 0.6129280492497114, + "flos": 603843434496.0, + "grad_norm": 0.06289917121629264, + "language_loss": 0.86502969, + "learning_rate": 0.0003440699831143429, + "loss": 0.87571514, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3186, + "time_per_iteration": 2.7979393005371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062443, + "balance_loss_mlp": 1.05262065, + "diversity_loss_mlp": 0.0, + "epoch": 0.613120430934975, + "flos": 519766295040.0, + "grad_norm": 0.07676649362634465, + "language_loss": 0.82236582, + "learning_rate": 0.0003437740083134449, + "loss": 0.83299029, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 3187, + "time_per_iteration": 2.686150312423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066248, + "balance_loss_mlp": 1.0564487, + "diversity_loss_mlp": 0.0, + "epoch": 0.6133128126202385, + "flos": 511083965952.0, + "grad_norm": 0.08991197971935971, + "language_loss": 0.83540225, + "learning_rate": 0.00034347809416485574, + "loss": 0.84606475, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.09790039, + "routerloss_mlp": 0.0, + "step": 3188, + "time_per_iteration": 2.604308605194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106686, + "balance_loss_mlp": 1.05696571, + "diversity_loss_mlp": 0.0, + "epoch": 0.6135051943055021, + "flos": 607562021376.0, + "grad_norm": 0.07330624647380965, + "language_loss": 0.81935883, + "learning_rate": 0.0003431822407834597, + "loss": 0.83002746, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.09887695, + "routerloss_mlp": 0.0, + "step": 3189, + "time_per_iteration": 2.786008596420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070082, + "balance_loss_mlp": 1.0602051, + "diversity_loss_mlp": 0.0, + "epoch": 0.6136975759907657, + "flos": 1160200931328.0, + "grad_norm": 0.07745901872485048, + "language_loss": 0.84407461, + "learning_rate": 0.00034288644828411706, + "loss": 0.85477537, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.09869385, + "routerloss_mlp": 0.0, + "step": 3190, + "time_per_iteration": 3.4646387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078292, + "balance_loss_mlp": 1.06861246, + "diversity_loss_mlp": 0.0, + "epoch": 0.6138899576760293, + "flos": 706938596352.0, + "grad_norm": 0.07529521339256182, + "language_loss": 0.75715351, + "learning_rate": 0.0003425907167816649, + "loss": 0.76793635, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3191, + "time_per_iteration": 2.874946117401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00808796, + "balance_loss_mlp": 1.37378812, + "diversity_loss_mlp": 0.21839428, + "epoch": 0.6140823393612928, + "flos": 586443898368.0, + "grad_norm": 0.033870623426287425, + "language_loss": 0.84848714, + "learning_rate": 0.00034229504639091623, + "loss": 0.85657513, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01270431, + "step": 3192, + "time_per_iteration": 2.8179514408111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074782, + "balance_loss_mlp": 1.06519175, + "diversity_loss_mlp": 0.0, + "epoch": 0.6142747210465563, + "flos": 804130633728.0, + "grad_norm": 0.07980932307836838, + "language_loss": 0.79876941, + "learning_rate": 0.0003419994372266606, + "loss": 0.80951726, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3193, + "time_per_iteration": 3.121509552001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070128, + "balance_loss_mlp": 1.06069219, + "diversity_loss_mlp": 0.0, + "epoch": 0.6144671027318199, + "flos": 529434620928.0, + "grad_norm": 0.05544583647367184, + "language_loss": 0.82228541, + "learning_rate": 0.00034170388940366335, + "loss": 0.83298671, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3194, + "time_per_iteration": 2.725961685180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071987, + "balance_loss_mlp": 1.0625093, + "diversity_loss_mlp": 0.0, + "epoch": 0.6146594844170835, + "flos": 805425348096.0, + "grad_norm": 0.06534437990847952, + "language_loss": 0.80109018, + "learning_rate": 0.0003414084030366667, + "loss": 0.81181002, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.0947876, + "routerloss_mlp": 0.0, + "step": 3195, + "time_per_iteration": 3.127318859100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073594, + "balance_loss_mlp": 1.06399155, + "diversity_loss_mlp": 0.0, + "epoch": 0.6148518661023471, + "flos": 501697193472.0, + "grad_norm": 0.07171859971508983, + "language_loss": 0.83377409, + "learning_rate": 0.0003411129782403883, + "loss": 0.84451008, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3196, + "time_per_iteration": 2.7145206928253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078425, + "balance_loss_mlp": 1.06870365, + "diversity_loss_mlp": 0.0, + "epoch": 0.6150442477876106, + "flos": 510688613376.0, + "grad_norm": 0.09666217933122766, + "language_loss": 0.85076511, + "learning_rate": 0.0003408176151295225, + "loss": 0.86154932, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3197, + "time_per_iteration": 2.5919525623321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079298, + "balance_loss_mlp": 1.06990433, + "diversity_loss_mlp": 0.0, + "epoch": 0.6152366294728742, + "flos": 527005979136.0, + "grad_norm": 0.06581377475358774, + "language_loss": 0.77279031, + "learning_rate": 0.00034052231381873944, + "loss": 0.78358328, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3198, + "time_per_iteration": 2.597702741622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082219, + "balance_loss_mlp": 1.07295024, + "diversity_loss_mlp": 0.0, + "epoch": 0.6154290111581378, + "flos": 473300112384.0, + "grad_norm": 0.0683279233493331, + "language_loss": 0.85131848, + "learning_rate": 0.00034022707442268494, + "loss": 0.8621406, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3199, + "time_per_iteration": 2.562068223953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080014, + "balance_loss_mlp": 1.07069743, + "diversity_loss_mlp": 0.0, + "epoch": 0.6156213928434013, + "flos": 550819616256.0, + "grad_norm": 0.0761762485373057, + "language_loss": 0.82035017, + "learning_rate": 0.0003399318970559813, + "loss": 0.83115035, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 3200, + "time_per_iteration": 2.789898157119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080478, + "balance_loss_mlp": 1.07100666, + "diversity_loss_mlp": 0.0, + "epoch": 0.6158137745286649, + "flos": 750941259264.0, + "grad_norm": 0.08069642466901547, + "language_loss": 0.84662288, + "learning_rate": 0.00033963678183322656, + "loss": 0.85742772, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3201, + "time_per_iteration": 3.026878595352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091206, + "balance_loss_mlp": 1.08173513, + "diversity_loss_mlp": 0.0, + "epoch": 0.6160061562139284, + "flos": 555815665152.0, + "grad_norm": 0.059556899615455, + "language_loss": 0.82784677, + "learning_rate": 0.0003393417288689945, + "loss": 0.83875883, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3202, + "time_per_iteration": 2.6654982566833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090691, + "balance_loss_mlp": 1.08118427, + "diversity_loss_mlp": 0.0, + "epoch": 0.616198537899192, + "flos": 742177437696.0, + "grad_norm": 0.07467788423655687, + "language_loss": 0.76113433, + "learning_rate": 0.00033904673827783504, + "loss": 0.77204126, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3203, + "time_per_iteration": 2.92669939994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010864, + "balance_loss_mlp": 1.07689261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6163909195844556, + "flos": 478810082304.0, + "grad_norm": 0.06286363142909755, + "language_loss": 0.8181622, + "learning_rate": 0.00033875181017427357, + "loss": 0.82902622, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3204, + "time_per_iteration": 2.5680675506591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090188, + "balance_loss_mlp": 1.08068752, + "diversity_loss_mlp": 0.0, + "epoch": 0.6165833012697192, + "flos": 531517469184.0, + "grad_norm": 0.07085405603281952, + "language_loss": 0.81132901, + "learning_rate": 0.00033845694467281133, + "loss": 0.82223082, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3205, + "time_per_iteration": 2.8592958450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00806951, + "balance_loss_mlp": 1.37197065, + "diversity_loss_mlp": 0.21751499, + "epoch": 0.6167756829549826, + "flos": 807765156864.0, + "grad_norm": 0.030824309293312202, + "language_loss": 0.83412218, + "learning_rate": 0.00033816214188792516, + "loss": 0.84219164, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01220786, + "step": 3206, + "time_per_iteration": 3.1863744258880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087794, + "balance_loss_mlp": 1.07844186, + "diversity_loss_mlp": 0.0, + "epoch": 0.6169680646402462, + "flos": 488928089088.0, + "grad_norm": 0.07935266980456598, + "language_loss": 0.85488075, + "learning_rate": 0.00033786740193406784, + "loss": 0.86575866, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3207, + "time_per_iteration": 2.626253604888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.07682097, + "diversity_loss_mlp": 0.0, + "epoch": 0.6171604463255098, + "flos": 618954918912.0, + "grad_norm": 0.07540350896316815, + "language_loss": 0.81724775, + "learning_rate": 0.00033757272492566736, + "loss": 0.82811046, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3208, + "time_per_iteration": 2.8899030685424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080715, + "balance_loss_mlp": 1.07114851, + "diversity_loss_mlp": 0.0, + "epoch": 0.6173528280107734, + "flos": 528859031040.0, + "grad_norm": 0.05796890161537444, + "language_loss": 0.87216032, + "learning_rate": 0.0003372781109771278, + "loss": 0.88296747, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3209, + "time_per_iteration": 2.752558708190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077325, + "balance_loss_mlp": 1.06753802, + "diversity_loss_mlp": 0.0, + "epoch": 0.617545209696037, + "flos": 596581728768.0, + "grad_norm": 0.06419749590312054, + "language_loss": 0.76373756, + "learning_rate": 0.0003369835602028281, + "loss": 0.7745108, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3210, + "time_per_iteration": 2.7878270149230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068114, + "balance_loss_mlp": 1.05842817, + "diversity_loss_mlp": 0.0, + "epoch": 0.6177375913813005, + "flos": 475098835968.0, + "grad_norm": 0.0669620080474601, + "language_loss": 0.79502624, + "learning_rate": 0.0003366890727171232, + "loss": 0.8057074, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3211, + "time_per_iteration": 2.7112903594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069089, + "balance_loss_mlp": 1.05950451, + "diversity_loss_mlp": 0.0, + "epoch": 0.617929973066564, + "flos": 529812721152.0, + "grad_norm": 0.08442057123784988, + "language_loss": 0.78359348, + "learning_rate": 0.00033639464863434313, + "loss": 0.79428434, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3212, + "time_per_iteration": 2.634425163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_mlp": 1.03023958, + "diversity_loss_mlp": 0.0, + "epoch": 0.6181223547518276, + "flos": 1420053783552.0, + "grad_norm": 0.02134222442632316, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79478121, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 3213, + "time_per_iteration": 4.7891459465026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066789, + "balance_loss_mlp": 1.05715084, + "diversity_loss_mlp": 0.0, + "epoch": 0.6183147364370912, + "flos": 740319243264.0, + "grad_norm": 0.07602232380536252, + "language_loss": 0.79711038, + "learning_rate": 0.00033580599113475543, + "loss": 0.80777824, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3214, + "time_per_iteration": 2.987006187438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065135, + "balance_loss_mlp": 1.0553956, + "diversity_loss_mlp": 0.0, + "epoch": 0.6185071181223547, + "flos": 381649978368.0, + "grad_norm": 0.0762428760353498, + "language_loss": 0.86394417, + "learning_rate": 0.00033551175794648507, + "loss": 0.87459552, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.09735107, + "routerloss_mlp": 0.0, + "step": 3215, + "time_per_iteration": 2.4780433177948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064383, + "balance_loss_mlp": 1.05447078, + "diversity_loss_mlp": 0.0, + "epoch": 0.6186994998076183, + "flos": 463347661824.0, + "grad_norm": 0.059308624592263506, + "language_loss": 0.81911212, + "learning_rate": 0.00033521758861821365, + "loss": 0.82975602, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3216, + "time_per_iteration": 2.5746333599090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062859, + "balance_loss_mlp": 1.05332255, + "diversity_loss_mlp": 0.0, + "epoch": 0.6188918814928819, + "flos": 485273742336.0, + "grad_norm": 0.06339313693664829, + "language_loss": 0.89093363, + "learning_rate": 0.0003349234832641479, + "loss": 0.90156221, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3217, + "time_per_iteration": 2.561518669128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062704, + "balance_loss_mlp": 1.05323243, + "diversity_loss_mlp": 0.0, + "epoch": 0.6190842631781455, + "flos": 657307021824.0, + "grad_norm": 0.07035473810033784, + "language_loss": 0.81230485, + "learning_rate": 0.00033462944199846975, + "loss": 0.82293189, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3218, + "time_per_iteration": 3.0372345447540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065156, + "balance_loss_mlp": 1.05549467, + "diversity_loss_mlp": 0.0, + "epoch": 0.619276644863409, + "flos": 403603223040.0, + "grad_norm": 0.07112802613336307, + "language_loss": 0.86179578, + "learning_rate": 0.00033433546493533606, + "loss": 0.87244731, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3219, + "time_per_iteration": 2.4615468978881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066941, + "balance_loss_mlp": 1.05763078, + "diversity_loss_mlp": 0.0, + "epoch": 0.6194690265486725, + "flos": 583093499904.0, + "grad_norm": 0.07983484825062852, + "language_loss": 0.84651643, + "learning_rate": 0.00033404155218887897, + "loss": 0.8571859, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3220, + "time_per_iteration": 2.725001335144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066491, + "balance_loss_mlp": 1.05722845, + "diversity_loss_mlp": 0.0, + "epoch": 0.6196614082339361, + "flos": 504246974976.0, + "grad_norm": 0.05498489673307501, + "language_loss": 0.87258649, + "learning_rate": 0.00033374770387320534, + "loss": 0.88325131, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3221, + "time_per_iteration": 2.7884719371795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066206, + "balance_loss_mlp": 1.05684233, + "diversity_loss_mlp": 0.0, + "epoch": 0.6198537899191997, + "flos": 575409277440.0, + "grad_norm": 0.06826724081601121, + "language_loss": 0.85091376, + "learning_rate": 0.00033345392010239737, + "loss": 0.86157584, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3222, + "time_per_iteration": 2.758528232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_mlp": 1.06346869, + "diversity_loss_mlp": 0.0, + "epoch": 0.6200461716044633, + "flos": 593157178368.0, + "grad_norm": 0.07112470494876487, + "language_loss": 0.82199866, + "learning_rate": 0.0003331602009905118, + "loss": 0.8327266, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3223, + "time_per_iteration": 2.7497544288635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073437, + "balance_loss_mlp": 1.06405497, + "diversity_loss_mlp": 0.0, + "epoch": 0.6202385532897268, + "flos": 666093238272.0, + "grad_norm": 0.06198906744782324, + "language_loss": 0.8420788, + "learning_rate": 0.00033286654665158085, + "loss": 0.85281318, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3224, + "time_per_iteration": 2.938769817352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00805444, + "balance_loss_mlp": 1.36691594, + "diversity_loss_mlp": 0.21943557, + "epoch": 0.6204309349749904, + "flos": 484952541696.0, + "grad_norm": 0.03128305924884035, + "language_loss": 0.87915754, + "learning_rate": 0.0003325729571996109, + "loss": 0.88721198, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01226849, + "step": 3225, + "time_per_iteration": 2.6774377822875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.07079625, + "diversity_loss_mlp": 0.0, + "epoch": 0.6206233166602539, + "flos": 584057101824.0, + "grad_norm": 0.15310961758991004, + "language_loss": 0.83791566, + "learning_rate": 0.000332279432748584, + "loss": 0.8487193, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3226, + "time_per_iteration": 2.723944664001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078837, + "balance_loss_mlp": 1.06965768, + "diversity_loss_mlp": 0.0, + "epoch": 0.6208156983455175, + "flos": 476917383168.0, + "grad_norm": 0.06102841985942585, + "language_loss": 0.87609762, + "learning_rate": 0.00033198597341245576, + "loss": 0.886886, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3227, + "time_per_iteration": 2.6077282428741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107844, + "balance_loss_mlp": 1.06877792, + "diversity_loss_mlp": 0.0, + "epoch": 0.6210080800307811, + "flos": 789066137088.0, + "grad_norm": 0.05859377500804419, + "language_loss": 0.81977952, + "learning_rate": 0.00033169257930515763, + "loss": 0.8305639, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3228, + "time_per_iteration": 3.0201709270477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079582, + "balance_loss_mlp": 1.06983042, + "diversity_loss_mlp": 0.0, + "epoch": 0.6212004617160446, + "flos": 607794388992.0, + "grad_norm": 0.06260829937623101, + "language_loss": 0.81892502, + "learning_rate": 0.0003313992505405951, + "loss": 0.82972085, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.09741211, + "routerloss_mlp": 0.0, + "step": 3229, + "time_per_iteration": 2.7065281867980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085739, + "balance_loss_mlp": 1.07612467, + "diversity_loss_mlp": 0.0, + "epoch": 0.6213928434013082, + "flos": 586520621568.0, + "grad_norm": 0.07524693848551285, + "language_loss": 0.81223184, + "learning_rate": 0.0003311059872326487, + "loss": 0.82308924, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.09606934, + "routerloss_mlp": 0.0, + "step": 3230, + "time_per_iteration": 2.6831164360046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082096, + "balance_loss_mlp": 1.07257652, + "diversity_loss_mlp": 0.0, + "epoch": 0.6215852250865718, + "flos": 536076320256.0, + "grad_norm": 0.08041283658351392, + "language_loss": 0.792005, + "learning_rate": 0.0003308127894951734, + "loss": 0.80282593, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3231, + "time_per_iteration": 2.6133408546447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.07829607, + "diversity_loss_mlp": 0.0, + "epoch": 0.6217776067718354, + "flos": 618169356288.0, + "grad_norm": 0.0806270364015219, + "language_loss": 0.86446661, + "learning_rate": 0.00033051965744199834, + "loss": 0.87534499, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3232, + "time_per_iteration": 2.7565104961395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081354, + "balance_loss_mlp": 1.07194829, + "diversity_loss_mlp": 0.0, + "epoch": 0.6219699884570988, + "flos": 545875324416.0, + "grad_norm": 0.06624380464527684, + "language_loss": 0.90293765, + "learning_rate": 0.0003302265911869276, + "loss": 0.91375124, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3233, + "time_per_iteration": 2.926671266555786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070794, + "balance_loss_mlp": 1.06132245, + "diversity_loss_mlp": 0.0, + "epoch": 0.6221623701423624, + "flos": 481149891072.0, + "grad_norm": 0.08213933441923858, + "language_loss": 0.84280741, + "learning_rate": 0.0003299335908437397, + "loss": 0.85351539, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3234, + "time_per_iteration": 2.5910556316375732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074187, + "balance_loss_mlp": 1.06473994, + "diversity_loss_mlp": 0.0, + "epoch": 0.622354751827626, + "flos": 380024151552.0, + "grad_norm": 0.08585428313311574, + "language_loss": 0.79975766, + "learning_rate": 0.0003296406565261873, + "loss": 0.81049955, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3235, + "time_per_iteration": 2.4815149307250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069588, + "balance_loss_mlp": 1.06017601, + "diversity_loss_mlp": 0.0, + "epoch": 0.6225471335128896, + "flos": 667869940224.0, + "grad_norm": 0.07182021420774376, + "language_loss": 0.84884858, + "learning_rate": 0.0003293477883479978, + "loss": 0.85954452, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3236, + "time_per_iteration": 2.821707248687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.05992377, + "diversity_loss_mlp": 0.0, + "epoch": 0.6227395151981532, + "flos": 771320807424.0, + "grad_norm": 0.08520791019751349, + "language_loss": 0.79754794, + "learning_rate": 0.0003290549864228727, + "loss": 0.80824208, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3237, + "time_per_iteration": 2.932542324066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075658, + "balance_loss_mlp": 1.06604934, + "diversity_loss_mlp": 0.0, + "epoch": 0.6229318968834167, + "flos": 484354556928.0, + "grad_norm": 0.07053580491728426, + "language_loss": 0.86281902, + "learning_rate": 0.0003287622508644875, + "loss": 0.87357557, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3238, + "time_per_iteration": 2.742324113845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00814101, + "balance_loss_mlp": 1.38574493, + "diversity_loss_mlp": 0.21743111, + "epoch": 0.6231242785686802, + "flos": 462935056896.0, + "grad_norm": 0.03587473659698897, + "language_loss": 0.86128193, + "learning_rate": 0.0003284695817864923, + "loss": 0.86942297, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01251296, + "step": 3239, + "time_per_iteration": 2.5240445137023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071749, + "balance_loss_mlp": 1.06229532, + "diversity_loss_mlp": 0.0, + "epoch": 0.6233166602539438, + "flos": 609089103360.0, + "grad_norm": 0.08834225044652763, + "language_loss": 0.84207428, + "learning_rate": 0.0003281769793025116, + "loss": 0.85279179, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3240, + "time_per_iteration": 2.733356237411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00812174, + "balance_loss_mlp": 1.3801111, + "diversity_loss_mlp": 0.21927354, + "epoch": 0.6235090419392074, + "flos": 439200340992.0, + "grad_norm": 0.03793852776762896, + "language_loss": 0.8948651, + "learning_rate": 0.00032788444352614346, + "loss": 0.90298682, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01248194, + "step": 3241, + "time_per_iteration": 2.599942922592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_mlp": 1.06840372, + "diversity_loss_mlp": 0.0, + "epoch": 0.6237014236244709, + "flos": 504904430592.0, + "grad_norm": 0.07096292336409799, + "language_loss": 0.80582923, + "learning_rate": 0.0003275919745709606, + "loss": 0.81660759, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3242, + "time_per_iteration": 2.5855822563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079567, + "balance_loss_mlp": 1.07014906, + "diversity_loss_mlp": 0.0, + "epoch": 0.6238938053097345, + "flos": 512917194240.0, + "grad_norm": 0.06686828549294242, + "language_loss": 0.81972641, + "learning_rate": 0.00032729957255050936, + "loss": 0.83052206, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3243, + "time_per_iteration": 2.652064561843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079309, + "balance_loss_mlp": 1.06973052, + "diversity_loss_mlp": 0.0, + "epoch": 0.6240861869949981, + "flos": 736751531520.0, + "grad_norm": 0.0716805986451115, + "language_loss": 0.81674051, + "learning_rate": 0.0003270072375783102, + "loss": 0.8275336, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3244, + "time_per_iteration": 2.894718647003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070218, + "balance_loss_mlp": 1.06071746, + "diversity_loss_mlp": 0.0, + "epoch": 0.6242785686802617, + "flos": 494712271872.0, + "grad_norm": 0.06745739273028781, + "language_loss": 0.79402959, + "learning_rate": 0.00032671496976785774, + "loss": 0.80473179, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3245, + "time_per_iteration": 2.637991428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077124, + "balance_loss_mlp": 1.06772995, + "diversity_loss_mlp": 0.0, + "epoch": 0.6244709503655252, + "flos": 745846465536.0, + "grad_norm": 0.06297519573167677, + "language_loss": 0.7578575, + "learning_rate": 0.0003264227692326205, + "loss": 0.76862872, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3246, + "time_per_iteration": 3.0627310276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010763, + "balance_loss_mlp": 1.06653643, + "diversity_loss_mlp": 0.0, + "epoch": 0.6246633320507887, + "flos": 492602259456.0, + "grad_norm": 0.06711643928809063, + "language_loss": 0.85974544, + "learning_rate": 0.00032613063608604055, + "loss": 0.87050849, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3247, + "time_per_iteration": 2.6602516174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_mlp": 1.0650897, + "diversity_loss_mlp": 0.0, + "epoch": 0.6248557137360523, + "flos": 517391981568.0, + "grad_norm": 0.06836828090896512, + "language_loss": 0.8368777, + "learning_rate": 0.0003258385704415343, + "loss": 0.84762454, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3248, + "time_per_iteration": 2.5850605964660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068989, + "balance_loss_mlp": 1.05929732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6250480954213159, + "flos": 519363601920.0, + "grad_norm": 0.0567839390219681, + "language_loss": 0.82901073, + "learning_rate": 0.0003255465724124915, + "loss": 0.83970058, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3249, + "time_per_iteration": 2.7133941650390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068793, + "balance_loss_mlp": 1.05952442, + "diversity_loss_mlp": 0.0, + "epoch": 0.6252404771065795, + "flos": 516060191232.0, + "grad_norm": 0.05839887652934639, + "language_loss": 0.82966471, + "learning_rate": 0.00032525464211227587, + "loss": 0.84035265, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3250, + "time_per_iteration": 2.611469030380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071121, + "balance_loss_mlp": 1.06180525, + "diversity_loss_mlp": 0.0, + "epoch": 0.6254328587918431, + "flos": 576916535808.0, + "grad_norm": 0.07351416510504778, + "language_loss": 0.85770059, + "learning_rate": 0.0003249627796542249, + "loss": 0.8684119, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3251, + "time_per_iteration": 2.6665618419647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066555, + "balance_loss_mlp": 1.05709553, + "diversity_loss_mlp": 0.0, + "epoch": 0.6256252404771065, + "flos": 597930771456.0, + "grad_norm": 0.06415360650327814, + "language_loss": 0.84284747, + "learning_rate": 0.00032467098515164943, + "loss": 0.853513, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3252, + "time_per_iteration": 2.8863329887390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069653, + "balance_loss_mlp": 1.06005657, + "diversity_loss_mlp": 0.0, + "epoch": 0.6258176221623701, + "flos": 508299245568.0, + "grad_norm": 0.07319159145136593, + "language_loss": 0.83726692, + "learning_rate": 0.00032437925871783456, + "loss": 0.84796345, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3253, + "time_per_iteration": 2.6411869525909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107529, + "balance_loss_mlp": 1.06570566, + "diversity_loss_mlp": 0.0, + "epoch": 0.6260100038476337, + "flos": 639645755904.0, + "grad_norm": 0.06969705547120199, + "language_loss": 0.84202456, + "learning_rate": 0.00032408760046603803, + "loss": 0.85277742, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3254, + "time_per_iteration": 2.79947829246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070892, + "balance_loss_mlp": 1.06131983, + "diversity_loss_mlp": 0.0, + "epoch": 0.6262023855328973, + "flos": 841007784960.0, + "grad_norm": 0.06622216529123302, + "language_loss": 0.77594912, + "learning_rate": 0.00032379601050949193, + "loss": 0.78665805, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3255, + "time_per_iteration": 3.089614152908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073078, + "balance_loss_mlp": 1.06385732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6263947672181608, + "flos": 522138410496.0, + "grad_norm": 0.06913459813204618, + "language_loss": 0.88098216, + "learning_rate": 0.0003235044889614013, + "loss": 0.8917129, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3256, + "time_per_iteration": 2.5961923599243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076462, + "balance_loss_mlp": 1.0670923, + "diversity_loss_mlp": 0.0, + "epoch": 0.6265871489034244, + "flos": 607055440896.0, + "grad_norm": 0.07985483332339025, + "language_loss": 0.83828497, + "learning_rate": 0.0003232130359349451, + "loss": 0.84904957, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3257, + "time_per_iteration": 2.8164010047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106986, + "balance_loss_mlp": 1.06043053, + "diversity_loss_mlp": 0.0, + "epoch": 0.626779530588688, + "flos": 588484901376.0, + "grad_norm": 0.06128522405733426, + "language_loss": 0.81820428, + "learning_rate": 0.0003229216515432751, + "loss": 0.82890296, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3258, + "time_per_iteration": 2.7743678092956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00804618, + "balance_loss_mlp": 1.36253858, + "diversity_loss_mlp": 0.22081783, + "epoch": 0.6269719122739515, + "flos": 438612268032.0, + "grad_norm": 0.03450370763198899, + "language_loss": 0.80067343, + "learning_rate": 0.0003226303358995174, + "loss": 0.80871964, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01293936, + "step": 3259, + "time_per_iteration": 2.6309425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065495, + "balance_loss_mlp": 1.05593443, + "diversity_loss_mlp": 0.0, + "epoch": 0.6271642939592151, + "flos": 562874738688.0, + "grad_norm": 0.05636981182900784, + "language_loss": 0.88916153, + "learning_rate": 0.00032233908911677, + "loss": 0.89981651, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 3260, + "time_per_iteration": 2.847928524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072322, + "balance_loss_mlp": 1.06297052, + "diversity_loss_mlp": 0.0, + "epoch": 0.6273566756444786, + "flos": 514560273408.0, + "grad_norm": 0.07940970349438319, + "language_loss": 0.810615, + "learning_rate": 0.0003220479113081053, + "loss": 0.8213383, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3261, + "time_per_iteration": 2.7070260047912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070214, + "balance_loss_mlp": 1.06123137, + "diversity_loss_mlp": 0.0, + "epoch": 0.6275490573297422, + "flos": 585472955904.0, + "grad_norm": 0.06801817573689214, + "language_loss": 0.78964686, + "learning_rate": 0.00032175680258656836, + "loss": 0.80034894, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 3262, + "time_per_iteration": 2.7481493949890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067516, + "balance_loss_mlp": 1.05819941, + "diversity_loss_mlp": 0.0, + "epoch": 0.6277414390150058, + "flos": 559423024128.0, + "grad_norm": 0.06408124041259919, + "language_loss": 0.80091017, + "learning_rate": 0.00032146576306517794, + "loss": 0.81158531, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3263, + "time_per_iteration": 2.799330949783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071623, + "balance_loss_mlp": 1.06242585, + "diversity_loss_mlp": 0.0, + "epoch": 0.6279338207002694, + "flos": 612706374144.0, + "grad_norm": 0.06510106509747231, + "language_loss": 0.80605328, + "learning_rate": 0.0003211747928569255, + "loss": 0.81676954, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 3264, + "time_per_iteration": 2.71992826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071469, + "balance_loss_mlp": 1.06197381, + "diversity_loss_mlp": 0.0, + "epoch": 0.6281262023855329, + "flos": 625685451264.0, + "grad_norm": 0.06441574996580214, + "language_loss": 0.8154881, + "learning_rate": 0.0003208838920747754, + "loss": 0.82620275, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3265, + "time_per_iteration": 2.8526246547698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073261, + "balance_loss_mlp": 1.06409347, + "diversity_loss_mlp": 0.0, + "epoch": 0.6283185840707964, + "flos": 1123600564224.0, + "grad_norm": 0.07893812182761015, + "language_loss": 0.76554495, + "learning_rate": 0.0003205930608316656, + "loss": 0.7762776, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3266, + "time_per_iteration": 3.4734575748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066498, + "balance_loss_mlp": 1.05708683, + "diversity_loss_mlp": 0.0, + "epoch": 0.62851096575606, + "flos": 515239750656.0, + "grad_norm": 0.06620674427686414, + "language_loss": 0.85159075, + "learning_rate": 0.00032030229924050673, + "loss": 0.86225569, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3267, + "time_per_iteration": 2.7024662494659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072026, + "balance_loss_mlp": 1.06285858, + "diversity_loss_mlp": 0.0, + "epoch": 0.6287033474413236, + "flos": 404171472384.0, + "grad_norm": 0.06417389888600762, + "language_loss": 0.79950488, + "learning_rate": 0.00032001160741418247, + "loss": 0.81022519, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 3268, + "time_per_iteration": 2.6112074851989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066415, + "balance_loss_mlp": 1.05720639, + "diversity_loss_mlp": 0.0, + "epoch": 0.6288957291265872, + "flos": 525718605312.0, + "grad_norm": 0.08748068388552233, + "language_loss": 0.82228744, + "learning_rate": 0.0003197209854655494, + "loss": 0.83295155, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3269, + "time_per_iteration": 2.642714500427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064334, + "balance_loss_mlp": 1.05507767, + "diversity_loss_mlp": 0.0, + "epoch": 0.6290881108118507, + "flos": 603722294784.0, + "grad_norm": 0.07987454353472763, + "language_loss": 0.74589109, + "learning_rate": 0.0003194304335074371, + "loss": 0.7565344, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 3270, + "time_per_iteration": 2.8935019969940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061724, + "balance_loss_mlp": 1.05230033, + "diversity_loss_mlp": 0.0, + "epoch": 0.6292804924971143, + "flos": 437675830272.0, + "grad_norm": 0.07476368913364388, + "language_loss": 0.8843264, + "learning_rate": 0.0003191399516526475, + "loss": 0.89494365, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3271, + "time_per_iteration": 2.5182955265045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010675, + "balance_loss_mlp": 1.0580647, + "diversity_loss_mlp": 0.0, + "epoch": 0.6294728741823779, + "flos": 606662659584.0, + "grad_norm": 0.0671044499872579, + "language_loss": 0.79825693, + "learning_rate": 0.0003188495400139559, + "loss": 0.80893195, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3272, + "time_per_iteration": 2.834392786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106563, + "balance_loss_mlp": 1.05608094, + "diversity_loss_mlp": 0.0, + "epoch": 0.6296652558676414, + "flos": 701529942528.0, + "grad_norm": 0.07440991142052084, + "language_loss": 0.84596652, + "learning_rate": 0.00031855919870411013, + "loss": 0.85662282, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.09539795, + "routerloss_mlp": 0.0, + "step": 3273, + "time_per_iteration": 2.8662502765655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.05781233, + "diversity_loss_mlp": 0.0, + "epoch": 0.6298576375529049, + "flos": 523909969920.0, + "grad_norm": 0.06934000715416044, + "language_loss": 0.8508203, + "learning_rate": 0.0003182689278358305, + "loss": 0.86149418, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3274, + "time_per_iteration": 2.707679510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071338, + "balance_loss_mlp": 1.06173623, + "diversity_loss_mlp": 0.0, + "epoch": 0.6300500192381685, + "flos": 475963693056.0, + "grad_norm": 0.08830765837123684, + "language_loss": 0.79631943, + "learning_rate": 0.0003179787275218105, + "loss": 0.80703276, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3275, + "time_per_iteration": 2.6076841354370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00806629, + "balance_loss_mlp": 1.3660543, + "diversity_loss_mlp": 0.22307114, + "epoch": 0.6302424009234321, + "flos": 520880772096.0, + "grad_norm": 0.030809011685951734, + "language_loss": 0.84306061, + "learning_rate": 0.0003176885978747155, + "loss": 0.85112691, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01206683, + "step": 3276, + "time_per_iteration": 2.6712234020233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070055, + "balance_loss_mlp": 1.06039953, + "diversity_loss_mlp": 0.0, + "epoch": 0.6304347826086957, + "flos": 694596777984.0, + "grad_norm": 0.05912857494905308, + "language_loss": 0.82393259, + "learning_rate": 0.0003173985390071839, + "loss": 0.83463317, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3277, + "time_per_iteration": 2.8781204223632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020459, + "balance_loss_mlp": 1.01545238, + "diversity_loss_mlp": 0.0, + "epoch": 0.6306271642939593, + "flos": 1466858045952.0, + "grad_norm": 0.014813696367821054, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78920913, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.05004883, + "routerloss_mlp": 0.0, + "step": 3278, + "time_per_iteration": 4.869734287261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071906, + "balance_loss_mlp": 1.06190431, + "diversity_loss_mlp": 0.0, + "epoch": 0.6308195459792227, + "flos": 601740762624.0, + "grad_norm": 0.07813339799532502, + "language_loss": 0.80876654, + "learning_rate": 0.00031681863406122704, + "loss": 0.8194856, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.09997559, + "routerloss_mlp": 0.0, + "step": 3279, + "time_per_iteration": 2.773547410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074089, + "balance_loss_mlp": 1.06446278, + "diversity_loss_mlp": 0.0, + "epoch": 0.6310119276644863, + "flos": 726858178560.0, + "grad_norm": 0.07216916580711319, + "language_loss": 0.85329819, + "learning_rate": 0.00031652878820794087, + "loss": 0.86403906, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3280, + "time_per_iteration": 2.980884552001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070557, + "balance_loss_mlp": 1.0605855, + "diversity_loss_mlp": 0.0, + "epoch": 0.6312043093497499, + "flos": 519749042688.0, + "grad_norm": 0.08329353384521647, + "language_loss": 0.85882401, + "learning_rate": 0.00031623901358449627, + "loss": 0.8695296, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.09967041, + "routerloss_mlp": 0.0, + "step": 3281, + "time_per_iteration": 2.650691509246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107097, + "balance_loss_mlp": 1.06155276, + "diversity_loss_mlp": 0.0, + "epoch": 0.6313966910350135, + "flos": 531191499264.0, + "grad_norm": 0.06939094759952598, + "language_loss": 0.88689077, + "learning_rate": 0.0003159493103033936, + "loss": 0.89760047, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3282, + "time_per_iteration": 2.589892864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022479, + "balance_loss_mlp": 1.0175674, + "diversity_loss_mlp": 0.0, + "epoch": 0.631589072720277, + "flos": 1379887529472.0, + "grad_norm": 0.015595592818812096, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80941534, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.04907227, + "routerloss_mlp": 0.0, + "step": 3283, + "time_per_iteration": 4.845726728439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063188, + "balance_loss_mlp": 1.05360401, + "diversity_loss_mlp": 0.0, + "epoch": 0.6317814544055406, + "flos": 624677432832.0, + "grad_norm": 0.08266858178450832, + "language_loss": 0.82553136, + "learning_rate": 0.0003153701182180776, + "loss": 0.83616328, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3284, + "time_per_iteration": 2.783351421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065881, + "balance_loss_mlp": 1.05632019, + "diversity_loss_mlp": 0.0, + "epoch": 0.6319738360908042, + "flos": 498119569920.0, + "grad_norm": 0.063758085961612, + "language_loss": 0.81699741, + "learning_rate": 0.00031508062963872655, + "loss": 0.82765627, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3285, + "time_per_iteration": 2.5591769218444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064029, + "balance_loss_mlp": 1.05435503, + "diversity_loss_mlp": 0.0, + "epoch": 0.6321662177760677, + "flos": 579760353792.0, + "grad_norm": 0.06946286940388995, + "language_loss": 0.79716074, + "learning_rate": 0.0003147912128514423, + "loss": 0.80780101, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 3286, + "time_per_iteration": 2.7374072074890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00792206, + "balance_loss_mlp": 1.3388809, + "diversity_loss_mlp": 0.2218435, + "epoch": 0.6323585994613313, + "flos": 601486373376.0, + "grad_norm": 0.030646294163886513, + "language_loss": 0.87300044, + "learning_rate": 0.0003145018679685859, + "loss": 0.8809225, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01184397, + "step": 3287, + "time_per_iteration": 2.7549750804901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067783, + "balance_loss_mlp": 1.05837727, + "diversity_loss_mlp": 0.0, + "epoch": 0.6325509811465948, + "flos": 528535259136.0, + "grad_norm": 0.05105189166461937, + "language_loss": 0.87830782, + "learning_rate": 0.00031421259510249134, + "loss": 0.88898563, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3288, + "time_per_iteration": 2.7835381031036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067714, + "balance_loss_mlp": 1.05796242, + "diversity_loss_mlp": 0.0, + "epoch": 0.6327433628318584, + "flos": 574262866944.0, + "grad_norm": 0.136960350782239, + "language_loss": 0.81129575, + "learning_rate": 0.00031392339436546414, + "loss": 0.82197285, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.09747314, + "routerloss_mlp": 0.0, + "step": 3289, + "time_per_iteration": 2.8133864402770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069758, + "balance_loss_mlp": 1.05946374, + "diversity_loss_mlp": 0.0, + "epoch": 0.632935744517122, + "flos": 517088033280.0, + "grad_norm": 0.0683406709240254, + "language_loss": 0.8385359, + "learning_rate": 0.00031363426586978205, + "loss": 0.84923339, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 3290, + "time_per_iteration": 2.7862977981567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070784, + "balance_loss_mlp": 1.06093121, + "diversity_loss_mlp": 0.0, + "epoch": 0.6331281262023856, + "flos": 617462714880.0, + "grad_norm": 0.06517080869241837, + "language_loss": 0.84541273, + "learning_rate": 0.0003133452097276947, + "loss": 0.85612059, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3291, + "time_per_iteration": 2.735102415084839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063814, + "balance_loss_mlp": 1.05341327, + "diversity_loss_mlp": 0.0, + "epoch": 0.633320507887649, + "flos": 592954546176.0, + "grad_norm": 0.06655999718782692, + "language_loss": 0.8441304, + "learning_rate": 0.0003130562260514238, + "loss": 0.85476851, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.10400391, + "routerloss_mlp": 0.0, + "step": 3292, + "time_per_iteration": 2.7411108016967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067846, + "balance_loss_mlp": 1.05757022, + "diversity_loss_mlp": 0.0, + "epoch": 0.6335128895729126, + "flos": 582349782528.0, + "grad_norm": 0.05657366074496326, + "language_loss": 0.81691957, + "learning_rate": 0.0003127673149531626, + "loss": 0.82759798, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.1027832, + "routerloss_mlp": 0.0, + "step": 3293, + "time_per_iteration": 2.766249418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066103, + "balance_loss_mlp": 1.05568373, + "diversity_loss_mlp": 0.0, + "epoch": 0.6337052712581762, + "flos": 453036934656.0, + "grad_norm": 0.0752121645824798, + "language_loss": 0.83436191, + "learning_rate": 0.0003124784765450762, + "loss": 0.84502298, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3294, + "time_per_iteration": 2.5490550994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066062, + "balance_loss_mlp": 1.05569124, + "diversity_loss_mlp": 0.0, + "epoch": 0.6338976529434398, + "flos": 573407921664.0, + "grad_norm": 0.06917813795445459, + "language_loss": 0.797925, + "learning_rate": 0.0003121897109393017, + "loss": 0.80858564, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.10375977, + "routerloss_mlp": 0.0, + "step": 3295, + "time_per_iteration": 2.779365062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061765, + "balance_loss_mlp": 1.05135238, + "diversity_loss_mlp": 0.0, + "epoch": 0.6340900346287034, + "flos": 508758838272.0, + "grad_norm": 0.06234951999103671, + "language_loss": 0.89289808, + "learning_rate": 0.0003119010182479481, + "loss": 0.9035157, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.10418701, + "routerloss_mlp": 0.0, + "step": 3296, + "time_per_iteration": 2.6138393878936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069092, + "balance_loss_mlp": 1.05855989, + "diversity_loss_mlp": 0.0, + "epoch": 0.6342824163139669, + "flos": 479746520064.0, + "grad_norm": 0.06350246507064496, + "language_loss": 0.82675922, + "learning_rate": 0.00031161239858309563, + "loss": 0.83745015, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.10540771, + "routerloss_mlp": 0.0, + "step": 3297, + "time_per_iteration": 2.586970329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072163, + "balance_loss_mlp": 1.06148767, + "diversity_loss_mlp": 0.0, + "epoch": 0.6344747979992305, + "flos": 572031714816.0, + "grad_norm": 0.0696399427467901, + "language_loss": 0.83455825, + "learning_rate": 0.0003113238520567964, + "loss": 0.84527981, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.10681152, + "routerloss_mlp": 0.0, + "step": 3298, + "time_per_iteration": 2.6586110591888428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065276, + "balance_loss_mlp": 1.05495286, + "diversity_loss_mlp": 0.0, + "epoch": 0.634667179684494, + "flos": 605911601664.0, + "grad_norm": 0.07177816314390054, + "language_loss": 0.81584775, + "learning_rate": 0.00031103537878107403, + "loss": 0.82650054, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.10321045, + "routerloss_mlp": 0.0, + "step": 3299, + "time_per_iteration": 2.708526372909546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106763, + "balance_loss_mlp": 1.05756879, + "diversity_loss_mlp": 0.0, + "epoch": 0.6348595613697576, + "flos": 646944537600.0, + "grad_norm": 0.0821312661024272, + "language_loss": 0.7999661, + "learning_rate": 0.0003107469788679238, + "loss": 0.81064236, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3300, + "time_per_iteration": 2.774571180343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070552, + "balance_loss_mlp": 1.06004977, + "diversity_loss_mlp": 0.0, + "epoch": 0.6350519430550212, + "flos": 639074935296.0, + "grad_norm": 0.06269586290013059, + "language_loss": 0.86672354, + "learning_rate": 0.00031045865242931267, + "loss": 0.87742901, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.10498047, + "routerloss_mlp": 0.0, + "step": 3301, + "time_per_iteration": 2.800271987915039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_mlp": 1.06537664, + "diversity_loss_mlp": 0.0, + "epoch": 0.6352443247402847, + "flos": 686437908480.0, + "grad_norm": 0.060025608417058285, + "language_loss": 0.83086729, + "learning_rate": 0.00031017039957717877, + "loss": 0.84162271, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.10162354, + "routerloss_mlp": 0.0, + "step": 3302, + "time_per_iteration": 2.99652361869812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083489, + "balance_loss_mlp": 1.07342744, + "diversity_loss_mlp": 0.0, + "epoch": 0.6354367064255483, + "flos": 559442847744.0, + "grad_norm": 0.0673613891994724, + "language_loss": 0.89035141, + "learning_rate": 0.0003098822204234318, + "loss": 0.90118629, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3303, + "time_per_iteration": 2.6769609451293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076148, + "balance_loss_mlp": 1.06632543, + "diversity_loss_mlp": 0.0, + "epoch": 0.6356290881108119, + "flos": 979487520768.0, + "grad_norm": 0.0682411238472533, + "language_loss": 0.87294948, + "learning_rate": 0.00030959411507995273, + "loss": 0.88371098, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.09814453, + "routerloss_mlp": 0.0, + "step": 3304, + "time_per_iteration": 3.25303053855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073289, + "balance_loss_mlp": 1.06334674, + "diversity_loss_mlp": 0.0, + "epoch": 0.6358214697960755, + "flos": 528278298624.0, + "grad_norm": 0.09293144525754729, + "language_loss": 0.80997777, + "learning_rate": 0.00030930608365859407, + "loss": 0.82071066, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.09942627, + "routerloss_mlp": 0.0, + "step": 3305, + "time_per_iteration": 2.650047540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079087, + "balance_loss_mlp": 1.06908488, + "diversity_loss_mlp": 0.0, + "epoch": 0.6360138514813389, + "flos": 516811249152.0, + "grad_norm": 0.06298630616486185, + "language_loss": 0.87762672, + "learning_rate": 0.00030901812627117943, + "loss": 0.8884176, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.10003662, + "routerloss_mlp": 0.0, + "step": 3306, + "time_per_iteration": 2.605576276779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106604, + "balance_loss_mlp": 1.05617523, + "diversity_loss_mlp": 0.0, + "epoch": 0.6362062331666025, + "flos": 466525163520.0, + "grad_norm": 0.09439685712352788, + "language_loss": 0.8446157, + "learning_rate": 0.000308730243029504, + "loss": 0.85527611, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3307, + "time_per_iteration": 2.6111857891082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070403, + "balance_loss_mlp": 1.06070554, + "diversity_loss_mlp": 0.0, + "epoch": 0.6363986148518661, + "flos": 549720193536.0, + "grad_norm": 0.06852736886674453, + "language_loss": 0.7914747, + "learning_rate": 0.0003084424340453339, + "loss": 0.80217868, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3308, + "time_per_iteration": 2.8072149753570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063715, + "balance_loss_mlp": 1.05379033, + "diversity_loss_mlp": 0.0, + "epoch": 0.6365909965371297, + "flos": 583049083392.0, + "grad_norm": 0.0739185528440478, + "language_loss": 0.82162523, + "learning_rate": 0.0003081546994304064, + "loss": 0.8322624, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.09918213, + "routerloss_mlp": 0.0, + "step": 3309, + "time_per_iteration": 2.7670769691467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_mlp": 1.04971123, + "diversity_loss_mlp": 0.0, + "epoch": 0.6367833782223933, + "flos": 531255739392.0, + "grad_norm": 0.07802596117693822, + "language_loss": 0.81907165, + "learning_rate": 0.0003078670392964298, + "loss": 0.82966554, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3310, + "time_per_iteration": 2.6474099159240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058905, + "balance_loss_mlp": 1.04899311, + "diversity_loss_mlp": 0.0, + "epoch": 0.6369757599076568, + "flos": 569506526208.0, + "grad_norm": 0.0731557233203608, + "language_loss": 0.82997435, + "learning_rate": 0.00030757945375508406, + "loss": 0.84056342, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.09906006, + "routerloss_mlp": 0.0, + "step": 3311, + "time_per_iteration": 2.6429851055145264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054164, + "balance_loss_mlp": 1.04434729, + "diversity_loss_mlp": 0.0, + "epoch": 0.6371681415929203, + "flos": 539957892096.0, + "grad_norm": 0.06845871409018763, + "language_loss": 0.81414253, + "learning_rate": 0.00030729194291801944, + "loss": 0.8246842, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3312, + "time_per_iteration": 2.6631555557250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_mlp": 1.04690671, + "diversity_loss_mlp": 0.0, + "epoch": 0.6373605232781839, + "flos": 483566423040.0, + "grad_norm": 0.08097298950364754, + "language_loss": 0.77058214, + "learning_rate": 0.00030700450689685787, + "loss": 0.78114825, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3313, + "time_per_iteration": 2.540600061416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059608, + "balance_loss_mlp": 1.0500232, + "diversity_loss_mlp": 0.0, + "epoch": 0.6375529049634475, + "flos": 578581636608.0, + "grad_norm": 0.0804877394257798, + "language_loss": 0.85728467, + "learning_rate": 0.00030671714580319186, + "loss": 0.86788076, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 3314, + "time_per_iteration": 2.804875135421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055412, + "balance_loss_mlp": 1.04565513, + "diversity_loss_mlp": 0.0, + "epoch": 0.637745286648711, + "flos": 682257530880.0, + "grad_norm": 0.07597136338877614, + "language_loss": 0.83442312, + "learning_rate": 0.0003064298597485846, + "loss": 0.84497726, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.09747314, + "routerloss_mlp": 0.0, + "step": 3315, + "time_per_iteration": 2.860419273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010587, + "balance_loss_mlp": 1.04858494, + "diversity_loss_mlp": 0.0, + "epoch": 0.6379376683339746, + "flos": 504637558272.0, + "grad_norm": 0.06770078099501715, + "language_loss": 0.83771706, + "learning_rate": 0.00030614264884457054, + "loss": 0.84830409, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3316, + "time_per_iteration": 2.6398963928222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_mlp": 1.04450154, + "diversity_loss_mlp": 0.0, + "epoch": 0.6381300500192382, + "flos": 502020965376.0, + "grad_norm": 0.09575765703427323, + "language_loss": 0.77156532, + "learning_rate": 0.000305855513202655, + "loss": 0.78211164, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 3317, + "time_per_iteration": 2.57024884223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052471, + "balance_loss_mlp": 1.04220688, + "diversity_loss_mlp": 0.0, + "epoch": 0.6383224317045018, + "flos": 400489961472.0, + "grad_norm": 0.07693758647747995, + "language_loss": 0.77392501, + "learning_rate": 0.0003055684529343138, + "loss": 0.7844497, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.10266113, + "routerloss_mlp": 0.0, + "step": 3318, + "time_per_iteration": 2.4296517372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058636, + "balance_loss_mlp": 1.04889059, + "diversity_loss_mlp": 0.0, + "epoch": 0.6385148133897653, + "flos": 499377208320.0, + "grad_norm": 0.08157026730411542, + "language_loss": 0.78901523, + "learning_rate": 0.00030528146815099374, + "loss": 0.79960155, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.09741211, + "routerloss_mlp": 0.0, + "step": 3319, + "time_per_iteration": 2.6178040504455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105942, + "balance_loss_mlp": 1.0495379, + "diversity_loss_mlp": 0.0, + "epoch": 0.6387071950750288, + "flos": 527665632768.0, + "grad_norm": 0.05929975411068792, + "language_loss": 0.72059178, + "learning_rate": 0.00030499455896411203, + "loss": 0.73118603, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.09875488, + "routerloss_mlp": 0.0, + "step": 3320, + "time_per_iteration": 2.627962589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026073, + "balance_loss_mlp": 1.02049422, + "diversity_loss_mlp": 0.0, + "epoch": 0.6388995767602924, + "flos": 1455979069440.0, + "grad_norm": 0.01967957525447477, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77326888, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.0559082, + "routerloss_mlp": 0.0, + "step": 3321, + "time_per_iteration": 4.926000595092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068116, + "balance_loss_mlp": 1.05800068, + "diversity_loss_mlp": 0.0, + "epoch": 0.639091958445556, + "flos": 603895191552.0, + "grad_norm": 0.06833251339694629, + "language_loss": 0.76524007, + "learning_rate": 0.0003044209678251865, + "loss": 0.77592129, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.10107422, + "routerloss_mlp": 0.0, + "step": 3322, + "time_per_iteration": 2.916396379470825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066594, + "balance_loss_mlp": 1.05691469, + "diversity_loss_mlp": 0.0, + "epoch": 0.6392843401308196, + "flos": 584516694528.0, + "grad_norm": 0.05729140281605497, + "language_loss": 0.84366953, + "learning_rate": 0.0003041342860958306, + "loss": 0.85433549, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 3323, + "time_per_iteration": 2.7770862579345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_mlp": 1.06162453, + "diversity_loss_mlp": 0.0, + "epoch": 0.6394767218160831, + "flos": 514681413120.0, + "grad_norm": 0.08519156923386062, + "language_loss": 0.91346496, + "learning_rate": 0.00030384768040828857, + "loss": 0.92417842, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3324, + "time_per_iteration": 2.6812171936035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081336, + "balance_loss_mlp": 1.07172787, + "diversity_loss_mlp": 0.0, + "epoch": 0.6396691035013466, + "flos": 541732022784.0, + "grad_norm": 0.07651235317530308, + "language_loss": 0.85160887, + "learning_rate": 0.00030356115087383094, + "loss": 0.86242223, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.0960083, + "routerloss_mlp": 0.0, + "step": 3325, + "time_per_iteration": 2.6458263397216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00811228, + "balance_loss_mlp": 1.37989581, + "diversity_loss_mlp": 0.21910624, + "epoch": 0.6398614851866102, + "flos": 525535796736.0, + "grad_norm": 0.034032588306098184, + "language_loss": 0.8530367, + "learning_rate": 0.00030327469760369803, + "loss": 0.86114895, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01172681, + "step": 3326, + "time_per_iteration": 2.6054904460906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075022, + "balance_loss_mlp": 1.06528878, + "diversity_loss_mlp": 0.0, + "epoch": 0.6400538668718738, + "flos": 622989937152.0, + "grad_norm": 0.06651858881657381, + "language_loss": 0.84802389, + "learning_rate": 0.0003029883207091009, + "loss": 0.85877407, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.097229, + "routerloss_mlp": 0.0, + "step": 3327, + "time_per_iteration": 2.7084085941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075174, + "balance_loss_mlp": 1.06530905, + "diversity_loss_mlp": 0.0, + "epoch": 0.6402462485571374, + "flos": 503367436800.0, + "grad_norm": 0.07064025062286232, + "language_loss": 0.78362405, + "learning_rate": 0.00030270202030122095, + "loss": 0.79437578, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3328, + "time_per_iteration": 2.668501615524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076287, + "balance_loss_mlp": 1.06659508, + "diversity_loss_mlp": 0.0, + "epoch": 0.6404386302424009, + "flos": 819247260672.0, + "grad_norm": 0.07541554155703202, + "language_loss": 0.85661519, + "learning_rate": 0.00030241579649121, + "loss": 0.867378, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3329, + "time_per_iteration": 2.9972317218780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107413, + "balance_loss_mlp": 1.06488538, + "diversity_loss_mlp": 0.0, + "epoch": 0.6406310119276645, + "flos": 471812677632.0, + "grad_norm": 0.06439571325368963, + "language_loss": 0.7957617, + "learning_rate": 0.00030212964939018994, + "loss": 0.806503, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.09234619, + "routerloss_mlp": 0.0, + "step": 3330, + "time_per_iteration": 2.5598840713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075671, + "balance_loss_mlp": 1.06651545, + "diversity_loss_mlp": 0.0, + "epoch": 0.6408233936129281, + "flos": 425583631872.0, + "grad_norm": 0.07958558119065547, + "language_loss": 0.85401917, + "learning_rate": 0.0003018435791092527, + "loss": 0.8647759, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3331, + "time_per_iteration": 2.4886720180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077198, + "balance_loss_mlp": 1.06757176, + "diversity_loss_mlp": 0.0, + "epoch": 0.6410157752981916, + "flos": 549784433664.0, + "grad_norm": 0.08502928683846613, + "language_loss": 0.80926251, + "learning_rate": 0.00030155758575946083, + "loss": 0.8200345, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3332, + "time_per_iteration": 2.661039113998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073982, + "balance_loss_mlp": 1.06464815, + "diversity_loss_mlp": 0.0, + "epoch": 0.6412081569834551, + "flos": 475899452928.0, + "grad_norm": 0.07641451366860309, + "language_loss": 0.84045428, + "learning_rate": 0.0003012716694518467, + "loss": 0.85119408, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.09332275, + "routerloss_mlp": 0.0, + "step": 3333, + "time_per_iteration": 2.579451322555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074146, + "balance_loss_mlp": 1.06456161, + "diversity_loss_mlp": 0.0, + "epoch": 0.6414005386687187, + "flos": 540921494016.0, + "grad_norm": 0.06148329614598223, + "language_loss": 0.85011578, + "learning_rate": 0.000300985830297413, + "loss": 0.86085725, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3334, + "time_per_iteration": 2.6951658725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070237, + "balance_loss_mlp": 1.0607183, + "diversity_loss_mlp": 0.0, + "epoch": 0.6415929203539823, + "flos": 1041317379072.0, + "grad_norm": 0.07715385519242493, + "language_loss": 0.8765533, + "learning_rate": 0.00030070006840713205, + "loss": 0.88725567, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3335, + "time_per_iteration": 3.415095329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068996, + "balance_loss_mlp": 1.05956614, + "diversity_loss_mlp": 0.0, + "epoch": 0.6417853020392459, + "flos": 648337996800.0, + "grad_norm": 0.06540243812784874, + "language_loss": 0.73462147, + "learning_rate": 0.000300414383891947, + "loss": 0.74531144, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3336, + "time_per_iteration": 2.8207781314849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070682, + "balance_loss_mlp": 1.06142569, + "diversity_loss_mlp": 0.0, + "epoch": 0.6419776837245095, + "flos": 500899147776.0, + "grad_norm": 0.062126831222401244, + "language_loss": 0.88856506, + "learning_rate": 0.00030012877686276973, + "loss": 0.89927197, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3337, + "time_per_iteration": 2.701467752456665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070599, + "balance_loss_mlp": 1.06103206, + "diversity_loss_mlp": 0.0, + "epoch": 0.642170065409773, + "flos": 620620392960.0, + "grad_norm": 0.06622404014204096, + "language_loss": 0.86998606, + "learning_rate": 0.0002998432474304832, + "loss": 0.88069206, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3338, + "time_per_iteration": 2.754462242126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023208, + "balance_loss_mlp": 1.01724732, + "diversity_loss_mlp": 0.0, + "epoch": 0.6423624470950365, + "flos": 1423539629568.0, + "grad_norm": 0.025409804512754288, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80260551, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 3339, + "time_per_iteration": 4.871408700942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061344, + "balance_loss_mlp": 1.05190849, + "diversity_loss_mlp": 0.0, + "epoch": 0.6425548287803001, + "flos": 562353477120.0, + "grad_norm": 0.056182904751461135, + "language_loss": 0.88884711, + "learning_rate": 0.00029927242179996107, + "loss": 0.89946061, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3340, + "time_per_iteration": 2.6943204402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063875, + "balance_loss_mlp": 1.05451107, + "diversity_loss_mlp": 0.0, + "epoch": 0.6427472104655637, + "flos": 585443220480.0, + "grad_norm": 0.05740093819519034, + "language_loss": 0.83547878, + "learning_rate": 0.0002989871258233398, + "loss": 0.8461175, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3341, + "time_per_iteration": 2.759075164794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106288, + "balance_loss_mlp": 1.05317652, + "diversity_loss_mlp": 0.0, + "epoch": 0.6429395921508272, + "flos": 404282700288.0, + "grad_norm": 0.08495529058707293, + "language_loss": 0.82866132, + "learning_rate": 0.0002987019078868373, + "loss": 0.83929014, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3342, + "time_per_iteration": 2.460184097290039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00806137, + "balance_loss_mlp": 1.3687458, + "diversity_loss_mlp": 0.21894245, + "epoch": 0.6431319738360908, + "flos": 548783755776.0, + "grad_norm": 0.03059825895364693, + "language_loss": 0.81932986, + "learning_rate": 0.00029841676810118484, + "loss": 0.82739115, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01229309, + "step": 3343, + "time_per_iteration": 2.6885409355163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_mlp": 1.04915428, + "diversity_loss_mlp": 0.0, + "epoch": 0.6433243555213544, + "flos": 793375368192.0, + "grad_norm": 0.0604476685897385, + "language_loss": 0.87177467, + "learning_rate": 0.0002981317065770839, + "loss": 0.88236231, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3344, + "time_per_iteration": 3.03983736038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060176, + "balance_loss_mlp": 1.05044222, + "diversity_loss_mlp": 0.0, + "epoch": 0.643516737206618, + "flos": 583031831040.0, + "grad_norm": 0.07704872008291591, + "language_loss": 0.8078779, + "learning_rate": 0.00029784672342520493, + "loss": 0.81847966, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.097229, + "routerloss_mlp": 0.0, + "step": 3345, + "time_per_iteration": 2.6846296787261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061675, + "balance_loss_mlp": 1.05220366, + "diversity_loss_mlp": 0.0, + "epoch": 0.6437091188918815, + "flos": 518750936064.0, + "grad_norm": 0.06975007259690363, + "language_loss": 0.8341136, + "learning_rate": 0.00029756181875618834, + "loss": 0.84473026, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3346, + "time_per_iteration": 2.5665693283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00808422, + "balance_loss_mlp": 1.37269104, + "diversity_loss_mlp": 0.21939373, + "epoch": 0.643901500577145, + "flos": 384946048512.0, + "grad_norm": 0.035494504018204545, + "language_loss": 0.83294541, + "learning_rate": 0.0002972769926806439, + "loss": 0.84102958, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0123796, + "step": 3347, + "time_per_iteration": 2.504934549331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0080263, + "balance_loss_mlp": 1.36098909, + "diversity_loss_mlp": 0.21952364, + "epoch": 0.6440938822624086, + "flos": 483722067456.0, + "grad_norm": 0.0334865497392214, + "language_loss": 0.88848293, + "learning_rate": 0.0002969922453091508, + "loss": 0.89650929, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01237371, + "step": 3348, + "time_per_iteration": 2.588092803955078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105701, + "balance_loss_mlp": 1.04741955, + "diversity_loss_mlp": 0.0, + "epoch": 0.6442862639476722, + "flos": 540469241856.0, + "grad_norm": 0.07081599083542611, + "language_loss": 0.85229504, + "learning_rate": 0.00029670757675225777, + "loss": 0.86286509, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3349, + "time_per_iteration": 2.7467896938323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056774, + "balance_loss_mlp": 1.04726744, + "diversity_loss_mlp": 0.0, + "epoch": 0.6444786456329358, + "flos": 526912003584.0, + "grad_norm": 0.08621507866757971, + "language_loss": 0.79660463, + "learning_rate": 0.0002964229871204831, + "loss": 0.80717242, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3350, + "time_per_iteration": 2.65602707862854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056473, + "balance_loss_mlp": 1.04715693, + "diversity_loss_mlp": 0.0, + "epoch": 0.6446710273181993, + "flos": 697892848128.0, + "grad_norm": 0.0705050991392221, + "language_loss": 0.83769023, + "learning_rate": 0.00029613847652431403, + "loss": 0.84825498, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3351, + "time_per_iteration": 2.8451104164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00797485, + "balance_loss_mlp": 1.35163832, + "diversity_loss_mlp": 0.21852379, + "epoch": 0.6448634090034628, + "flos": 625023226368.0, + "grad_norm": 0.02943697991412704, + "language_loss": 0.79510611, + "learning_rate": 0.0002958540450742078, + "loss": 0.80308104, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01240353, + "step": 3352, + "time_per_iteration": 2.950679063796997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060228, + "balance_loss_mlp": 1.05063784, + "diversity_loss_mlp": 0.0, + "epoch": 0.6450557906887264, + "flos": 600950057472.0, + "grad_norm": 0.06852868488451136, + "language_loss": 0.7732749, + "learning_rate": 0.0002955696928805901, + "loss": 0.78387713, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 3353, + "time_per_iteration": 2.8771724700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067774, + "balance_loss_mlp": 1.0582372, + "diversity_loss_mlp": 0.0, + "epoch": 0.64524817237399, + "flos": 646200820224.0, + "grad_norm": 0.10704512558750189, + "language_loss": 0.86111909, + "learning_rate": 0.0002952854200538563, + "loss": 0.87179685, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3354, + "time_per_iteration": 2.777782917022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798015, + "balance_loss_mlp": 1.35377836, + "diversity_loss_mlp": 0.21820019, + "epoch": 0.6454405540592536, + "flos": 473411340288.0, + "grad_norm": 0.032699702246912744, + "language_loss": 0.82167614, + "learning_rate": 0.000295001226704371, + "loss": 0.82965624, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01202584, + "step": 3355, + "time_per_iteration": 2.5991604328155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_mlp": 1.05207551, + "diversity_loss_mlp": 0.0, + "epoch": 0.6456329357445171, + "flos": 611841517056.0, + "grad_norm": 0.07645377110954561, + "language_loss": 0.82891458, + "learning_rate": 0.00029471711294246783, + "loss": 0.8395294, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3356, + "time_per_iteration": 2.8146939277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064507, + "balance_loss_mlp": 1.05512571, + "diversity_loss_mlp": 0.0, + "epoch": 0.6458253174297807, + "flos": 731683901952.0, + "grad_norm": 0.07650305014050414, + "language_loss": 0.82254899, + "learning_rate": 0.0002944330788784494, + "loss": 0.83319402, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3357, + "time_per_iteration": 2.90537428855896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_mlp": 1.05508041, + "diversity_loss_mlp": 0.0, + "epoch": 0.6460176991150443, + "flos": 570413228544.0, + "grad_norm": 0.06168723315149378, + "language_loss": 0.84662282, + "learning_rate": 0.00029414912462258786, + "loss": 0.85727078, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3358, + "time_per_iteration": 2.8301830291748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068597, + "balance_loss_mlp": 1.05873299, + "diversity_loss_mlp": 0.0, + "epoch": 0.6462100808003078, + "flos": 583160311296.0, + "grad_norm": 0.07109215771884392, + "language_loss": 0.81651056, + "learning_rate": 0.00029386525028512366, + "loss": 0.8271966, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3359, + "time_per_iteration": 2.689298152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068602, + "balance_loss_mlp": 1.05881464, + "diversity_loss_mlp": 0.0, + "epoch": 0.6464024624855714, + "flos": 483919557120.0, + "grad_norm": 0.0690455154627963, + "language_loss": 0.86761546, + "learning_rate": 0.0002935814559762666, + "loss": 0.8783015, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3360, + "time_per_iteration": 2.820415496826172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072441, + "balance_loss_mlp": 1.06286263, + "diversity_loss_mlp": 0.0, + "epoch": 0.6465948441708349, + "flos": 527774289408.0, + "grad_norm": 0.06340694058104589, + "language_loss": 0.7940557, + "learning_rate": 0.0002932977418061957, + "loss": 0.80478007, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3361, + "time_per_iteration": 2.638246536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075754, + "balance_loss_mlp": 1.06592488, + "diversity_loss_mlp": 0.0, + "epoch": 0.6467872258560985, + "flos": 669421615104.0, + "grad_norm": 0.11078731162526398, + "language_loss": 0.80980253, + "learning_rate": 0.00029301410788505833, + "loss": 0.82056004, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3362, + "time_per_iteration": 2.829946279525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067795, + "balance_loss_mlp": 1.05792451, + "diversity_loss_mlp": 0.0, + "epoch": 0.6469796075413621, + "flos": 432101620224.0, + "grad_norm": 0.08350394703111745, + "language_loss": 0.80845594, + "learning_rate": 0.00029273055432297126, + "loss": 0.81913394, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.09875488, + "routerloss_mlp": 0.0, + "step": 3363, + "time_per_iteration": 2.5047130584716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057084, + "balance_loss_mlp": 1.04717803, + "diversity_loss_mlp": 0.0, + "epoch": 0.6471719892266257, + "flos": 803750335488.0, + "grad_norm": 0.06756647759690963, + "language_loss": 0.80998582, + "learning_rate": 0.00029244708123001917, + "loss": 0.8205567, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3364, + "time_per_iteration": 3.071207284927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059298, + "balance_loss_mlp": 1.04951715, + "diversity_loss_mlp": 0.0, + "epoch": 0.6473643709118891, + "flos": 577208001024.0, + "grad_norm": 0.08982319043529345, + "language_loss": 0.84555328, + "learning_rate": 0.0002921636887162565, + "loss": 0.85614622, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3365, + "time_per_iteration": 2.768284797668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057421, + "balance_loss_mlp": 1.04800391, + "diversity_loss_mlp": 0.0, + "epoch": 0.6475567525971527, + "flos": 761420113920.0, + "grad_norm": 0.08629567448100454, + "language_loss": 0.83712798, + "learning_rate": 0.00029188037689170595, + "loss": 0.84770226, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3366, + "time_per_iteration": 2.9462075233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054242, + "balance_loss_mlp": 1.04440713, + "diversity_loss_mlp": 0.0, + "epoch": 0.6477491342824163, + "flos": 843103116288.0, + "grad_norm": 0.07194825267456643, + "language_loss": 0.84329098, + "learning_rate": 0.0002915971458663586, + "loss": 0.85383338, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.09820557, + "routerloss_mlp": 0.0, + "step": 3367, + "time_per_iteration": 3.052452802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105506, + "balance_loss_mlp": 1.04521894, + "diversity_loss_mlp": 0.0, + "epoch": 0.6479415159676799, + "flos": 884820298752.0, + "grad_norm": 0.06187590041276245, + "language_loss": 0.81901962, + "learning_rate": 0.00029131399575017494, + "loss": 0.82957023, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.09838867, + "routerloss_mlp": 0.0, + "step": 3368, + "time_per_iteration": 3.260995864868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054713, + "balance_loss_mlp": 1.04508734, + "diversity_loss_mlp": 0.0, + "epoch": 0.6481338976529435, + "flos": 615513116160.0, + "grad_norm": 0.08987768190651603, + "language_loss": 0.85898274, + "learning_rate": 0.0002910309266530836, + "loss": 0.8695299, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3369, + "time_per_iteration": 2.8022115230560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059559, + "balance_loss_mlp": 1.0497539, + "diversity_loss_mlp": 0.0, + "epoch": 0.648326279338207, + "flos": 510009136128.0, + "grad_norm": 0.07644364345836648, + "language_loss": 0.8560974, + "learning_rate": 0.0002907479386849814, + "loss": 0.86669296, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.09796143, + "routerloss_mlp": 0.0, + "step": 3370, + "time_per_iteration": 2.646334171295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057441, + "balance_loss_mlp": 1.04791021, + "diversity_loss_mlp": 0.0, + "epoch": 0.6485186610234706, + "flos": 702498313728.0, + "grad_norm": 0.07833648604751785, + "language_loss": 0.80597669, + "learning_rate": 0.0002904650319557339, + "loss": 0.81655109, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3371, + "time_per_iteration": 2.9977073669433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787303, + "balance_loss_mlp": 1.33170056, + "diversity_loss_mlp": 0.21746175, + "epoch": 0.6487110427087341, + "flos": 560683233792.0, + "grad_norm": 0.036264020076934224, + "language_loss": 0.81342006, + "learning_rate": 0.0002901822065751758, + "loss": 0.82129312, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01272238, + "step": 3372, + "time_per_iteration": 2.697375774383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054945, + "balance_loss_mlp": 1.04537833, + "diversity_loss_mlp": 0.0, + "epoch": 0.6489034243939977, + "flos": 680100530688.0, + "grad_norm": 0.06787352107623057, + "language_loss": 0.8556366, + "learning_rate": 0.0002898994626531093, + "loss": 0.86618596, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3373, + "time_per_iteration": 2.8561713695526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059452, + "balance_loss_mlp": 1.05008769, + "diversity_loss_mlp": 0.0, + "epoch": 0.6490958060792612, + "flos": 474412018176.0, + "grad_norm": 0.07079984620053167, + "language_loss": 0.87879932, + "learning_rate": 0.00028961680029930526, + "loss": 0.88939387, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3374, + "time_per_iteration": 2.535357713699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058352, + "balance_loss_mlp": 1.04902411, + "diversity_loss_mlp": 0.0, + "epoch": 0.6492881877645248, + "flos": 588850518528.0, + "grad_norm": 0.07847742657670442, + "language_loss": 0.7705428, + "learning_rate": 0.00028933421962350317, + "loss": 0.78112632, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3375, + "time_per_iteration": 2.7630350589752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059653, + "balance_loss_mlp": 1.05022955, + "diversity_loss_mlp": 0.0, + "epoch": 0.6494805694497884, + "flos": 642427905024.0, + "grad_norm": 0.060066877370730534, + "language_loss": 0.83867884, + "learning_rate": 0.0002890517207354104, + "loss": 0.84927535, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3376, + "time_per_iteration": 2.8403854370117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067954, + "balance_loss_mlp": 1.05819058, + "diversity_loss_mlp": 0.0, + "epoch": 0.649672951135052, + "flos": 531806736384.0, + "grad_norm": 0.07875615832785021, + "language_loss": 0.81685328, + "learning_rate": 0.0002887693037447029, + "loss": 0.82753289, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3377, + "time_per_iteration": 2.5936834812164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786778, + "balance_loss_mlp": 1.32879448, + "diversity_loss_mlp": 0.22056285, + "epoch": 0.6498653328203156, + "flos": 547387725312.0, + "grad_norm": 0.03360133181749734, + "language_loss": 0.82620949, + "learning_rate": 0.00028848696876102443, + "loss": 0.8340773, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01209909, + "step": 3378, + "time_per_iteration": 2.646881341934204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083646, + "balance_loss_mlp": 1.07432425, + "diversity_loss_mlp": 0.0, + "epoch": 0.650057714505579, + "flos": 462228415488.0, + "grad_norm": 0.07289026043627096, + "language_loss": 0.83464664, + "learning_rate": 0.00028820471589398723, + "loss": 0.84548312, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 3379, + "time_per_iteration": 2.5300872325897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079061, + "balance_loss_mlp": 1.3374207, + "diversity_loss_mlp": 0.22020277, + "epoch": 0.6502500961908426, + "flos": 510172121088.0, + "grad_norm": 0.03832598047329158, + "language_loss": 0.78047603, + "learning_rate": 0.00028792254525317196, + "loss": 0.78838205, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01179803, + "step": 3380, + "time_per_iteration": 2.696711301803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090042, + "balance_loss_mlp": 1.08066666, + "diversity_loss_mlp": 0.0, + "epoch": 0.6504424778761062, + "flos": 579827165184.0, + "grad_norm": 0.07654044550208572, + "language_loss": 0.81385279, + "learning_rate": 0.00028764045694812645, + "loss": 0.82475317, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3381, + "time_per_iteration": 2.7730586528778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092437, + "balance_loss_mlp": 1.08303761, + "diversity_loss_mlp": 0.0, + "epoch": 0.6506348595613698, + "flos": 519457577472.0, + "grad_norm": 0.08987457099582341, + "language_loss": 0.76744068, + "learning_rate": 0.0002873584510883671, + "loss": 0.77836508, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3382, + "time_per_iteration": 2.6443450450897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088701, + "balance_loss_mlp": 1.07926512, + "diversity_loss_mlp": 0.0, + "epoch": 0.6508272412466333, + "flos": 510310513152.0, + "grad_norm": 0.07067062397279458, + "language_loss": 0.86143303, + "learning_rate": 0.0002870765277833788, + "loss": 0.87232006, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3383, + "time_per_iteration": 2.740920305252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.07161593, + "diversity_loss_mlp": 0.0, + "epoch": 0.6510196229318969, + "flos": 625623782400.0, + "grad_norm": 0.07689735458190097, + "language_loss": 0.80460048, + "learning_rate": 0.00028679468714261347, + "loss": 0.81540942, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3384, + "time_per_iteration": 2.7767040729522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074737, + "balance_loss_mlp": 1.06546891, + "diversity_loss_mlp": 0.0, + "epoch": 0.6512120046171604, + "flos": 474696142848.0, + "grad_norm": 0.06416640561224615, + "language_loss": 0.76925558, + "learning_rate": 0.0002865129292754918, + "loss": 0.78000295, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3385, + "time_per_iteration": 2.591616630554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075196, + "balance_loss_mlp": 1.06574309, + "diversity_loss_mlp": 0.0, + "epoch": 0.651404386302424, + "flos": 551854798848.0, + "grad_norm": 0.06819374320087251, + "language_loss": 0.81950033, + "learning_rate": 0.00028623125429140105, + "loss": 0.83025235, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3386, + "time_per_iteration": 2.819565773010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068167, + "balance_loss_mlp": 1.05845094, + "diversity_loss_mlp": 0.0, + "epoch": 0.6515967679876876, + "flos": 523311985152.0, + "grad_norm": 0.07152430707450508, + "language_loss": 0.8685019, + "learning_rate": 0.00028594966229969785, + "loss": 0.87918359, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3387, + "time_per_iteration": 2.6802561283111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067324, + "balance_loss_mlp": 1.05746567, + "diversity_loss_mlp": 0.0, + "epoch": 0.6517891496729511, + "flos": 573874854912.0, + "grad_norm": 0.0719578704836234, + "language_loss": 0.81695348, + "learning_rate": 0.00028566815340970577, + "loss": 0.82762671, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3388, + "time_per_iteration": 2.725184917449951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078869, + "balance_loss_mlp": 1.33117235, + "diversity_loss_mlp": 0.22285563, + "epoch": 0.6519815313582147, + "flos": 555926893056.0, + "grad_norm": 0.03133119374313574, + "language_loss": 0.80959165, + "learning_rate": 0.0002853867277307162, + "loss": 0.81747854, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01167633, + "step": 3389, + "time_per_iteration": 2.6700825691223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066001, + "balance_loss_mlp": 1.05601168, + "diversity_loss_mlp": 0.0, + "epoch": 0.6521739130434783, + "flos": 480487666176.0, + "grad_norm": 0.077177119922592, + "language_loss": 0.82811326, + "learning_rate": 0.00028510538537198824, + "loss": 0.83877325, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.09985352, + "routerloss_mlp": 0.0, + "step": 3390, + "time_per_iteration": 2.65598464012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065546, + "balance_loss_mlp": 1.05591428, + "diversity_loss_mlp": 0.0, + "epoch": 0.6523662947287419, + "flos": 665707797504.0, + "grad_norm": 0.06292665593790116, + "language_loss": 0.86663938, + "learning_rate": 0.00028482412644274867, + "loss": 0.87729478, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3391, + "time_per_iteration": 2.926029682159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106361, + "balance_loss_mlp": 1.05354261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6525586764140053, + "flos": 548655275520.0, + "grad_norm": 0.07441000419261597, + "language_loss": 0.74793214, + "learning_rate": 0.00028454295105219207, + "loss": 0.75856817, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.10064697, + "routerloss_mlp": 0.0, + "step": 3392, + "time_per_iteration": 2.6511483192443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064701, + "balance_loss_mlp": 1.05479479, + "diversity_loss_mlp": 0.0, + "epoch": 0.6527510580992689, + "flos": 802900159488.0, + "grad_norm": 0.053639196798002685, + "language_loss": 0.79547405, + "learning_rate": 0.0002842618593094802, + "loss": 0.80612105, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3393, + "time_per_iteration": 3.1180903911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066549, + "balance_loss_mlp": 1.05651164, + "diversity_loss_mlp": 0.0, + "epoch": 0.6529434397845325, + "flos": 671166010368.0, + "grad_norm": 0.09762000223606793, + "language_loss": 0.80486917, + "learning_rate": 0.00028398085132374243, + "loss": 0.81553459, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.1003418, + "routerloss_mlp": 0.0, + "step": 3394, + "time_per_iteration": 2.805560350418091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061804, + "balance_loss_mlp": 1.05185044, + "diversity_loss_mlp": 0.0, + "epoch": 0.6531358214697961, + "flos": 828409006080.0, + "grad_norm": 0.06212778963151281, + "language_loss": 0.84015262, + "learning_rate": 0.0002836999272040761, + "loss": 0.85077065, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3395, + "time_per_iteration": 3.1151998043060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062758, + "balance_loss_mlp": 1.05245829, + "diversity_loss_mlp": 0.0, + "epoch": 0.6533282031550597, + "flos": 487403578368.0, + "grad_norm": 0.07524661860640132, + "language_loss": 0.83834863, + "learning_rate": 0.00028341908705954575, + "loss": 0.84897625, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.10296631, + "routerloss_mlp": 0.0, + "step": 3396, + "time_per_iteration": 2.5500996112823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00599946, + "balance_loss_mlp": 1.02570343, + "diversity_loss_mlp": 0.15256089, + "epoch": 0.6535205848403232, + "flos": 1557744638976.0, + "grad_norm": 0.0014313680900061394, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82361758, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01081435, + "step": 3397, + "time_per_iteration": 4.838392496109009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060865, + "balance_loss_mlp": 1.05047619, + "diversity_loss_mlp": 0.0, + "epoch": 0.6537129665255867, + "flos": 493711593984.0, + "grad_norm": 0.08700190278237876, + "language_loss": 0.77911532, + "learning_rate": 0.00028285765913198604, + "loss": 0.78972399, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.10394287, + "routerloss_mlp": 0.0, + "step": 3398, + "time_per_iteration": 2.5510177612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056044, + "balance_loss_mlp": 1.04590559, + "diversity_loss_mlp": 0.0, + "epoch": 0.6539053482108503, + "flos": 605002328064.0, + "grad_norm": 0.06794032810044964, + "language_loss": 0.82229477, + "learning_rate": 0.0002825770715669227, + "loss": 0.83285522, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.10137939, + "routerloss_mlp": 0.0, + "step": 3399, + "time_per_iteration": 2.7065982818603516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052667, + "balance_loss_mlp": 1.04248071, + "diversity_loss_mlp": 0.0, + "epoch": 0.6540977298961139, + "flos": 577778821632.0, + "grad_norm": 0.06703848890261048, + "language_loss": 0.81440985, + "learning_rate": 0.00028229656841292634, + "loss": 0.82493651, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.10186768, + "routerloss_mlp": 0.0, + "step": 3400, + "time_per_iteration": 2.7117483615875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050894, + "balance_loss_mlp": 1.04067171, + "diversity_loss_mlp": 0.0, + "epoch": 0.6542901115813774, + "flos": 511753531392.0, + "grad_norm": 0.06998039744710104, + "language_loss": 0.76892245, + "learning_rate": 0.0002820161497788979, + "loss": 0.7794314, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.10217285, + "routerloss_mlp": 0.0, + "step": 3401, + "time_per_iteration": 2.590047836303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049365, + "balance_loss_mlp": 1.03930926, + "diversity_loss_mlp": 0.0, + "epoch": 0.654482493266641, + "flos": 625495302144.0, + "grad_norm": 0.06845614791056948, + "language_loss": 0.86992002, + "learning_rate": 0.00028173581577370545, + "loss": 0.88041365, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.1005249, + "routerloss_mlp": 0.0, + "step": 3402, + "time_per_iteration": 2.7577242851257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047368, + "balance_loss_mlp": 1.03716338, + "diversity_loss_mlp": 0.0, + "epoch": 0.6546748749519046, + "flos": 523981550592.0, + "grad_norm": 0.059228402052172, + "language_loss": 0.78973734, + "learning_rate": 0.0002814555665061844, + "loss": 0.80021101, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.10198975, + "routerloss_mlp": 0.0, + "step": 3403, + "time_per_iteration": 2.731137752532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_mlp": 1.0375247, + "diversity_loss_mlp": 0.0, + "epoch": 0.6548672566371682, + "flos": 479210204160.0, + "grad_norm": 0.07926071177251158, + "language_loss": 0.77611935, + "learning_rate": 0.00028117540208513715, + "loss": 0.78659368, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3404, + "time_per_iteration": 2.689107894897461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0077145, + "balance_loss_mlp": 1.2970531, + "diversity_loss_mlp": 0.22200939, + "epoch": 0.6550596383224317, + "flos": 616012356096.0, + "grad_norm": 0.029568297533915613, + "language_loss": 0.85005927, + "learning_rate": 0.00028089532261933313, + "loss": 0.85777372, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01191924, + "step": 3405, + "time_per_iteration": 2.7177927494049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105146, + "balance_loss_mlp": 1.04141116, + "diversity_loss_mlp": 0.0, + "epoch": 0.6552520200076952, + "flos": 488836684800.0, + "grad_norm": 0.08876519929545809, + "language_loss": 0.85989165, + "learning_rate": 0.0002806153282175087, + "loss": 0.87040627, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3406, + "time_per_iteration": 2.5502045154571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053299, + "balance_loss_mlp": 1.04348814, + "diversity_loss_mlp": 0.0, + "epoch": 0.6554444016929588, + "flos": 687619196928.0, + "grad_norm": 0.07350490516448754, + "language_loss": 0.82776654, + "learning_rate": 0.0002803354189883679, + "loss": 0.83829957, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3407, + "time_per_iteration": 2.8476340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054863, + "balance_loss_mlp": 1.0448494, + "diversity_loss_mlp": 0.0, + "epoch": 0.6556367833782224, + "flos": 543051330048.0, + "grad_norm": 0.06617021222220203, + "language_loss": 0.85199594, + "learning_rate": 0.00028005559504058053, + "loss": 0.86254454, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3408, + "time_per_iteration": 2.701035261154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105219, + "balance_loss_mlp": 1.04206932, + "diversity_loss_mlp": 0.0, + "epoch": 0.655829165063486, + "flos": 673535554560.0, + "grad_norm": 0.08388731304351217, + "language_loss": 0.77208018, + "learning_rate": 0.0002797758564827838, + "loss": 0.78260207, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.10113525, + "routerloss_mlp": 0.0, + "step": 3409, + "time_per_iteration": 2.8340024948120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058496, + "balance_loss_mlp": 1.04903674, + "diversity_loss_mlp": 0.0, + "epoch": 0.6560215467487496, + "flos": 531806736384.0, + "grad_norm": 0.07006819638769121, + "language_loss": 0.83542061, + "learning_rate": 0.0002794962034235824, + "loss": 0.84600556, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3410, + "time_per_iteration": 2.634612798690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054024, + "balance_loss_mlp": 1.04401076, + "diversity_loss_mlp": 0.0, + "epoch": 0.656213928434013, + "flos": 591311467008.0, + "grad_norm": 0.07454971523093613, + "language_loss": 0.74929279, + "learning_rate": 0.00027921663597154695, + "loss": 0.75983304, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.10009766, + "routerloss_mlp": 0.0, + "step": 3411, + "time_per_iteration": 2.736161708831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058952, + "balance_loss_mlp": 1.04926038, + "diversity_loss_mlp": 0.0, + "epoch": 0.6564063101192766, + "flos": 415786825728.0, + "grad_norm": 0.08159088858174726, + "language_loss": 0.81125355, + "learning_rate": 0.00027893715423521525, + "loss": 0.82184303, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3412, + "time_per_iteration": 2.452563524246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781164, + "balance_loss_mlp": 1.31892097, + "diversity_loss_mlp": 0.22038518, + "epoch": 0.6565986918045402, + "flos": 453321059328.0, + "grad_norm": 0.03347946196666781, + "language_loss": 0.8419345, + "learning_rate": 0.00027865775832309163, + "loss": 0.84974611, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01151081, + "step": 3413, + "time_per_iteration": 2.6782755851745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068715, + "balance_loss_mlp": 1.05899358, + "diversity_loss_mlp": 0.0, + "epoch": 0.6567910734898038, + "flos": 547746001920.0, + "grad_norm": 0.0675198993979362, + "language_loss": 0.86263126, + "learning_rate": 0.00027837844834364733, + "loss": 0.87331843, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3414, + "time_per_iteration": 2.63967227935791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058125, + "balance_loss_mlp": 1.04836726, + "diversity_loss_mlp": 0.0, + "epoch": 0.6569834551750673, + "flos": 655518210048.0, + "grad_norm": 0.06663266607359189, + "language_loss": 0.8659035, + "learning_rate": 0.00027809922440532, + "loss": 0.87648469, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3415, + "time_per_iteration": 2.816204786300659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059729, + "balance_loss_mlp": 1.05018628, + "diversity_loss_mlp": 0.0, + "epoch": 0.6571758368603309, + "flos": 539681107968.0, + "grad_norm": 0.06360594790571725, + "language_loss": 0.81154943, + "learning_rate": 0.00027782008661651406, + "loss": 0.82214665, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3416, + "time_per_iteration": 2.80657958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059234, + "balance_loss_mlp": 1.04937577, + "diversity_loss_mlp": 0.0, + "epoch": 0.6573682185455945, + "flos": 497346117120.0, + "grad_norm": 0.062003807204006764, + "language_loss": 0.87255514, + "learning_rate": 0.00027754103508560013, + "loss": 0.88314748, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.09857178, + "routerloss_mlp": 0.0, + "step": 3417, + "time_per_iteration": 2.648777723312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062014, + "balance_loss_mlp": 1.05205965, + "diversity_loss_mlp": 0.0, + "epoch": 0.657560600230858, + "flos": 447465295872.0, + "grad_norm": 0.06781110485333444, + "language_loss": 0.82382166, + "learning_rate": 0.0002772620699209163, + "loss": 0.83444178, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.0994873, + "routerloss_mlp": 0.0, + "step": 3418, + "time_per_iteration": 2.566547155380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_mlp": 1.0503962, + "diversity_loss_mlp": 0.0, + "epoch": 0.6577529819161216, + "flos": 481940596224.0, + "grad_norm": 0.0650517875970755, + "language_loss": 0.79616904, + "learning_rate": 0.0002769831912307658, + "loss": 0.80676609, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3419, + "time_per_iteration": 2.606062889099121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061383, + "balance_loss_mlp": 1.05156565, + "diversity_loss_mlp": 0.0, + "epoch": 0.6579453636013851, + "flos": 530843134464.0, + "grad_norm": 0.07306581186555239, + "language_loss": 0.80279779, + "learning_rate": 0.00027670439912341917, + "loss": 0.81341165, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3420, + "time_per_iteration": 2.616004228591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058734, + "balance_loss_mlp": 1.04903078, + "diversity_loss_mlp": 0.0, + "epoch": 0.6581377452866487, + "flos": 628037743104.0, + "grad_norm": 0.07531365664549339, + "language_loss": 0.83319843, + "learning_rate": 0.0002764256937071129, + "loss": 0.84378576, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.09692383, + "routerloss_mlp": 0.0, + "step": 3421, + "time_per_iteration": 2.7864840030670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061647, + "balance_loss_mlp": 1.05205703, + "diversity_loss_mlp": 0.0, + "epoch": 0.6583301269719123, + "flos": 548618199552.0, + "grad_norm": 0.06844647739450752, + "language_loss": 0.87222612, + "learning_rate": 0.00027614707509005036, + "loss": 0.88284254, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 3422, + "time_per_iteration": 2.666473388671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058786, + "balance_loss_mlp": 1.04912376, + "diversity_loss_mlp": 0.0, + "epoch": 0.6585225086571759, + "flos": 427493583360.0, + "grad_norm": 0.0762783210263198, + "language_loss": 0.79373097, + "learning_rate": 0.0002758685433804008, + "loss": 0.8043189, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3423, + "time_per_iteration": 2.4872303009033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056028, + "balance_loss_mlp": 1.04637778, + "diversity_loss_mlp": 0.0, + "epoch": 0.6587148903424394, + "flos": 859620542976.0, + "grad_norm": 0.07259832833327884, + "language_loss": 0.79187661, + "learning_rate": 0.00027559009868630005, + "loss": 0.80243689, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3424, + "time_per_iteration": 3.1284892559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063823, + "balance_loss_mlp": 1.0545187, + "diversity_loss_mlp": 0.0, + "epoch": 0.6589072720277029, + "flos": 805630551552.0, + "grad_norm": 0.07475259244153008, + "language_loss": 0.80332637, + "learning_rate": 0.0002753117411158491, + "loss": 0.81396455, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3425, + "time_per_iteration": 3.024216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066892, + "balance_loss_mlp": 1.05724216, + "diversity_loss_mlp": 0.0, + "epoch": 0.6590996537129665, + "flos": 548618199552.0, + "grad_norm": 0.06493586108743211, + "language_loss": 0.89989424, + "learning_rate": 0.0002750334707771168, + "loss": 0.91056317, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.09637451, + "routerloss_mlp": 0.0, + "step": 3426, + "time_per_iteration": 2.6436870098114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066532, + "balance_loss_mlp": 1.0567987, + "diversity_loss_mlp": 0.0, + "epoch": 0.6592920353982301, + "flos": 454166092800.0, + "grad_norm": 0.06891806065084582, + "language_loss": 0.81568319, + "learning_rate": 0.0002747552877781369, + "loss": 0.82634848, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.097229, + "routerloss_mlp": 0.0, + "step": 3427, + "time_per_iteration": 2.484457015991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106612, + "balance_loss_mlp": 1.05665517, + "diversity_loss_mlp": 0.0, + "epoch": 0.6594844170834937, + "flos": 567174057984.0, + "grad_norm": 0.06651025164376474, + "language_loss": 0.81769067, + "learning_rate": 0.0002744771922269097, + "loss": 0.82835186, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3428, + "time_per_iteration": 2.724034547805786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.06395817, + "diversity_loss_mlp": 0.0, + "epoch": 0.6596767987687572, + "flos": 1187911194624.0, + "grad_norm": 0.08249136451092651, + "language_loss": 0.81983304, + "learning_rate": 0.0002741991842314015, + "loss": 0.83056509, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3429, + "time_per_iteration": 3.4791431427001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106958, + "balance_loss_mlp": 1.06021035, + "diversity_loss_mlp": 0.0, + "epoch": 0.6598691804540208, + "flos": 503491147776.0, + "grad_norm": 0.09631718735244636, + "language_loss": 0.85994452, + "learning_rate": 0.0002739212638995445, + "loss": 0.87064034, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3430, + "time_per_iteration": 2.5809226036071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070566, + "balance_loss_mlp": 1.06089258, + "diversity_loss_mlp": 0.0, + "epoch": 0.6600615621392844, + "flos": 531337231872.0, + "grad_norm": 0.07152811859744175, + "language_loss": 0.83226836, + "learning_rate": 0.00027364343133923696, + "loss": 0.84297395, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.09667969, + "routerloss_mlp": 0.0, + "step": 3431, + "time_per_iteration": 2.664724826812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072179, + "balance_loss_mlp": 1.06281483, + "diversity_loss_mlp": 0.0, + "epoch": 0.6602539438245479, + "flos": 565446915072.0, + "grad_norm": 0.07076815482363777, + "language_loss": 0.82710063, + "learning_rate": 0.0002733656866583431, + "loss": 0.83782238, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3432, + "time_per_iteration": 2.6845815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075681, + "balance_loss_mlp": 1.06614459, + "diversity_loss_mlp": 0.0, + "epoch": 0.6604463255098114, + "flos": 857159594496.0, + "grad_norm": 0.07348653509543634, + "language_loss": 0.83014315, + "learning_rate": 0.0002730880299646927, + "loss": 0.84089994, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3433, + "time_per_iteration": 3.09417462348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072804, + "balance_loss_mlp": 1.06348789, + "diversity_loss_mlp": 0.0, + "epoch": 0.660638707195075, + "flos": 674462080512.0, + "grad_norm": 0.060523936244010056, + "language_loss": 0.85307741, + "learning_rate": 0.0002728104613660821, + "loss": 0.86380541, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3434, + "time_per_iteration": 2.844012498855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071618, + "balance_loss_mlp": 1.06231332, + "diversity_loss_mlp": 0.0, + "epoch": 0.6608310888803386, + "flos": 888961402368.0, + "grad_norm": 0.06580511923703304, + "language_loss": 0.83062303, + "learning_rate": 0.0002725329809702729, + "loss": 0.84133923, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3435, + "time_per_iteration": 3.203927516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070425, + "balance_loss_mlp": 1.06119871, + "diversity_loss_mlp": 0.0, + "epoch": 0.6610234705656022, + "flos": 1136347646976.0, + "grad_norm": 0.07937285786961487, + "language_loss": 0.76092625, + "learning_rate": 0.0002722555888849921, + "loss": 0.77163053, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3436, + "time_per_iteration": 3.441042423248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071904, + "balance_loss_mlp": 1.06265306, + "diversity_loss_mlp": 0.0, + "epoch": 0.6612158522508658, + "flos": 468012598272.0, + "grad_norm": 0.06477982340890849, + "language_loss": 0.80420995, + "learning_rate": 0.00027197828521793334, + "loss": 0.81492901, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3437, + "time_per_iteration": 2.508976697921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072662, + "balance_loss_mlp": 1.0631609, + "diversity_loss_mlp": 0.0, + "epoch": 0.6614082339361292, + "flos": 571653614592.0, + "grad_norm": 0.05773126923802199, + "language_loss": 0.85235512, + "learning_rate": 0.0002717010700767552, + "loss": 0.86308175, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3438, + "time_per_iteration": 2.7343809604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788388, + "balance_loss_mlp": 1.33122396, + "diversity_loss_mlp": 0.22170436, + "epoch": 0.6616006156213928, + "flos": 498467934720.0, + "grad_norm": 0.035967269047030424, + "language_loss": 0.76073134, + "learning_rate": 0.00027142394356908226, + "loss": 0.76861525, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01192367, + "step": 3439, + "time_per_iteration": 2.6098694801330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072149, + "balance_loss_mlp": 1.06304741, + "diversity_loss_mlp": 0.0, + "epoch": 0.6617929973066564, + "flos": 602420239872.0, + "grad_norm": 0.07092995700037574, + "language_loss": 0.84935868, + "learning_rate": 0.00027114690580250456, + "loss": 0.86008012, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3440, + "time_per_iteration": 2.7477781772613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067851, + "balance_loss_mlp": 1.05864227, + "diversity_loss_mlp": 0.0, + "epoch": 0.66198537899192, + "flos": 522983443968.0, + "grad_norm": 0.07606845250334485, + "language_loss": 0.87084186, + "learning_rate": 0.0002708699568845776, + "loss": 0.88152039, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3441, + "time_per_iteration": 2.6247143745422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068327, + "balance_loss_mlp": 1.062343, + "diversity_loss_mlp": 0.0, + "epoch": 0.6621777606771835, + "flos": 1566256642560.0, + "grad_norm": 0.03817420207517821, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80356109, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.05981445, + "routerloss_mlp": 0.0, + "step": 3442, + "time_per_iteration": 4.9118194580078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070581, + "balance_loss_mlp": 1.06144977, + "diversity_loss_mlp": 0.0, + "epoch": 0.6623701423624471, + "flos": 526664954880.0, + "grad_norm": 0.059711141008881904, + "language_loss": 0.83110899, + "learning_rate": 0.0002703163260247261, + "loss": 0.84181482, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3443, + "time_per_iteration": 2.6146388053894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070764, + "balance_loss_mlp": 1.06162047, + "diversity_loss_mlp": 0.0, + "epoch": 0.6625625240477107, + "flos": 528179553792.0, + "grad_norm": 0.07293118954211444, + "language_loss": 0.81726909, + "learning_rate": 0.0002700396442977399, + "loss": 0.82797676, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3444, + "time_per_iteration": 2.6122488975524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072126, + "balance_loss_mlp": 1.06287587, + "diversity_loss_mlp": 0.0, + "epoch": 0.6627549057329742, + "flos": 473122073088.0, + "grad_norm": 0.06235524151571192, + "language_loss": 0.84365332, + "learning_rate": 0.0002697630518492817, + "loss": 0.85437459, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3445, + "time_per_iteration": 2.695577621459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074381, + "balance_loss_mlp": 1.06496358, + "diversity_loss_mlp": 0.0, + "epoch": 0.6629472874182378, + "flos": 527996745216.0, + "grad_norm": 0.09449311389962292, + "language_loss": 0.85555631, + "learning_rate": 0.0002694865487867343, + "loss": 0.86630011, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3446, + "time_per_iteration": 2.643448829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066192, + "balance_loss_mlp": 1.0568881, + "diversity_loss_mlp": 0.0, + "epoch": 0.6631396691035013, + "flos": 613200471552.0, + "grad_norm": 0.06130478535455018, + "language_loss": 0.84665477, + "learning_rate": 0.0002692101352174453, + "loss": 0.85731673, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3447, + "time_per_iteration": 2.7684693336486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071985, + "balance_loss_mlp": 1.06239462, + "diversity_loss_mlp": 0.0, + "epoch": 0.6633320507887649, + "flos": 609318899712.0, + "grad_norm": 0.0686574359328325, + "language_loss": 0.84783942, + "learning_rate": 0.00026893381124872787, + "loss": 0.85855925, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3448, + "time_per_iteration": 2.6856155395507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.06869519, + "diversity_loss_mlp": 0.0, + "epoch": 0.6635244324740285, + "flos": 749700873216.0, + "grad_norm": 0.07711664740076789, + "language_loss": 0.80761468, + "learning_rate": 0.00026865757698786097, + "loss": 0.8183924, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3449, + "time_per_iteration": 3.0219905376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064699, + "balance_loss_mlp": 1.05549026, + "diversity_loss_mlp": 0.0, + "epoch": 0.6637168141592921, + "flos": 664526882304.0, + "grad_norm": 0.07081100750222453, + "language_loss": 0.81853712, + "learning_rate": 0.000268381432542088, + "loss": 0.82918411, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3450, + "time_per_iteration": 2.7959303855895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063443, + "balance_loss_mlp": 1.05394757, + "diversity_loss_mlp": 0.0, + "epoch": 0.6639091958445555, + "flos": 606783799296.0, + "grad_norm": 0.0764006206271421, + "language_loss": 0.80043346, + "learning_rate": 0.00026810537801861807, + "loss": 0.81106788, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3451, + "time_per_iteration": 2.7303504943847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058217, + "balance_loss_mlp": 1.04875171, + "diversity_loss_mlp": 0.0, + "epoch": 0.6641015775298191, + "flos": 476697498624.0, + "grad_norm": 0.05834244489040309, + "language_loss": 0.81090832, + "learning_rate": 0.0002678294135246243, + "loss": 0.82149041, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3452, + "time_per_iteration": 2.733463764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056165, + "balance_loss_mlp": 1.04691422, + "diversity_loss_mlp": 0.0, + "epoch": 0.6642939592150827, + "flos": 904115105280.0, + "grad_norm": 0.07343702884431198, + "language_loss": 0.86356318, + "learning_rate": 0.0002675535391672463, + "loss": 0.87412483, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3453, + "time_per_iteration": 3.115978956222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00797636, + "balance_loss_mlp": 1.35083306, + "diversity_loss_mlp": 0.22054271, + "epoch": 0.6644863409003463, + "flos": 581808697344.0, + "grad_norm": 0.028810841374919304, + "language_loss": 0.86237454, + "learning_rate": 0.0002672777550535877, + "loss": 0.87035096, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01194801, + "step": 3454, + "time_per_iteration": 2.793548822402954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060819, + "balance_loss_mlp": 1.05172312, + "diversity_loss_mlp": 0.0, + "epoch": 0.6646787225856099, + "flos": 479002802688.0, + "grad_norm": 0.0753840272591569, + "language_loss": 0.85331321, + "learning_rate": 0.00026700206129071747, + "loss": 0.8639214, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3455, + "time_per_iteration": 2.5915210247039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064823, + "balance_loss_mlp": 1.05565548, + "diversity_loss_mlp": 0.0, + "epoch": 0.6648711042708734, + "flos": 449906420736.0, + "grad_norm": 0.07433202645873906, + "language_loss": 0.89061069, + "learning_rate": 0.00026672645798566925, + "loss": 0.90125895, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3456, + "time_per_iteration": 2.5754494667053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059485, + "balance_loss_mlp": 1.05019283, + "diversity_loss_mlp": 0.0, + "epoch": 0.665063485956137, + "flos": 858960516096.0, + "grad_norm": 0.07294926148794169, + "language_loss": 0.79539233, + "learning_rate": 0.00026645094524544225, + "loss": 0.80598718, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.09289551, + "routerloss_mlp": 0.0, + "step": 3457, + "time_per_iteration": 3.2948148250579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056831, + "balance_loss_mlp": 1.04734802, + "diversity_loss_mlp": 0.0, + "epoch": 0.6652558676414005, + "flos": 604312939008.0, + "grad_norm": 0.08386362480566827, + "language_loss": 0.75221157, + "learning_rate": 0.00026617552317699945, + "loss": 0.76277989, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3458, + "time_per_iteration": 2.789961576461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057714, + "balance_loss_mlp": 1.04836822, + "diversity_loss_mlp": 0.0, + "epoch": 0.6654482493266641, + "flos": 510394576896.0, + "grad_norm": 0.09354786354914506, + "language_loss": 0.87007248, + "learning_rate": 0.0002659001918872693, + "loss": 0.88064957, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3459, + "time_per_iteration": 2.6320250034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058346, + "balance_loss_mlp": 1.04896998, + "diversity_loss_mlp": 0.0, + "epoch": 0.6656406310119277, + "flos": 565605130752.0, + "grad_norm": 0.06598239053228593, + "language_loss": 0.80718446, + "learning_rate": 0.0002656249514831449, + "loss": 0.81776798, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3460, + "time_per_iteration": 2.6485753059387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063647, + "balance_loss_mlp": 1.05442595, + "diversity_loss_mlp": 0.0, + "epoch": 0.6658330126971912, + "flos": 1024298141184.0, + "grad_norm": 0.05863451757746151, + "language_loss": 0.87114978, + "learning_rate": 0.00026534980207148416, + "loss": 0.88178623, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3461, + "time_per_iteration": 3.4618935585021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.05719471, + "diversity_loss_mlp": 0.0, + "epoch": 0.6660253943824548, + "flos": 816823388160.0, + "grad_norm": 0.07572861338992695, + "language_loss": 0.73451698, + "learning_rate": 0.0002650747437591097, + "loss": 0.7451815, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 3462, + "time_per_iteration": 2.985516309738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_mlp": 1.02065372, + "diversity_loss_mlp": 0.0, + "epoch": 0.6662177760677184, + "flos": 1496169169920.0, + "grad_norm": 0.017950660829121307, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82906377, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.06152344, + "routerloss_mlp": 0.0, + "step": 3463, + "time_per_iteration": 5.041592359542847 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067724, + "balance_loss_mlp": 1.05844963, + "diversity_loss_mlp": 0.0, + "epoch": 0.666410157752982, + "flos": 500120925696.0, + "grad_norm": 0.06793562911737132, + "language_loss": 0.86417711, + "learning_rate": 0.00026452490085933155, + "loss": 0.87485433, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3464, + "time_per_iteration": 2.5661425590515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069546, + "balance_loss_mlp": 1.05994368, + "diversity_loss_mlp": 0.0, + "epoch": 0.6666025394382454, + "flos": 481169714688.0, + "grad_norm": 0.08819800975527838, + "language_loss": 0.89818048, + "learning_rate": 0.00026425011648539614, + "loss": 0.90887594, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3465, + "time_per_iteration": 2.5488314628601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065239, + "balance_loss_mlp": 1.05584478, + "diversity_loss_mlp": 0.0, + "epoch": 0.666794921123509, + "flos": 546653919744.0, + "grad_norm": 0.06406494944770698, + "language_loss": 0.82567346, + "learning_rate": 0.00026397542363768267, + "loss": 0.83632582, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3466, + "time_per_iteration": 2.669250965118408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781407, + "balance_loss_mlp": 1.32080197, + "diversity_loss_mlp": 0.21862534, + "epoch": 0.6669873028087726, + "flos": 471988145664.0, + "grad_norm": 0.03313864292511896, + "language_loss": 0.8202821, + "learning_rate": 0.0002637008224228362, + "loss": 0.82809615, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01169338, + "step": 3467, + "time_per_iteration": 2.572173833847046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070583, + "balance_loss_mlp": 1.06133246, + "diversity_loss_mlp": 0.0, + "epoch": 0.6671796844940362, + "flos": 547395065856.0, + "grad_norm": 0.05107139851875669, + "language_loss": 0.8441903, + "learning_rate": 0.00026342631294746653, + "loss": 0.85489613, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3468, + "time_per_iteration": 2.698885917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072156, + "balance_loss_mlp": 1.06254137, + "diversity_loss_mlp": 0.0, + "epoch": 0.6673720661792998, + "flos": 1070317214208.0, + "grad_norm": 0.05734496396036439, + "language_loss": 0.80842233, + "learning_rate": 0.0002631518953181476, + "loss": 0.81914389, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3469, + "time_per_iteration": 3.4733734130859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101407, + "balance_loss_mlp": 1.0077759, + "diversity_loss_mlp": 0.0, + "epoch": 0.6675644478645633, + "flos": 1523790600192.0, + "grad_norm": 0.015747171991140264, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77339357, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.06298828, + "routerloss_mlp": 0.0, + "step": 3470, + "time_per_iteration": 4.929265737533569 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074034, + "balance_loss_mlp": 1.06445539, + "diversity_loss_mlp": 0.0, + "epoch": 0.6677568295498268, + "flos": 579696113664.0, + "grad_norm": 0.060826323549746535, + "language_loss": 0.80429429, + "learning_rate": 0.00026260333602377985, + "loss": 0.81503463, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3471, + "time_per_iteration": 2.848822593688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076958, + "balance_loss_mlp": 1.06758189, + "diversity_loss_mlp": 0.0, + "epoch": 0.6679492112350904, + "flos": 383935458816.0, + "grad_norm": 0.07184696149338711, + "language_loss": 0.87395489, + "learning_rate": 0.0002623291945717007, + "loss": 0.88472444, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.09368896, + "routerloss_mlp": 0.0, + "step": 3472, + "time_per_iteration": 2.500190019607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073902, + "balance_loss_mlp": 1.06426954, + "diversity_loss_mlp": 0.0, + "epoch": 0.668141592920354, + "flos": 1150759830528.0, + "grad_norm": 0.06589735356893138, + "language_loss": 0.84111875, + "learning_rate": 0.00026205514539161175, + "loss": 0.85185778, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3473, + "time_per_iteration": 3.534797191619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072796, + "balance_loss_mlp": 1.0632112, + "diversity_loss_mlp": 0.0, + "epoch": 0.6683339746056175, + "flos": 561100608000.0, + "grad_norm": 0.059882211902428664, + "language_loss": 0.83973366, + "learning_rate": 0.00026178118858990773, + "loss": 0.8504616, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3474, + "time_per_iteration": 2.8565967082977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070699, + "balance_loss_mlp": 1.06109083, + "diversity_loss_mlp": 0.0, + "epoch": 0.6685263562908811, + "flos": 514305884160.0, + "grad_norm": 0.06021787961002869, + "language_loss": 0.84205377, + "learning_rate": 0.0002615073242729483, + "loss": 0.85276067, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.0960083, + "routerloss_mlp": 0.0, + "step": 3475, + "time_per_iteration": 2.678913116455078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070157, + "balance_loss_mlp": 1.0605185, + "diversity_loss_mlp": 0.0, + "epoch": 0.6687187379761447, + "flos": 629772226560.0, + "grad_norm": 0.05349171948445146, + "language_loss": 0.84449661, + "learning_rate": 0.0002612335525470573, + "loss": 0.85519814, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3476, + "time_per_iteration": 2.8754477500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_mlp": 1.05415487, + "diversity_loss_mlp": 0.0, + "epoch": 0.6689111196614083, + "flos": 535586992128.0, + "grad_norm": 0.0743507074362168, + "language_loss": 0.78049976, + "learning_rate": 0.0002609598735185221, + "loss": 0.79113823, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3477, + "time_per_iteration": 2.6721932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066424, + "balance_loss_mlp": 1.05687511, + "diversity_loss_mlp": 0.0, + "epoch": 0.6691035013466718, + "flos": 603038048256.0, + "grad_norm": 0.06005632064488323, + "language_loss": 0.83158946, + "learning_rate": 0.00026068628729359445, + "loss": 0.84225374, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3478, + "time_per_iteration": 2.7650654315948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068151, + "balance_loss_mlp": 1.05830431, + "diversity_loss_mlp": 0.0, + "epoch": 0.6692958830319353, + "flos": 632855752704.0, + "grad_norm": 0.0704650229723735, + "language_loss": 0.76221395, + "learning_rate": 0.00026041279397848996, + "loss": 0.77289546, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3479, + "time_per_iteration": 2.8531105518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.055673, + "diversity_loss_mlp": 0.0, + "epoch": 0.6694882647171989, + "flos": 645471783936.0, + "grad_norm": 0.06824163679163787, + "language_loss": 0.82570118, + "learning_rate": 0.00026013939367938797, + "loss": 0.8363536, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3480, + "time_per_iteration": 2.8762619495391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798551, + "balance_loss_mlp": 1.35232079, + "diversity_loss_mlp": 0.22152299, + "epoch": 0.6696806464024625, + "flos": 569585447424.0, + "grad_norm": 0.028482542431452974, + "language_loss": 0.81186199, + "learning_rate": 0.00025986608650243204, + "loss": 0.81984746, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01162949, + "step": 3481, + "time_per_iteration": 2.8153860569000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071437, + "balance_loss_mlp": 1.06166184, + "diversity_loss_mlp": 0.0, + "epoch": 0.6698730280877261, + "flos": 622700669952.0, + "grad_norm": 0.08903053329626802, + "language_loss": 0.79281807, + "learning_rate": 0.0002595928725537293, + "loss": 0.80353248, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3482, + "time_per_iteration": 2.8563952445983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064965, + "balance_loss_mlp": 1.05542827, + "diversity_loss_mlp": 0.0, + "epoch": 0.6700654097729896, + "flos": 502507722240.0, + "grad_norm": 0.06597366352184171, + "language_loss": 0.8811605, + "learning_rate": 0.0002593197519393509, + "loss": 0.89181018, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.09539795, + "routerloss_mlp": 0.0, + "step": 3483, + "time_per_iteration": 2.659468650817871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060552, + "balance_loss_mlp": 1.05117035, + "diversity_loss_mlp": 0.0, + "epoch": 0.6702577914582531, + "flos": 623876815872.0, + "grad_norm": 0.06129183928704833, + "language_loss": 0.79517573, + "learning_rate": 0.00025904672476533165, + "loss": 0.80578125, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3484, + "time_per_iteration": 2.843041181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_mlp": 1.0531497, + "diversity_loss_mlp": 0.0, + "epoch": 0.6704501731435167, + "flos": 456268764672.0, + "grad_norm": 0.06231151375576235, + "language_loss": 0.82821012, + "learning_rate": 0.0002587737911376704, + "loss": 0.83883744, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3485, + "time_per_iteration": 2.579852819442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065444, + "balance_loss_mlp": 1.0560143, + "diversity_loss_mlp": 0.0, + "epoch": 0.6706425548287803, + "flos": 543229369344.0, + "grad_norm": 0.06196157664485949, + "language_loss": 0.84223086, + "learning_rate": 0.00025850095116232885, + "loss": 0.85288531, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3486, + "time_per_iteration": 2.6867549419403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059913, + "balance_loss_mlp": 1.05029857, + "diversity_loss_mlp": 0.0, + "epoch": 0.6708349365140439, + "flos": 633940494336.0, + "grad_norm": 0.07455755751361211, + "language_loss": 0.77796304, + "learning_rate": 0.000258228204945233, + "loss": 0.78856218, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3487, + "time_per_iteration": 2.9104583263397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788296, + "balance_loss_mlp": 1.33072948, + "diversity_loss_mlp": 0.22110668, + "epoch": 0.6710273181993074, + "flos": 640747749888.0, + "grad_norm": 0.03107378418050736, + "language_loss": 0.84813625, + "learning_rate": 0.00025795555259227254, + "loss": 0.8560192, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0123779, + "step": 3488, + "time_per_iteration": 2.799049139022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064064, + "balance_loss_mlp": 1.05453348, + "diversity_loss_mlp": 0.0, + "epoch": 0.671219699884571, + "flos": 553942789632.0, + "grad_norm": 0.05587900492957358, + "language_loss": 0.8365714, + "learning_rate": 0.00025768299420930046, + "loss": 0.84721196, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.09515381, + "routerloss_mlp": 0.0, + "step": 3489, + "time_per_iteration": 2.7350802421569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059988, + "balance_loss_mlp": 1.05058801, + "diversity_loss_mlp": 0.0, + "epoch": 0.6714120815698346, + "flos": 731508433920.0, + "grad_norm": 0.0636982622522837, + "language_loss": 0.83686626, + "learning_rate": 0.0002574105299021332, + "loss": 0.84746611, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3490, + "time_per_iteration": 2.8952267169952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056705, + "balance_loss_mlp": 1.04722226, + "diversity_loss_mlp": 0.0, + "epoch": 0.6716044632550981, + "flos": 688664291328.0, + "grad_norm": 0.059047086854658884, + "language_loss": 0.84235394, + "learning_rate": 0.00025713815977655084, + "loss": 0.85292095, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3491, + "time_per_iteration": 2.8801188468933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059823, + "balance_loss_mlp": 1.05020285, + "diversity_loss_mlp": 0.0, + "epoch": 0.6717968449403616, + "flos": 460629752832.0, + "grad_norm": 0.0713613195550899, + "language_loss": 0.84868813, + "learning_rate": 0.0002568658839382969, + "loss": 0.85928631, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3492, + "time_per_iteration": 2.565765380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055936, + "balance_loss_mlp": 1.04666197, + "diversity_loss_mlp": 0.0, + "epoch": 0.6719892266256252, + "flos": 501608360448.0, + "grad_norm": 0.0809894292628365, + "language_loss": 0.8436929, + "learning_rate": 0.00025659370249307814, + "loss": 0.85425228, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3493, + "time_per_iteration": 2.61505126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_mlp": 1.04709792, + "diversity_loss_mlp": 0.0, + "epoch": 0.6721816083108888, + "flos": 683525081088.0, + "grad_norm": 0.06605957100839344, + "language_loss": 0.85386133, + "learning_rate": 0.00025632161554656473, + "loss": 0.86442864, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.09631348, + "routerloss_mlp": 0.0, + "step": 3494, + "time_per_iteration": 2.8639488220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054512, + "balance_loss_mlp": 1.04485643, + "diversity_loss_mlp": 0.0, + "epoch": 0.6723739899961524, + "flos": 585813980160.0, + "grad_norm": 0.0758709557174038, + "language_loss": 0.8232398, + "learning_rate": 0.00025604962320439017, + "loss": 0.83378488, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3495, + "time_per_iteration": 2.71235728263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056091, + "balance_loss_mlp": 1.04692411, + "diversity_loss_mlp": 0.0, + "epoch": 0.672566371681416, + "flos": 506616519168.0, + "grad_norm": 0.06832671008161519, + "language_loss": 0.82082075, + "learning_rate": 0.0002557777255721516, + "loss": 0.83138162, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3496, + "time_per_iteration": 2.728652000427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052712, + "balance_loss_mlp": 1.04334807, + "diversity_loss_mlp": 0.0, + "epoch": 0.6727587533666795, + "flos": 535671055872.0, + "grad_norm": 0.07590882568517338, + "language_loss": 0.80502313, + "learning_rate": 0.0002555059227554087, + "loss": 0.81555027, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3497, + "time_per_iteration": 2.6704843044281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054312, + "balance_loss_mlp": 1.04488301, + "diversity_loss_mlp": 0.0, + "epoch": 0.672951135051943, + "flos": 602832844800.0, + "grad_norm": 0.0738650094824256, + "language_loss": 0.77972269, + "learning_rate": 0.00025523421485968453, + "loss": 0.79026586, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3498, + "time_per_iteration": 2.8093771934509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057909, + "balance_loss_mlp": 1.04843736, + "diversity_loss_mlp": 0.0, + "epoch": 0.6731435167372066, + "flos": 811315989504.0, + "grad_norm": 0.07086262263525961, + "language_loss": 0.85447127, + "learning_rate": 0.00025496260199046585, + "loss": 0.86505038, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3499, + "time_per_iteration": 3.0010836124420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105668, + "balance_loss_mlp": 1.04721487, + "diversity_loss_mlp": 0.0, + "epoch": 0.6733358984224702, + "flos": 611594468352.0, + "grad_norm": 0.056698795982303, + "language_loss": 0.84606051, + "learning_rate": 0.000254691084253202, + "loss": 0.85662723, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3500, + "time_per_iteration": 2.7931160926818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106411, + "balance_loss_mlp": 1.05446577, + "diversity_loss_mlp": 0.0, + "epoch": 0.6735282801077337, + "flos": 558901762560.0, + "grad_norm": 0.075539637024569, + "language_loss": 0.77243733, + "learning_rate": 0.00025441966175330567, + "loss": 0.78307843, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3501, + "time_per_iteration": 2.6508493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067964, + "balance_loss_mlp": 1.05850506, + "diversity_loss_mlp": 0.0, + "epoch": 0.6737206617929973, + "flos": 672433560576.0, + "grad_norm": 0.07065885937587965, + "language_loss": 0.79737401, + "learning_rate": 0.00025414833459615183, + "loss": 0.80805361, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3502, + "time_per_iteration": 2.784524917602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074197, + "balance_loss_mlp": 1.0648514, + "diversity_loss_mlp": 0.0, + "epoch": 0.6739130434782609, + "flos": 633446396928.0, + "grad_norm": 0.06652503704287359, + "language_loss": 0.80206275, + "learning_rate": 0.0002538771028870796, + "loss": 0.8128047, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3503, + "time_per_iteration": 2.802136182785034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075432, + "balance_loss_mlp": 1.06571674, + "diversity_loss_mlp": 0.0, + "epoch": 0.6741054251635245, + "flos": 531445888512.0, + "grad_norm": 0.06376799007020843, + "language_loss": 0.81455564, + "learning_rate": 0.0002536059667313903, + "loss": 0.82530999, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3504, + "time_per_iteration": 2.711933135986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068251, + "balance_loss_mlp": 1.05844057, + "diversity_loss_mlp": 0.0, + "epoch": 0.674297806848788, + "flos": 542604220416.0, + "grad_norm": 0.09964706429340704, + "language_loss": 0.89608288, + "learning_rate": 0.0002533349262343483, + "loss": 0.9067654, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.09802246, + "routerloss_mlp": 0.0, + "step": 3505, + "time_per_iteration": 2.6715004444122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082337, + "balance_loss_mlp": 1.07268143, + "diversity_loss_mlp": 0.0, + "epoch": 0.6744901885340515, + "flos": 463523129856.0, + "grad_norm": 0.06572677444304757, + "language_loss": 0.81604284, + "learning_rate": 0.0002530639815011807, + "loss": 0.82686627, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3506, + "time_per_iteration": 2.4929287433624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078955, + "balance_loss_mlp": 1.33325195, + "diversity_loss_mlp": 0.2229899, + "epoch": 0.6746825702193151, + "flos": 631830481920.0, + "grad_norm": 0.03439328096706921, + "language_loss": 0.8506915, + "learning_rate": 0.0002527931326370781, + "loss": 0.85858697, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01142928, + "step": 3507, + "time_per_iteration": 2.83644962310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.07446539, + "diversity_loss_mlp": 0.0, + "epoch": 0.6748749519045787, + "flos": 671146186752.0, + "grad_norm": 0.08750505461607005, + "language_loss": 0.82915336, + "learning_rate": 0.00025252237974719276, + "loss": 0.83999527, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3508, + "time_per_iteration": 2.871253252029419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081215, + "balance_loss_mlp": 1.07155883, + "diversity_loss_mlp": 0.0, + "epoch": 0.6750673335898423, + "flos": 767102980608.0, + "grad_norm": 0.08335060522291943, + "language_loss": 0.80458963, + "learning_rate": 0.00025225172293664056, + "loss": 0.81540173, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.09643555, + "routerloss_mlp": 0.0, + "step": 3509, + "time_per_iteration": 3.033853530883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014527, + "balance_loss_mlp": 1.00832772, + "diversity_loss_mlp": 0.0, + "epoch": 0.6752597152751059, + "flos": 1512607675392.0, + "grad_norm": 0.01800991302482, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77947664, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.06176758, + "routerloss_mlp": 0.0, + "step": 3510, + "time_per_iteration": 4.911616325378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085264, + "balance_loss_mlp": 1.07521439, + "diversity_loss_mlp": 0.0, + "epoch": 0.6754520969603693, + "flos": 687297996288.0, + "grad_norm": 0.09401749664970258, + "language_loss": 0.84862983, + "learning_rate": 0.00025171069797381106, + "loss": 0.85948253, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.10046387, + "routerloss_mlp": 0.0, + "step": 3511, + "time_per_iteration": 2.8283350467681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071317, + "balance_loss_mlp": 1.06139874, + "diversity_loss_mlp": 0.0, + "epoch": 0.6756444786456329, + "flos": 500577947136.0, + "grad_norm": 0.06520954806538445, + "language_loss": 0.82273233, + "learning_rate": 0.00025144033003157864, + "loss": 0.83344549, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.09912109, + "routerloss_mlp": 0.0, + "step": 3512, + "time_per_iteration": 2.5983166694641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070965, + "balance_loss_mlp": 1.06117821, + "diversity_loss_mlp": 0.0, + "epoch": 0.6758368603308965, + "flos": 492616940544.0, + "grad_norm": 0.08310754245868612, + "language_loss": 0.78935671, + "learning_rate": 0.00025117005858876806, + "loss": 0.80006635, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3513, + "time_per_iteration": 2.6797635555267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787238, + "balance_loss_mlp": 1.33182001, + "diversity_loss_mlp": 0.21994653, + "epoch": 0.6760292420161601, + "flos": 555934233600.0, + "grad_norm": 0.03353723121835004, + "language_loss": 0.85560071, + "learning_rate": 0.000250899883750308, + "loss": 0.86347306, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0113544, + "step": 3514, + "time_per_iteration": 2.7176060676574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059441, + "balance_loss_mlp": 1.04921913, + "diversity_loss_mlp": 0.0, + "epoch": 0.6762216237014236, + "flos": 607601668608.0, + "grad_norm": 0.07453608092591449, + "language_loss": 0.81898236, + "learning_rate": 0.00025062980562109006, + "loss": 0.82957679, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.10223389, + "routerloss_mlp": 0.0, + "step": 3515, + "time_per_iteration": 2.7594966888427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789958, + "balance_loss_mlp": 1.33716106, + "diversity_loss_mlp": 0.21975538, + "epoch": 0.6764140053866872, + "flos": 533785697280.0, + "grad_norm": 0.033729691487123833, + "language_loss": 0.83036506, + "learning_rate": 0.0002503598243059677, + "loss": 0.83826458, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01150025, + "step": 3516, + "time_per_iteration": 2.891763687133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_mlp": 1.04839277, + "diversity_loss_mlp": 0.0, + "epoch": 0.6766063870719508, + "flos": 504810455040.0, + "grad_norm": 0.07017833187059877, + "language_loss": 0.80408925, + "learning_rate": 0.0002500899399097568, + "loss": 0.81467211, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.09887695, + "routerloss_mlp": 0.0, + "step": 3517, + "time_per_iteration": 2.672029972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786476, + "balance_loss_mlp": 1.32907259, + "diversity_loss_mlp": 0.22110882, + "epoch": 0.6767987687572143, + "flos": 513176726016.0, + "grad_norm": 0.038425556988831724, + "language_loss": 0.85818875, + "learning_rate": 0.0002498201525372359, + "loss": 0.86605346, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01138566, + "step": 3518, + "time_per_iteration": 2.617760419845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054389, + "balance_loss_mlp": 1.04459572, + "diversity_loss_mlp": 0.0, + "epoch": 0.6769911504424779, + "flos": 525039128064.0, + "grad_norm": 0.06814874892769256, + "language_loss": 0.83201683, + "learning_rate": 0.00024955046229314584, + "loss": 0.84256077, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.09783936, + "routerloss_mlp": 0.0, + "step": 3519, + "time_per_iteration": 2.6269547939300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_mlp": 1.04138207, + "diversity_loss_mlp": 0.0, + "epoch": 0.6771835321277414, + "flos": 449896508928.0, + "grad_norm": 0.06326657634867637, + "language_loss": 0.87517166, + "learning_rate": 0.00024928086928218947, + "loss": 0.88568723, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.10174561, + "routerloss_mlp": 0.0, + "step": 3520, + "time_per_iteration": 2.500542163848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057369, + "balance_loss_mlp": 1.04749823, + "diversity_loss_mlp": 0.0, + "epoch": 0.677375913813005, + "flos": 709349985792.0, + "grad_norm": 0.0729210521666428, + "language_loss": 0.76251125, + "learning_rate": 0.00024901137360903216, + "loss": 0.77308488, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.09869385, + "routerloss_mlp": 0.0, + "step": 3521, + "time_per_iteration": 2.921558380126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055481, + "balance_loss_mlp": 1.04586673, + "diversity_loss_mlp": 0.0, + "epoch": 0.6775682954982686, + "flos": 428420109312.0, + "grad_norm": 0.08065371435227142, + "language_loss": 0.80853164, + "learning_rate": 0.00024874197537830115, + "loss": 0.81908649, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3522, + "time_per_iteration": 2.5280978679656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793286, + "balance_loss_mlp": 1.3416667, + "diversity_loss_mlp": 0.22178407, + "epoch": 0.6777606771835322, + "flos": 437905626624.0, + "grad_norm": 0.034341347950706966, + "language_loss": 0.834656, + "learning_rate": 0.00024847267469458684, + "loss": 0.8425889, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0115611, + "step": 3523, + "time_per_iteration": 2.5251760482788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058563, + "balance_loss_mlp": 1.04881763, + "diversity_loss_mlp": 0.0, + "epoch": 0.6779530588687956, + "flos": 775442087424.0, + "grad_norm": 0.0593554156839795, + "language_loss": 0.77790511, + "learning_rate": 0.00024820347166244034, + "loss": 0.78849077, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.09741211, + "routerloss_mlp": 0.0, + "step": 3524, + "time_per_iteration": 2.9970362186431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061163, + "balance_loss_mlp": 1.051489, + "diversity_loss_mlp": 0.0, + "epoch": 0.6781454405540592, + "flos": 571782094848.0, + "grad_norm": 0.05785383684082485, + "language_loss": 0.8476572, + "learning_rate": 0.0002479343663863755, + "loss": 0.85826874, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 3525, + "time_per_iteration": 2.748159885406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059476, + "balance_loss_mlp": 1.04968917, + "diversity_loss_mlp": 0.0, + "epoch": 0.6783378222393228, + "flos": 485026693632.0, + "grad_norm": 0.0719627260838572, + "language_loss": 0.76970756, + "learning_rate": 0.00024766535897086876, + "loss": 0.78030241, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3526, + "time_per_iteration": 2.5848824977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060587, + "balance_loss_mlp": 1.05073428, + "diversity_loss_mlp": 0.0, + "epoch": 0.6785302039245864, + "flos": 482839958016.0, + "grad_norm": 0.06835251841322831, + "language_loss": 0.79290187, + "learning_rate": 0.0002473964495203578, + "loss": 0.80350775, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.09851074, + "routerloss_mlp": 0.0, + "step": 3527, + "time_per_iteration": 2.6953914165496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106164, + "balance_loss_mlp": 1.05191827, + "diversity_loss_mlp": 0.0, + "epoch": 0.67872258560985, + "flos": 524732608512.0, + "grad_norm": 0.06684083470405644, + "language_loss": 0.85681713, + "learning_rate": 0.0002471276381392425, + "loss": 0.86743355, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.09710693, + "routerloss_mlp": 0.0, + "step": 3528, + "time_per_iteration": 2.7917094230651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_mlp": 1.02451074, + "diversity_loss_mlp": 0.0, + "epoch": 0.6789149672951135, + "flos": 1552605428736.0, + "grad_norm": 0.029269024795112553, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.7921958, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.06445312, + "routerloss_mlp": 0.0, + "step": 3529, + "time_per_iteration": 4.962055921554565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066825, + "balance_loss_mlp": 1.05733609, + "diversity_loss_mlp": 0.0, + "epoch": 0.6791073489803771, + "flos": 741406556160.0, + "grad_norm": 0.06831388456608918, + "language_loss": 0.84243917, + "learning_rate": 0.00024659031000260826, + "loss": 0.85310745, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3530, + "time_per_iteration": 2.8746378421783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066552, + "balance_loss_mlp": 1.05688381, + "diversity_loss_mlp": 0.0, + "epoch": 0.6792997306656406, + "flos": 576365538816.0, + "grad_norm": 0.07285232550578888, + "language_loss": 0.80730051, + "learning_rate": 0.0002463217934556985, + "loss": 0.81796598, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3531, + "time_per_iteration": 2.7028424739837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014286, + "balance_loss_mlp": 1.00808728, + "diversity_loss_mlp": 0.0, + "epoch": 0.6794921123509042, + "flos": 1503337273344.0, + "grad_norm": 0.01858574921496822, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77546376, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.06201172, + "routerloss_mlp": 0.0, + "step": 3532, + "time_per_iteration": 4.780252933502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071665, + "balance_loss_mlp": 1.06221724, + "diversity_loss_mlp": 0.0, + "epoch": 0.6796844940361677, + "flos": 698923261440.0, + "grad_norm": 0.08979673870599186, + "language_loss": 0.83808529, + "learning_rate": 0.0002457850559259306, + "loss": 0.84880191, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3533, + "time_per_iteration": 2.9009928703308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107616, + "balance_loss_mlp": 1.06684947, + "diversity_loss_mlp": 0.0, + "epoch": 0.6798768757214313, + "flos": 552759303168.0, + "grad_norm": 0.06667977411786664, + "language_loss": 0.81866515, + "learning_rate": 0.00024551683515145275, + "loss": 0.82942677, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3534, + "time_per_iteration": 2.67411208152771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.0675205, + "diversity_loss_mlp": 0.0, + "epoch": 0.6800692574066949, + "flos": 522936456192.0, + "grad_norm": 0.06662082176408471, + "language_loss": 0.86499625, + "learning_rate": 0.0002452487131761014, + "loss": 0.87576586, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3535, + "time_per_iteration": 2.723414421081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071026, + "balance_loss_mlp": 1.06126261, + "diversity_loss_mlp": 0.0, + "epoch": 0.6802616390919585, + "flos": 574023158784.0, + "grad_norm": 0.07513209939898634, + "language_loss": 0.79904449, + "learning_rate": 0.00024498069010397093, + "loss": 0.80975473, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.09753418, + "routerloss_mlp": 0.0, + "step": 3536, + "time_per_iteration": 2.729044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071543, + "balance_loss_mlp": 1.06177378, + "diversity_loss_mlp": 0.0, + "epoch": 0.6804540207772221, + "flos": 488157207552.0, + "grad_norm": 0.062001089349607685, + "language_loss": 0.85142958, + "learning_rate": 0.00024471276603911697, + "loss": 0.86214507, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3537, + "time_per_iteration": 4.243680953979492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073179, + "balance_loss_mlp": 1.06360102, + "diversity_loss_mlp": 0.0, + "epoch": 0.6806464024624855, + "flos": 578594119680.0, + "grad_norm": 0.06230124795461592, + "language_loss": 0.79373354, + "learning_rate": 0.0002444449410855572, + "loss": 0.80446529, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3538, + "time_per_iteration": 2.744311571121216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071763, + "balance_loss_mlp": 1.06218505, + "diversity_loss_mlp": 0.0, + "epoch": 0.6808387841477491, + "flos": 553722905088.0, + "grad_norm": 0.057428584707934646, + "language_loss": 0.84307408, + "learning_rate": 0.00024417721534727033, + "loss": 0.85379171, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3539, + "time_per_iteration": 2.643796920776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073723, + "balance_loss_mlp": 1.06420994, + "diversity_loss_mlp": 0.0, + "epoch": 0.6810311658330127, + "flos": 426841270272.0, + "grad_norm": 0.09448746877359589, + "language_loss": 0.82968056, + "learning_rate": 0.00024390958892819687, + "loss": 0.8404178, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3540, + "time_per_iteration": 2.500807285308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010722, + "balance_loss_mlp": 1.0624193, + "diversity_loss_mlp": 0.0, + "epoch": 0.6812235475182763, + "flos": 572256368640.0, + "grad_norm": 0.06494427347835982, + "language_loss": 0.80941665, + "learning_rate": 0.0002436420619322381, + "loss": 0.82013869, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.09771729, + "routerloss_mlp": 0.0, + "step": 3541, + "time_per_iteration": 2.8345742225646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077361, + "balance_loss_mlp": 1.0675267, + "diversity_loss_mlp": 0.0, + "epoch": 0.6814159292035398, + "flos": 501917078016.0, + "grad_norm": 0.07816741001086884, + "language_loss": 0.82754946, + "learning_rate": 0.0002433746344632577, + "loss": 0.83832312, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.0982666, + "routerloss_mlp": 0.0, + "step": 3542, + "time_per_iteration": 2.6863982677459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067339, + "balance_loss_mlp": 1.05741465, + "diversity_loss_mlp": 0.0, + "epoch": 0.6816083108888034, + "flos": 765531482112.0, + "grad_norm": 0.06517118266272649, + "language_loss": 0.80166835, + "learning_rate": 0.00024310730662508006, + "loss": 0.81234175, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.09924316, + "routerloss_mlp": 0.0, + "step": 3543, + "time_per_iteration": 3.0644540786743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070309, + "balance_loss_mlp": 1.06105816, + "diversity_loss_mlp": 0.0, + "epoch": 0.681800692574067, + "flos": 479459824128.0, + "grad_norm": 0.06994305910782121, + "language_loss": 0.87753445, + "learning_rate": 0.0002428400785214911, + "loss": 0.88823748, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3544, + "time_per_iteration": 2.5769219398498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070709, + "balance_loss_mlp": 1.06136894, + "diversity_loss_mlp": 0.0, + "epoch": 0.6819930742593305, + "flos": 691604656128.0, + "grad_norm": 0.07082765333867001, + "language_loss": 0.82354796, + "learning_rate": 0.00024257295025623794, + "loss": 0.83425504, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3545, + "time_per_iteration": 2.799276828765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066867, + "balance_loss_mlp": 1.05750871, + "diversity_loss_mlp": 0.0, + "epoch": 0.6821854559445941, + "flos": 678096603648.0, + "grad_norm": 0.06649234916050309, + "language_loss": 0.8049404, + "learning_rate": 0.00024230592193302892, + "loss": 0.8156091, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3546, + "time_per_iteration": 2.9205825328826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064713, + "balance_loss_mlp": 1.05521762, + "diversity_loss_mlp": 0.0, + "epoch": 0.6823778376298576, + "flos": 462191339520.0, + "grad_norm": 0.07288649013986744, + "language_loss": 0.84268177, + "learning_rate": 0.00024203899365553372, + "loss": 0.85332888, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3547, + "time_per_iteration": 2.5345499515533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_mlp": 1.02241051, + "diversity_loss_mlp": 0.0, + "epoch": 0.6825702193151212, + "flos": 1475298842112.0, + "grad_norm": 0.024887330229706912, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77762419, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.05639648, + "routerloss_mlp": 0.0, + "step": 3548, + "time_per_iteration": 4.575555801391602 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066126, + "balance_loss_mlp": 1.05700111, + "diversity_loss_mlp": 0.0, + "epoch": 0.6827626010003848, + "flos": 723114998784.0, + "grad_norm": 0.06418703018565212, + "language_loss": 0.83182037, + "learning_rate": 0.00024150543765216848, + "loss": 0.84248167, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 3549, + "time_per_iteration": 2.9021003246307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060662, + "balance_loss_mlp": 1.05113733, + "diversity_loss_mlp": 0.0, + "epoch": 0.6829549826856484, + "flos": 558864686592.0, + "grad_norm": 0.07049185581954354, + "language_loss": 0.83715057, + "learning_rate": 0.00024123881013344352, + "loss": 0.8477571, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.09515381, + "routerloss_mlp": 0.0, + "step": 3550, + "time_per_iteration": 2.671104669570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062514, + "balance_loss_mlp": 1.05335271, + "diversity_loss_mlp": 0.0, + "epoch": 0.6831473643709118, + "flos": 624934393344.0, + "grad_norm": 0.06503037380674516, + "language_loss": 0.7999897, + "learning_rate": 0.00024097228307472202, + "loss": 0.81061488, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3551, + "time_per_iteration": 2.826650619506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064244, + "balance_loss_mlp": 1.05474889, + "diversity_loss_mlp": 0.0, + "epoch": 0.6833397460561754, + "flos": 713861849088.0, + "grad_norm": 0.06680109192015529, + "language_loss": 0.82289582, + "learning_rate": 0.00024070585657947846, + "loss": 0.83353829, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3552, + "time_per_iteration": 2.831995725631714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010645, + "balance_loss_mlp": 1.05527949, + "diversity_loss_mlp": 0.0, + "epoch": 0.683532127741439, + "flos": 464704045056.0, + "grad_norm": 0.065434895685697, + "language_loss": 0.85023475, + "learning_rate": 0.00024043953075114934, + "loss": 0.86087978, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3553, + "time_per_iteration": 2.622846841812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055947, + "balance_loss_mlp": 1.0463928, + "diversity_loss_mlp": 0.0, + "epoch": 0.6837245094267026, + "flos": 582251037696.0, + "grad_norm": 0.07243414619593286, + "language_loss": 0.89257199, + "learning_rate": 0.00024017330569313128, + "loss": 0.90313148, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3554, + "time_per_iteration": 2.705098867416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065851, + "balance_loss_mlp": 1.05631375, + "diversity_loss_mlp": 0.0, + "epoch": 0.6839168911119662, + "flos": 794173413888.0, + "grad_norm": 0.06810293796091849, + "language_loss": 0.7482394, + "learning_rate": 0.0002399071815087821, + "loss": 0.7588979, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3555, + "time_per_iteration": 3.053788900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064522, + "balance_loss_mlp": 1.05496788, + "diversity_loss_mlp": 0.0, + "epoch": 0.6841092727972297, + "flos": 580009973760.0, + "grad_norm": 0.0721005752972134, + "language_loss": 0.83788198, + "learning_rate": 0.00023964115830142025, + "loss": 0.84852719, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3556, + "time_per_iteration": 2.7068707942962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062319, + "balance_loss_mlp": 1.05320573, + "diversity_loss_mlp": 0.0, + "epoch": 0.6843016544824932, + "flos": 383742738432.0, + "grad_norm": 0.07897700130685587, + "language_loss": 0.87426114, + "learning_rate": 0.00023937523617432522, + "loss": 0.88488424, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3557, + "time_per_iteration": 2.526129722595215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_mlp": 1.05461264, + "diversity_loss_mlp": 0.0, + "epoch": 0.6844940361677568, + "flos": 1439035476480.0, + "grad_norm": 0.08002974259616906, + "language_loss": 0.8704505, + "learning_rate": 0.00023910941523073705, + "loss": 0.88108861, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3558, + "time_per_iteration": 3.884982109069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067156, + "balance_loss_mlp": 1.05752969, + "diversity_loss_mlp": 0.0, + "epoch": 0.6846864178530204, + "flos": 520870860288.0, + "grad_norm": 0.0697798269972245, + "language_loss": 0.86687434, + "learning_rate": 0.0002388436955738566, + "loss": 0.87754589, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3559, + "time_per_iteration": 2.6896438598632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067142, + "balance_loss_mlp": 1.05763495, + "diversity_loss_mlp": 0.0, + "epoch": 0.6848787995382839, + "flos": 717946053120.0, + "grad_norm": 0.07371598831130721, + "language_loss": 0.81583881, + "learning_rate": 0.00023857807730684523, + "loss": 0.82651019, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 3560, + "time_per_iteration": 2.906409740447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070721, + "balance_loss_mlp": 1.06119633, + "diversity_loss_mlp": 0.0, + "epoch": 0.6850711812235475, + "flos": 511061571072.0, + "grad_norm": 0.09020757950976771, + "language_loss": 0.82591355, + "learning_rate": 0.00023831256053282547, + "loss": 0.83662075, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 3561, + "time_per_iteration": 2.741647481918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076955, + "balance_loss_mlp": 1.06726301, + "diversity_loss_mlp": 0.0, + "epoch": 0.6852635629088111, + "flos": 668151493632.0, + "grad_norm": 0.06598100836979733, + "language_loss": 0.7798056, + "learning_rate": 0.00023804714535488003, + "loss": 0.79057515, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3562, + "time_per_iteration": 2.8663859367370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022665, + "balance_loss_mlp": 1.01694274, + "diversity_loss_mlp": 0.0, + "epoch": 0.6854559445940747, + "flos": 1522980071424.0, + "grad_norm": 0.018293527884891043, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80832297, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.05712891, + "routerloss_mlp": 0.0, + "step": 3563, + "time_per_iteration": 4.938952684402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.06765318, + "diversity_loss_mlp": 0.0, + "epoch": 0.6856483262793382, + "flos": 454203168768.0, + "grad_norm": 0.06579070354920068, + "language_loss": 0.8089236, + "learning_rate": 0.00023751662019934488, + "loss": 0.81969196, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3564, + "time_per_iteration": 2.4886345863342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085968, + "balance_loss_mlp": 1.07677126, + "diversity_loss_mlp": 0.0, + "epoch": 0.6858407079646017, + "flos": 615552763392.0, + "grad_norm": 0.06770513871895241, + "language_loss": 0.79428673, + "learning_rate": 0.00023725151042772364, + "loss": 0.80514634, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3565, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091397, + "balance_loss_mlp": 1.08220637, + "diversity_loss_mlp": 0.0, + "epoch": 0.6860330896498653, + "flos": 466053087744.0, + "grad_norm": 0.0657025292696896, + "language_loss": 0.83245081, + "learning_rate": 0.00023698650266411276, + "loss": 0.84336478, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3566, + "time_per_iteration": 2.619652032852173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087671, + "balance_loss_mlp": 1.07844996, + "diversity_loss_mlp": 0.0, + "epoch": 0.6862254713351289, + "flos": 864270425088.0, + "grad_norm": 0.07570090303701395, + "language_loss": 0.82732457, + "learning_rate": 0.00023672159701139755, + "loss": 0.83820128, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3567, + "time_per_iteration": 3.2096190452575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092795, + "balance_loss_mlp": 1.08350825, + "diversity_loss_mlp": 0.0, + "epoch": 0.6864178530203925, + "flos": 447141523968.0, + "grad_norm": 0.07219945861824417, + "language_loss": 0.86111134, + "learning_rate": 0.00023645679357242296, + "loss": 0.87203926, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3568, + "time_per_iteration": 2.598115921020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00792363, + "balance_loss_mlp": 1.34135008, + "diversity_loss_mlp": 0.22022857, + "epoch": 0.6866102347056561, + "flos": 424269093888.0, + "grad_norm": 0.03374979092207147, + "language_loss": 0.84308195, + "learning_rate": 0.00023619209244999534, + "loss": 0.85100567, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01157361, + "step": 3569, + "time_per_iteration": 2.647141695022583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.07559109, + "diversity_loss_mlp": 0.0, + "epoch": 0.6868026163909196, + "flos": 472373586432.0, + "grad_norm": 0.09720254317506574, + "language_loss": 0.85017771, + "learning_rate": 0.0002359274937468806, + "loss": 0.86102515, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 3570, + "time_per_iteration": 2.5088424682617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080851, + "balance_loss_mlp": 1.07149255, + "diversity_loss_mlp": 0.0, + "epoch": 0.6869949980761831, + "flos": 464190124032.0, + "grad_norm": 0.06491952507138833, + "language_loss": 0.77798098, + "learning_rate": 0.00023566299756580512, + "loss": 0.78878951, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3571, + "time_per_iteration": 2.6349782943725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080655, + "balance_loss_mlp": 1.07132113, + "diversity_loss_mlp": 0.0, + "epoch": 0.6871873797614467, + "flos": 426235944960.0, + "grad_norm": 0.07205344290521438, + "language_loss": 0.78495932, + "learning_rate": 0.0002353986040094551, + "loss": 0.79576588, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3572, + "time_per_iteration": 2.4710493087768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079091, + "balance_loss_mlp": 1.06974494, + "diversity_loss_mlp": 0.0, + "epoch": 0.6873797614467103, + "flos": 443625569280.0, + "grad_norm": 0.07195013135933294, + "language_loss": 0.7977035, + "learning_rate": 0.00023513431318047796, + "loss": 0.80849445, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3573, + "time_per_iteration": 2.5213143825531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081479, + "balance_loss_mlp": 1.07233512, + "diversity_loss_mlp": 0.0, + "epoch": 0.6875721431319738, + "flos": 992323436544.0, + "grad_norm": 0.0671999790126143, + "language_loss": 0.77178657, + "learning_rate": 0.00023487012518147977, + "loss": 0.78260136, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3574, + "time_per_iteration": 3.2319135665893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073879, + "balance_loss_mlp": 1.06456256, + "diversity_loss_mlp": 0.0, + "epoch": 0.6877645248172374, + "flos": 1285513638912.0, + "grad_norm": 0.06898424741609648, + "language_loss": 0.84452772, + "learning_rate": 0.00023460604011502772, + "loss": 0.85526657, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3575, + "time_per_iteration": 3.8878557682037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075527, + "balance_loss_mlp": 1.0666877, + "diversity_loss_mlp": 0.0, + "epoch": 0.687956906502501, + "flos": 876733383168.0, + "grad_norm": 0.0699577179930161, + "language_loss": 0.85862118, + "learning_rate": 0.00023434205808364845, + "loss": 0.86937642, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 3576, + "time_per_iteration": 3.1633143424987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072972, + "balance_loss_mlp": 1.06390619, + "diversity_loss_mlp": 0.0, + "epoch": 0.6881492881877646, + "flos": 563324419584.0, + "grad_norm": 0.07476899851847786, + "language_loss": 0.85238355, + "learning_rate": 0.00023407817918982932, + "loss": 0.86311328, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3577, + "time_per_iteration": 2.7126357555389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075894, + "balance_loss_mlp": 1.06677413, + "diversity_loss_mlp": 0.0, + "epoch": 0.6883416698730281, + "flos": 795127104000.0, + "grad_norm": 0.07427735671199864, + "language_loss": 0.78816962, + "learning_rate": 0.00023381440353601718, + "loss": 0.79892862, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3578, + "time_per_iteration": 2.9925150871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069003, + "balance_loss_mlp": 1.05976987, + "diversity_loss_mlp": 0.0, + "epoch": 0.6885340515582916, + "flos": 723621579264.0, + "grad_norm": 0.07604251893794473, + "language_loss": 0.86125422, + "learning_rate": 0.00023355073122461822, + "loss": 0.87194419, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3579, + "time_per_iteration": 2.938112258911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065285, + "balance_loss_mlp": 1.05620754, + "diversity_loss_mlp": 0.0, + "epoch": 0.6887264332435552, + "flos": 1010926282752.0, + "grad_norm": 0.06357801718819331, + "language_loss": 0.82597542, + "learning_rate": 0.00023328716235799973, + "loss": 0.83662832, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3580, + "time_per_iteration": 3.2711336612701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066362, + "balance_loss_mlp": 1.05755877, + "diversity_loss_mlp": 0.0, + "epoch": 0.6889188149288188, + "flos": 585262983168.0, + "grad_norm": 0.07922172227575792, + "language_loss": 0.84162283, + "learning_rate": 0.00023302369703848803, + "loss": 0.85228646, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 3581, + "time_per_iteration": 2.8185226917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069197, + "balance_loss_mlp": 1.06004775, + "diversity_loss_mlp": 0.0, + "epoch": 0.6891111966140824, + "flos": 636119889408.0, + "grad_norm": 0.07416922878209098, + "language_loss": 0.79931486, + "learning_rate": 0.00023276033536836937, + "loss": 0.81000686, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 3582, + "time_per_iteration": 2.844299554824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061227, + "balance_loss_mlp": 1.05179787, + "diversity_loss_mlp": 0.0, + "epoch": 0.6893035782993459, + "flos": 495270609408.0, + "grad_norm": 0.06489183727188522, + "language_loss": 0.85119617, + "learning_rate": 0.00023249707744988984, + "loss": 0.86180842, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3583, + "time_per_iteration": 2.701711654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060915, + "balance_loss_mlp": 1.05140829, + "diversity_loss_mlp": 0.0, + "epoch": 0.6894959599846094, + "flos": 458215792128.0, + "grad_norm": 0.07019303893436639, + "language_loss": 0.82148254, + "learning_rate": 0.00023223392338525529, + "loss": 0.83209163, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3584, + "time_per_iteration": 2.5167200565338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053502, + "balance_loss_mlp": 1.04406083, + "diversity_loss_mlp": 0.0, + "epoch": 0.689688341669873, + "flos": 505003175424.0, + "grad_norm": 0.06639305906088179, + "language_loss": 0.78639823, + "learning_rate": 0.00023197087327663107, + "loss": 0.79693329, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3585, + "time_per_iteration": 2.6349897384643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057819, + "balance_loss_mlp": 1.04834747, + "diversity_loss_mlp": 0.0, + "epoch": 0.6898807233551366, + "flos": 763910797824.0, + "grad_norm": 0.0732534701091779, + "language_loss": 0.81201088, + "learning_rate": 0.00023170792722614243, + "loss": 0.82258916, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3586, + "time_per_iteration": 2.9198050498962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056115, + "balance_loss_mlp": 1.04651892, + "diversity_loss_mlp": 0.0, + "epoch": 0.6900731050404002, + "flos": 583337977344.0, + "grad_norm": 0.06720533838288198, + "language_loss": 0.83776879, + "learning_rate": 0.00023144508533587377, + "loss": 0.84832996, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3587, + "time_per_iteration": 2.8723502159118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054327, + "balance_loss_mlp": 1.04436147, + "diversity_loss_mlp": 0.0, + "epoch": 0.6902654867256637, + "flos": 711865262592.0, + "grad_norm": 0.07065225941485688, + "language_loss": 0.78699905, + "learning_rate": 0.0002311823477078698, + "loss": 0.79754233, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.09960938, + "routerloss_mlp": 0.0, + "step": 3588, + "time_per_iteration": 2.9407894611358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054694, + "balance_loss_mlp": 1.04507959, + "diversity_loss_mlp": 0.0, + "epoch": 0.6904578684109273, + "flos": 597112902144.0, + "grad_norm": 0.0778571388662146, + "language_loss": 0.85240763, + "learning_rate": 0.00023091971444413428, + "loss": 0.8629545, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.0960083, + "routerloss_mlp": 0.0, + "step": 3589, + "time_per_iteration": 2.796943187713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054005, + "balance_loss_mlp": 1.04385448, + "diversity_loss_mlp": 0.0, + "epoch": 0.6906502500961909, + "flos": 585040527360.0, + "grad_norm": 0.0732795678952718, + "language_loss": 0.82600373, + "learning_rate": 0.00023065718564663012, + "loss": 0.8365438, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.1015625, + "routerloss_mlp": 0.0, + "step": 3590, + "time_per_iteration": 2.742586135864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010537, + "balance_loss_mlp": 1.00519681, + "diversity_loss_mlp": 0.0, + "epoch": 0.6908426317814544, + "flos": 1587827017728.0, + "grad_norm": 0.012465594930310886, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74922127, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.0534668, + "routerloss_mlp": 0.0, + "step": 3591, + "time_per_iteration": 4.981812477111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079259, + "balance_loss_mlp": 1.34177041, + "diversity_loss_mlp": 0.2198928, + "epoch": 0.6910350134667179, + "flos": 500780579328.0, + "grad_norm": 0.028847197535296083, + "language_loss": 0.80689478, + "learning_rate": 0.0002301324418579666, + "loss": 0.81482071, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0117582, + "step": 3592, + "time_per_iteration": 2.71809983253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0058906, + "balance_loss_mlp": 1.01557088, + "diversity_loss_mlp": 0.14263315, + "epoch": 0.6912273951519815, + "flos": 1409194257408.0, + "grad_norm": 0.0010924650790030575, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79277533, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00995804, + "step": 3593, + "time_per_iteration": 4.800194263458252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064196, + "balance_loss_mlp": 1.05474234, + "diversity_loss_mlp": 0.0, + "epoch": 0.6914197768372451, + "flos": 635279625216.0, + "grad_norm": 0.08227146788009188, + "language_loss": 0.80700612, + "learning_rate": 0.00022960811715677415, + "loss": 0.81764805, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3594, + "time_per_iteration": 2.8780887126922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065404, + "balance_loss_mlp": 1.05574787, + "diversity_loss_mlp": 0.0, + "epoch": 0.6916121585225087, + "flos": 558044246016.0, + "grad_norm": 0.06283622806249096, + "language_loss": 0.82029772, + "learning_rate": 0.00022934611221845608, + "loss": 0.83095175, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3595, + "time_per_iteration": 2.80785870552063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062245, + "balance_loss_mlp": 1.05264866, + "diversity_loss_mlp": 0.0, + "epoch": 0.6918045402077723, + "flos": 529167748608.0, + "grad_norm": 0.07415067488634865, + "language_loss": 0.77666163, + "learning_rate": 0.00022908421235729609, + "loss": 0.78728402, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3596, + "time_per_iteration": 2.75410795211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065698, + "balance_loss_mlp": 1.05607235, + "diversity_loss_mlp": 0.0, + "epoch": 0.6919969218930357, + "flos": 570351559680.0, + "grad_norm": 0.06984612144500793, + "language_loss": 0.8509379, + "learning_rate": 0.0002288224176749728, + "loss": 0.86159492, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3597, + "time_per_iteration": 2.670696258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070664, + "balance_loss_mlp": 1.06105542, + "diversity_loss_mlp": 0.0, + "epoch": 0.6921893035782993, + "flos": 683305196544.0, + "grad_norm": 0.1037313094960325, + "language_loss": 0.78704476, + "learning_rate": 0.00022856072827312385, + "loss": 0.79775131, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.09606934, + "routerloss_mlp": 0.0, + "step": 3598, + "time_per_iteration": 2.795475959777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106581, + "balance_loss_mlp": 1.05624998, + "diversity_loss_mlp": 0.0, + "epoch": 0.6923816852635629, + "flos": 546745324032.0, + "grad_norm": 0.06439958207329444, + "language_loss": 0.77316082, + "learning_rate": 0.00022829914425334598, + "loss": 0.78381896, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3599, + "time_per_iteration": 2.6179866790771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064235, + "balance_loss_mlp": 1.05483484, + "diversity_loss_mlp": 0.0, + "epoch": 0.6925740669488265, + "flos": 510036300288.0, + "grad_norm": 0.06408780313496462, + "language_loss": 0.80725557, + "learning_rate": 0.0002280376657171956, + "loss": 0.81789792, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3600, + "time_per_iteration": 2.633162021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_mlp": 1.05445051, + "diversity_loss_mlp": 0.0, + "epoch": 0.69276644863409, + "flos": 869424689664.0, + "grad_norm": 0.07377083778937557, + "language_loss": 0.76414573, + "learning_rate": 0.00022777629276618706, + "loss": 0.77478784, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3601, + "time_per_iteration": 3.0916104316711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065762, + "balance_loss_mlp": 1.05597496, + "diversity_loss_mlp": 0.0, + "epoch": 0.6929588303193536, + "flos": 625772086272.0, + "grad_norm": 0.06702562864271609, + "language_loss": 0.77948666, + "learning_rate": 0.0002275150255017947, + "loss": 0.79014426, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3602, + "time_per_iteration": 2.7668936252593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012943, + "balance_loss_mlp": 1.00765014, + "diversity_loss_mlp": 0.0, + "epoch": 0.6931512120046172, + "flos": 1545382996992.0, + "grad_norm": 0.010670435186768691, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76745617, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.05297852, + "routerloss_mlp": 0.0, + "step": 3603, + "time_per_iteration": 5.010159492492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011501, + "balance_loss_mlp": 1.00618434, + "diversity_loss_mlp": 0.0, + "epoch": 0.6933435936898807, + "flos": 1448230606848.0, + "grad_norm": 0.00963913060826947, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76138604, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 3604, + "time_per_iteration": 4.7926812171936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061095, + "balance_loss_mlp": 1.05157018, + "diversity_loss_mlp": 0.0, + "epoch": 0.6935359753751443, + "flos": 540896901120.0, + "grad_norm": 0.06111799581134822, + "language_loss": 0.84283471, + "learning_rate": 0.0002267318588424379, + "loss": 0.85344565, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3605, + "time_per_iteration": 2.732388496398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_mlp": 1.04717207, + "diversity_loss_mlp": 0.0, + "epoch": 0.6937283570604078, + "flos": 719396411904.0, + "grad_norm": 0.07244313312376265, + "language_loss": 0.87551069, + "learning_rate": 0.00022647101533842845, + "loss": 0.88607633, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 3606, + "time_per_iteration": 3.001912832260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_mlp": 1.04882836, + "diversity_loss_mlp": 0.0, + "epoch": 0.6939207387456714, + "flos": 522165574656.0, + "grad_norm": 0.07498146805012186, + "language_loss": 0.76334918, + "learning_rate": 0.00022621027802778872, + "loss": 0.77393162, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3607, + "time_per_iteration": 2.6257400512695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052774, + "balance_loss_mlp": 1.04345798, + "diversity_loss_mlp": 0.0, + "epoch": 0.694113120430935, + "flos": 535359767040.0, + "grad_norm": 0.07029819881410336, + "language_loss": 0.78756207, + "learning_rate": 0.00022594964701174586, + "loss": 0.79808986, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3608, + "time_per_iteration": 2.6099236011505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065561, + "balance_loss_mlp": 1.05642402, + "diversity_loss_mlp": 0.0, + "epoch": 0.6943055021161986, + "flos": 523358972928.0, + "grad_norm": 0.10152593614861574, + "language_loss": 0.84643018, + "learning_rate": 0.00022568912239148586, + "loss": 0.85708582, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 3609, + "time_per_iteration": 2.6678829193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059207, + "balance_loss_mlp": 1.04986095, + "diversity_loss_mlp": 0.0, + "epoch": 0.694497883801462, + "flos": 484902982656.0, + "grad_norm": 0.06906376751770449, + "language_loss": 0.81638551, + "learning_rate": 0.00022542870426815344, + "loss": 0.82697761, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3610, + "time_per_iteration": 2.69460129737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058231, + "balance_loss_mlp": 1.04869449, + "diversity_loss_mlp": 0.0, + "epoch": 0.6946902654867256, + "flos": 461474786304.0, + "grad_norm": 0.07528135941421366, + "language_loss": 0.86051476, + "learning_rate": 0.00022516839274285173, + "loss": 0.87109709, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3611, + "time_per_iteration": 2.5634658336639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063678, + "balance_loss_mlp": 1.05389714, + "diversity_loss_mlp": 0.0, + "epoch": 0.6948826471719892, + "flos": 512855525376.0, + "grad_norm": 0.06331906344074151, + "language_loss": 0.7521888, + "learning_rate": 0.00022490818791664265, + "loss": 0.76282561, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3612, + "time_per_iteration": 2.617492437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067849, + "balance_loss_mlp": 1.05837226, + "diversity_loss_mlp": 0.0, + "epoch": 0.6950750288572528, + "flos": 557184531456.0, + "grad_norm": 0.05946591075452152, + "language_loss": 0.85666263, + "learning_rate": 0.00022464808989054676, + "loss": 0.86734116, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3613, + "time_per_iteration": 2.6678874492645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789837, + "balance_loss_mlp": 1.33770788, + "diversity_loss_mlp": 0.21965824, + "epoch": 0.6952674105425164, + "flos": 542475740160.0, + "grad_norm": 0.03604068217542595, + "language_loss": 0.76138353, + "learning_rate": 0.00022438809876554284, + "loss": 0.76928186, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01115366, + "step": 3614, + "time_per_iteration": 2.6613171100616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070097, + "balance_loss_mlp": 1.0602442, + "diversity_loss_mlp": 0.0, + "epoch": 0.6954597922277799, + "flos": 546742752768.0, + "grad_norm": 0.08971125257054285, + "language_loss": 0.80425173, + "learning_rate": 0.00022412821464256873, + "loss": 0.81495273, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.09844971, + "routerloss_mlp": 0.0, + "step": 3615, + "time_per_iteration": 2.7288718223571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_mlp": 1.06157804, + "diversity_loss_mlp": 0.0, + "epoch": 0.6956521739130435, + "flos": 519511905792.0, + "grad_norm": 0.07384702921709109, + "language_loss": 0.82342923, + "learning_rate": 0.00022386843762252023, + "loss": 0.83414114, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3616, + "time_per_iteration": 2.5761711597442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106737, + "balance_loss_mlp": 1.0575707, + "diversity_loss_mlp": 0.0, + "epoch": 0.695844555598307, + "flos": 466275543552.0, + "grad_norm": 0.07908443617567998, + "language_loss": 0.79798818, + "learning_rate": 0.00022360876780625193, + "loss": 0.80866194, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.09790039, + "routerloss_mlp": 0.0, + "step": 3617, + "time_per_iteration": 2.6008386611938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059868, + "balance_loss_mlp": 1.05015886, + "diversity_loss_mlp": 0.0, + "epoch": 0.6960369372835706, + "flos": 600663361536.0, + "grad_norm": 0.07021226627677062, + "language_loss": 0.80116498, + "learning_rate": 0.00022334920529457604, + "loss": 0.81176364, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3618, + "time_per_iteration": 2.9185733795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105864, + "balance_loss_mlp": 1.04876924, + "diversity_loss_mlp": 0.0, + "epoch": 0.6962293189688342, + "flos": 644233969152.0, + "grad_norm": 0.05697997760775425, + "language_loss": 0.87189567, + "learning_rate": 0.00022308975018826423, + "loss": 0.88248205, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.09863281, + "routerloss_mlp": 0.0, + "step": 3619, + "time_per_iteration": 2.927544355392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054275, + "balance_loss_mlp": 1.04414856, + "diversity_loss_mlp": 0.0, + "epoch": 0.6964217006540977, + "flos": 638810634240.0, + "grad_norm": 0.0740354998090604, + "language_loss": 0.84932256, + "learning_rate": 0.00022283040258804564, + "loss": 0.85986531, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.10125732, + "routerloss_mlp": 0.0, + "step": 3620, + "time_per_iteration": 2.755613327026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787662, + "balance_loss_mlp": 1.33203387, + "diversity_loss_mlp": 0.22018704, + "epoch": 0.6966140823393613, + "flos": 652167811584.0, + "grad_norm": 0.033538632644234186, + "language_loss": 0.83875167, + "learning_rate": 0.00022257116259460802, + "loss": 0.84662825, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01155162, + "step": 3621, + "time_per_iteration": 2.844062089920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_mlp": 1.03843641, + "diversity_loss_mlp": 0.0, + "epoch": 0.6968064640246249, + "flos": 704492328960.0, + "grad_norm": 0.06349986715080715, + "language_loss": 0.81602001, + "learning_rate": 0.00022231203030859725, + "loss": 0.82649869, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3622, + "time_per_iteration": 2.9582505226135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053512, + "balance_loss_mlp": 1.04382682, + "diversity_loss_mlp": 0.0, + "epoch": 0.6969988457098885, + "flos": 492555271680.0, + "grad_norm": 0.09473470519326596, + "language_loss": 0.83760095, + "learning_rate": 0.00022205300583061737, + "loss": 0.84813607, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 3623, + "time_per_iteration": 2.5727412700653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016252, + "balance_loss_mlp": 1.01057744, + "diversity_loss_mlp": 0.0, + "epoch": 0.6971912273951519, + "flos": 1352592442368.0, + "grad_norm": 0.01746847385777515, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83854461, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.05664062, + "routerloss_mlp": 0.0, + "step": 3624, + "time_per_iteration": 4.8940582275390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051503, + "balance_loss_mlp": 1.04190028, + "diversity_loss_mlp": 0.0, + "epoch": 0.6973836090804155, + "flos": 602459887104.0, + "grad_norm": 0.07214179790538137, + "language_loss": 0.77598304, + "learning_rate": 0.00022153528070095735, + "loss": 0.78649807, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3625, + "time_per_iteration": 2.694251298904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049268, + "balance_loss_mlp": 1.03960037, + "diversity_loss_mlp": 0.0, + "epoch": 0.6975759907656791, + "flos": 524065614336.0, + "grad_norm": 0.07542787145084529, + "language_loss": 0.88381326, + "learning_rate": 0.00022127658025027568, + "loss": 0.89430594, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.09655762, + "routerloss_mlp": 0.0, + "step": 3626, + "time_per_iteration": 2.6595661640167236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053933, + "balance_loss_mlp": 1.04412818, + "diversity_loss_mlp": 0.0, + "epoch": 0.6977683724509427, + "flos": 480912754176.0, + "grad_norm": 0.08038583191357998, + "language_loss": 0.85689813, + "learning_rate": 0.00022101798800962258, + "loss": 0.86743748, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3627, + "time_per_iteration": 2.6137661933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057745, + "balance_loss_mlp": 1.04847646, + "diversity_loss_mlp": 0.0, + "epoch": 0.6979607541362063, + "flos": 522625167360.0, + "grad_norm": 0.08075391789271535, + "language_loss": 0.78634858, + "learning_rate": 0.00022075950407939227, + "loss": 0.79692602, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3628, + "time_per_iteration": 2.6296188831329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059141, + "balance_loss_mlp": 1.04959214, + "diversity_loss_mlp": 0.0, + "epoch": 0.6981531358214698, + "flos": 548077114368.0, + "grad_norm": 0.0897351301563825, + "language_loss": 0.8281461, + "learning_rate": 0.0002205011285599367, + "loss": 0.83873749, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.09539795, + "routerloss_mlp": 0.0, + "step": 3629, + "time_per_iteration": 2.6147000789642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079513, + "balance_loss_mlp": 1.34714937, + "diversity_loss_mlp": 0.21970588, + "epoch": 0.6983455175067333, + "flos": 700052419584.0, + "grad_norm": 0.029792453728032804, + "language_loss": 0.80962801, + "learning_rate": 0.00022024286155156658, + "loss": 0.81757927, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01170244, + "step": 3630, + "time_per_iteration": 2.8613815307617188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058875, + "balance_loss_mlp": 1.04967785, + "diversity_loss_mlp": 0.0, + "epoch": 0.6985378991919969, + "flos": 485078450688.0, + "grad_norm": 0.10033041150535157, + "language_loss": 0.86079919, + "learning_rate": 0.00021998470315454994, + "loss": 0.87138796, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3631, + "time_per_iteration": 2.647185802459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061381, + "balance_loss_mlp": 1.05195761, + "diversity_loss_mlp": 0.0, + "epoch": 0.6987302808772605, + "flos": 558780622848.0, + "grad_norm": 0.06594571513985185, + "language_loss": 0.86829215, + "learning_rate": 0.00021972665346911275, + "loss": 0.87890601, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3632, + "time_per_iteration": 2.757704257965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065424, + "balance_loss_mlp": 1.05622673, + "diversity_loss_mlp": 0.0, + "epoch": 0.698922662562524, + "flos": 483593587200.0, + "grad_norm": 0.06824207534465764, + "language_loss": 0.79957312, + "learning_rate": 0.00021946871259543877, + "loss": 0.81022739, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 3633, + "time_per_iteration": 2.577909231185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063518, + "balance_loss_mlp": 1.05467892, + "diversity_loss_mlp": 0.0, + "epoch": 0.6991150442477876, + "flos": 718909655040.0, + "grad_norm": 0.08329780404335202, + "language_loss": 0.83364546, + "learning_rate": 0.00021921088063366957, + "loss": 0.84428072, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 3634, + "time_per_iteration": 2.933506965637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106547, + "balance_loss_mlp": 1.05625534, + "diversity_loss_mlp": 0.0, + "epoch": 0.6993074259330512, + "flos": 489128150016.0, + "grad_norm": 0.06097911291290099, + "language_loss": 0.81932688, + "learning_rate": 0.00021895315768390435, + "loss": 0.82998157, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 3635, + "time_per_iteration": 2.6155378818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071151, + "balance_loss_mlp": 1.06179357, + "diversity_loss_mlp": 0.0, + "epoch": 0.6994998076183148, + "flos": 718089214464.0, + "grad_norm": 0.05851098027896569, + "language_loss": 0.87547219, + "learning_rate": 0.00021869554384619999, + "loss": 0.88618374, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 3636, + "time_per_iteration": 2.9845876693725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106866, + "balance_loss_mlp": 1.05937409, + "diversity_loss_mlp": 0.0, + "epoch": 0.6996921893035783, + "flos": 579016636416.0, + "grad_norm": 0.066101183722826, + "language_loss": 0.80819213, + "learning_rate": 0.00021843803922057115, + "loss": 0.81887871, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3637, + "time_per_iteration": 2.736743688583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069376, + "balance_loss_mlp": 1.060215, + "diversity_loss_mlp": 0.0, + "epoch": 0.6998845709888418, + "flos": 518629796352.0, + "grad_norm": 0.07934438223674636, + "language_loss": 0.8197611, + "learning_rate": 0.00021818064390698977, + "loss": 0.83045483, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3638, + "time_per_iteration": 2.6075611114501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070673, + "balance_loss_mlp": 1.06178594, + "diversity_loss_mlp": 0.0, + "epoch": 0.7000769526741054, + "flos": 620951505408.0, + "grad_norm": 0.0705113992952529, + "language_loss": 0.87237096, + "learning_rate": 0.0002179233580053861, + "loss": 0.88307768, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 3639, + "time_per_iteration": 2.7142910957336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107015, + "balance_loss_mlp": 1.06120896, + "diversity_loss_mlp": 0.0, + "epoch": 0.700269334359369, + "flos": 559946856960.0, + "grad_norm": 0.07560028355572443, + "language_loss": 0.85636085, + "learning_rate": 0.00021766618161564688, + "loss": 0.86706233, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3640, + "time_per_iteration": 2.7285115718841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065402, + "balance_loss_mlp": 1.0562886, + "diversity_loss_mlp": 0.0, + "epoch": 0.7004617160446326, + "flos": 483343967232.0, + "grad_norm": 0.06395770762467583, + "language_loss": 0.87343419, + "learning_rate": 0.00021740911483761677, + "loss": 0.88408822, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3641, + "time_per_iteration": 2.584667205810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068314, + "balance_loss_mlp": 1.05936706, + "diversity_loss_mlp": 0.0, + "epoch": 0.7006540977298961, + "flos": 696981003264.0, + "grad_norm": 0.05940351360925286, + "language_loss": 0.91777283, + "learning_rate": 0.00021715215777109837, + "loss": 0.92845595, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 3642, + "time_per_iteration": 2.9933156967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069259, + "balance_loss_mlp": 1.06025815, + "diversity_loss_mlp": 0.0, + "epoch": 0.7008464794151597, + "flos": 504775950336.0, + "grad_norm": 0.07347565488383569, + "language_loss": 0.84518594, + "learning_rate": 0.00021689531051585103, + "loss": 0.85587853, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 3643, + "time_per_iteration": 2.6531710624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067704, + "balance_loss_mlp": 1.05844164, + "diversity_loss_mlp": 0.0, + "epoch": 0.7010388611004232, + "flos": 537242554368.0, + "grad_norm": 0.08696231717445767, + "language_loss": 0.80713868, + "learning_rate": 0.00021663857317159196, + "loss": 0.81781578, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3644, + "time_per_iteration": 2.604703426361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072761, + "balance_loss_mlp": 1.06396961, + "diversity_loss_mlp": 0.0, + "epoch": 0.7012312427856868, + "flos": 547259245056.0, + "grad_norm": 0.057193672258815845, + "language_loss": 0.81973934, + "learning_rate": 0.00021638194583799487, + "loss": 0.83046699, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 3645, + "time_per_iteration": 2.6747145652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067445, + "balance_loss_mlp": 1.05851054, + "diversity_loss_mlp": 0.0, + "epoch": 0.7014236244709504, + "flos": 941409630720.0, + "grad_norm": 0.08498226844175927, + "language_loss": 0.82551372, + "learning_rate": 0.00021612542861469176, + "loss": 0.83618826, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 3646, + "time_per_iteration": 3.2375802993774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067002, + "balance_loss_mlp": 1.05810285, + "diversity_loss_mlp": 0.0, + "epoch": 0.7016160061562139, + "flos": 525167608320.0, + "grad_norm": 0.07003978186883456, + "language_loss": 0.8260622, + "learning_rate": 0.00021586902160127135, + "loss": 0.83673215, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 3647, + "time_per_iteration": 2.6448206901550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076859, + "balance_loss_mlp": 1.06791854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7018083878414775, + "flos": 373385023488.0, + "grad_norm": 0.11788208419913924, + "language_loss": 0.74163634, + "learning_rate": 0.00021561272489727974, + "loss": 0.75240493, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3648, + "time_per_iteration": 2.5040485858917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107998, + "balance_loss_mlp": 1.07128358, + "diversity_loss_mlp": 0.0, + "epoch": 0.7020007695267411, + "flos": 527784201216.0, + "grad_norm": 0.06337788759133205, + "language_loss": 0.8008945, + "learning_rate": 0.0002153565386022199, + "loss": 0.81169432, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 3649, + "time_per_iteration": 2.7248024940490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076924, + "balance_loss_mlp": 1.06812, + "diversity_loss_mlp": 0.0, + "epoch": 0.7021931512120047, + "flos": 690154297344.0, + "grad_norm": 0.0801860998557123, + "language_loss": 0.82855487, + "learning_rate": 0.00021510046281555262, + "loss": 0.83932412, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 3650, + "time_per_iteration": 2.809051036834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077447, + "balance_loss_mlp": 1.06870925, + "diversity_loss_mlp": 0.0, + "epoch": 0.7023855328972681, + "flos": 639784147968.0, + "grad_norm": 0.08542793543919469, + "language_loss": 0.81736684, + "learning_rate": 0.0002148444976366949, + "loss": 0.82814133, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 3651, + "time_per_iteration": 2.7492573261260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084402, + "balance_loss_mlp": 1.07583714, + "diversity_loss_mlp": 0.0, + "epoch": 0.7025779145825317, + "flos": 560940194304.0, + "grad_norm": 0.0799718694707253, + "language_loss": 0.82820916, + "learning_rate": 0.00021458864316502136, + "loss": 0.83905321, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.08575439, + "routerloss_mlp": 0.0, + "step": 3652, + "time_per_iteration": 2.7140626907348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.07368028, + "diversity_loss_mlp": 0.0, + "epoch": 0.7027702962677953, + "flos": 447445472256.0, + "grad_norm": 0.0716785593922181, + "language_loss": 0.87417138, + "learning_rate": 0.0002143328994998634, + "loss": 0.88499534, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 3653, + "time_per_iteration": 2.5076870918273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074305, + "balance_loss_mlp": 1.06541252, + "diversity_loss_mlp": 0.0, + "epoch": 0.7029626779530589, + "flos": 622500609024.0, + "grad_norm": 0.078552736129926, + "language_loss": 0.78368807, + "learning_rate": 0.00021407726674050982, + "loss": 0.79443109, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 3654, + "time_per_iteration": 2.8595826625823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077013, + "balance_loss_mlp": 1.06806064, + "diversity_loss_mlp": 0.0, + "epoch": 0.7031550596383225, + "flos": 629591989248.0, + "grad_norm": 0.06456326920806615, + "language_loss": 0.8704083, + "learning_rate": 0.0002138217449862061, + "loss": 0.88117838, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 3655, + "time_per_iteration": 2.727473258972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074047, + "balance_loss_mlp": 1.06530333, + "diversity_loss_mlp": 0.0, + "epoch": 0.703347441323586, + "flos": 530843134464.0, + "grad_norm": 0.06685907167482581, + "language_loss": 0.78296137, + "learning_rate": 0.00021356633433615403, + "loss": 0.79370177, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 3656, + "time_per_iteration": 2.5853357315063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072471, + "balance_loss_mlp": 1.06341755, + "diversity_loss_mlp": 0.0, + "epoch": 0.7035398230088495, + "flos": 693593528832.0, + "grad_norm": 0.05195711031116695, + "language_loss": 0.83568424, + "learning_rate": 0.0002133110348895133, + "loss": 0.84640896, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3657, + "time_per_iteration": 2.966989517211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069658, + "balance_loss_mlp": 1.06044364, + "diversity_loss_mlp": 0.0, + "epoch": 0.7037322046941131, + "flos": 968035152384.0, + "grad_norm": 0.05842315057280589, + "language_loss": 0.85166538, + "learning_rate": 0.0002130558467453999, + "loss": 0.86236197, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3658, + "time_per_iteration": 3.3303468227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080025, + "balance_loss_mlp": 1.07069683, + "diversity_loss_mlp": 0.0, + "epoch": 0.7039245863793767, + "flos": 502863427584.0, + "grad_norm": 0.06729984707772495, + "language_loss": 0.8469972, + "learning_rate": 0.0002128007700028865, + "loss": 0.85779744, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3659, + "time_per_iteration": 2.7004916667938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069483, + "balance_loss_mlp": 1.06041121, + "diversity_loss_mlp": 0.0, + "epoch": 0.7041169680646402, + "flos": 465954342912.0, + "grad_norm": 0.08608403684795747, + "language_loss": 0.84587854, + "learning_rate": 0.00021254580476100276, + "loss": 0.85657346, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3660, + "time_per_iteration": 2.5480196475982666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072222, + "balance_loss_mlp": 1.06278646, + "diversity_loss_mlp": 0.0, + "epoch": 0.7043093497499038, + "flos": 632181417984.0, + "grad_norm": 0.07339918095130941, + "language_loss": 0.79315257, + "learning_rate": 0.00021229095111873497, + "loss": 0.80387473, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3661, + "time_per_iteration": 2.7757935523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791151, + "balance_loss_mlp": 1.34026599, + "diversity_loss_mlp": 0.21938899, + "epoch": 0.7045017314351674, + "flos": 542930190336.0, + "grad_norm": 0.027590424390171175, + "language_loss": 0.85883224, + "learning_rate": 0.0002120362091750261, + "loss": 0.8667438, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01132388, + "step": 3662, + "time_per_iteration": 2.896202802658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798199, + "balance_loss_mlp": 1.35343075, + "diversity_loss_mlp": 0.22044487, + "epoch": 0.704694113120431, + "flos": 428237300736.0, + "grad_norm": 0.03684811642709949, + "language_loss": 0.87121612, + "learning_rate": 0.00021178157902877566, + "loss": 0.87919807, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01126087, + "step": 3663, + "time_per_iteration": 2.4897618293762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059718, + "balance_loss_mlp": 1.05026472, + "diversity_loss_mlp": 0.0, + "epoch": 0.7048864948056945, + "flos": 650544556032.0, + "grad_norm": 0.06585144557964606, + "language_loss": 0.868586, + "learning_rate": 0.0002115270607788397, + "loss": 0.87918323, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3664, + "time_per_iteration": 2.767237901687622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061738, + "balance_loss_mlp": 1.05233264, + "diversity_loss_mlp": 0.0, + "epoch": 0.705078876490958, + "flos": 412562336256.0, + "grad_norm": 0.06809628156665722, + "language_loss": 0.8563199, + "learning_rate": 0.00021127265452403133, + "loss": 0.86693728, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3665, + "time_per_iteration": 2.5270590782165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028622, + "balance_loss_mlp": 1.02266109, + "diversity_loss_mlp": 0.0, + "epoch": 0.7052712581762216, + "flos": 1420040927232.0, + "grad_norm": 0.030216242564882093, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85120249, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.05957031, + "routerloss_mlp": 0.0, + "step": 3666, + "time_per_iteration": 4.850507974624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105764, + "balance_loss_mlp": 1.04785872, + "diversity_loss_mlp": 0.0, + "epoch": 0.7054636398614852, + "flos": 493049369088.0, + "grad_norm": 0.07688296901308685, + "language_loss": 0.82549417, + "learning_rate": 0.00021076417839483065, + "loss": 0.83607054, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.09777832, + "routerloss_mlp": 0.0, + "step": 3667, + "time_per_iteration": 2.789318799972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00785288, + "balance_loss_mlp": 1.32734215, + "diversity_loss_mlp": 0.21942863, + "epoch": 0.7056560215467488, + "flos": 450457417728.0, + "grad_norm": 0.027872662040783723, + "language_loss": 0.85229611, + "learning_rate": 0.00021051010871784589, + "loss": 0.86014903, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01190263, + "step": 3668, + "time_per_iteration": 2.6029293537139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049431, + "balance_loss_mlp": 1.03972173, + "diversity_loss_mlp": 0.0, + "epoch": 0.7058484032320124, + "flos": 565703875584.0, + "grad_norm": 0.06094440535163373, + "language_loss": 0.79136097, + "learning_rate": 0.0002102561514308045, + "loss": 0.80185533, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.09698486, + "routerloss_mlp": 0.0, + "step": 3669, + "time_per_iteration": 2.717550754547119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048509, + "balance_loss_mlp": 1.03882289, + "diversity_loss_mlp": 0.0, + "epoch": 0.7060407849172758, + "flos": 567008501760.0, + "grad_norm": 0.06685679205809081, + "language_loss": 0.82684934, + "learning_rate": 0.00021000230663230135, + "loss": 0.83733451, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.09680176, + "routerloss_mlp": 0.0, + "step": 3670, + "time_per_iteration": 2.663641929626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_mlp": 1.03758621, + "diversity_loss_mlp": 0.0, + "epoch": 0.7062331666025394, + "flos": 468746403840.0, + "grad_norm": 0.0788999580683501, + "language_loss": 0.8333686, + "learning_rate": 0.00020974857442088762, + "loss": 0.84384131, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3671, + "time_per_iteration": 2.603200674057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050365, + "balance_loss_mlp": 1.04090595, + "diversity_loss_mlp": 0.0, + "epoch": 0.706425548287803, + "flos": 595316749824.0, + "grad_norm": 0.06597055707746856, + "language_loss": 0.89200228, + "learning_rate": 0.00020949495489507104, + "loss": 0.90250599, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3672, + "time_per_iteration": 2.6877996921539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_mlp": 1.04270363, + "diversity_loss_mlp": 0.0, + "epoch": 0.7066179299730666, + "flos": 475815389184.0, + "grad_norm": 0.17274894008002345, + "language_loss": 0.84991109, + "learning_rate": 0.00020924144815331525, + "loss": 0.86043334, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3673, + "time_per_iteration": 2.5844242572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_mlp": 1.04517114, + "diversity_loss_mlp": 0.0, + "epoch": 0.7068103116583301, + "flos": 506409117696.0, + "grad_norm": 0.0640379080300773, + "language_loss": 0.83600396, + "learning_rate": 0.00020898805429404044, + "loss": 0.84655201, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.09625244, + "routerloss_mlp": 0.0, + "step": 3674, + "time_per_iteration": 2.676417350769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056838, + "balance_loss_mlp": 1.04724169, + "diversity_loss_mlp": 0.0, + "epoch": 0.7070026933435937, + "flos": 679336989696.0, + "grad_norm": 0.0780577693768427, + "language_loss": 0.78793156, + "learning_rate": 0.0002087347734156228, + "loss": 0.79849994, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.09594727, + "routerloss_mlp": 0.0, + "step": 3675, + "time_per_iteration": 2.8697783946990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057522, + "balance_loss_mlp": 1.04800272, + "diversity_loss_mlp": 0.0, + "epoch": 0.7071950750288573, + "flos": 472217942016.0, + "grad_norm": 0.0710988084964876, + "language_loss": 0.79834986, + "learning_rate": 0.00020848160561639452, + "loss": 0.80892509, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.09515381, + "routerloss_mlp": 0.0, + "step": 3676, + "time_per_iteration": 2.7413785457611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106069, + "balance_loss_mlp": 1.05147529, + "diversity_loss_mlp": 0.0, + "epoch": 0.7073874567141208, + "flos": 473742452736.0, + "grad_norm": 0.06834186778178446, + "language_loss": 0.86040401, + "learning_rate": 0.0002082285509946445, + "loss": 0.8710109, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3677, + "time_per_iteration": 2.5471127033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063838, + "balance_loss_mlp": 1.05436051, + "diversity_loss_mlp": 0.0, + "epoch": 0.7075798383993844, + "flos": 545877895680.0, + "grad_norm": 0.06236421972787801, + "language_loss": 0.83409554, + "learning_rate": 0.00020797560964861683, + "loss": 0.84473389, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3678, + "time_per_iteration": 2.748696804046631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065573, + "balance_loss_mlp": 1.05635202, + "diversity_loss_mlp": 0.0, + "epoch": 0.7077722200846479, + "flos": 662090526720.0, + "grad_norm": 0.07878907365407993, + "language_loss": 0.80641901, + "learning_rate": 0.0002077227816765122, + "loss": 0.81707478, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3679, + "time_per_iteration": 3.000666618347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_mlp": 1.03114033, + "diversity_loss_mlp": 0.0, + "epoch": 0.7079646017699115, + "flos": 1529960223744.0, + "grad_norm": 0.025842314854182848, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77483988, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.05126953, + "routerloss_mlp": 0.0, + "step": 3680, + "time_per_iteration": 4.779016971588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106697, + "balance_loss_mlp": 1.05772507, + "diversity_loss_mlp": 0.0, + "epoch": 0.7081569834551751, + "flos": 621502502400.0, + "grad_norm": 0.06703239561102693, + "language_loss": 0.78754878, + "learning_rate": 0.00020721746624665383, + "loss": 0.79821849, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3681, + "time_per_iteration": 2.7041916847229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073317, + "balance_loss_mlp": 1.06381631, + "diversity_loss_mlp": 0.0, + "epoch": 0.7083493651404387, + "flos": 794630435328.0, + "grad_norm": 0.06071055961479113, + "language_loss": 0.80160034, + "learning_rate": 0.00020696497898508114, + "loss": 0.81233358, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3682, + "time_per_iteration": 3.003126382827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073203, + "balance_loss_mlp": 1.06374955, + "diversity_loss_mlp": 0.0, + "epoch": 0.7085417468257021, + "flos": 813747202560.0, + "grad_norm": 0.0794178936209596, + "language_loss": 0.77425051, + "learning_rate": 0.00020671260548979316, + "loss": 0.7849825, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3683, + "time_per_iteration": 3.000619649887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079652, + "balance_loss_mlp": 1.07019854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7087341285109657, + "flos": 700566340608.0, + "grad_norm": 0.06569012319146904, + "language_loss": 0.85012448, + "learning_rate": 0.00020646034585876982, + "loss": 0.86092097, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3684, + "time_per_iteration": 2.8407599925994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788495, + "balance_loss_mlp": 1.33244729, + "diversity_loss_mlp": 0.22155851, + "epoch": 0.7089265101962293, + "flos": 596514917376.0, + "grad_norm": 0.02817752508262258, + "language_loss": 0.84630954, + "learning_rate": 0.00020620820018994718, + "loss": 0.8541944, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0114923, + "step": 3685, + "time_per_iteration": 2.8807289600372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791818, + "balance_loss_mlp": 1.33957911, + "diversity_loss_mlp": 0.22135019, + "epoch": 0.7091188918814929, + "flos": 487106970624.0, + "grad_norm": 0.03572846620936607, + "language_loss": 0.83307725, + "learning_rate": 0.00020595616858121675, + "loss": 0.84099543, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0113536, + "step": 3686, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075035, + "balance_loss_mlp": 1.06569517, + "diversity_loss_mlp": 0.0, + "epoch": 0.7093112735667565, + "flos": 600117507072.0, + "grad_norm": 0.05825520117041851, + "language_loss": 0.80985916, + "learning_rate": 0.00020570425113042586, + "loss": 0.82060945, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 3687, + "time_per_iteration": 2.724151611328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078198, + "balance_loss_mlp": 1.06894779, + "diversity_loss_mlp": 0.0, + "epoch": 0.70950365525202, + "flos": 505830956544.0, + "grad_norm": 0.0736963808397267, + "language_loss": 0.8558749, + "learning_rate": 0.0002054524479353776, + "loss": 0.8666569, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3688, + "time_per_iteration": 2.7505970001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074288, + "balance_loss_mlp": 1.06498957, + "diversity_loss_mlp": 0.0, + "epoch": 0.7096960369372836, + "flos": 732160747008.0, + "grad_norm": 0.07506666957013575, + "language_loss": 0.81571054, + "learning_rate": 0.00020520075909383063, + "loss": 0.82645345, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3689, + "time_per_iteration": 2.854198694229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074645, + "balance_loss_mlp": 1.06511474, + "diversity_loss_mlp": 0.0, + "epoch": 0.7098884186225471, + "flos": 972077511168.0, + "grad_norm": 0.06551416788386397, + "language_loss": 0.80860078, + "learning_rate": 0.00020494918470349916, + "loss": 0.81934714, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3690, + "time_per_iteration": 3.2713325023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079528, + "balance_loss_mlp": 1.34716058, + "diversity_loss_mlp": 0.22097552, + "epoch": 0.7100808003078107, + "flos": 504252117504.0, + "grad_norm": 0.03587666052644611, + "language_loss": 0.85333264, + "learning_rate": 0.00020469772486205297, + "loss": 0.86128545, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01121199, + "step": 3691, + "time_per_iteration": 2.626685380935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787595, + "balance_loss_mlp": 1.33183146, + "diversity_loss_mlp": 0.22060202, + "epoch": 0.7102731819930742, + "flos": 540335992320.0, + "grad_norm": 0.030476334667887343, + "language_loss": 0.81455922, + "learning_rate": 0.0002044463796671177, + "loss": 0.82243514, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0113784, + "step": 3692, + "time_per_iteration": 2.7819416522979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074018, + "balance_loss_mlp": 1.06465387, + "diversity_loss_mlp": 0.0, + "epoch": 0.7104655636783378, + "flos": 620378113536.0, + "grad_norm": 0.07963770038273417, + "language_loss": 0.8046093, + "learning_rate": 0.00020419514921627408, + "loss": 0.81534946, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3693, + "time_per_iteration": 2.8676981925964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069877, + "balance_loss_mlp": 1.06088233, + "diversity_loss_mlp": 0.0, + "epoch": 0.7106579453636014, + "flos": 557322923520.0, + "grad_norm": 0.07391756130926609, + "language_loss": 0.77261078, + "learning_rate": 0.00020394403360705855, + "loss": 0.78330958, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 3694, + "time_per_iteration": 2.695068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788663, + "balance_loss_mlp": 1.33321095, + "diversity_loss_mlp": 0.22100018, + "epoch": 0.710850327048865, + "flos": 513048245760.0, + "grad_norm": 0.034812211167962216, + "language_loss": 0.88271379, + "learning_rate": 0.00020369303293696228, + "loss": 0.89060044, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01155703, + "step": 3695, + "time_per_iteration": 2.601621627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066517, + "balance_loss_mlp": 1.05723643, + "diversity_loss_mlp": 0.0, + "epoch": 0.7110427087341286, + "flos": 423619352064.0, + "grad_norm": 0.07715335648803619, + "language_loss": 0.78224587, + "learning_rate": 0.00020344214730343304, + "loss": 0.79291105, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3696, + "time_per_iteration": 2.6193599700927734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065299, + "balance_loss_mlp": 1.05618572, + "diversity_loss_mlp": 0.0, + "epoch": 0.711235090419392, + "flos": 577415402496.0, + "grad_norm": 0.05468894944159508, + "language_loss": 0.79277122, + "learning_rate": 0.00020319137680387296, + "loss": 0.80342424, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 3697, + "time_per_iteration": 2.9309933185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060045, + "balance_loss_mlp": 1.05068743, + "diversity_loss_mlp": 0.0, + "epoch": 0.7114274721046556, + "flos": 448060709376.0, + "grad_norm": 0.07057759031394817, + "language_loss": 0.80451727, + "learning_rate": 0.0002029407215356398, + "loss": 0.81511772, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3698, + "time_per_iteration": 2.4956727027893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058583, + "balance_loss_mlp": 1.04976714, + "diversity_loss_mlp": 0.0, + "epoch": 0.7116198537899192, + "flos": 621962095104.0, + "grad_norm": 0.0722387573875999, + "language_loss": 0.83844793, + "learning_rate": 0.00020269018159604663, + "loss": 0.84903371, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 3699, + "time_per_iteration": 2.731231689453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057429, + "balance_loss_mlp": 1.04814827, + "diversity_loss_mlp": 0.0, + "epoch": 0.7118122354751828, + "flos": 498724895232.0, + "grad_norm": 0.07123396580800914, + "language_loss": 0.818003, + "learning_rate": 0.00020243975708236162, + "loss": 0.82857728, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 3700, + "time_per_iteration": 2.597215414047241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781944, + "balance_loss_mlp": 1.31673443, + "diversity_loss_mlp": 0.22274226, + "epoch": 0.7120046171604463, + "flos": 572718532608.0, + "grad_norm": 0.030217464674653638, + "language_loss": 0.86634398, + "learning_rate": 0.00020218944809180818, + "loss": 0.87416339, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01220552, + "step": 3701, + "time_per_iteration": 2.7128944396972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056546, + "balance_loss_mlp": 1.04739642, + "diversity_loss_mlp": 0.0, + "epoch": 0.7121969988457099, + "flos": 572664204288.0, + "grad_norm": 0.06969302254489844, + "language_loss": 0.84630072, + "learning_rate": 0.00020193925472156493, + "loss": 0.85686618, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3702, + "time_per_iteration": 2.695040702819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009738, + "balance_loss_mlp": 1.00442076, + "diversity_loss_mlp": 0.0, + "epoch": 0.7123893805309734, + "flos": 1523429752320.0, + "grad_norm": 0.015177951683804305, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75298905, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 3703, + "time_per_iteration": 4.91239857673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784779, + "balance_loss_mlp": 1.3239193, + "diversity_loss_mlp": 0.22157452, + "epoch": 0.712581762216237, + "flos": 615105280512.0, + "grad_norm": 0.02622509859947044, + "language_loss": 0.83696187, + "learning_rate": 0.00020143921523049863, + "loss": 0.84480959, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01203172, + "step": 3704, + "time_per_iteration": 3.0262062549591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057244, + "balance_loss_mlp": 1.04805851, + "diversity_loss_mlp": 0.0, + "epoch": 0.7127741439015006, + "flos": 597777698304.0, + "grad_norm": 0.07737525798134272, + "language_loss": 0.838422, + "learning_rate": 0.00020118936930380837, + "loss": 0.84899437, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 3705, + "time_per_iteration": 2.741217851638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105596, + "balance_loss_mlp": 1.04639971, + "diversity_loss_mlp": 0.0, + "epoch": 0.7129665255867641, + "flos": 537398198784.0, + "grad_norm": 0.08146435226617602, + "language_loss": 0.80879092, + "learning_rate": 0.0002009396393856932, + "loss": 0.81935048, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 3706, + "time_per_iteration": 2.643540143966675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_mlp": 1.0414114, + "diversity_loss_mlp": 0.0, + "epoch": 0.7131589072720277, + "flos": 526442499072.0, + "grad_norm": 0.07418360122955521, + "language_loss": 0.82790005, + "learning_rate": 0.00020069002557310673, + "loss": 0.83840382, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3707, + "time_per_iteration": 2.719648838043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052452, + "balance_loss_mlp": 1.04351699, + "diversity_loss_mlp": 0.0, + "epoch": 0.7133512889572913, + "flos": 530919484416.0, + "grad_norm": 0.05884856391484217, + "language_loss": 0.77115107, + "learning_rate": 0.00020044052796295807, + "loss": 0.78167558, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3708, + "time_per_iteration": 2.830353260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051246, + "balance_loss_mlp": 1.04202533, + "diversity_loss_mlp": 0.0, + "epoch": 0.7135436706425549, + "flos": 503535564288.0, + "grad_norm": 0.07889939453961878, + "language_loss": 0.82217181, + "learning_rate": 0.00020019114665211063, + "loss": 0.83268428, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 3709, + "time_per_iteration": 2.581709623336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048706, + "balance_loss_mlp": 1.03982449, + "diversity_loss_mlp": 0.0, + "epoch": 0.7137360523278183, + "flos": 515968786944.0, + "grad_norm": 0.06519405348344502, + "language_loss": 0.81405282, + "learning_rate": 0.00019994188173738276, + "loss": 0.8245399, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 3710, + "time_per_iteration": 2.5735976696014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049854, + "balance_loss_mlp": 1.04063272, + "diversity_loss_mlp": 0.0, + "epoch": 0.7139284340130819, + "flos": 510389434368.0, + "grad_norm": 0.07046885330875076, + "language_loss": 0.80712581, + "learning_rate": 0.0001996927333155477, + "loss": 0.81762433, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 3711, + "time_per_iteration": 2.814368724822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054586, + "balance_loss_mlp": 1.04546654, + "diversity_loss_mlp": 0.0, + "epoch": 0.7141208156983455, + "flos": 890275940352.0, + "grad_norm": 0.07187972004168419, + "language_loss": 0.85349059, + "learning_rate": 0.00019944370148333346, + "loss": 0.8640365, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3712, + "time_per_iteration": 3.169759750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058298, + "balance_loss_mlp": 1.04938745, + "diversity_loss_mlp": 0.0, + "epoch": 0.7143131973836091, + "flos": 535779712512.0, + "grad_norm": 0.060002667598624965, + "language_loss": 0.79623508, + "learning_rate": 0.00019919478633742278, + "loss": 0.80681807, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 3713, + "time_per_iteration": 2.644663095474243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061749, + "balance_loss_mlp": 1.05258763, + "diversity_loss_mlp": 0.0, + "epoch": 0.7145055790688727, + "flos": 473668300800.0, + "grad_norm": 0.07397385813864758, + "language_loss": 0.85182703, + "learning_rate": 0.00019894598797445302, + "loss": 0.86244452, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3714, + "time_per_iteration": 2.5240604877471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061709, + "balance_loss_mlp": 1.05239308, + "diversity_loss_mlp": 0.0, + "epoch": 0.7146979607541362, + "flos": 570521885184.0, + "grad_norm": 0.07339492646897193, + "language_loss": 0.81885231, + "learning_rate": 0.00019869730649101615, + "loss": 0.82946944, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3715, + "time_per_iteration": 2.827868938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063135, + "balance_loss_mlp": 1.05403948, + "diversity_loss_mlp": 0.0, + "epoch": 0.7148903424393998, + "flos": 839666082816.0, + "grad_norm": 0.0742719443850205, + "language_loss": 0.72613627, + "learning_rate": 0.00019844874198365943, + "loss": 0.73676765, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3716, + "time_per_iteration": 3.0963878631591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063516, + "balance_loss_mlp": 1.05428362, + "diversity_loss_mlp": 0.0, + "epoch": 0.7150827241246633, + "flos": 541823427072.0, + "grad_norm": 0.061591749317610134, + "language_loss": 0.83976817, + "learning_rate": 0.00019820029454888362, + "loss": 0.85040331, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3717, + "time_per_iteration": 2.7068889141082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006732, + "balance_loss_mlp": 1.0012722, + "diversity_loss_mlp": 0.0, + "epoch": 0.7152751058099269, + "flos": 1583678200320.0, + "grad_norm": 0.016486733546314403, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75528002, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.0546875, + "routerloss_mlp": 0.0, + "step": 3718, + "time_per_iteration": 5.0301513671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010681, + "balance_loss_mlp": 1.05873013, + "diversity_loss_mlp": 0.0, + "epoch": 0.7154674874951905, + "flos": 517419145728.0, + "grad_norm": 0.06632920905024949, + "language_loss": 0.80107152, + "learning_rate": 0.0001977037512828529, + "loss": 0.81175244, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3719, + "time_per_iteration": 2.573982000350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066134, + "balance_loss_mlp": 1.05686522, + "diversity_loss_mlp": 0.0, + "epoch": 0.715659869180454, + "flos": 602524127232.0, + "grad_norm": 0.05986593090344285, + "language_loss": 0.86432415, + "learning_rate": 0.0001974556556443734, + "loss": 0.87498546, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3720, + "time_per_iteration": 2.7087209224700928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106825, + "balance_loss_mlp": 1.0589757, + "diversity_loss_mlp": 0.0, + "epoch": 0.7158522508657176, + "flos": 531675684864.0, + "grad_norm": 0.05551674827732864, + "language_loss": 0.88590324, + "learning_rate": 0.00019720767746402547, + "loss": 0.89658576, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3721, + "time_per_iteration": 2.7290821075439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010754, + "balance_loss_mlp": 1.06610191, + "diversity_loss_mlp": 0.0, + "epoch": 0.7160446325509812, + "flos": 557569972224.0, + "grad_norm": 0.07406216566818759, + "language_loss": 0.79965603, + "learning_rate": 0.00019695981683808222, + "loss": 0.81041002, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3722, + "time_per_iteration": 2.8323793411254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072903, + "balance_loss_mlp": 1.06386733, + "diversity_loss_mlp": 0.0, + "epoch": 0.7162370142362448, + "flos": 690986847744.0, + "grad_norm": 0.08922707402242334, + "language_loss": 0.84955275, + "learning_rate": 0.00019671207386277225, + "loss": 0.86028177, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 3723, + "time_per_iteration": 2.94681978225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069475, + "balance_loss_mlp": 1.06010544, + "diversity_loss_mlp": 0.0, + "epoch": 0.7164293959215082, + "flos": 794109173760.0, + "grad_norm": 0.07420263460977167, + "language_loss": 0.78355432, + "learning_rate": 0.0001964644486342777, + "loss": 0.79424912, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3724, + "time_per_iteration": 2.960944414138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064733, + "balance_loss_mlp": 1.05573297, + "diversity_loss_mlp": 0.0, + "epoch": 0.7166217776067718, + "flos": 494178527232.0, + "grad_norm": 0.0760825236490028, + "language_loss": 0.86588323, + "learning_rate": 0.00019621694124873524, + "loss": 0.87653053, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 3725, + "time_per_iteration": 2.6881937980651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101766, + "balance_loss_mlp": 1.01224804, + "diversity_loss_mlp": 0.0, + "epoch": 0.7168141592920354, + "flos": 1401060354048.0, + "grad_norm": 0.018433056607108506, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77557743, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.05419922, + "routerloss_mlp": 0.0, + "step": 3726, + "time_per_iteration": 4.8842387199401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057658, + "balance_loss_mlp": 1.04820442, + "diversity_loss_mlp": 0.0, + "epoch": 0.717006540977299, + "flos": 793150341120.0, + "grad_norm": 0.08148717312552407, + "language_loss": 0.77167314, + "learning_rate": 0.00019572228039082428, + "loss": 0.78224969, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3727, + "time_per_iteration": 3.071643829345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055911, + "balance_loss_mlp": 1.04670763, + "diversity_loss_mlp": 0.0, + "epoch": 0.7171989226625626, + "flos": 554812416000.0, + "grad_norm": 0.05270267691232831, + "language_loss": 0.83482945, + "learning_rate": 0.0001954751271105002, + "loss": 0.84538865, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3728, + "time_per_iteration": 2.8301711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105429, + "balance_loss_mlp": 1.04496169, + "diversity_loss_mlp": 0.0, + "epoch": 0.717391304347826, + "flos": 555914409984.0, + "grad_norm": 0.06896440922655821, + "language_loss": 0.80838037, + "learning_rate": 0.00019522809205721687, + "loss": 0.81892335, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 3729, + "time_per_iteration": 2.8094747066497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048553, + "balance_loss_mlp": 1.03930831, + "diversity_loss_mlp": 0.0, + "epoch": 0.7175836860330896, + "flos": 538855898112.0, + "grad_norm": 0.09744205035272979, + "language_loss": 0.83110106, + "learning_rate": 0.0001949811753268816, + "loss": 0.84158659, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3730, + "time_per_iteration": 2.6963374614715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045755, + "balance_loss_mlp": 1.03643274, + "diversity_loss_mlp": 0.0, + "epoch": 0.7177760677183532, + "flos": 515637674496.0, + "grad_norm": 0.0730125544637403, + "language_loss": 0.82630277, + "learning_rate": 0.00019473437701535634, + "loss": 0.83676028, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3731, + "time_per_iteration": 2.6076574325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047574, + "balance_loss_mlp": 1.03844213, + "diversity_loss_mlp": 0.0, + "epoch": 0.7179684494036168, + "flos": 674719041024.0, + "grad_norm": 0.07914181118847867, + "language_loss": 0.89615285, + "learning_rate": 0.00019448769721845677, + "loss": 0.90662855, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3732, + "time_per_iteration": 2.824897289276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047996, + "balance_loss_mlp": 1.03853655, + "diversity_loss_mlp": 0.0, + "epoch": 0.7181608310888803, + "flos": 469912637952.0, + "grad_norm": 0.07061643018013358, + "language_loss": 0.86148334, + "learning_rate": 0.00019424113603195203, + "loss": 0.87196326, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3733, + "time_per_iteration": 2.520390510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_mlp": 1.03879809, + "diversity_loss_mlp": 0.0, + "epoch": 0.7183532127741439, + "flos": 593952652800.0, + "grad_norm": 0.07087799527916698, + "language_loss": 0.79863775, + "learning_rate": 0.0001939946935515657, + "loss": 0.80912238, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3734, + "time_per_iteration": 2.8286993503570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104904, + "balance_loss_mlp": 1.03927684, + "diversity_loss_mlp": 0.0, + "epoch": 0.7185455944594075, + "flos": 498917615616.0, + "grad_norm": 0.08245280249652003, + "language_loss": 0.80650169, + "learning_rate": 0.0001937483698729755, + "loss": 0.8169921, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3735, + "time_per_iteration": 2.6458795070648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_mlp": 1.0338974, + "diversity_loss_mlp": 0.0, + "epoch": 0.718737976144671, + "flos": 814933260288.0, + "grad_norm": 0.07515481344769812, + "language_loss": 0.82211673, + "learning_rate": 0.0001935021650918128, + "loss": 0.83255374, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.0980835, + "routerloss_mlp": 0.0, + "step": 3736, + "time_per_iteration": 3.0285887718200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043391, + "balance_loss_mlp": 1.03346682, + "diversity_loss_mlp": 0.0, + "epoch": 0.7189303578299346, + "flos": 438328143360.0, + "grad_norm": 0.06979349456564556, + "language_loss": 0.87017608, + "learning_rate": 0.0001932560793036625, + "loss": 0.88060999, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.09924316, + "routerloss_mlp": 0.0, + "step": 3737, + "time_per_iteration": 2.482374906539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_mlp": 1.03452408, + "diversity_loss_mlp": 0.0, + "epoch": 0.7191227395151981, + "flos": 549398992896.0, + "grad_norm": 0.08340257337042449, + "language_loss": 0.86882925, + "learning_rate": 0.00019301011260406382, + "loss": 0.87927186, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.09716797, + "routerloss_mlp": 0.0, + "step": 3738, + "time_per_iteration": 2.6162045001983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104539, + "balance_loss_mlp": 1.03576994, + "diversity_loss_mlp": 0.0, + "epoch": 0.7193151212004617, + "flos": 626938320384.0, + "grad_norm": 0.0721539169034284, + "language_loss": 0.79805303, + "learning_rate": 0.00019276426508850936, + "loss": 0.80850697, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.09619141, + "routerloss_mlp": 0.0, + "step": 3739, + "time_per_iteration": 2.7380456924438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_mlp": 1.03111315, + "diversity_loss_mlp": 0.0, + "epoch": 0.7195075028857253, + "flos": 741062960640.0, + "grad_norm": 0.0788007665709812, + "language_loss": 0.80469853, + "learning_rate": 0.00019251853685244564, + "loss": 0.81510872, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.09899902, + "routerloss_mlp": 0.0, + "step": 3740, + "time_per_iteration": 3.0559754371643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044512, + "balance_loss_mlp": 1.03485012, + "diversity_loss_mlp": 0.0, + "epoch": 0.7196998845709889, + "flos": 802875566592.0, + "grad_norm": 0.07989753754857366, + "language_loss": 0.80738026, + "learning_rate": 0.00019227292799127283, + "loss": 0.81782538, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.09661865, + "routerloss_mlp": 0.0, + "step": 3741, + "time_per_iteration": 3.0058369636535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044827, + "balance_loss_mlp": 1.03530192, + "diversity_loss_mlp": 0.0, + "epoch": 0.7198922662562524, + "flos": 925183669248.0, + "grad_norm": 0.17846470971826942, + "language_loss": 0.79000109, + "learning_rate": 0.00019202743860034454, + "loss": 0.80044937, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3742, + "time_per_iteration": 3.218614339828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043684, + "balance_loss_mlp": 1.03441513, + "diversity_loss_mlp": 0.0, + "epoch": 0.7200846479415159, + "flos": 580111289856.0, + "grad_norm": 0.07729553507192725, + "language_loss": 0.83831203, + "learning_rate": 0.00019178206877496873, + "loss": 0.84874886, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 3743, + "time_per_iteration": 2.7014403343200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048278, + "balance_loss_mlp": 1.03885424, + "diversity_loss_mlp": 0.0, + "epoch": 0.7202770296267795, + "flos": 557695881216.0, + "grad_norm": 0.06342209640567653, + "language_loss": 0.85333169, + "learning_rate": 0.0001915368186104059, + "loss": 0.86381447, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3744, + "time_per_iteration": 2.733520746231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105441, + "balance_loss_mlp": 1.04513526, + "diversity_loss_mlp": 0.0, + "epoch": 0.7204694113120431, + "flos": 672552129024.0, + "grad_norm": 0.08207076889899251, + "language_loss": 0.81176144, + "learning_rate": 0.0001912916882018706, + "loss": 0.8223055, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3745, + "time_per_iteration": 2.7833125591278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057126, + "balance_loss_mlp": 1.04774427, + "diversity_loss_mlp": 0.0, + "epoch": 0.7206617929973067, + "flos": 799194055680.0, + "grad_norm": 0.08263651010752651, + "language_loss": 0.79468751, + "learning_rate": 0.00019104667764453125, + "loss": 0.80525875, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.09368896, + "routerloss_mlp": 0.0, + "step": 3746, + "time_per_iteration": 3.0572047233581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066676, + "balance_loss_mlp": 1.05751503, + "diversity_loss_mlp": 0.0, + "epoch": 0.7208541746825702, + "flos": 531898140672.0, + "grad_norm": 0.06554660744507769, + "language_loss": 0.80441052, + "learning_rate": 0.00019080178703350926, + "loss": 0.8150773, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3747, + "time_per_iteration": 2.6344070434570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067771, + "balance_loss_mlp": 1.05819249, + "diversity_loss_mlp": 0.0, + "epoch": 0.7210465563678338, + "flos": 535139882496.0, + "grad_norm": 0.07164749029527417, + "language_loss": 0.83225226, + "learning_rate": 0.00019055701646387952, + "loss": 0.84292996, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 3748, + "time_per_iteration": 2.674436330795288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014621, + "balance_loss_mlp": 1.00935245, + "diversity_loss_mlp": 0.0, + "epoch": 0.7212389380530974, + "flos": 1533908606976.0, + "grad_norm": 0.01350364958452467, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.8148731, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.05273438, + "routerloss_mlp": 0.0, + "step": 3749, + "time_per_iteration": 4.8169167041778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074721, + "balance_loss_mlp": 1.06568444, + "diversity_loss_mlp": 0.0, + "epoch": 0.7214313197383609, + "flos": 461511862272.0, + "grad_norm": 0.09948968640859872, + "language_loss": 0.86443639, + "learning_rate": 0.00019006783582886368, + "loss": 0.87518358, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3750, + "time_per_iteration": 2.6094882488250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082564, + "balance_loss_mlp": 1.0731287, + "diversity_loss_mlp": 0.0, + "epoch": 0.7216237014236244, + "flos": 1037134056960.0, + "grad_norm": 0.0940617497046545, + "language_loss": 0.8313877, + "learning_rate": 0.00018982342595339437, + "loss": 0.84221339, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3751, + "time_per_iteration": 4.834062576293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077441, + "balance_loss_mlp": 1.06848848, + "diversity_loss_mlp": 0.0, + "epoch": 0.721816083108888, + "flos": 895951466496.0, + "grad_norm": 0.08300933032368943, + "language_loss": 0.81837034, + "learning_rate": 0.00018957913649915076, + "loss": 0.82914484, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 3752, + "time_per_iteration": 3.1204826831817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076559, + "balance_loss_mlp": 1.06739748, + "diversity_loss_mlp": 0.0, + "epoch": 0.7220084647941516, + "flos": 523314556416.0, + "grad_norm": 0.08305681898579634, + "language_loss": 0.79633486, + "learning_rate": 0.00018933496756097428, + "loss": 0.80710053, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3753, + "time_per_iteration": 2.6664350032806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077149, + "balance_loss_mlp": 1.06786871, + "diversity_loss_mlp": 0.0, + "epoch": 0.7222008464794152, + "flos": 816099494400.0, + "grad_norm": 0.08328010196337048, + "language_loss": 0.81679463, + "learning_rate": 0.0001890909192336603, + "loss": 0.82756615, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3754, + "time_per_iteration": 2.994882822036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073126, + "balance_loss_mlp": 1.06407857, + "diversity_loss_mlp": 0.0, + "epoch": 0.7223932281646788, + "flos": 749053702656.0, + "grad_norm": 0.08777822688547723, + "language_loss": 0.70716894, + "learning_rate": 0.00018884699161195623, + "loss": 0.71790028, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 3755, + "time_per_iteration": 4.262615442276001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071208, + "balance_loss_mlp": 1.06174874, + "diversity_loss_mlp": 0.0, + "epoch": 0.7225856098499422, + "flos": 745502870016.0, + "grad_norm": 0.0673256778775424, + "language_loss": 0.77517748, + "learning_rate": 0.00018860318479056327, + "loss": 0.78588951, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3756, + "time_per_iteration": 3.1185147762298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064095, + "balance_loss_mlp": 1.05514276, + "diversity_loss_mlp": 0.0, + "epoch": 0.7227779915352058, + "flos": 547330825728.0, + "grad_norm": 0.06734169026400741, + "language_loss": 0.83406973, + "learning_rate": 0.00018835949886413555, + "loss": 0.84471071, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 3757, + "time_per_iteration": 2.7693490982055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066701, + "balance_loss_mlp": 1.05735517, + "diversity_loss_mlp": 0.0, + "epoch": 0.7229703732204694, + "flos": 530484857856.0, + "grad_norm": 0.0750419048722912, + "language_loss": 0.78459024, + "learning_rate": 0.0001881159339272806, + "loss": 0.79525727, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3758, + "time_per_iteration": 2.6415517330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059793, + "balance_loss_mlp": 1.05062032, + "diversity_loss_mlp": 0.0, + "epoch": 0.723162754905733, + "flos": 528355021824.0, + "grad_norm": 0.0644798827635335, + "language_loss": 0.78601432, + "learning_rate": 0.00018787249007455858, + "loss": 0.79661226, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 3759, + "time_per_iteration": 2.6022799015045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063242, + "balance_loss_mlp": 1.05413401, + "diversity_loss_mlp": 0.0, + "epoch": 0.7233551365909965, + "flos": 654868468224.0, + "grad_norm": 0.07015599197769962, + "language_loss": 0.71291095, + "learning_rate": 0.00018762916740048302, + "loss": 0.72354335, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3760, + "time_per_iteration": 2.8239991664886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059773, + "balance_loss_mlp": 1.05033171, + "diversity_loss_mlp": 0.0, + "epoch": 0.7235475182762601, + "flos": 522365635584.0, + "grad_norm": 0.07068719643677601, + "language_loss": 0.86275655, + "learning_rate": 0.0001873859659995195, + "loss": 0.87335426, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3761, + "time_per_iteration": 2.825853109359741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056903, + "balance_loss_mlp": 1.04742599, + "diversity_loss_mlp": 0.0, + "epoch": 0.7237398999615237, + "flos": 609170595840.0, + "grad_norm": 0.06521234046982781, + "language_loss": 0.83369851, + "learning_rate": 0.0001871428859660878, + "loss": 0.84426749, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3762, + "time_per_iteration": 2.765061855316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054846, + "balance_loss_mlp": 1.04584002, + "diversity_loss_mlp": 0.0, + "epoch": 0.7239322816467872, + "flos": 658987176960.0, + "grad_norm": 0.06876344834189922, + "language_loss": 0.81910485, + "learning_rate": 0.00018689992739455975, + "loss": 0.82965332, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 3763, + "time_per_iteration": 2.955744504928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050714, + "balance_loss_mlp": 1.04123139, + "diversity_loss_mlp": 0.0, + "epoch": 0.7241246633320508, + "flos": 969282878976.0, + "grad_norm": 0.06967924844938471, + "language_loss": 0.85903621, + "learning_rate": 0.00018665709037926027, + "loss": 0.86954343, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.0947876, + "routerloss_mlp": 0.0, + "step": 3764, + "time_per_iteration": 3.306689977645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050074, + "balance_loss_mlp": 1.04077554, + "diversity_loss_mlp": 0.0, + "epoch": 0.7243170450173143, + "flos": 514995273216.0, + "grad_norm": 0.07823184864923875, + "language_loss": 0.8509047, + "learning_rate": 0.00018641437501446694, + "loss": 0.86140537, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3765, + "time_per_iteration": 2.5606436729431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053259, + "balance_loss_mlp": 1.04385924, + "diversity_loss_mlp": 0.0, + "epoch": 0.7245094267025779, + "flos": 559746796032.0, + "grad_norm": 0.07453327039799393, + "language_loss": 0.8240428, + "learning_rate": 0.0001861717813944104, + "loss": 0.83457536, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3766, + "time_per_iteration": 2.639479875564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052028, + "balance_loss_mlp": 1.04260468, + "diversity_loss_mlp": 0.0, + "epoch": 0.7247018083878415, + "flos": 612642134016.0, + "grad_norm": 0.07462880824505752, + "language_loss": 0.79635704, + "learning_rate": 0.00018592930961327365, + "loss": 0.80687737, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3767, + "time_per_iteration": 2.71537446975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051032, + "balance_loss_mlp": 1.04159653, + "diversity_loss_mlp": 0.0, + "epoch": 0.7248941900731051, + "flos": 634676871168.0, + "grad_norm": 0.06502387009338012, + "language_loss": 0.88172042, + "learning_rate": 0.00018568695976519273, + "loss": 0.89223075, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3768, + "time_per_iteration": 2.7851336002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053572, + "balance_loss_mlp": 1.04388046, + "diversity_loss_mlp": 0.0, + "epoch": 0.7250865717583687, + "flos": 424941230592.0, + "grad_norm": 0.07526480217284313, + "language_loss": 0.80197144, + "learning_rate": 0.00018544473194425593, + "loss": 0.81250715, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 3769, + "time_per_iteration": 2.5187532901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054045, + "balance_loss_mlp": 1.044276, + "diversity_loss_mlp": 0.0, + "epoch": 0.7252789534436321, + "flos": 635114068992.0, + "grad_norm": 0.07238275679239237, + "language_loss": 0.78824592, + "learning_rate": 0.00018520262624450485, + "loss": 0.79878634, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.09759521, + "routerloss_mlp": 0.0, + "step": 3770, + "time_per_iteration": 2.8748114109039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057377, + "balance_loss_mlp": 1.04787064, + "diversity_loss_mlp": 0.0, + "epoch": 0.7254713351288957, + "flos": 617185930752.0, + "grad_norm": 0.08918095477851212, + "language_loss": 0.86894727, + "learning_rate": 0.00018496064275993324, + "loss": 0.87952113, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3771, + "time_per_iteration": 2.824845314025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105481, + "balance_loss_mlp": 1.04509437, + "diversity_loss_mlp": 0.0, + "epoch": 0.7256637168141593, + "flos": 766986983424.0, + "grad_norm": 0.06900224223805673, + "language_loss": 0.82001221, + "learning_rate": 0.00018471878158448686, + "loss": 0.83056033, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.0970459, + "routerloss_mlp": 0.0, + "step": 3772, + "time_per_iteration": 2.9548990726470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056235, + "balance_loss_mlp": 1.04668033, + "diversity_loss_mlp": 0.0, + "epoch": 0.7258560984994229, + "flos": 495559503360.0, + "grad_norm": 0.058256019250052936, + "language_loss": 0.84301949, + "learning_rate": 0.00018447704281206512, + "loss": 0.85358179, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.09545898, + "routerloss_mlp": 0.0, + "step": 3773, + "time_per_iteration": 2.83591365814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055601, + "balance_loss_mlp": 1.04598725, + "diversity_loss_mlp": 0.0, + "epoch": 0.7260484801846864, + "flos": 530069681664.0, + "grad_norm": 0.07576068763334884, + "language_loss": 0.82763028, + "learning_rate": 0.0001842354265365191, + "loss": 0.83818638, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.09613037, + "routerloss_mlp": 0.0, + "step": 3774, + "time_per_iteration": 2.68778657913208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060495, + "balance_loss_mlp": 1.05112517, + "diversity_loss_mlp": 0.0, + "epoch": 0.72624086186995, + "flos": 624964128768.0, + "grad_norm": 0.0805275617178238, + "language_loss": 0.80610001, + "learning_rate": 0.0001839939328516526, + "loss": 0.81670493, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3775, + "time_per_iteration": 2.7422258853912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790959, + "balance_loss_mlp": 1.33957541, + "diversity_loss_mlp": 0.21958014, + "epoch": 0.7264332435552135, + "flos": 716522858496.0, + "grad_norm": 0.033705672182060005, + "language_loss": 0.8138454, + "learning_rate": 0.0001837525618512218, + "loss": 0.82175499, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01138153, + "step": 3776, + "time_per_iteration": 2.9108829498291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053299, + "balance_loss_mlp": 1.04409015, + "diversity_loss_mlp": 0.0, + "epoch": 0.7266256252404771, + "flos": 681036968448.0, + "grad_norm": 0.07511121424148261, + "language_loss": 0.8321476, + "learning_rate": 0.00018351131362893519, + "loss": 0.84268057, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 3777, + "time_per_iteration": 2.789809465408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058309, + "balance_loss_mlp": 1.04874849, + "diversity_loss_mlp": 0.0, + "epoch": 0.7268180069257407, + "flos": 518906580480.0, + "grad_norm": 0.08246656435114352, + "language_loss": 0.80534494, + "learning_rate": 0.00018327018827845364, + "loss": 0.81592798, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.09558105, + "routerloss_mlp": 0.0, + "step": 3778, + "time_per_iteration": 2.6201207637786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059499, + "balance_loss_mlp": 1.0502367, + "diversity_loss_mlp": 0.0, + "epoch": 0.7270103886110042, + "flos": 512662804992.0, + "grad_norm": 0.060849425034284504, + "language_loss": 0.87504601, + "learning_rate": 0.00018302918589339036, + "loss": 0.88564098, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3779, + "time_per_iteration": 2.689378499984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061153, + "balance_loss_mlp": 1.05198562, + "diversity_loss_mlp": 0.0, + "epoch": 0.7272027702962678, + "flos": 546653919744.0, + "grad_norm": 0.06743911417724738, + "language_loss": 0.90138805, + "learning_rate": 0.00018278830656731054, + "loss": 0.91199952, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 3780, + "time_per_iteration": 2.6595706939697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056667, + "balance_loss_mlp": 1.04758894, + "diversity_loss_mlp": 0.0, + "epoch": 0.7273951519815314, + "flos": 593048521728.0, + "grad_norm": 0.06124301945992682, + "language_loss": 0.86350238, + "learning_rate": 0.00018254755039373222, + "loss": 0.87406909, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 3781, + "time_per_iteration": 2.7230565547943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062194, + "balance_loss_mlp": 1.0530144, + "diversity_loss_mlp": 0.0, + "epoch": 0.727587533666795, + "flos": 606012917760.0, + "grad_norm": 0.07105415138975459, + "language_loss": 0.83752382, + "learning_rate": 0.0001823069174661252, + "loss": 0.84814572, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 3782, + "time_per_iteration": 2.7941086292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056585, + "balance_loss_mlp": 1.04759097, + "diversity_loss_mlp": 0.0, + "epoch": 0.7277799153520584, + "flos": 513021081600.0, + "grad_norm": 0.06458866746308467, + "language_loss": 0.78171599, + "learning_rate": 0.00018206640787791112, + "loss": 0.79228187, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 3783, + "time_per_iteration": 2.618022918701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062955, + "balance_loss_mlp": 1.05387712, + "diversity_loss_mlp": 0.0, + "epoch": 0.727972297037322, + "flos": 537756475392.0, + "grad_norm": 0.06663972838638854, + "language_loss": 0.85480422, + "learning_rate": 0.00018182602172246416, + "loss": 0.86543375, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3784, + "time_per_iteration": 2.6113829612731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.05812776, + "diversity_loss_mlp": 0.0, + "epoch": 0.7281646787225856, + "flos": 535038566400.0, + "grad_norm": 0.07678107880467737, + "language_loss": 0.76375031, + "learning_rate": 0.00018158575909311075, + "loss": 0.77441949, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 3785, + "time_per_iteration": 2.650192975997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061503, + "balance_loss_mlp": 1.05243719, + "diversity_loss_mlp": 0.0, + "epoch": 0.7283570604078492, + "flos": 625055533056.0, + "grad_norm": 0.07604258502871962, + "language_loss": 0.79732937, + "learning_rate": 0.000181345620083129, + "loss": 0.80794436, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3786, + "time_per_iteration": 2.8074841499328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061193, + "balance_loss_mlp": 1.05211556, + "diversity_loss_mlp": 0.0, + "epoch": 0.7285494420931128, + "flos": 534173709312.0, + "grad_norm": 0.0629164713746694, + "language_loss": 0.86736983, + "learning_rate": 0.00018110560478574927, + "loss": 0.87798178, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3787, + "time_per_iteration": 2.6831634044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106216, + "balance_loss_mlp": 1.05288577, + "diversity_loss_mlp": 0.0, + "epoch": 0.7287418237783763, + "flos": 666548061696.0, + "grad_norm": 0.07652228362928638, + "language_loss": 0.80521822, + "learning_rate": 0.0001808657132941533, + "loss": 0.81583983, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3788, + "time_per_iteration": 2.7681210041046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063738, + "balance_loss_mlp": 1.05462408, + "diversity_loss_mlp": 0.0, + "epoch": 0.7289342054636399, + "flos": 550602302976.0, + "grad_norm": 0.06755228065084157, + "language_loss": 0.83012414, + "learning_rate": 0.00018062594570147572, + "loss": 0.84076142, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3789, + "time_per_iteration": 2.59897780418396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069496, + "balance_loss_mlp": 1.06051326, + "diversity_loss_mlp": 0.0, + "epoch": 0.7291265871489034, + "flos": 687923145216.0, + "grad_norm": 0.0602370632110868, + "language_loss": 0.84944886, + "learning_rate": 0.00018038630210080243, + "loss": 0.86014384, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 3790, + "time_per_iteration": 2.8492085933685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061985, + "balance_loss_mlp": 1.05299687, + "diversity_loss_mlp": 0.0, + "epoch": 0.729318968834167, + "flos": 572664204288.0, + "grad_norm": 0.06258751029355039, + "language_loss": 0.85112703, + "learning_rate": 0.0001801467825851712, + "loss": 0.86174691, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 3791, + "time_per_iteration": 2.724008321762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063231, + "balance_loss_mlp": 1.05412316, + "diversity_loss_mlp": 0.0, + "epoch": 0.7295113505194305, + "flos": 586061028864.0, + "grad_norm": 0.06759881980366181, + "language_loss": 0.78407717, + "learning_rate": 0.00017990738724757172, + "loss": 0.79470944, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3792, + "time_per_iteration": 2.8527557849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065387, + "balance_loss_mlp": 1.05635726, + "diversity_loss_mlp": 0.0, + "epoch": 0.7297037322046941, + "flos": 707185645056.0, + "grad_norm": 0.05706424828537789, + "language_loss": 0.82412189, + "learning_rate": 0.00017966811618094598, + "loss": 0.83477581, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3793, + "time_per_iteration": 2.891587734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071379, + "balance_loss_mlp": 1.06256318, + "diversity_loss_mlp": 0.0, + "epoch": 0.7298961138899577, + "flos": 487292350464.0, + "grad_norm": 0.0800044571001495, + "language_loss": 0.84934509, + "learning_rate": 0.00017942896947818664, + "loss": 0.86005884, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 3794, + "time_per_iteration": 2.578213691711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027287, + "balance_loss_mlp": 1.02208936, + "diversity_loss_mlp": 0.0, + "epoch": 0.7300884955752213, + "flos": 1365804260352.0, + "grad_norm": 0.018812365315957286, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.7585234, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.05200195, + "routerloss_mlp": 0.0, + "step": 3795, + "time_per_iteration": 4.8731958866119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_mlp": 1.05696881, + "diversity_loss_mlp": 0.0, + "epoch": 0.7302808772604849, + "flos": 531806736384.0, + "grad_norm": 0.08247331408198653, + "language_loss": 0.85473979, + "learning_rate": 0.00017895104953559947, + "loss": 0.86539787, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 3796, + "time_per_iteration": 2.6150035858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071519, + "balance_loss_mlp": 1.06257856, + "diversity_loss_mlp": 0.0, + "epoch": 0.7304732589457483, + "flos": 436171143168.0, + "grad_norm": 0.0876682306683089, + "language_loss": 0.90019357, + "learning_rate": 0.00017871227648131672, + "loss": 0.91090876, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 3797, + "time_per_iteration": 2.5456666946411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790219, + "balance_loss_mlp": 1.33552265, + "diversity_loss_mlp": 0.2213349, + "epoch": 0.7306656406310119, + "flos": 451621080576.0, + "grad_norm": 0.0295011086457174, + "language_loss": 0.82969385, + "learning_rate": 0.0001784736281619907, + "loss": 0.83759606, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01178985, + "step": 3798, + "time_per_iteration": 2.617690086364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064248, + "balance_loss_mlp": 1.05507529, + "diversity_loss_mlp": 0.0, + "epoch": 0.7308580223162755, + "flos": 512010491904.0, + "grad_norm": 0.0761333988969544, + "language_loss": 0.74143457, + "learning_rate": 0.00017823510467027232, + "loss": 0.75207704, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 3799, + "time_per_iteration": 2.74944806098938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061269, + "balance_loss_mlp": 1.05231094, + "diversity_loss_mlp": 0.0, + "epoch": 0.7310504040015391, + "flos": 375423455232.0, + "grad_norm": 0.07529945885516458, + "language_loss": 0.7849319, + "learning_rate": 0.00017799670609876516, + "loss": 0.79554456, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3800, + "time_per_iteration": 2.514719247817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106109, + "balance_loss_mlp": 1.05228066, + "diversity_loss_mlp": 0.0, + "epoch": 0.7312427856868026, + "flos": 549334752768.0, + "grad_norm": 0.07202410794231434, + "language_loss": 0.89223945, + "learning_rate": 0.00017775843254002366, + "loss": 0.90285027, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 3801, + "time_per_iteration": 2.742403507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059589, + "balance_loss_mlp": 1.05084491, + "diversity_loss_mlp": 0.0, + "epoch": 0.7314351673720662, + "flos": 767238801408.0, + "grad_norm": 0.060424645606399964, + "language_loss": 0.83728462, + "learning_rate": 0.00017752028408655367, + "loss": 0.84788048, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 3802, + "time_per_iteration": 3.0845768451690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.007903, + "balance_loss_mlp": 1.33712423, + "diversity_loss_mlp": 0.22043222, + "epoch": 0.7316275490573297, + "flos": 486734012928.0, + "grad_norm": 0.03351149815402085, + "language_loss": 0.85395515, + "learning_rate": 0.00017728226083081272, + "loss": 0.86185813, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01152179, + "step": 3803, + "time_per_iteration": 2.625450849533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064175, + "balance_loss_mlp": 1.05536509, + "diversity_loss_mlp": 0.0, + "epoch": 0.7318199307425933, + "flos": 473428592640.0, + "grad_norm": 0.06980647435682294, + "language_loss": 0.81371546, + "learning_rate": 0.00017704436286520965, + "loss": 0.82435715, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 3804, + "time_per_iteration": 2.5445075035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064875, + "balance_loss_mlp": 1.05574334, + "diversity_loss_mlp": 0.0, + "epoch": 0.7320123124278569, + "flos": 549463233024.0, + "grad_norm": 0.0710476755005787, + "language_loss": 0.84313726, + "learning_rate": 0.0001768065902821046, + "loss": 0.85378599, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 3805, + "time_per_iteration": 2.6542673110961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060899, + "balance_loss_mlp": 1.05200648, + "diversity_loss_mlp": 0.0, + "epoch": 0.7322046941131204, + "flos": 570781416960.0, + "grad_norm": 0.07797130890244271, + "language_loss": 0.8206104, + "learning_rate": 0.00017656894317380907, + "loss": 0.83121943, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 3806, + "time_per_iteration": 2.701544761657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020369, + "balance_loss_mlp": 1.01498067, + "diversity_loss_mlp": 0.0, + "epoch": 0.732397075798384, + "flos": 1469165548032.0, + "grad_norm": 0.021367923460696967, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.7705164, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.05395508, + "routerloss_mlp": 0.0, + "step": 3807, + "time_per_iteration": 5.001535177230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066451, + "balance_loss_mlp": 1.05737972, + "diversity_loss_mlp": 0.0, + "epoch": 0.7325894574836476, + "flos": 464862260736.0, + "grad_norm": 0.08165775614059534, + "language_loss": 0.83709639, + "learning_rate": 0.00017609402575064875, + "loss": 0.84776092, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 3808, + "time_per_iteration": 2.583564043045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061217, + "balance_loss_mlp": 1.05229425, + "diversity_loss_mlp": 0.0, + "epoch": 0.7327818391689112, + "flos": 495493065216.0, + "grad_norm": 0.0811056502064105, + "language_loss": 0.80930746, + "learning_rate": 0.00017585675562016367, + "loss": 0.81991959, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 3809, + "time_per_iteration": 2.6347053050994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101136, + "balance_loss_mlp": 1.00604343, + "diversity_loss_mlp": 0.0, + "epoch": 0.7329742208541746, + "flos": 1433489508864.0, + "grad_norm": 0.015405005389362274, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78224206, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.05322266, + "routerloss_mlp": 0.0, + "step": 3810, + "time_per_iteration": 4.809669017791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010632, + "balance_loss_mlp": 1.05418134, + "diversity_loss_mlp": 0.0, + "epoch": 0.7331666025394382, + "flos": 496889095680.0, + "grad_norm": 0.08174261034044085, + "language_loss": 0.85100114, + "learning_rate": 0.00017538259298196474, + "loss": 0.86163306, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 3811, + "time_per_iteration": 2.5669541358947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066918, + "balance_loss_mlp": 1.05802464, + "diversity_loss_mlp": 0.0, + "epoch": 0.7333589842247018, + "flos": 538524785664.0, + "grad_norm": 0.06518192792765873, + "language_loss": 0.82332867, + "learning_rate": 0.00017514570065833745, + "loss": 0.83399785, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 3812, + "time_per_iteration": 2.7447328567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071687, + "balance_loss_mlp": 1.06259131, + "diversity_loss_mlp": 0.0, + "epoch": 0.7335513659099654, + "flos": 491067836928.0, + "grad_norm": 0.09580264059121266, + "language_loss": 0.80788046, + "learning_rate": 0.00017490893445433426, + "loss": 0.81859732, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3813, + "time_per_iteration": 2.6378085613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_mlp": 1.05522716, + "diversity_loss_mlp": 0.0, + "epoch": 0.733743747595229, + "flos": 562150844928.0, + "grad_norm": 0.07102449829418327, + "language_loss": 0.81571025, + "learning_rate": 0.00017467229446187587, + "loss": 0.82635403, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3814, + "time_per_iteration": 2.7120914459228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072677, + "balance_loss_mlp": 1.06393909, + "diversity_loss_mlp": 0.0, + "epoch": 0.7339361292804925, + "flos": 538581685248.0, + "grad_norm": 0.07114012207935533, + "language_loss": 0.81285048, + "learning_rate": 0.00017443578077283424, + "loss": 0.82357717, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 3815, + "time_per_iteration": 2.6395435333251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106649, + "balance_loss_mlp": 1.05747199, + "diversity_loss_mlp": 0.0, + "epoch": 0.734128510965756, + "flos": 548469895680.0, + "grad_norm": 0.07483834875110257, + "language_loss": 0.84961641, + "learning_rate": 0.0001741993934790319, + "loss": 0.86028135, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 3816, + "time_per_iteration": 2.726897716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059942, + "balance_loss_mlp": 1.05116272, + "diversity_loss_mlp": 0.0, + "epoch": 0.7343208926510196, + "flos": 540066548736.0, + "grad_norm": 0.07480496039033006, + "language_loss": 0.84648383, + "learning_rate": 0.00017396313267224273, + "loss": 0.85708326, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 3817, + "time_per_iteration": 2.8066418170928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066831, + "balance_loss_mlp": 1.05799198, + "diversity_loss_mlp": 0.0, + "epoch": 0.7345132743362832, + "flos": 571095277056.0, + "grad_norm": 0.0889487029403391, + "language_loss": 0.8847158, + "learning_rate": 0.0001737269984441912, + "loss": 0.89538407, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 3818, + "time_per_iteration": 2.6318438053131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060197, + "balance_loss_mlp": 1.05124998, + "diversity_loss_mlp": 0.0, + "epoch": 0.7347056560215467, + "flos": 545403621888.0, + "grad_norm": 0.07556044268941689, + "language_loss": 0.85168499, + "learning_rate": 0.00017349099088655263, + "loss": 0.86228693, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 3819, + "time_per_iteration": 2.6988065242767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058709, + "balance_loss_mlp": 1.05007255, + "diversity_loss_mlp": 0.0, + "epoch": 0.7348980377068103, + "flos": 595949239296.0, + "grad_norm": 0.06839680418094873, + "language_loss": 0.80908042, + "learning_rate": 0.00017325511009095375, + "loss": 0.81966752, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 3820, + "time_per_iteration": 2.727027177810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057302, + "balance_loss_mlp": 1.04837942, + "diversity_loss_mlp": 0.0, + "epoch": 0.7350904193920739, + "flos": 538554521088.0, + "grad_norm": 0.07744320065165705, + "language_loss": 0.83646286, + "learning_rate": 0.00017301935614897113, + "loss": 0.84703583, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 3821, + "time_per_iteration": 2.6904449462890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059614, + "balance_loss_mlp": 1.05071497, + "diversity_loss_mlp": 0.0, + "epoch": 0.7352828010773375, + "flos": 512981434368.0, + "grad_norm": 0.06367960554180149, + "language_loss": 0.82050133, + "learning_rate": 0.00017278372915213274, + "loss": 0.83109748, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 3822, + "time_per_iteration": 2.715162515640259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009526, + "balance_loss_mlp": 1.00437641, + "diversity_loss_mlp": 0.0, + "epoch": 0.735475182762601, + "flos": 1553820848640.0, + "grad_norm": 0.013680325571624621, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80903369, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.05151367, + "routerloss_mlp": 0.0, + "step": 3823, + "time_per_iteration": 4.962257146835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056616, + "balance_loss_mlp": 1.04753208, + "diversity_loss_mlp": 0.0, + "epoch": 0.7356675644478645, + "flos": 681308610048.0, + "grad_norm": 0.08246165896918017, + "language_loss": 0.80686677, + "learning_rate": 0.00017231285635975314, + "loss": 0.81743288, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 3824, + "time_per_iteration": 2.892613172531128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060803, + "balance_loss_mlp": 1.05131412, + "diversity_loss_mlp": 0.0, + "epoch": 0.7358599461331281, + "flos": 515215157760.0, + "grad_norm": 0.06805025721620432, + "language_loss": 0.83387762, + "learning_rate": 0.00017207761074702115, + "loss": 0.84448564, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3825, + "time_per_iteration": 2.600008964538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061591, + "balance_loss_mlp": 1.05259085, + "diversity_loss_mlp": 0.0, + "epoch": 0.7360523278183917, + "flos": 443973934080.0, + "grad_norm": 0.06050130894095604, + "language_loss": 0.84002912, + "learning_rate": 0.0001718424924450514, + "loss": 0.85064507, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 3826, + "time_per_iteration": 2.5992300510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054783, + "balance_loss_mlp": 1.04562807, + "diversity_loss_mlp": 0.0, + "epoch": 0.7362447095036553, + "flos": 603423489024.0, + "grad_norm": 0.057066515344493245, + "language_loss": 0.86262274, + "learning_rate": 0.00017160750154512482, + "loss": 0.87317061, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 3827, + "time_per_iteration": 2.726304292678833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00795034, + "balance_loss_mlp": 1.34579134, + "diversity_loss_mlp": 0.220893, + "epoch": 0.7364370911889189, + "flos": 553095184896.0, + "grad_norm": 0.03015959834370855, + "language_loss": 0.83901906, + "learning_rate": 0.0001713726381384731, + "loss": 0.84696937, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01169185, + "step": 3828, + "time_per_iteration": 2.8043603897094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061645, + "balance_loss_mlp": 1.05248344, + "diversity_loss_mlp": 0.0, + "epoch": 0.7366294728741823, + "flos": 449061387264.0, + "grad_norm": 0.06844777280948466, + "language_loss": 0.81076348, + "learning_rate": 0.00017113790231627812, + "loss": 0.8213799, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3829, + "time_per_iteration": 2.619093179702759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100728, + "balance_loss_mlp": 1.0020107, + "diversity_loss_mlp": 0.0, + "epoch": 0.7368218545594459, + "flos": 1535502500352.0, + "grad_norm": 0.01400462839453399, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80265498, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.05273438, + "routerloss_mlp": 0.0, + "step": 3830, + "time_per_iteration": 4.812221527099609 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00792371, + "balance_loss_mlp": 1.34191561, + "diversity_loss_mlp": 0.21972378, + "epoch": 0.7370142362447095, + "flos": 515425130496.0, + "grad_norm": 0.03330075510268521, + "language_loss": 0.81812584, + "learning_rate": 0.00017066881378973936, + "loss": 0.82604957, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01155161, + "step": 3831, + "time_per_iteration": 2.7056965827941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060657, + "balance_loss_mlp": 1.05176377, + "diversity_loss_mlp": 0.0, + "epoch": 0.7372066179299731, + "flos": 500805172224.0, + "grad_norm": 0.07192956817041389, + "language_loss": 0.83134949, + "learning_rate": 0.00017043446126751189, + "loss": 0.84195602, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 3832, + "time_per_iteration": 2.676421880722046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060842, + "balance_loss_mlp": 1.05175185, + "diversity_loss_mlp": 0.0, + "epoch": 0.7373989996152366, + "flos": 558083893248.0, + "grad_norm": 0.07065913186643534, + "language_loss": 0.76922351, + "learning_rate": 0.00017020023669397376, + "loss": 0.77983195, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3833, + "time_per_iteration": 2.67942214012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063044, + "balance_loss_mlp": 1.0536567, + "diversity_loss_mlp": 0.0, + "epoch": 0.7375913813005002, + "flos": 506777306112.0, + "grad_norm": 0.07582868630536281, + "language_loss": 0.81676751, + "learning_rate": 0.0001699661401600589, + "loss": 0.82739794, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3834, + "time_per_iteration": 2.5813028812408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791828, + "balance_loss_mlp": 1.34016216, + "diversity_loss_mlp": 0.22067872, + "epoch": 0.7377837629857638, + "flos": 486183015936.0, + "grad_norm": 0.03104422851251126, + "language_loss": 0.78392982, + "learning_rate": 0.00016973217175665205, + "loss": 0.79184818, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01140742, + "step": 3835, + "time_per_iteration": 2.622943639755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002245, + "balance_loss_mlp": 0.99702322, + "diversity_loss_mlp": 0.0, + "epoch": 0.7379761446710273, + "flos": 1414693942272.0, + "grad_norm": 0.013207371532760371, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82168412, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.05224609, + "routerloss_mlp": 0.0, + "step": 3836, + "time_per_iteration": 4.931336402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060295, + "balance_loss_mlp": 1.05126452, + "diversity_loss_mlp": 0.0, + "epoch": 0.7381685263562909, + "flos": 629737721856.0, + "grad_norm": 0.06649751574670516, + "language_loss": 0.84498501, + "learning_rate": 0.00016926461970465047, + "loss": 0.85558796, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 3837, + "time_per_iteration": 2.765747547149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059108, + "balance_loss_mlp": 1.04992294, + "diversity_loss_mlp": 0.0, + "epoch": 0.7383609080415544, + "flos": 739224589824.0, + "grad_norm": 0.0574260047104924, + "language_loss": 0.84358233, + "learning_rate": 0.00016903103623757516, + "loss": 0.85417342, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3838, + "time_per_iteration": 3.069658041000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060146, + "balance_loss_mlp": 1.05106258, + "diversity_loss_mlp": 0.0, + "epoch": 0.738553289726818, + "flos": 550206950400.0, + "grad_norm": 0.19052913382225448, + "language_loss": 0.80133057, + "learning_rate": 0.00016879758126404738, + "loss": 0.81193197, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3839, + "time_per_iteration": 2.689941167831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789085, + "balance_loss_mlp": 1.33350182, + "diversity_loss_mlp": 0.2223025, + "epoch": 0.7387456714120816, + "flos": 910294640640.0, + "grad_norm": 0.03551016649676842, + "language_loss": 0.79851139, + "learning_rate": 0.00016856425487470216, + "loss": 0.80640227, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01118332, + "step": 3840, + "time_per_iteration": 3.1254615783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064553, + "balance_loss_mlp": 1.05543303, + "diversity_loss_mlp": 0.0, + "epoch": 0.7389380530973452, + "flos": 852684807168.0, + "grad_norm": 0.0706997471436485, + "language_loss": 0.79199183, + "learning_rate": 0.00016833105716012486, + "loss": 0.8026374, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 3841, + "time_per_iteration": 3.138193368911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063082, + "balance_loss_mlp": 1.05398655, + "diversity_loss_mlp": 0.0, + "epoch": 0.7391304347826086, + "flos": 817026020352.0, + "grad_norm": 0.06630465632536123, + "language_loss": 0.85135829, + "learning_rate": 0.00016809798821085088, + "loss": 0.86198914, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3842, + "time_per_iteration": 3.0023772716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070258, + "balance_loss_mlp": 1.06117415, + "diversity_loss_mlp": 0.0, + "epoch": 0.7393228164678722, + "flos": 572819848704.0, + "grad_norm": 0.05652902477854722, + "language_loss": 0.89046443, + "learning_rate": 0.00016786504811736565, + "loss": 0.90116704, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3843, + "time_per_iteration": 2.706385374069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063666, + "balance_loss_mlp": 1.05483222, + "diversity_loss_mlp": 0.0, + "epoch": 0.7395151981531358, + "flos": 685237169664.0, + "grad_norm": 0.0599118075718357, + "language_loss": 0.82577473, + "learning_rate": 0.00016763223697010442, + "loss": 0.83641136, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 3844, + "time_per_iteration": 3.0668578147888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065987, + "balance_loss_mlp": 1.05714738, + "diversity_loss_mlp": 0.0, + "epoch": 0.7397075798383994, + "flos": 556366662144.0, + "grad_norm": 0.06587022409921209, + "language_loss": 0.84292293, + "learning_rate": 0.00016739955485945256, + "loss": 0.8535828, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 3845, + "time_per_iteration": 2.76232647895813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066755, + "balance_loss_mlp": 1.05776656, + "diversity_loss_mlp": 0.0, + "epoch": 0.739899961523663, + "flos": 546782400000.0, + "grad_norm": 0.07863227392455628, + "language_loss": 0.85949242, + "learning_rate": 0.00016716700187574513, + "loss": 0.87015998, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 3846, + "time_per_iteration": 2.6615161895751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068464, + "balance_loss_mlp": 1.05967295, + "diversity_loss_mlp": 0.0, + "epoch": 0.7400923432089265, + "flos": 609190419456.0, + "grad_norm": 0.0694717633397352, + "language_loss": 0.8384943, + "learning_rate": 0.0001669345781092675, + "loss": 0.84917903, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 3847, + "time_per_iteration": 2.708287477493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068988, + "balance_loss_mlp": 1.06022048, + "diversity_loss_mlp": 0.0, + "epoch": 0.7402847248941901, + "flos": 591007518720.0, + "grad_norm": 0.08739626570818541, + "language_loss": 0.87128854, + "learning_rate": 0.0001667022836502546, + "loss": 0.88197839, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 3848, + "time_per_iteration": 2.768453598022461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_mlp": 1.06293964, + "diversity_loss_mlp": 0.0, + "epoch": 0.7404771065794536, + "flos": 477369635328.0, + "grad_norm": 0.07849103844245357, + "language_loss": 0.83004302, + "learning_rate": 0.00016647011858889077, + "loss": 0.84076011, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 3849, + "time_per_iteration": 2.553321123123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066558, + "balance_loss_mlp": 1.05774295, + "diversity_loss_mlp": 0.0, + "epoch": 0.7406694882647172, + "flos": 496446755328.0, + "grad_norm": 0.0747699795491948, + "language_loss": 0.85671914, + "learning_rate": 0.00016623808301531056, + "loss": 0.86738473, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 3850, + "time_per_iteration": 2.6675972938537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072662, + "balance_loss_mlp": 1.06376278, + "diversity_loss_mlp": 0.0, + "epoch": 0.7408618699499807, + "flos": 562205173248.0, + "grad_norm": 0.08247164679043814, + "language_loss": 0.79259217, + "learning_rate": 0.00016600617701959842, + "loss": 0.8033188, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 3851, + "time_per_iteration": 2.7360141277313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_mlp": 1.03028595, + "diversity_loss_mlp": 0.0, + "epoch": 0.7410542516352443, + "flos": 1388228834304.0, + "grad_norm": 0.02428572869696352, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79879034, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.04931641, + "routerloss_mlp": 0.0, + "step": 3852, + "time_per_iteration": 4.992321968078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066311, + "balance_loss_mlp": 1.05746007, + "diversity_loss_mlp": 0.0, + "epoch": 0.7412466333205079, + "flos": 669999776256.0, + "grad_norm": 0.06380286775900439, + "language_loss": 0.81274605, + "learning_rate": 0.00016554275412186315, + "loss": 0.8234092, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 3853, + "time_per_iteration": 2.82212495803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065457, + "balance_loss_mlp": 1.05660534, + "diversity_loss_mlp": 0.0, + "epoch": 0.7414390150057715, + "flos": 489293706240.0, + "grad_norm": 0.08235676445627264, + "language_loss": 0.80846745, + "learning_rate": 0.0001653112373997568, + "loss": 0.81912202, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 3854, + "time_per_iteration": 2.6886162757873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072808, + "balance_loss_mlp": 1.06417763, + "diversity_loss_mlp": 0.0, + "epoch": 0.7416313966910351, + "flos": 599393613312.0, + "grad_norm": 0.0787808176004402, + "language_loss": 0.7459085, + "learning_rate": 0.0001650798506153517, + "loss": 0.75663662, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 3855, + "time_per_iteration": 2.699655294418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064176, + "balance_loss_mlp": 1.05534911, + "diversity_loss_mlp": 0.0, + "epoch": 0.7418237783762985, + "flos": 542539980288.0, + "grad_norm": 0.13185112675918914, + "language_loss": 0.84102911, + "learning_rate": 0.00016484859385848023, + "loss": 0.85167086, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 3856, + "time_per_iteration": 2.6237292289733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066087, + "balance_loss_mlp": 1.05749846, + "diversity_loss_mlp": 0.0, + "epoch": 0.7420161600615621, + "flos": 544136071680.0, + "grad_norm": 0.0735312090287519, + "language_loss": 0.77380371, + "learning_rate": 0.0001646174672189243, + "loss": 0.7844646, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.08599854, + "routerloss_mlp": 0.0, + "step": 3857, + "time_per_iteration": 2.662250518798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066781, + "balance_loss_mlp": 1.05808437, + "diversity_loss_mlp": 0.0, + "epoch": 0.7422085417468257, + "flos": 527178875904.0, + "grad_norm": 0.07158580991852644, + "language_loss": 0.80202585, + "learning_rate": 0.00016438647078641488, + "loss": 0.81269372, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 3858, + "time_per_iteration": 2.5815234184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061243, + "balance_loss_mlp": 1.05223656, + "diversity_loss_mlp": 0.0, + "epoch": 0.7424009234320893, + "flos": 508674774528.0, + "grad_norm": 0.07922307514532904, + "language_loss": 0.82879561, + "learning_rate": 0.00016415560465063344, + "loss": 0.83940804, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 3859, + "time_per_iteration": 2.708585739135742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057118, + "balance_loss_mlp": 1.04814172, + "diversity_loss_mlp": 0.0, + "epoch": 0.7425933051173528, + "flos": 512598564864.0, + "grad_norm": 0.07844823875052143, + "language_loss": 0.79364371, + "learning_rate": 0.0001639248689012095, + "loss": 0.80421484, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 3860, + "time_per_iteration": 2.58583927154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_mlp": 1.05484664, + "diversity_loss_mlp": 0.0, + "epoch": 0.7427856868026164, + "flos": 458302053888.0, + "grad_norm": 0.0625994675611715, + "language_loss": 0.87600327, + "learning_rate": 0.00016369426362772271, + "loss": 0.88664174, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 3861, + "time_per_iteration": 2.7810909748077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058337, + "balance_loss_mlp": 1.04926515, + "diversity_loss_mlp": 0.0, + "epoch": 0.74297806848788, + "flos": 605019580416.0, + "grad_norm": 0.06941058470153043, + "language_loss": 0.80742699, + "learning_rate": 0.00016346378891970233, + "loss": 0.81801033, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3862, + "time_per_iteration": 2.846928596496582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063331, + "balance_loss_mlp": 1.05435514, + "diversity_loss_mlp": 0.0, + "epoch": 0.7431704501731435, + "flos": 891390044160.0, + "grad_norm": 0.0684493510726064, + "language_loss": 0.81710279, + "learning_rate": 0.00016323344486662633, + "loss": 0.82773608, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 3863, + "time_per_iteration": 3.331202745437622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061695, + "balance_loss_mlp": 1.05259883, + "diversity_loss_mlp": 0.0, + "epoch": 0.7433628318584071, + "flos": 592163841024.0, + "grad_norm": 0.05806816249285044, + "language_loss": 0.78816247, + "learning_rate": 0.00016300323155792247, + "loss": 0.79877937, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3864, + "time_per_iteration": 2.872833490371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060231, + "balance_loss_mlp": 1.05139732, + "diversity_loss_mlp": 0.0, + "epoch": 0.7435552135436706, + "flos": 477154520064.0, + "grad_norm": 0.06583078508607046, + "language_loss": 0.88677347, + "learning_rate": 0.00016277314908296687, + "loss": 0.89737576, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 3865, + "time_per_iteration": 2.6268508434295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062318, + "balance_loss_mlp": 1.05286467, + "diversity_loss_mlp": 0.0, + "epoch": 0.7437475952289342, + "flos": 673184618496.0, + "grad_norm": 0.08180248385301583, + "language_loss": 0.7621361, + "learning_rate": 0.00016254319753108604, + "loss": 0.77275932, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 3866, + "time_per_iteration": 2.8856914043426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062277, + "balance_loss_mlp": 1.05305004, + "diversity_loss_mlp": 0.0, + "epoch": 0.7439399769141978, + "flos": 770428786176.0, + "grad_norm": 0.07310249763973194, + "language_loss": 0.77018058, + "learning_rate": 0.00016231337699155492, + "loss": 0.78080332, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3867, + "time_per_iteration": 2.975250244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059189, + "balance_loss_mlp": 1.04974771, + "diversity_loss_mlp": 0.0, + "epoch": 0.7441323585994614, + "flos": 647777088000.0, + "grad_norm": 0.07083990267041149, + "language_loss": 0.78228271, + "learning_rate": 0.0001620836875535977, + "loss": 0.79287452, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.09436035, + "routerloss_mlp": 0.0, + "step": 3868, + "time_per_iteration": 2.856765031814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105862, + "balance_loss_mlp": 1.04925001, + "diversity_loss_mlp": 0.0, + "epoch": 0.7443247402847248, + "flos": 565372763136.0, + "grad_norm": 0.058820941096758894, + "language_loss": 0.80752689, + "learning_rate": 0.00016185412930638766, + "loss": 0.81811309, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3869, + "time_per_iteration": 2.7962300777435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060986, + "balance_loss_mlp": 1.05180645, + "diversity_loss_mlp": 0.0, + "epoch": 0.7445171219699884, + "flos": 578529879552.0, + "grad_norm": 0.09216022180459393, + "language_loss": 0.82565176, + "learning_rate": 0.00016162470233904765, + "loss": 0.83626163, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3870, + "time_per_iteration": 2.727376937866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059095, + "balance_loss_mlp": 1.05008888, + "diversity_loss_mlp": 0.0, + "epoch": 0.744709503655252, + "flos": 618875997696.0, + "grad_norm": 0.08871714462123159, + "language_loss": 0.82108277, + "learning_rate": 0.00016139540674064856, + "loss": 0.83167374, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 3871, + "time_per_iteration": 2.747559070587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055678, + "balance_loss_mlp": 1.04671371, + "diversity_loss_mlp": 0.0, + "epoch": 0.7449018853405156, + "flos": 528619322880.0, + "grad_norm": 0.063692065795828, + "language_loss": 0.7763024, + "learning_rate": 0.00016116624260021113, + "loss": 0.78685915, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3872, + "time_per_iteration": 2.75909423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106192, + "balance_loss_mlp": 1.0528599, + "diversity_loss_mlp": 0.0, + "epoch": 0.7450942670257792, + "flos": 433314842112.0, + "grad_norm": 0.06099997691226976, + "language_loss": 0.83786505, + "learning_rate": 0.0001609372100067046, + "loss": 0.84848428, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 3873, + "time_per_iteration": 2.5251874923706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796431, + "balance_loss_mlp": 1.34714556, + "diversity_loss_mlp": 0.22299039, + "epoch": 0.7452866487110427, + "flos": 696882258432.0, + "grad_norm": 0.03925838692514683, + "language_loss": 0.85007972, + "learning_rate": 0.0001607083090490475, + "loss": 0.85804403, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01136341, + "step": 3874, + "time_per_iteration": 2.8896329402923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061928, + "balance_loss_mlp": 1.0527246, + "diversity_loss_mlp": 0.0, + "epoch": 0.7454790303963063, + "flos": 512210552832.0, + "grad_norm": 0.07963892031444339, + "language_loss": 0.80322075, + "learning_rate": 0.00016047953981610714, + "loss": 0.81384003, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 3875, + "time_per_iteration": 2.7198143005371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_mlp": 1.02416849, + "diversity_loss_mlp": 0.0, + "epoch": 0.7456714120815698, + "flos": 1325949668352.0, + "grad_norm": 0.01953041960218584, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80758721, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.04907227, + "routerloss_mlp": 0.0, + "step": 3876, + "time_per_iteration": 5.047106981277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105976, + "balance_loss_mlp": 1.05069435, + "diversity_loss_mlp": 0.0, + "epoch": 0.7458637937668334, + "flos": 721711627776.0, + "grad_norm": 0.07139005535531126, + "language_loss": 0.80606306, + "learning_rate": 0.0001600223968795889, + "loss": 0.81666064, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3877, + "time_per_iteration": 2.8899221420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_mlp": 1.02230287, + "diversity_loss_mlp": 0.0, + "epoch": 0.746056175452097, + "flos": 1501580395008.0, + "grad_norm": 0.018847716252117216, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76723289, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.04907227, + "routerloss_mlp": 0.0, + "step": 3878, + "time_per_iteration": 4.949044466018677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063533, + "balance_loss_mlp": 1.05449665, + "diversity_loss_mlp": 0.0, + "epoch": 0.7462485571373605, + "flos": 520245711360.0, + "grad_norm": 0.08037956070996295, + "language_loss": 0.8220886, + "learning_rate": 0.00015956578190706483, + "loss": 0.83272392, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3879, + "time_per_iteration": 2.679077386856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058904, + "balance_loss_mlp": 1.04966509, + "diversity_loss_mlp": 0.0, + "epoch": 0.7464409388226241, + "flos": 481206790656.0, + "grad_norm": 0.07423526276361143, + "language_loss": 0.75933188, + "learning_rate": 0.00015933767262892468, + "loss": 0.76992095, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3880, + "time_per_iteration": 2.725120782852173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061626, + "balance_loss_mlp": 1.05248249, + "diversity_loss_mlp": 0.0, + "epoch": 0.7466333205078877, + "flos": 486761177088.0, + "grad_norm": 0.08122487442608403, + "language_loss": 0.81791377, + "learning_rate": 0.00015910969560762927, + "loss": 0.82853001, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 3881, + "time_per_iteration": 2.5659735202789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061027, + "balance_loss_mlp": 1.05212796, + "diversity_loss_mlp": 0.0, + "epoch": 0.7468257021931513, + "flos": 611293091328.0, + "grad_norm": 0.06269003532148706, + "language_loss": 0.83085567, + "learning_rate": 0.00015888185093168727, + "loss": 0.84146595, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 3882, + "time_per_iteration": 2.7333316802978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064922, + "balance_loss_mlp": 1.0554266, + "diversity_loss_mlp": 0.0, + "epoch": 0.7470180838784147, + "flos": 533459727360.0, + "grad_norm": 0.06569405974283654, + "language_loss": 0.81109202, + "learning_rate": 0.00015865413868955581, + "loss": 0.82174122, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3883, + "time_per_iteration": 2.6078059673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058237, + "balance_loss_mlp": 1.04946291, + "diversity_loss_mlp": 0.0, + "epoch": 0.7472104655636783, + "flos": 739338388992.0, + "grad_norm": 0.057634664266444945, + "language_loss": 0.82803142, + "learning_rate": 0.00015842655896964054, + "loss": 0.83861375, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 3884, + "time_per_iteration": 3.042433977127075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061863, + "balance_loss_mlp": 1.0528096, + "diversity_loss_mlp": 0.0, + "epoch": 0.7474028472489419, + "flos": 640305409536.0, + "grad_norm": 0.07244796431130596, + "language_loss": 0.73654252, + "learning_rate": 0.00015819911186029567, + "loss": 0.74716115, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 3885, + "time_per_iteration": 2.8399569988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063739, + "balance_loss_mlp": 1.05458951, + "diversity_loss_mlp": 0.0, + "epoch": 0.7475952289342055, + "flos": 590249120256.0, + "grad_norm": 0.0730187367037383, + "language_loss": 0.86386681, + "learning_rate": 0.00015797179744982443, + "loss": 0.87450415, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 3886, + "time_per_iteration": 2.6979753971099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068538, + "balance_loss_mlp": 1.05947804, + "diversity_loss_mlp": 0.0, + "epoch": 0.7477876106194691, + "flos": 488191712256.0, + "grad_norm": 0.06196383449999257, + "language_loss": 0.78900141, + "learning_rate": 0.00015774461582647765, + "loss": 0.79968679, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 3887, + "time_per_iteration": 2.6235530376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067008, + "balance_loss_mlp": 1.05791271, + "diversity_loss_mlp": 0.0, + "epoch": 0.7479799923047326, + "flos": 554733494784.0, + "grad_norm": 0.07428746170121639, + "language_loss": 0.81271255, + "learning_rate": 0.00015751756707845505, + "loss": 0.82338268, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3888, + "time_per_iteration": 2.654217481613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066279, + "balance_loss_mlp": 1.05733204, + "diversity_loss_mlp": 0.0, + "epoch": 0.7481723739899961, + "flos": 767387105280.0, + "grad_norm": 0.06349901375293318, + "language_loss": 0.8820529, + "learning_rate": 0.00015729065129390502, + "loss": 0.89271569, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 3889, + "time_per_iteration": 2.990723133087158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107017, + "balance_loss_mlp": 1.06125295, + "diversity_loss_mlp": 0.0, + "epoch": 0.7483647556752597, + "flos": 496172542464.0, + "grad_norm": 0.10644115001559669, + "language_loss": 0.82281494, + "learning_rate": 0.0001570638685609241, + "loss": 0.83351666, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 3890, + "time_per_iteration": 2.562049627304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064882, + "balance_loss_mlp": 1.0558815, + "diversity_loss_mlp": 0.0, + "epoch": 0.7485571373605233, + "flos": 472850431488.0, + "grad_norm": 0.07005408827456952, + "language_loss": 0.80632579, + "learning_rate": 0.00015683721896755693, + "loss": 0.81697452, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 3891, + "time_per_iteration": 2.5688047409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018069, + "balance_loss_mlp": 1.01291943, + "diversity_loss_mlp": 0.0, + "epoch": 0.7487495190457868, + "flos": 1554473161728.0, + "grad_norm": 0.021126139986013294, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83228564, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.05151367, + "routerloss_mlp": 0.0, + "step": 3892, + "time_per_iteration": 4.9241249561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063391, + "balance_loss_mlp": 1.05425954, + "diversity_loss_mlp": 0.0, + "epoch": 0.7489419007310504, + "flos": 581845773312.0, + "grad_norm": 0.07047459901443781, + "language_loss": 0.85042292, + "learning_rate": 0.00015638431955158528, + "loss": 0.8610568, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 3893, + "time_per_iteration": 2.696835517883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059789, + "balance_loss_mlp": 1.05092609, + "diversity_loss_mlp": 0.0, + "epoch": 0.749134282416314, + "flos": 567576751104.0, + "grad_norm": 0.07429691825865621, + "language_loss": 0.81044436, + "learning_rate": 0.00015615806990481186, + "loss": 0.8210423, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 3894, + "time_per_iteration": 2.721975088119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061695, + "balance_loss_mlp": 1.05259371, + "diversity_loss_mlp": 0.0, + "epoch": 0.7493266641015776, + "flos": 533061803520.0, + "grad_norm": 0.05332768573038703, + "language_loss": 0.84447378, + "learning_rate": 0.00015593195374931452, + "loss": 0.85509074, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3895, + "time_per_iteration": 2.724210500717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057311, + "balance_loss_mlp": 1.04820967, + "diversity_loss_mlp": 0.0, + "epoch": 0.7495190457868411, + "flos": 523613362176.0, + "grad_norm": 0.08170178598725314, + "language_loss": 0.79939067, + "learning_rate": 0.00015570597117287922, + "loss": 0.80996376, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3896, + "time_per_iteration": 2.6550590991973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058835, + "balance_loss_mlp": 1.04970384, + "diversity_loss_mlp": 0.0, + "epoch": 0.7497114274721046, + "flos": 514187315712.0, + "grad_norm": 0.07111999470543245, + "language_loss": 0.77950025, + "learning_rate": 0.0001554801222632406, + "loss": 0.79008865, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 3897, + "time_per_iteration": 2.5913069248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058781, + "balance_loss_mlp": 1.04961967, + "diversity_loss_mlp": 0.0, + "epoch": 0.7499038091573682, + "flos": 495006308352.0, + "grad_norm": 0.07004004520272819, + "language_loss": 0.8521589, + "learning_rate": 0.00015525440710808052, + "loss": 0.86274672, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3898, + "time_per_iteration": 2.633772850036621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105768, + "balance_loss_mlp": 1.04835165, + "diversity_loss_mlp": 0.0, + "epoch": 0.7500961908426318, + "flos": 737658233856.0, + "grad_norm": 0.07310706246925956, + "language_loss": 0.77907795, + "learning_rate": 0.00015502882579502953, + "loss": 0.78965473, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.09332275, + "routerloss_mlp": 0.0, + "step": 3899, + "time_per_iteration": 2.938547372817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054758, + "balance_loss_mlp": 1.04551327, + "diversity_loss_mlp": 0.0, + "epoch": 0.7502885725278954, + "flos": 533400256512.0, + "grad_norm": 0.06650950979385485, + "language_loss": 0.8470974, + "learning_rate": 0.00015480337841166592, + "loss": 0.85764492, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3900, + "time_per_iteration": 2.719611167907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064406, + "balance_loss_mlp": 1.05532193, + "diversity_loss_mlp": 0.0, + "epoch": 0.7504809542131589, + "flos": 589324792320.0, + "grad_norm": 0.06798274648693917, + "language_loss": 0.83017278, + "learning_rate": 0.00015457806504551647, + "loss": 0.84081692, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 3901, + "time_per_iteration": 2.815099000930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055292, + "balance_loss_mlp": 1.04617858, + "diversity_loss_mlp": 0.0, + "epoch": 0.7506733358984224, + "flos": 511550899200.0, + "grad_norm": 0.06551967362841071, + "language_loss": 0.78146368, + "learning_rate": 0.0001543528857840554, + "loss": 0.79201663, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3902, + "time_per_iteration": 2.660747528076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105756, + "balance_loss_mlp": 1.04829192, + "diversity_loss_mlp": 0.0, + "epoch": 0.750865717583686, + "flos": 539268503040.0, + "grad_norm": 0.08761977110880032, + "language_loss": 0.80069476, + "learning_rate": 0.000154127840714705, + "loss": 0.81127042, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3903, + "time_per_iteration": 2.791895627975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057287, + "balance_loss_mlp": 1.04786348, + "diversity_loss_mlp": 0.0, + "epoch": 0.7510580992689496, + "flos": 476578930176.0, + "grad_norm": 0.08489214172044417, + "language_loss": 0.82145894, + "learning_rate": 0.00015390292992483557, + "loss": 0.83203179, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 3904, + "time_per_iteration": 2.531291961669922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058521, + "balance_loss_mlp": 1.04955626, + "diversity_loss_mlp": 0.0, + "epoch": 0.7512504809542132, + "flos": 579043800576.0, + "grad_norm": 0.06641081846092535, + "language_loss": 0.84235787, + "learning_rate": 0.00015367815350176523, + "loss": 0.85294312, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 3905, + "time_per_iteration": 2.7290806770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055516, + "balance_loss_mlp": 1.04627776, + "diversity_loss_mlp": 0.0, + "epoch": 0.7514428626394767, + "flos": 418660379136.0, + "grad_norm": 0.06804815402684934, + "language_loss": 0.82392836, + "learning_rate": 0.00015345351153275987, + "loss": 0.8344835, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3906, + "time_per_iteration": 2.530323028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054875, + "balance_loss_mlp": 1.04556477, + "diversity_loss_mlp": 0.0, + "epoch": 0.7516352443247403, + "flos": 641039215104.0, + "grad_norm": 0.06371304983723255, + "language_loss": 0.80832905, + "learning_rate": 0.00015322900410503332, + "loss": 0.81887782, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 3907, + "time_per_iteration": 2.840207576751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062824, + "balance_loss_mlp": 1.05359089, + "diversity_loss_mlp": 0.0, + "epoch": 0.7518276260100039, + "flos": 580998168576.0, + "grad_norm": 0.0661364017188776, + "language_loss": 0.77996182, + "learning_rate": 0.00015300463130574703, + "loss": 0.79059005, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 3908, + "time_per_iteration": 2.8597986698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00795371, + "balance_loss_mlp": 1.3454839, + "diversity_loss_mlp": 0.22311893, + "epoch": 0.7520200076952674, + "flos": 687342412800.0, + "grad_norm": 0.027335085290279493, + "language_loss": 0.81861627, + "learning_rate": 0.00015278039322201033, + "loss": 0.82656997, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01107004, + "step": 3909, + "time_per_iteration": 2.991687774658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056027, + "balance_loss_mlp": 1.04691339, + "diversity_loss_mlp": 0.0, + "epoch": 0.7522123893805309, + "flos": 486439976448.0, + "grad_norm": 0.07802530294793614, + "language_loss": 0.79405951, + "learning_rate": 0.00015255628994088004, + "loss": 0.80461979, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3910, + "time_per_iteration": 2.552389621734619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057943, + "balance_loss_mlp": 1.04875183, + "diversity_loss_mlp": 0.0, + "epoch": 0.7524047710657945, + "flos": 818982586368.0, + "grad_norm": 0.06839079088853381, + "language_loss": 0.75070244, + "learning_rate": 0.00015233232154936082, + "loss": 0.76128185, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3911, + "time_per_iteration": 3.2685062885284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060306, + "balance_loss_mlp": 1.05104983, + "diversity_loss_mlp": 0.0, + "epoch": 0.7525971527510581, + "flos": 699508763136.0, + "grad_norm": 0.0742904302268966, + "language_loss": 0.76248109, + "learning_rate": 0.0001521084881344048, + "loss": 0.77308416, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3912, + "time_per_iteration": 2.8669307231903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063744, + "balance_loss_mlp": 1.05449915, + "diversity_loss_mlp": 0.0, + "epoch": 0.7527895344363217, + "flos": 633787421184.0, + "grad_norm": 0.07365945451583152, + "language_loss": 0.86536098, + "learning_rate": 0.00015188478978291208, + "loss": 0.87599838, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3913, + "time_per_iteration": 2.8062844276428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060792, + "balance_loss_mlp": 1.05141592, + "diversity_loss_mlp": 0.0, + "epoch": 0.7529819161215853, + "flos": 562830322176.0, + "grad_norm": 0.06964875853647617, + "language_loss": 0.86198735, + "learning_rate": 0.00015166122658173014, + "loss": 0.87259525, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 3914, + "time_per_iteration": 2.832261085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062251, + "balance_loss_mlp": 1.05276752, + "diversity_loss_mlp": 0.0, + "epoch": 0.7531742978068487, + "flos": 690665647104.0, + "grad_norm": 0.07069372780846282, + "language_loss": 0.88695043, + "learning_rate": 0.00015143779861765332, + "loss": 0.89757293, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 3915, + "time_per_iteration": 2.876596689224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057505, + "balance_loss_mlp": 1.04845726, + "diversity_loss_mlp": 0.0, + "epoch": 0.7533666794921123, + "flos": 681101208576.0, + "grad_norm": 0.07477721009048348, + "language_loss": 0.81360573, + "learning_rate": 0.00015121450597742458, + "loss": 0.82418078, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 3916, + "time_per_iteration": 2.83457612991333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105545, + "balance_loss_mlp": 1.04619908, + "diversity_loss_mlp": 0.0, + "epoch": 0.7535590611773759, + "flos": 623669414400.0, + "grad_norm": 0.07347506206734646, + "language_loss": 0.78634655, + "learning_rate": 0.00015099134874773369, + "loss": 0.79690111, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3917, + "time_per_iteration": 2.7597367763519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793692, + "balance_loss_mlp": 1.34194863, + "diversity_loss_mlp": 0.22241086, + "epoch": 0.7537514428626395, + "flos": 519427842048.0, + "grad_norm": 0.028776380158614775, + "language_loss": 0.80358481, + "learning_rate": 0.00015076832701521793, + "loss": 0.81152171, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01151239, + "step": 3918, + "time_per_iteration": 2.746518135070801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050485, + "balance_loss_mlp": 1.04122829, + "diversity_loss_mlp": 0.0, + "epoch": 0.753943824547903, + "flos": 723653512704.0, + "grad_norm": 0.08224807804324459, + "language_loss": 0.82372093, + "learning_rate": 0.000150545440866462, + "loss": 0.83422583, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 3919, + "time_per_iteration": 2.986933708190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056343, + "balance_loss_mlp": 1.047104, + "diversity_loss_mlp": 0.0, + "epoch": 0.7541362062331666, + "flos": 437547350016.0, + "grad_norm": 0.07659379290436485, + "language_loss": 0.78524017, + "learning_rate": 0.000150322690387998, + "loss": 0.79580355, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 3920, + "time_per_iteration": 2.5535264015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053379, + "balance_loss_mlp": 1.04395509, + "diversity_loss_mlp": 0.0, + "epoch": 0.7543285879184302, + "flos": 565274018304.0, + "grad_norm": 0.08088787979004233, + "language_loss": 0.75178206, + "learning_rate": 0.00015010007566630535, + "loss": 0.76231587, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3921, + "time_per_iteration": 2.752476930618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052876, + "balance_loss_mlp": 1.0435003, + "diversity_loss_mlp": 0.0, + "epoch": 0.7545209696036937, + "flos": 521036416512.0, + "grad_norm": 0.09066204118342673, + "language_loss": 0.81410325, + "learning_rate": 0.00014987759678781077, + "loss": 0.82463199, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 3922, + "time_per_iteration": 2.6611218452453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_mlp": 1.04057336, + "diversity_loss_mlp": 0.0, + "epoch": 0.7547133512889573, + "flos": 616066684416.0, + "grad_norm": 0.07014269793522399, + "language_loss": 0.82503462, + "learning_rate": 0.00014965525383888795, + "loss": 0.83553147, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3923, + "time_per_iteration": 2.7689740657806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_mlp": 1.04243279, + "diversity_loss_mlp": 0.0, + "epoch": 0.7549057329742208, + "flos": 750845085696.0, + "grad_norm": 0.07037901848858046, + "language_loss": 0.72344971, + "learning_rate": 0.00014943304690585851, + "loss": 0.73396569, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 3924, + "time_per_iteration": 2.926786184310913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050623, + "balance_loss_mlp": 1.04116416, + "diversity_loss_mlp": 0.0, + "epoch": 0.7550981146594844, + "flos": 514444276224.0, + "grad_norm": 0.07074790487011906, + "language_loss": 0.79134214, + "learning_rate": 0.0001492109760749908, + "loss": 0.80184835, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 3925, + "time_per_iteration": 2.6663551330566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048636, + "balance_loss_mlp": 1.03920674, + "diversity_loss_mlp": 0.0, + "epoch": 0.755290496344748, + "flos": 522009930240.0, + "grad_norm": 0.06259359506310941, + "language_loss": 0.79865938, + "learning_rate": 0.00014898904143250002, + "loss": 0.80914569, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.09417725, + "routerloss_mlp": 0.0, + "step": 3926, + "time_per_iteration": 2.7111570835113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007032, + "balance_loss_mlp": 1.00193024, + "diversity_loss_mlp": 0.0, + "epoch": 0.7554828780300116, + "flos": 1414615021056.0, + "grad_norm": 0.018464770707338953, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76762235, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.05102539, + "routerloss_mlp": 0.0, + "step": 3927, + "time_per_iteration": 4.9247355461120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049551, + "balance_loss_mlp": 1.04027081, + "diversity_loss_mlp": 0.0, + "epoch": 0.7556752597152752, + "flos": 556937482752.0, + "grad_norm": 0.0681788266526358, + "language_loss": 0.80484271, + "learning_rate": 0.0001485455810572474, + "loss": 0.81533813, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 3928, + "time_per_iteration": 2.644436836242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050291, + "balance_loss_mlp": 1.04075408, + "diversity_loss_mlp": 0.0, + "epoch": 0.7558676414005386, + "flos": 563638279680.0, + "grad_norm": 0.05891834719109388, + "language_loss": 0.83858299, + "learning_rate": 0.00014832405549665236, + "loss": 0.84908581, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.09533691, + "routerloss_mlp": 0.0, + "step": 3929, + "time_per_iteration": 2.7012484073638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_mlp": 1.03651154, + "diversity_loss_mlp": 0.0, + "epoch": 0.7560600230858022, + "flos": 561377392128.0, + "grad_norm": 0.06702269562440989, + "language_loss": 0.78850049, + "learning_rate": 0.00014810266646876746, + "loss": 0.79895926, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 3930, + "time_per_iteration": 2.768267869949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_mlp": 1.0400542, + "diversity_loss_mlp": 0.0, + "epoch": 0.7562524047710658, + "flos": 719576649216.0, + "grad_norm": 0.07203252309013448, + "language_loss": 0.77448905, + "learning_rate": 0.00014788141405954364, + "loss": 0.78498399, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3931, + "time_per_iteration": 2.9904940128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047773, + "balance_loss_mlp": 1.03817058, + "diversity_loss_mlp": 0.0, + "epoch": 0.7564447864563294, + "flos": 543347937792.0, + "grad_norm": 0.07800689348595595, + "language_loss": 0.8509475, + "learning_rate": 0.00014766029835487865, + "loss": 0.86142522, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3932, + "time_per_iteration": 2.712207078933716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050974, + "balance_loss_mlp": 1.04148519, + "diversity_loss_mlp": 0.0, + "epoch": 0.7566371681415929, + "flos": 725805743616.0, + "grad_norm": 0.09178447768332373, + "language_loss": 0.79506183, + "learning_rate": 0.0001474393194406173, + "loss": 0.80557162, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3933, + "time_per_iteration": 2.933224678039551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048897, + "balance_loss_mlp": 1.03937268, + "diversity_loss_mlp": 0.0, + "epoch": 0.7568295498268565, + "flos": 576580280832.0, + "grad_norm": 0.05892607400759823, + "language_loss": 0.79702771, + "learning_rate": 0.00014721847740255112, + "loss": 0.80751669, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 3934, + "time_per_iteration": 2.826552391052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003728, + "balance_loss_mlp": 0.99864936, + "diversity_loss_mlp": 0.0, + "epoch": 0.75702193151212, + "flos": 1520059903488.0, + "grad_norm": 0.02131829704568505, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74915653, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.05078125, + "routerloss_mlp": 0.0, + "step": 3935, + "time_per_iteration": 4.626272439956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050547, + "balance_loss_mlp": 1.0411061, + "diversity_loss_mlp": 0.0, + "epoch": 0.7572143131973836, + "flos": 525471556608.0, + "grad_norm": 0.08283198519893772, + "language_loss": 0.78541541, + "learning_rate": 0.00014677720429790526, + "loss": 0.79592091, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.09429932, + "routerloss_mlp": 0.0, + "step": 3936, + "time_per_iteration": 2.634308338165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046754, + "balance_loss_mlp": 1.03724098, + "diversity_loss_mlp": 0.0, + "epoch": 0.7574066948826472, + "flos": 550738123776.0, + "grad_norm": 0.060589870954327815, + "language_loss": 0.84442061, + "learning_rate": 0.0001465567734026429, + "loss": 0.8548882, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.09503174, + "routerloss_mlp": 0.0, + "step": 3937, + "time_per_iteration": 2.716531276702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051246, + "balance_loss_mlp": 1.04150677, + "diversity_loss_mlp": 0.0, + "epoch": 0.7575990765679107, + "flos": 395899176960.0, + "grad_norm": 0.08803792614427135, + "language_loss": 0.82826757, + "learning_rate": 0.00014633647972621034, + "loss": 0.83878005, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.09729004, + "routerloss_mlp": 0.0, + "step": 3938, + "time_per_iteration": 2.4589834213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053327, + "balance_loss_mlp": 1.04381418, + "diversity_loss_mlp": 0.0, + "epoch": 0.7577914582531743, + "flos": 585030615552.0, + "grad_norm": 0.07008474871833649, + "language_loss": 0.86420083, + "learning_rate": 0.00014611632335413354, + "loss": 0.87473404, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3939, + "time_per_iteration": 2.7953155040740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_mlp": 1.04597211, + "diversity_loss_mlp": 0.0, + "epoch": 0.7579838399384379, + "flos": 820979172864.0, + "grad_norm": 0.06005420836927303, + "language_loss": 0.82715803, + "learning_rate": 0.00014589630437188456, + "loss": 0.83771348, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.09570312, + "routerloss_mlp": 0.0, + "step": 3940, + "time_per_iteration": 3.1720919609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056474, + "balance_loss_mlp": 1.04727697, + "diversity_loss_mlp": 0.0, + "epoch": 0.7581762216237015, + "flos": 443892441600.0, + "grad_norm": 0.07556117037580423, + "language_loss": 0.78885162, + "learning_rate": 0.00014567642286488253, + "loss": 0.7994163, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3941, + "time_per_iteration": 2.5224215984344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105497, + "balance_loss_mlp": 1.0453198, + "diversity_loss_mlp": 0.0, + "epoch": 0.7583686033089649, + "flos": 540886989312.0, + "grad_norm": 0.10380533878684198, + "language_loss": 0.79189527, + "learning_rate": 0.00014545667891849258, + "loss": 0.80244499, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.09649658, + "routerloss_mlp": 0.0, + "step": 3942, + "time_per_iteration": 2.6196579933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_mlp": 1.04717493, + "diversity_loss_mlp": 0.0, + "epoch": 0.7585609849942285, + "flos": 522588091392.0, + "grad_norm": 0.06980232416240703, + "language_loss": 0.82745945, + "learning_rate": 0.00014523707261802733, + "loss": 0.83802581, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 3943, + "time_per_iteration": 2.652625799179077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794094, + "balance_loss_mlp": 1.34365344, + "diversity_loss_mlp": 0.22232203, + "epoch": 0.7587533666794921, + "flos": 541860503040.0, + "grad_norm": 0.034795977662747106, + "language_loss": 0.81799769, + "learning_rate": 0.00014501760404874527, + "loss": 0.82593858, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01110633, + "step": 3944, + "time_per_iteration": 2.7529001235961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059832, + "balance_loss_mlp": 1.05071235, + "diversity_loss_mlp": 0.0, + "epoch": 0.7589457483647557, + "flos": 606408270336.0, + "grad_norm": 0.07566953086997541, + "language_loss": 0.85807776, + "learning_rate": 0.00014479827329585176, + "loss": 0.86867607, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 3945, + "time_per_iteration": 2.701622486114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_mlp": 1.04233766, + "diversity_loss_mlp": 0.0, + "epoch": 0.7591381300500193, + "flos": 555106452480.0, + "grad_norm": 0.05933089648069645, + "language_loss": 0.84881538, + "learning_rate": 0.00014457908044449846, + "loss": 0.85933375, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 3946, + "time_per_iteration": 2.728095769882202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787034, + "balance_loss_mlp": 1.32538223, + "diversity_loss_mlp": 0.22601989, + "epoch": 0.7593305117352828, + "flos": 529681669632.0, + "grad_norm": 0.02987157443530754, + "language_loss": 0.83105904, + "learning_rate": 0.00014436002557978371, + "loss": 0.83892936, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.011333, + "step": 3947, + "time_per_iteration": 2.8229527473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009615, + "balance_loss_mlp": 1.00491834, + "diversity_loss_mlp": 0.0, + "epoch": 0.7595228934205464, + "flos": 1502798759424.0, + "grad_norm": 0.009520189474687826, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77652764, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.046875, + "routerloss_mlp": 0.0, + "step": 3948, + "time_per_iteration": 6.289541482925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060096, + "balance_loss_mlp": 1.05072582, + "diversity_loss_mlp": 0.0, + "epoch": 0.7597152751058099, + "flos": 455525047296.0, + "grad_norm": 0.06379991139513626, + "language_loss": 0.79987645, + "learning_rate": 0.0001439223301503945, + "loss": 0.8104775, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.09362793, + "routerloss_mlp": 0.0, + "step": 3949, + "time_per_iteration": 2.4896605014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063286, + "balance_loss_mlp": 1.05441725, + "diversity_loss_mlp": 0.0, + "epoch": 0.7599076567910735, + "flos": 685466966016.0, + "grad_norm": 0.07443357695534152, + "language_loss": 0.75937033, + "learning_rate": 0.00014370368975564834, + "loss": 0.7700032, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 3950, + "time_per_iteration": 2.939652442932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062595, + "balance_loss_mlp": 1.05339789, + "diversity_loss_mlp": 0.0, + "epoch": 0.760100038476337, + "flos": 532372414464.0, + "grad_norm": 0.07225326310483449, + "language_loss": 0.83501256, + "learning_rate": 0.00014348518768739766, + "loss": 0.84563851, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 3951, + "time_per_iteration": 2.760315179824829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013895, + "balance_loss_mlp": 1.00924563, + "diversity_loss_mlp": 0.0, + "epoch": 0.7602924201616006, + "flos": 1471742866944.0, + "grad_norm": 0.01015881799745275, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77741933, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.04638672, + "routerloss_mlp": 0.0, + "step": 3952, + "time_per_iteration": 4.8084025382995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106276, + "balance_loss_mlp": 1.05365205, + "diversity_loss_mlp": 0.0, + "epoch": 0.7604848018468642, + "flos": 774631558656.0, + "grad_norm": 0.06460876756714844, + "language_loss": 0.86549526, + "learning_rate": 0.00014304859886964867, + "loss": 0.87612283, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3953, + "time_per_iteration": 2.9919626712799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_mlp": 1.05655789, + "diversity_loss_mlp": 0.0, + "epoch": 0.7606771835321278, + "flos": 558185209344.0, + "grad_norm": 0.06531272999026969, + "language_loss": 0.83625901, + "learning_rate": 0.00014283051228964878, + "loss": 0.84691703, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.09234619, + "routerloss_mlp": 0.0, + "step": 3954, + "time_per_iteration": 2.7195558547973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060232, + "balance_loss_mlp": 1.05114245, + "diversity_loss_mlp": 0.0, + "epoch": 0.7608695652173914, + "flos": 525397404672.0, + "grad_norm": 0.06973579873696066, + "language_loss": 0.82862848, + "learning_rate": 0.00014261256437514197, + "loss": 0.83923078, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 3955, + "time_per_iteration": 2.6542091369628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794195, + "balance_loss_mlp": 1.3411088, + "diversity_loss_mlp": 0.22477356, + "epoch": 0.7610619469026548, + "flos": 615038842368.0, + "grad_norm": 0.03401627820018092, + "language_loss": 0.82645166, + "learning_rate": 0.0001423947552107428, + "loss": 0.83439362, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0112533, + "step": 3956, + "time_per_iteration": 2.7648067474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062618, + "balance_loss_mlp": 1.05335546, + "diversity_loss_mlp": 0.0, + "epoch": 0.7612543285879184, + "flos": 863356382208.0, + "grad_norm": 0.06632119476384091, + "language_loss": 0.77184016, + "learning_rate": 0.00014217708488101243, + "loss": 0.78246629, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3957, + "time_per_iteration": 3.1002120971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064244, + "balance_loss_mlp": 1.05514848, + "diversity_loss_mlp": 0.0, + "epoch": 0.761446710273182, + "flos": 553658664960.0, + "grad_norm": 0.08639703813163502, + "language_loss": 0.77281177, + "learning_rate": 0.0001419595534704579, + "loss": 0.78345418, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 3958, + "time_per_iteration": 2.7124218940734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062721, + "balance_loss_mlp": 1.05369043, + "diversity_loss_mlp": 0.0, + "epoch": 0.7616390919584456, + "flos": 467350373376.0, + "grad_norm": 0.06838082339011158, + "language_loss": 0.81229275, + "learning_rate": 0.00014174216106353237, + "loss": 0.82291996, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 3959, + "time_per_iteration": 2.628516912460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060156, + "balance_loss_mlp": 1.05085802, + "diversity_loss_mlp": 0.0, + "epoch": 0.7618314736437091, + "flos": 498430858752.0, + "grad_norm": 0.07205328766008003, + "language_loss": 0.76858711, + "learning_rate": 0.00014152490774463512, + "loss": 0.77918863, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 3960, + "time_per_iteration": 2.630159854888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106295, + "balance_loss_mlp": 1.05382431, + "diversity_loss_mlp": 0.0, + "epoch": 0.7620238553289727, + "flos": 434545316352.0, + "grad_norm": 0.0819861529910791, + "language_loss": 0.87198371, + "learning_rate": 0.00014130779359811135, + "loss": 0.88261318, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3961, + "time_per_iteration": 2.464413642883301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058979, + "balance_loss_mlp": 1.04990077, + "diversity_loss_mlp": 0.0, + "epoch": 0.7622162370142362, + "flos": 664277262336.0, + "grad_norm": 0.07245892571162069, + "language_loss": 0.85946453, + "learning_rate": 0.0001410908187082521, + "loss": 0.87005424, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 3962, + "time_per_iteration": 2.921780586242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058262, + "balance_loss_mlp": 1.04887986, + "diversity_loss_mlp": 0.0, + "epoch": 0.7624086186994998, + "flos": 557965324800.0, + "grad_norm": 0.06688462156779182, + "language_loss": 0.83390021, + "learning_rate": 0.0001408739831592949, + "loss": 0.84448284, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3963, + "time_per_iteration": 2.6833889484405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060293, + "balance_loss_mlp": 1.05104804, + "diversity_loss_mlp": 0.0, + "epoch": 0.7626010003847634, + "flos": 629132396544.0, + "grad_norm": 0.0755930480675871, + "language_loss": 0.77544367, + "learning_rate": 0.0001406572870354224, + "loss": 0.7860465, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3964, + "time_per_iteration": 2.7871947288513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060093, + "balance_loss_mlp": 1.05084801, + "diversity_loss_mlp": 0.0, + "epoch": 0.7627933820700269, + "flos": 437942702592.0, + "grad_norm": 0.06988595261199848, + "language_loss": 0.86813599, + "learning_rate": 0.00014044073042076337, + "loss": 0.87873685, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 3965, + "time_per_iteration": 2.4948155879974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064783, + "balance_loss_mlp": 1.0558666, + "diversity_loss_mlp": 0.0, + "epoch": 0.7629857637552905, + "flos": 532723350528.0, + "grad_norm": 0.053016831320737375, + "language_loss": 0.88845956, + "learning_rate": 0.00014022431339939302, + "loss": 0.8991074, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 3966, + "time_per_iteration": 2.673383951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057572, + "balance_loss_mlp": 1.04824972, + "diversity_loss_mlp": 0.0, + "epoch": 0.7631781454405541, + "flos": 680036290560.0, + "grad_norm": 0.09057872820095057, + "language_loss": 0.7816959, + "learning_rate": 0.00014000803605533163, + "loss": 0.79227161, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3967, + "time_per_iteration": 2.8631951808929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057314, + "balance_loss_mlp": 1.04857016, + "diversity_loss_mlp": 0.0, + "epoch": 0.7633705271258177, + "flos": 507493859328.0, + "grad_norm": 0.08630668575925342, + "language_loss": 0.84042531, + "learning_rate": 0.00013979189847254553, + "loss": 0.85099846, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 3968, + "time_per_iteration": 2.5586295127868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057911, + "balance_loss_mlp": 1.04832053, + "diversity_loss_mlp": 0.0, + "epoch": 0.7635629088110811, + "flos": 618866085888.0, + "grad_norm": 0.07119073500769035, + "language_loss": 0.80335605, + "learning_rate": 0.00013957590073494674, + "loss": 0.81393516, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.09576416, + "routerloss_mlp": 0.0, + "step": 3969, + "time_per_iteration": 2.785759449005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055351, + "balance_loss_mlp": 1.0460887, + "diversity_loss_mlp": 0.0, + "epoch": 0.7637552904963447, + "flos": 638425193472.0, + "grad_norm": 0.0691753234001315, + "language_loss": 0.78865349, + "learning_rate": 0.0001393600429263931, + "loss": 0.79920697, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 3970, + "time_per_iteration": 2.7582993507385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013524, + "balance_loss_mlp": 1.00873148, + "diversity_loss_mlp": 0.0, + "epoch": 0.7639476721816083, + "flos": 1563222302208.0, + "grad_norm": 0.011908325756944461, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.7575841, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.04785156, + "routerloss_mlp": 0.0, + "step": 3971, + "time_per_iteration": 4.944155693054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051128, + "balance_loss_mlp": 1.04182386, + "diversity_loss_mlp": 0.0, + "epoch": 0.7641400538668719, + "flos": 495987162624.0, + "grad_norm": 0.07417078530438988, + "language_loss": 0.81570405, + "learning_rate": 0.0001389287474315804, + "loss": 0.82621539, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 3972, + "time_per_iteration": 2.6553244590759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052519, + "balance_loss_mlp": 1.04347086, + "diversity_loss_mlp": 0.0, + "epoch": 0.7643324355521355, + "flos": 578441046528.0, + "grad_norm": 0.05487535888911553, + "language_loss": 0.79840803, + "learning_rate": 0.00013871330991276505, + "loss": 0.8089332, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 3973, + "time_per_iteration": 2.681697368621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052207, + "balance_loss_mlp": 1.0428077, + "diversity_loss_mlp": 0.0, + "epoch": 0.764524817237399, + "flos": 784823717376.0, + "grad_norm": 0.08960984364762024, + "language_loss": 0.80946076, + "learning_rate": 0.00013849801265788247, + "loss": 0.81998283, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3974, + "time_per_iteration": 3.0523104667663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796632, + "balance_loss_mlp": 1.34598541, + "diversity_loss_mlp": 0.22497699, + "epoch": 0.7647171989226625, + "flos": 526279514112.0, + "grad_norm": 0.033347453631336434, + "language_loss": 0.83125114, + "learning_rate": 0.00013828285575051818, + "loss": 0.83921754, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01115073, + "step": 3975, + "time_per_iteration": 2.631014108657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052026, + "balance_loss_mlp": 1.04301977, + "diversity_loss_mlp": 0.0, + "epoch": 0.7649095806079261, + "flos": 554876656128.0, + "grad_norm": 0.06872239671854397, + "language_loss": 0.84060633, + "learning_rate": 0.0001380678392742035, + "loss": 0.85112655, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 3976, + "time_per_iteration": 2.710768938064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050013, + "balance_loss_mlp": 1.04042244, + "diversity_loss_mlp": 0.0, + "epoch": 0.7651019622931897, + "flos": 649145954304.0, + "grad_norm": 0.05722299510673748, + "language_loss": 0.84721446, + "learning_rate": 0.00013785296331241526, + "loss": 0.85771459, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.09588623, + "routerloss_mlp": 0.0, + "step": 3977, + "time_per_iteration": 2.863175868988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049924, + "balance_loss_mlp": 1.04060829, + "diversity_loss_mlp": 0.0, + "epoch": 0.7652943439784533, + "flos": 1046449248768.0, + "grad_norm": 0.0690026214963165, + "language_loss": 0.87410915, + "learning_rate": 0.00013763822794857583, + "loss": 0.88460839, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 3978, + "time_per_iteration": 3.3100810050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049847, + "balance_loss_mlp": 1.04050136, + "diversity_loss_mlp": 0.0, + "epoch": 0.7654867256637168, + "flos": 504350862336.0, + "grad_norm": 0.06632607852839086, + "language_loss": 0.90003061, + "learning_rate": 0.00013742363326605278, + "loss": 0.91052908, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 3979, + "time_per_iteration": 2.754115581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_mlp": 1.04258752, + "diversity_loss_mlp": 0.0, + "epoch": 0.7656791073489804, + "flos": 574709976576.0, + "grad_norm": 0.059791344398012564, + "language_loss": 0.78432417, + "learning_rate": 0.00013720917934815935, + "loss": 0.79484463, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.09466553, + "routerloss_mlp": 0.0, + "step": 3980, + "time_per_iteration": 2.801797866821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053778, + "balance_loss_mlp": 1.04425907, + "diversity_loss_mlp": 0.0, + "epoch": 0.765871489034244, + "flos": 492812232192.0, + "grad_norm": 0.08312893208703641, + "language_loss": 0.82967758, + "learning_rate": 0.00013699486627815344, + "loss": 0.84021544, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.09509277, + "routerloss_mlp": 0.0, + "step": 3981, + "time_per_iteration": 2.6589224338531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052365, + "balance_loss_mlp": 1.04295897, + "diversity_loss_mlp": 0.0, + "epoch": 0.7660638707195075, + "flos": 486024800256.0, + "grad_norm": 0.07260212580199023, + "language_loss": 0.82633436, + "learning_rate": 0.00013678069413923928, + "loss": 0.83685803, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 3982, + "time_per_iteration": 2.6876726150512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_mlp": 1.0454247, + "diversity_loss_mlp": 0.0, + "epoch": 0.766256252404771, + "flos": 444295134720.0, + "grad_norm": 0.060912508562222696, + "language_loss": 0.81971568, + "learning_rate": 0.00013656666301456555, + "loss": 0.83026105, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 3983, + "time_per_iteration": 2.547969341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051422, + "balance_loss_mlp": 1.04195666, + "diversity_loss_mlp": 0.0, + "epoch": 0.7664486340900346, + "flos": 485179766784.0, + "grad_norm": 0.07203556219041155, + "language_loss": 0.84272242, + "learning_rate": 0.0001363527729872267, + "loss": 0.85323668, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 3984, + "time_per_iteration": 2.638418197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052921, + "balance_loss_mlp": 1.04378974, + "diversity_loss_mlp": 0.0, + "epoch": 0.7666410157752982, + "flos": 646200820224.0, + "grad_norm": 0.06683426358110046, + "language_loss": 0.76389247, + "learning_rate": 0.00013613902414026207, + "loss": 0.77442169, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3985, + "time_per_iteration": 2.7989237308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055274, + "balance_loss_mlp": 1.04588056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7668333974605618, + "flos": 774303017472.0, + "grad_norm": 0.07515257411295292, + "language_loss": 0.82508516, + "learning_rate": 0.00013592541655665642, + "loss": 0.83563781, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 3986, + "time_per_iteration": 3.015293836593628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105591, + "balance_loss_mlp": 1.04635525, + "diversity_loss_mlp": 0.0, + "epoch": 0.7670257791458254, + "flos": 613462574592.0, + "grad_norm": 0.07774054250244124, + "language_loss": 0.85269868, + "learning_rate": 0.00013571195031933947, + "loss": 0.86325783, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 3987, + "time_per_iteration": 2.6980810165405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010581, + "balance_loss_mlp": 1.0057168, + "diversity_loss_mlp": 0.0, + "epoch": 0.7672181608310888, + "flos": 1485357378048.0, + "grad_norm": 0.012742252799641985, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81491923, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 3988, + "time_per_iteration": 4.809666156768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049905, + "balance_loss_mlp": 1.04043365, + "diversity_loss_mlp": 0.0, + "epoch": 0.7674105425163524, + "flos": 610732182528.0, + "grad_norm": 0.07424799958173026, + "language_loss": 0.85590923, + "learning_rate": 0.00013528544221501655, + "loss": 0.86640829, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.09460449, + "routerloss_mlp": 0.0, + "step": 3989, + "time_per_iteration": 2.7649118900299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010579, + "balance_loss_mlp": 1.04848218, + "diversity_loss_mlp": 0.0, + "epoch": 0.767602924201616, + "flos": 845205788160.0, + "grad_norm": 0.07001972276723446, + "language_loss": 0.81763613, + "learning_rate": 0.00013507240051359586, + "loss": 0.82821512, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3990, + "time_per_iteration": 3.0377867221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057165, + "balance_loss_mlp": 1.04797447, + "diversity_loss_mlp": 0.0, + "epoch": 0.7677953058868796, + "flos": 527114635776.0, + "grad_norm": 0.07160878890290734, + "language_loss": 0.86059034, + "learning_rate": 0.00013485950048963425, + "loss": 0.87116206, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 3991, + "time_per_iteration": 2.5790224075317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105318, + "balance_loss_mlp": 1.04409003, + "diversity_loss_mlp": 0.0, + "epoch": 0.7679876875721431, + "flos": 923550501888.0, + "grad_norm": 0.0667031946156718, + "language_loss": 0.82767689, + "learning_rate": 0.00013464674222578643, + "loss": 0.83820868, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 3992, + "time_per_iteration": 3.201578140258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057061, + "balance_loss_mlp": 1.04791176, + "diversity_loss_mlp": 0.0, + "epoch": 0.7681800692574067, + "flos": 458087311872.0, + "grad_norm": 0.08569609854575283, + "language_loss": 0.83404213, + "learning_rate": 0.00013443412580465292, + "loss": 0.84461272, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 3993, + "time_per_iteration": 2.5704004764556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_mlp": 1.04118383, + "diversity_loss_mlp": 0.0, + "epoch": 0.7683724509426703, + "flos": 658436179968.0, + "grad_norm": 0.0673936052155154, + "language_loss": 0.83964813, + "learning_rate": 0.00013422165130877857, + "loss": 0.85015404, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 3994, + "time_per_iteration": 2.9138286113739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057424, + "balance_loss_mlp": 1.0483048, + "diversity_loss_mlp": 0.0, + "epoch": 0.7685648326279338, + "flos": 555284491776.0, + "grad_norm": 0.07281784593119212, + "language_loss": 0.8049981, + "learning_rate": 0.00013400931882065327, + "loss": 0.81557238, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 3995, + "time_per_iteration": 2.6342077255249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055278, + "balance_loss_mlp": 1.04585409, + "diversity_loss_mlp": 0.0, + "epoch": 0.7687572143131974, + "flos": 687404081664.0, + "grad_norm": 0.062093519620885704, + "language_loss": 0.80842459, + "learning_rate": 0.0001337971284227118, + "loss": 0.81897736, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 3996, + "time_per_iteration": 3.0022008419036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004861, + "balance_loss_mlp": 1.00011611, + "diversity_loss_mlp": 0.0, + "epoch": 0.7689495959984609, + "flos": 1489453691904.0, + "grad_norm": 0.007312606829584695, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77123284, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.04736328, + "routerloss_mlp": 0.0, + "step": 3997, + "time_per_iteration": 4.911606311798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055259, + "balance_loss_mlp": 1.04605579, + "diversity_loss_mlp": 0.0, + "epoch": 0.7691419776837245, + "flos": 570405888000.0, + "grad_norm": 0.06973120075241693, + "language_loss": 0.8046248, + "learning_rate": 0.0001333731742268438, + "loss": 0.81517738, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 3998, + "time_per_iteration": 2.683593273162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053524, + "balance_loss_mlp": 1.0442791, + "diversity_loss_mlp": 0.0, + "epoch": 0.7693343593689881, + "flos": 520087495680.0, + "grad_norm": 0.0765354269800423, + "language_loss": 0.85693717, + "learning_rate": 0.0001331614105935109, + "loss": 0.86747241, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 3999, + "time_per_iteration": 2.675220489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054062, + "balance_loss_mlp": 1.04481769, + "diversity_loss_mlp": 0.0, + "epoch": 0.7695267410542517, + "flos": 660378438144.0, + "grad_norm": 0.06349178277774252, + "language_loss": 0.84176111, + "learning_rate": 0.00013294978937954883, + "loss": 0.85230172, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4000, + "time_per_iteration": 2.8622941970825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054979, + "balance_loss_mlp": 1.04558492, + "diversity_loss_mlp": 0.0, + "epoch": 0.7697191227395151, + "flos": 546809564160.0, + "grad_norm": 0.09234703224205486, + "language_loss": 0.85414779, + "learning_rate": 0.00013273831066711655, + "loss": 0.86469758, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4001, + "time_per_iteration": 2.6298534870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052476, + "balance_loss_mlp": 1.04325461, + "diversity_loss_mlp": 0.0, + "epoch": 0.7699115044247787, + "flos": 540610205184.0, + "grad_norm": 0.06055695533202859, + "language_loss": 0.79907209, + "learning_rate": 0.00013252697453831747, + "loss": 0.8095969, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 4002, + "time_per_iteration": 2.692922830581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047306, + "balance_loss_mlp": 1.03798985, + "diversity_loss_mlp": 0.0, + "epoch": 0.7701038861100423, + "flos": 562936407552.0, + "grad_norm": 0.06495740089460322, + "language_loss": 0.82613641, + "learning_rate": 0.00013231578107519916, + "loss": 0.83660942, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4003, + "time_per_iteration": 2.9229555130004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049706, + "balance_loss_mlp": 1.04049134, + "diversity_loss_mlp": 0.0, + "epoch": 0.7702962677953059, + "flos": 481737964032.0, + "grad_norm": 0.07621650724161941, + "language_loss": 0.82803172, + "learning_rate": 0.00013210473035975422, + "loss": 0.83852881, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 4004, + "time_per_iteration": 2.569532632827759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_mlp": 1.04116035, + "diversity_loss_mlp": 0.0, + "epoch": 0.7704886494805695, + "flos": 770389138944.0, + "grad_norm": 0.07296352629436301, + "language_loss": 0.85812414, + "learning_rate": 0.0001318938224739201, + "loss": 0.86862826, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4005, + "time_per_iteration": 3.0234341621398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049929, + "balance_loss_mlp": 1.04063106, + "diversity_loss_mlp": 0.0, + "epoch": 0.770681031165833, + "flos": 601192336896.0, + "grad_norm": 0.06528825004105314, + "language_loss": 0.83766401, + "learning_rate": 0.00013168305749957843, + "loss": 0.84816337, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.09301758, + "routerloss_mlp": 0.0, + "step": 4006, + "time_per_iteration": 2.733548641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790765, + "balance_loss_mlp": 1.33768153, + "diversity_loss_mlp": 0.22157404, + "epoch": 0.7708734128510966, + "flos": 496108302336.0, + "grad_norm": 0.030772470198916744, + "language_loss": 0.82874978, + "learning_rate": 0.00013147243551855532, + "loss": 0.8366574, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01113757, + "step": 4007, + "time_per_iteration": 2.6124446392059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_mlp": 1.0404737, + "diversity_loss_mlp": 0.0, + "epoch": 0.7710657945363601, + "flos": 567299966976.0, + "grad_norm": 0.05859111752284866, + "language_loss": 0.80677342, + "learning_rate": 0.00013126195661262148, + "loss": 0.81727076, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4008, + "time_per_iteration": 2.7372946739196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052041, + "balance_loss_mlp": 1.04286766, + "diversity_loss_mlp": 0.0, + "epoch": 0.7712581762216237, + "flos": 604550075904.0, + "grad_norm": 0.06950402202343967, + "language_loss": 0.86921602, + "learning_rate": 0.00013105162086349216, + "loss": 0.87973642, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4009, + "time_per_iteration": 2.825164556503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050421, + "balance_loss_mlp": 1.04102159, + "diversity_loss_mlp": 0.0, + "epoch": 0.7714505579068872, + "flos": 530894891520.0, + "grad_norm": 0.05664497988696294, + "language_loss": 0.85945249, + "learning_rate": 0.00013084142835282687, + "loss": 0.86995667, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 4010, + "time_per_iteration": 2.6627306938171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00590218, + "balance_loss_mlp": 1.02735484, + "diversity_loss_mlp": 0.13424492, + "epoch": 0.7716429395921508, + "flos": 1422205267968.0, + "grad_norm": 0.0012430140076356488, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80474579, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00941846, + "step": 4011, + "time_per_iteration": 4.808507919311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050884, + "balance_loss_mlp": 1.04154992, + "diversity_loss_mlp": 0.0, + "epoch": 0.7718353212774144, + "flos": 578428563456.0, + "grad_norm": 0.062052307609784016, + "language_loss": 0.89290094, + "learning_rate": 0.0001304214733732485, + "loss": 0.90340984, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 4012, + "time_per_iteration": 2.7328708171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105349, + "balance_loss_mlp": 1.04380453, + "diversity_loss_mlp": 0.0, + "epoch": 0.772027702962678, + "flos": 510742941696.0, + "grad_norm": 0.07734543299334512, + "language_loss": 0.82669097, + "learning_rate": 0.00013021171106737672, + "loss": 0.83722585, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.09686279, + "routerloss_mlp": 0.0, + "step": 4013, + "time_per_iteration": 2.6573734283447266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049877, + "balance_loss_mlp": 1.04070377, + "diversity_loss_mlp": 0.0, + "epoch": 0.7722200846479416, + "flos": 525661705728.0, + "grad_norm": 0.06603423132938777, + "language_loss": 0.80092031, + "learning_rate": 0.00013000209232605071, + "loss": 0.81141913, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4014, + "time_per_iteration": 2.717602014541626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053571, + "balance_loss_mlp": 1.04388535, + "diversity_loss_mlp": 0.0, + "epoch": 0.772412466333205, + "flos": 479598216192.0, + "grad_norm": 0.10571386830465022, + "language_loss": 0.80179751, + "learning_rate": 0.0001297926172306519, + "loss": 0.81233323, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.09674072, + "routerloss_mlp": 0.0, + "step": 4015, + "time_per_iteration": 2.65010142326355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051804, + "balance_loss_mlp": 1.04230273, + "diversity_loss_mlp": 0.0, + "epoch": 0.7726048480184686, + "flos": 905688801792.0, + "grad_norm": 0.06492582612573077, + "language_loss": 0.7883606, + "learning_rate": 0.0001295832858625055, + "loss": 0.79887861, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 4016, + "time_per_iteration": 3.2565736770629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050538, + "balance_loss_mlp": 1.04109037, + "diversity_loss_mlp": 0.0, + "epoch": 0.7727972297037322, + "flos": 631380801024.0, + "grad_norm": 0.06662088321139942, + "language_loss": 0.70083648, + "learning_rate": 0.00012937409830288154, + "loss": 0.71134186, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 4017, + "time_per_iteration": 2.818197250366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046111, + "balance_loss_mlp": 1.03688383, + "diversity_loss_mlp": 0.0, + "epoch": 0.7729896113889958, + "flos": 414786147840.0, + "grad_norm": 0.08953669234150197, + "language_loss": 0.84953344, + "learning_rate": 0.00012916505463299362, + "loss": 0.85999447, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4018, + "time_per_iteration": 2.5104525089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104651, + "balance_loss_mlp": 1.03696132, + "diversity_loss_mlp": 0.0, + "epoch": 0.7731819930742593, + "flos": 668907694080.0, + "grad_norm": 0.08710028809718832, + "language_loss": 0.78235918, + "learning_rate": 0.00012895615493399972, + "loss": 0.79282427, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 4019, + "time_per_iteration": 2.7878103256225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104679, + "balance_loss_mlp": 1.03747368, + "diversity_loss_mlp": 0.0, + "epoch": 0.7733743747595229, + "flos": 489854615040.0, + "grad_norm": 0.07808729146965544, + "language_loss": 0.82637143, + "learning_rate": 0.00012874739928700192, + "loss": 0.83683932, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 4020, + "time_per_iteration": 2.5788097381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044505, + "balance_loss_mlp": 1.03501582, + "diversity_loss_mlp": 0.0, + "epoch": 0.7735667564447865, + "flos": 659612325888.0, + "grad_norm": 0.07324265685000747, + "language_loss": 0.79874408, + "learning_rate": 0.00012853878777304624, + "loss": 0.80918914, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 4021, + "time_per_iteration": 2.870278835296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794381, + "balance_loss_mlp": 1.34430456, + "diversity_loss_mlp": 0.22252312, + "epoch": 0.77375913813005, + "flos": 533383004160.0, + "grad_norm": 0.029931863934209574, + "language_loss": 0.84459031, + "learning_rate": 0.000128330320473123, + "loss": 0.85253412, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01096685, + "step": 4022, + "time_per_iteration": 2.7129287719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008173, + "balance_loss_mlp": 1.00330901, + "diversity_loss_mlp": 0.0, + "epoch": 0.7739515198153136, + "flos": 1520081925120.0, + "grad_norm": 0.013994594591819043, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.7934007, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.04858398, + "routerloss_mlp": 0.0, + "step": 4023, + "time_per_iteration": 4.895900726318359 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051725, + "balance_loss_mlp": 1.04231346, + "diversity_loss_mlp": 0.0, + "epoch": 0.7741439015005771, + "flos": 640105348608.0, + "grad_norm": 0.07018696985022486, + "language_loss": 0.81708258, + "learning_rate": 0.0001279138188390543, + "loss": 0.82759976, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.09405518, + "routerloss_mlp": 0.0, + "step": 4024, + "time_per_iteration": 2.745079517364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050638, + "balance_loss_mlp": 1.04130435, + "diversity_loss_mlp": 0.0, + "epoch": 0.7743362831858407, + "flos": 665841420288.0, + "grad_norm": 0.06486800405407347, + "language_loss": 0.86009115, + "learning_rate": 0.00012770578466660915, + "loss": 0.87059748, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4025, + "time_per_iteration": 2.848886013031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054208, + "balance_loss_mlp": 1.04474843, + "diversity_loss_mlp": 0.0, + "epoch": 0.7745286648711043, + "flos": 562760939520.0, + "grad_norm": 0.06391594939980325, + "language_loss": 0.81626999, + "learning_rate": 0.0001274978950315968, + "loss": 0.82681203, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.09454346, + "routerloss_mlp": 0.0, + "step": 4026, + "time_per_iteration": 2.791773796081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104997, + "balance_loss_mlp": 1.04037929, + "diversity_loss_mlp": 0.0, + "epoch": 0.7747210465563679, + "flos": 516912565248.0, + "grad_norm": 0.11270799389052534, + "language_loss": 0.83240479, + "learning_rate": 0.00012729015001472716, + "loss": 0.84290445, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.0958252, + "routerloss_mlp": 0.0, + "step": 4027, + "time_per_iteration": 2.6333580017089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_mlp": 1.04164386, + "diversity_loss_mlp": 0.0, + "epoch": 0.7749134282416313, + "flos": 634209937920.0, + "grad_norm": 0.06039716871949276, + "language_loss": 0.81597829, + "learning_rate": 0.00012708254969665418, + "loss": 0.82648969, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.0949707, + "routerloss_mlp": 0.0, + "step": 4028, + "time_per_iteration": 2.753960132598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057336, + "balance_loss_mlp": 1.0482347, + "diversity_loss_mlp": 0.0, + "epoch": 0.7751058099268949, + "flos": 495364584960.0, + "grad_norm": 0.08015627547619836, + "language_loss": 0.83207834, + "learning_rate": 0.00012687509415797526, + "loss": 0.84265172, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4029, + "time_per_iteration": 2.549224376678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055891, + "balance_loss_mlp": 1.04669952, + "diversity_loss_mlp": 0.0, + "epoch": 0.7752981916121585, + "flos": 510310513152.0, + "grad_norm": 0.0754412874698092, + "language_loss": 0.81577122, + "learning_rate": 0.00012666778347923208, + "loss": 0.82633013, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 4030, + "time_per_iteration": 2.6578049659729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058934, + "balance_loss_mlp": 1.04996991, + "diversity_loss_mlp": 0.0, + "epoch": 0.7754905732974221, + "flos": 497548749312.0, + "grad_norm": 0.05434911795401194, + "language_loss": 0.83884913, + "learning_rate": 0.0001264606177409092, + "loss": 0.84943849, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4031, + "time_per_iteration": 2.7437548637390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054431, + "balance_loss_mlp": 1.04539514, + "diversity_loss_mlp": 0.0, + "epoch": 0.7756829549826857, + "flos": 480744626688.0, + "grad_norm": 0.06981681066227559, + "language_loss": 0.85926938, + "learning_rate": 0.00012625359702343609, + "loss": 0.86981368, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4032, + "time_per_iteration": 2.7145252227783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062978, + "balance_loss_mlp": 1.05414999, + "diversity_loss_mlp": 0.0, + "epoch": 0.7758753366679492, + "flos": 552630822912.0, + "grad_norm": 0.06703655691775996, + "language_loss": 0.84627414, + "learning_rate": 0.00012604672140718504, + "loss": 0.85690391, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4033, + "time_per_iteration": 2.6776609420776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061314, + "balance_loss_mlp": 1.05224824, + "diversity_loss_mlp": 0.0, + "epoch": 0.7760677183532128, + "flos": 703835246592.0, + "grad_norm": 0.0713724123127894, + "language_loss": 0.77912575, + "learning_rate": 0.00012583999097247233, + "loss": 0.78973895, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4034, + "time_per_iteration": 2.8429367542266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058219, + "balance_loss_mlp": 1.04938531, + "diversity_loss_mlp": 0.0, + "epoch": 0.7762601000384763, + "flos": 523470200832.0, + "grad_norm": 0.07138701732892383, + "language_loss": 0.80042505, + "learning_rate": 0.0001256334057995578, + "loss": 0.81100732, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4035, + "time_per_iteration": 2.805361032485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060081, + "balance_loss_mlp": 1.05109227, + "diversity_loss_mlp": 0.0, + "epoch": 0.7764524817237399, + "flos": 557532896256.0, + "grad_norm": 0.06152435345467902, + "language_loss": 0.85125613, + "learning_rate": 0.000125426965968645, + "loss": 0.86185694, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4036, + "time_per_iteration": 2.7150938510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064057, + "balance_loss_mlp": 1.05523515, + "diversity_loss_mlp": 0.0, + "epoch": 0.7766448634090035, + "flos": 579725849088.0, + "grad_norm": 0.07000613008602406, + "language_loss": 0.819399, + "learning_rate": 0.00012522067155988092, + "loss": 0.83003962, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4037, + "time_per_iteration": 2.6996352672576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060303, + "balance_loss_mlp": 1.05135584, + "diversity_loss_mlp": 0.0, + "epoch": 0.776837245094267, + "flos": 635603397120.0, + "grad_norm": 0.0718823999319763, + "language_loss": 0.75306779, + "learning_rate": 0.00012501452265335617, + "loss": 0.7636708, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4038, + "time_per_iteration": 2.8315415382385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066111, + "balance_loss_mlp": 1.05724156, + "diversity_loss_mlp": 0.0, + "epoch": 0.7770296267795306, + "flos": 614680565760.0, + "grad_norm": 0.06411925705378174, + "language_loss": 0.83063197, + "learning_rate": 0.0001248085193291047, + "loss": 0.84129304, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4039, + "time_per_iteration": 2.729095935821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069535, + "balance_loss_mlp": 1.0605464, + "diversity_loss_mlp": 0.0, + "epoch": 0.7772220084647942, + "flos": 878808890880.0, + "grad_norm": 0.05882048458025786, + "language_loss": 0.82089669, + "learning_rate": 0.00012460266166710443, + "loss": 0.83159202, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4040, + "time_per_iteration": 3.1514501571655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068929, + "balance_loss_mlp": 1.06013775, + "diversity_loss_mlp": 0.0, + "epoch": 0.7774143901500578, + "flos": 839641489920.0, + "grad_norm": 0.07867166554480139, + "language_loss": 0.77746958, + "learning_rate": 0.00012439694974727633, + "loss": 0.78815889, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4041, + "time_per_iteration": 3.0117955207824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_mlp": 1.05708027, + "diversity_loss_mlp": 0.0, + "epoch": 0.7776067718353212, + "flos": 568147571712.0, + "grad_norm": 0.06430167773545564, + "language_loss": 0.79798543, + "learning_rate": 0.00012419138364948458, + "loss": 0.80864501, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4042, + "time_per_iteration": 2.7055745124816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064858, + "balance_loss_mlp": 1.05601263, + "diversity_loss_mlp": 0.0, + "epoch": 0.7777991535205848, + "flos": 745943012352.0, + "grad_norm": 0.06788477072783218, + "language_loss": 0.82296908, + "learning_rate": 0.00012398596345353702, + "loss": 0.83361769, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4043, + "time_per_iteration": 2.8943872451782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064992, + "balance_loss_mlp": 1.05608058, + "diversity_loss_mlp": 0.0, + "epoch": 0.7779915352058484, + "flos": 538075104768.0, + "grad_norm": 0.06253380969554054, + "language_loss": 0.83342338, + "learning_rate": 0.0001237806892391851, + "loss": 0.8440733, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4044, + "time_per_iteration": 2.697079658508301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061837, + "balance_loss_mlp": 1.05312264, + "diversity_loss_mlp": 0.0, + "epoch": 0.778183916891112, + "flos": 634788099072.0, + "grad_norm": 0.07069263559946819, + "language_loss": 0.81128013, + "learning_rate": 0.0001235755610861233, + "loss": 0.82189852, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4045, + "time_per_iteration": 2.7329134941101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066232, + "balance_loss_mlp": 1.05731463, + "diversity_loss_mlp": 0.0, + "epoch": 0.7783762985763756, + "flos": 588677621760.0, + "grad_norm": 0.07032278053298287, + "language_loss": 0.85504925, + "learning_rate": 0.0001233705790739893, + "loss": 0.86571157, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4046, + "time_per_iteration": 2.708867073059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061968, + "balance_loss_mlp": 1.05317056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7785686802616391, + "flos": 930656563200.0, + "grad_norm": 0.08570945023626393, + "language_loss": 0.7512747, + "learning_rate": 0.0001231657432823643, + "loss": 0.76189435, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4047, + "time_per_iteration": 3.209035634994507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064295, + "balance_loss_mlp": 1.05536008, + "diversity_loss_mlp": 0.0, + "epoch": 0.7787610619469026, + "flos": 497934190080.0, + "grad_norm": 0.07478772193794427, + "language_loss": 0.78683329, + "learning_rate": 0.0001229610537907725, + "loss": 0.79747623, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4048, + "time_per_iteration": 2.570645332336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063203, + "balance_loss_mlp": 1.05442929, + "diversity_loss_mlp": 0.0, + "epoch": 0.7789534436321662, + "flos": 515637674496.0, + "grad_norm": 0.07810921414498996, + "language_loss": 0.90262878, + "learning_rate": 0.00012275651067868143, + "loss": 0.91326082, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4049, + "time_per_iteration": 2.5862553119659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058533, + "balance_loss_mlp": 1.04978311, + "diversity_loss_mlp": 0.0, + "epoch": 0.7791458253174298, + "flos": 988476369408.0, + "grad_norm": 0.05845393765756997, + "language_loss": 0.80259252, + "learning_rate": 0.00012255211402550182, + "loss": 0.81317782, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4050, + "time_per_iteration": 3.2020328044891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055369, + "balance_loss_mlp": 1.04645181, + "diversity_loss_mlp": 0.0, + "epoch": 0.7793382070026933, + "flos": 629040992256.0, + "grad_norm": 0.07830185849799275, + "language_loss": 0.76506507, + "learning_rate": 0.00012234786391058727, + "loss": 0.77561879, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4051, + "time_per_iteration": 2.823751449584961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059116, + "balance_loss_mlp": 1.05021727, + "diversity_loss_mlp": 0.0, + "epoch": 0.7795305886879569, + "flos": 531752408064.0, + "grad_norm": 0.07934971719083544, + "language_loss": 0.85162616, + "learning_rate": 0.0001221437604132352, + "loss": 0.86221731, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4052, + "time_per_iteration": 2.6284594535827637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054893, + "balance_loss_mlp": 1.04598236, + "diversity_loss_mlp": 0.0, + "epoch": 0.7797229703732205, + "flos": 611979909120.0, + "grad_norm": 0.07077897315409304, + "language_loss": 0.8102321, + "learning_rate": 0.0001219398036126852, + "loss": 0.82078099, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4053, + "time_per_iteration": 2.7439231872558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059971, + "balance_loss_mlp": 1.05101228, + "diversity_loss_mlp": 0.0, + "epoch": 0.7799153520584841, + "flos": 872164620288.0, + "grad_norm": 0.06870313821829518, + "language_loss": 0.78245676, + "learning_rate": 0.00012173599358812027, + "loss": 0.79305649, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4054, + "time_per_iteration": 3.256080150604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058619, + "balance_loss_mlp": 1.04986334, + "diversity_loss_mlp": 0.0, + "epoch": 0.7801077337437476, + "flos": 583627244544.0, + "grad_norm": 0.07402592003625927, + "language_loss": 0.82719493, + "learning_rate": 0.0001215323304186668, + "loss": 0.83778107, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4055, + "time_per_iteration": 2.7612040042877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105856, + "balance_loss_mlp": 1.05008435, + "diversity_loss_mlp": 0.0, + "epoch": 0.7803001154290111, + "flos": 601165172736.0, + "grad_norm": 0.06917846158934658, + "language_loss": 0.87829256, + "learning_rate": 0.00012132881418339364, + "loss": 0.88887817, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.08483887, + "routerloss_mlp": 0.0, + "step": 4056, + "time_per_iteration": 2.7365031242370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006422, + "balance_loss_mlp": 1.00186825, + "diversity_loss_mlp": 0.0, + "epoch": 0.7804924971142747, + "flos": 1479577591296.0, + "grad_norm": 0.016656968003394067, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78523988, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.0456543, + "routerloss_mlp": 0.0, + "step": 4057, + "time_per_iteration": 4.83305811882019 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_mlp": 1.04785705, + "diversity_loss_mlp": 0.0, + "epoch": 0.7806848787995383, + "flos": 630362870784.0, + "grad_norm": 0.06805160455788861, + "language_loss": 0.77303064, + "learning_rate": 0.00012092222283137944, + "loss": 0.78359842, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4058, + "time_per_iteration": 2.749647617340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100669, + "balance_loss_mlp": 1.00213623, + "diversity_loss_mlp": 0.0, + "epoch": 0.7808772604848019, + "flos": 1417587319296.0, + "grad_norm": 0.014137874321597207, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79913002, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.0456543, + "routerloss_mlp": 0.0, + "step": 4059, + "time_per_iteration": 4.786531209945679 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060827, + "balance_loss_mlp": 1.0521071, + "diversity_loss_mlp": 0.0, + "epoch": 0.7810696421700654, + "flos": 731696011776.0, + "grad_norm": 0.0627573295973092, + "language_loss": 0.83679825, + "learning_rate": 0.00012051622016348856, + "loss": 0.84740651, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4060, + "time_per_iteration": 2.999849557876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060136, + "balance_loss_mlp": 1.05145788, + "diversity_loss_mlp": 0.0, + "epoch": 0.781262023855329, + "flos": 424941230592.0, + "grad_norm": 0.09064537340570315, + "language_loss": 0.84317231, + "learning_rate": 0.00012031343978315539, + "loss": 0.85377359, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 4061, + "time_per_iteration": 2.468447208404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056381, + "balance_loss_mlp": 1.04746997, + "diversity_loss_mlp": 0.0, + "epoch": 0.7814544055405925, + "flos": 501027628032.0, + "grad_norm": 0.06926307807295869, + "language_loss": 0.8253361, + "learning_rate": 0.00012011080681021774, + "loss": 0.83589995, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4062, + "time_per_iteration": 2.6554322242736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058674, + "balance_loss_mlp": 1.04981685, + "diversity_loss_mlp": 0.0, + "epoch": 0.7816467872258561, + "flos": 462448300032.0, + "grad_norm": 0.07294593948757502, + "language_loss": 0.86419785, + "learning_rate": 0.00011990832132334512, + "loss": 0.87478459, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4063, + "time_per_iteration": 2.514464855194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054396, + "balance_loss_mlp": 1.04535961, + "diversity_loss_mlp": 0.0, + "epoch": 0.7818391689111197, + "flos": 740818483200.0, + "grad_norm": 0.07578138035513655, + "language_loss": 0.82624197, + "learning_rate": 0.00011970598340114897, + "loss": 0.83678591, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4064, + "time_per_iteration": 2.931457042694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051575, + "balance_loss_mlp": 1.04267633, + "diversity_loss_mlp": 0.0, + "epoch": 0.7820315505963832, + "flos": 547669278720.0, + "grad_norm": 0.07400316047770077, + "language_loss": 0.84204572, + "learning_rate": 0.00011950379312218396, + "loss": 0.85256147, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4065, + "time_per_iteration": 2.7011330127716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053821, + "balance_loss_mlp": 1.04467154, + "diversity_loss_mlp": 0.0, + "epoch": 0.7822239322816468, + "flos": 728983245312.0, + "grad_norm": 0.057956585414562535, + "language_loss": 0.86203766, + "learning_rate": 0.00011930175056494719, + "loss": 0.87257588, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4066, + "time_per_iteration": 2.877427816390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054242, + "balance_loss_mlp": 1.04519939, + "diversity_loss_mlp": 0.0, + "epoch": 0.7824163139669104, + "flos": 452016433152.0, + "grad_norm": 0.057083401886059204, + "language_loss": 0.75923216, + "learning_rate": 0.00011909985580787885, + "loss": 0.76977456, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4067, + "time_per_iteration": 2.624633312225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047724, + "balance_loss_mlp": 1.03850365, + "diversity_loss_mlp": 0.0, + "epoch": 0.782608695652174, + "flos": 540489065472.0, + "grad_norm": 0.05949124262263275, + "language_loss": 0.81228232, + "learning_rate": 0.00011889810892936137, + "loss": 0.82275951, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4068, + "time_per_iteration": 2.736132860183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060005, + "balance_loss_mlp": 1.05080259, + "diversity_loss_mlp": 0.0, + "epoch": 0.7828010773374374, + "flos": 500308503552.0, + "grad_norm": 0.067986892151795, + "language_loss": 0.77103662, + "learning_rate": 0.00011869651000771959, + "loss": 0.78163677, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4069, + "time_per_iteration": 2.8403103351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054344, + "balance_loss_mlp": 1.04549229, + "diversity_loss_mlp": 0.0, + "epoch": 0.782993459022701, + "flos": 600816807936.0, + "grad_norm": 0.06684521190560817, + "language_loss": 0.83076346, + "learning_rate": 0.00011849505912122117, + "loss": 0.84130692, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4070, + "time_per_iteration": 2.7008423805236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054175, + "balance_loss_mlp": 1.04501987, + "diversity_loss_mlp": 0.0, + "epoch": 0.7831858407079646, + "flos": 810055779840.0, + "grad_norm": 0.07690857771038405, + "language_loss": 0.78090364, + "learning_rate": 0.00011829375634807654, + "loss": 0.79144537, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4071, + "time_per_iteration": 3.033573627471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054971, + "balance_loss_mlp": 1.04576814, + "diversity_loss_mlp": 0.0, + "epoch": 0.7833782223932282, + "flos": 806594153472.0, + "grad_norm": 0.056420463967120596, + "language_loss": 0.81179786, + "learning_rate": 0.00011809260176643821, + "loss": 0.82234752, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4072, + "time_per_iteration": 3.047667980194092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057919, + "balance_loss_mlp": 1.0486629, + "diversity_loss_mlp": 0.0, + "epoch": 0.7835706040784918, + "flos": 520870860288.0, + "grad_norm": 0.08201668927537556, + "language_loss": 0.83855987, + "learning_rate": 0.00011789159545440131, + "loss": 0.84913909, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4073, + "time_per_iteration": 2.5870485305786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061783, + "balance_loss_mlp": 1.05281854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7837629857637552, + "flos": 505605929472.0, + "grad_norm": 0.05483100075639626, + "language_loss": 0.82342023, + "learning_rate": 0.00011769073749000348, + "loss": 0.83403808, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4074, + "time_per_iteration": 2.7744524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_mlp": 1.05058742, + "diversity_loss_mlp": 0.0, + "epoch": 0.7839553674490188, + "flos": 516124431360.0, + "grad_norm": 0.07650558225741275, + "language_loss": 0.76181698, + "learning_rate": 0.0001174900279512246, + "loss": 0.77241433, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4075, + "time_per_iteration": 2.5718233585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055959, + "balance_loss_mlp": 1.04716742, + "diversity_loss_mlp": 0.0, + "epoch": 0.7841477491342824, + "flos": 506648825856.0, + "grad_norm": 0.06638794146044662, + "language_loss": 0.81755495, + "learning_rate": 0.00011728946691598707, + "loss": 0.82811451, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4076, + "time_per_iteration": 2.597710371017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057965, + "balance_loss_mlp": 1.0489229, + "diversity_loss_mlp": 0.0, + "epoch": 0.784340130819546, + "flos": 719636120064.0, + "grad_norm": 0.07312696414479496, + "language_loss": 0.76038092, + "learning_rate": 0.00011708905446215561, + "loss": 0.77096057, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4077, + "time_per_iteration": 2.8587801456451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052857, + "balance_loss_mlp": 1.04389191, + "diversity_loss_mlp": 0.0, + "epoch": 0.7845325125048095, + "flos": 514441704960.0, + "grad_norm": 0.05480426452035972, + "language_loss": 0.79978698, + "learning_rate": 0.00011688879066753711, + "loss": 0.81031561, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4078, + "time_per_iteration": 2.6878645420074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00794674, + "balance_loss_mlp": 1.3435601, + "diversity_loss_mlp": 0.22424069, + "epoch": 0.7847248941900731, + "flos": 466102646784.0, + "grad_norm": 0.037025249970490705, + "language_loss": 0.87360638, + "learning_rate": 0.00011668867560988122, + "loss": 0.88155311, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01077335, + "step": 4079, + "time_per_iteration": 2.605992317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055214, + "balance_loss_mlp": 1.04603505, + "diversity_loss_mlp": 0.0, + "epoch": 0.7849172758753367, + "flos": 503028983808.0, + "grad_norm": 0.07540056238596937, + "language_loss": 0.84502101, + "learning_rate": 0.00011648870936687916, + "loss": 0.85557318, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4080, + "time_per_iteration": 2.803166627883911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054284, + "balance_loss_mlp": 1.04527164, + "diversity_loss_mlp": 0.0, + "epoch": 0.7851096575606002, + "flos": 531999456768.0, + "grad_norm": 0.07109491685615342, + "language_loss": 0.7888999, + "learning_rate": 0.00011628889201616461, + "loss": 0.79944277, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4081, + "time_per_iteration": 2.6307146549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053935, + "balance_loss_mlp": 1.04494071, + "diversity_loss_mlp": 0.0, + "epoch": 0.7853020392458638, + "flos": 569956207104.0, + "grad_norm": 0.06995649688675094, + "language_loss": 0.8206296, + "learning_rate": 0.00011608922363531393, + "loss": 0.83116901, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4082, + "time_per_iteration": 2.6929171085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_mlp": 1.04621124, + "diversity_loss_mlp": 0.0, + "epoch": 0.7854944209311273, + "flos": 832579845120.0, + "grad_norm": 0.06467745732761603, + "language_loss": 0.83401716, + "learning_rate": 0.00011588970430184504, + "loss": 0.84456635, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 4083, + "time_per_iteration": 3.0374722480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055907, + "balance_loss_mlp": 1.04704356, + "diversity_loss_mlp": 0.0, + "epoch": 0.7856868026163909, + "flos": 559929604608.0, + "grad_norm": 0.053416444226472466, + "language_loss": 0.81812388, + "learning_rate": 0.00011569033409321822, + "loss": 0.82868296, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4084, + "time_per_iteration": 2.7151241302490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056433, + "balance_loss_mlp": 1.04721808, + "diversity_loss_mlp": 0.0, + "epoch": 0.7858791843016545, + "flos": 545230725120.0, + "grad_norm": 0.08362128305368578, + "language_loss": 0.72967046, + "learning_rate": 0.00011549111308683591, + "loss": 0.74023485, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4085, + "time_per_iteration": 2.703397750854492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053784, + "balance_loss_mlp": 1.044855, + "diversity_loss_mlp": 0.0, + "epoch": 0.7860715659869181, + "flos": 380997665280.0, + "grad_norm": 0.07026628399198086, + "language_loss": 0.80478334, + "learning_rate": 0.00011529204136004251, + "loss": 0.81532121, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4086, + "time_per_iteration": 2.4818243980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055251, + "balance_loss_mlp": 1.04632854, + "diversity_loss_mlp": 0.0, + "epoch": 0.7862639476721817, + "flos": 567440930304.0, + "grad_norm": 0.06468878784636958, + "language_loss": 0.84670031, + "learning_rate": 0.00011509311899012459, + "loss": 0.85725284, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4087, + "time_per_iteration": 2.6685831546783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052321, + "balance_loss_mlp": 1.04333234, + "diversity_loss_mlp": 0.0, + "epoch": 0.7864563293574451, + "flos": 545238065664.0, + "grad_norm": 0.07857696263976417, + "language_loss": 0.781057, + "learning_rate": 0.00011489434605431053, + "loss": 0.7915802, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4088, + "time_per_iteration": 2.634192705154419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050771, + "balance_loss_mlp": 1.0415858, + "diversity_loss_mlp": 0.0, + "epoch": 0.7866487110427087, + "flos": 563536963584.0, + "grad_norm": 0.06849593864396217, + "language_loss": 0.81194121, + "learning_rate": 0.0001146957226297708, + "loss": 0.82244897, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4089, + "time_per_iteration": 2.6896586418151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054397, + "balance_loss_mlp": 1.04508066, + "diversity_loss_mlp": 0.0, + "epoch": 0.7868410927279723, + "flos": 728189968896.0, + "grad_norm": 0.06226549816004976, + "language_loss": 0.76514363, + "learning_rate": 0.00011449724879361827, + "loss": 0.77568758, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4090, + "time_per_iteration": 3.0211868286132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105233, + "balance_loss_mlp": 1.04349613, + "diversity_loss_mlp": 0.0, + "epoch": 0.7870334744132359, + "flos": 521355045888.0, + "grad_norm": 0.10606387135755017, + "language_loss": 0.73947829, + "learning_rate": 0.00011429892462290687, + "loss": 0.75000155, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4091, + "time_per_iteration": 2.663403034210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051781, + "balance_loss_mlp": 1.04245293, + "diversity_loss_mlp": 0.0, + "epoch": 0.7872258560984994, + "flos": 451411107840.0, + "grad_norm": 0.07444773057019392, + "language_loss": 0.83167046, + "learning_rate": 0.00011410075019463295, + "loss": 0.84218824, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4092, + "time_per_iteration": 2.6732146739959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048957, + "balance_loss_mlp": 1.04006362, + "diversity_loss_mlp": 0.0, + "epoch": 0.787418237783763, + "flos": 515195334144.0, + "grad_norm": 0.060787527331610934, + "language_loss": 0.80152667, + "learning_rate": 0.00011390272558573461, + "loss": 0.81201625, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4093, + "time_per_iteration": 2.7180373668670654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046439, + "balance_loss_mlp": 1.03762388, + "diversity_loss_mlp": 0.0, + "epoch": 0.7876106194690266, + "flos": 485081021952.0, + "grad_norm": 0.06490792600835427, + "language_loss": 0.7982657, + "learning_rate": 0.00011370485087309202, + "loss": 0.80873013, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4094, + "time_per_iteration": 2.6366312503814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049978, + "balance_loss_mlp": 1.04087603, + "diversity_loss_mlp": 0.0, + "epoch": 0.7878030011542901, + "flos": 542841357312.0, + "grad_norm": 0.07475345031561743, + "language_loss": 0.79215139, + "learning_rate": 0.00011350712613352688, + "loss": 0.80265117, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4095, + "time_per_iteration": 2.652498960494995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_mlp": 1.0379113, + "diversity_loss_mlp": 0.0, + "epoch": 0.7879953828395537, + "flos": 516739668480.0, + "grad_norm": 0.08748048466921367, + "language_loss": 0.79438257, + "learning_rate": 0.00011330955144380283, + "loss": 0.8048501, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4096, + "time_per_iteration": 2.641091823577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051654, + "balance_loss_mlp": 1.04231441, + "diversity_loss_mlp": 0.0, + "epoch": 0.7881877645248172, + "flos": 582278201856.0, + "grad_norm": 0.09762790842246886, + "language_loss": 0.8590734, + "learning_rate": 0.00011311212688062483, + "loss": 0.86958992, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.09338379, + "routerloss_mlp": 0.0, + "step": 4097, + "time_per_iteration": 2.7734925746917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104816, + "balance_loss_mlp": 1.03907609, + "diversity_loss_mlp": 0.0, + "epoch": 0.7883801462100808, + "flos": 589171719168.0, + "grad_norm": 0.07905994769378807, + "language_loss": 0.77729434, + "learning_rate": 0.0001129148525206402, + "loss": 0.78777593, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4098, + "time_per_iteration": 2.7954680919647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043495, + "balance_loss_mlp": 1.03457785, + "diversity_loss_mlp": 0.0, + "epoch": 0.7885725278953444, + "flos": 481728052224.0, + "grad_norm": 0.07239705861159748, + "language_loss": 0.86597443, + "learning_rate": 0.00011271772844043759, + "loss": 0.87640929, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4099, + "time_per_iteration": 2.6607439517974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_mlp": 1.03621721, + "diversity_loss_mlp": 0.0, + "epoch": 0.788764909580608, + "flos": 756794824704.0, + "grad_norm": 0.0879845315874332, + "language_loss": 0.76285118, + "learning_rate": 0.00011252075471654727, + "loss": 0.7733022, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4100, + "time_per_iteration": 2.971648693084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105122, + "balance_loss_mlp": 1.04207063, + "diversity_loss_mlp": 0.0, + "epoch": 0.7889572912658714, + "flos": 702555213312.0, + "grad_norm": 0.0764302871750087, + "language_loss": 0.77711362, + "learning_rate": 0.00011232393142544133, + "loss": 0.78762579, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4101, + "time_per_iteration": 2.91229510307312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047622, + "balance_loss_mlp": 1.03860378, + "diversity_loss_mlp": 0.0, + "epoch": 0.789149672951135, + "flos": 736405364736.0, + "grad_norm": 0.07185195333789275, + "language_loss": 0.82940054, + "learning_rate": 0.00011212725864353323, + "loss": 0.83987677, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4102, + "time_per_iteration": 3.1023645401000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025318, + "balance_loss_mlp": 1.02088332, + "diversity_loss_mlp": 0.0, + "epoch": 0.7893420546363986, + "flos": 1481396511744.0, + "grad_norm": 0.024083596003167965, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77361244, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4103, + "time_per_iteration": 4.869060754776001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.03684092, + "diversity_loss_mlp": 0.0, + "epoch": 0.7895344363216622, + "flos": 509072698368.0, + "grad_norm": 0.08808407727788632, + "language_loss": 0.75807375, + "learning_rate": 0.00011173436491267291, + "loss": 0.76853269, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4104, + "time_per_iteration": 2.632619619369507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051432, + "balance_loss_mlp": 1.04226446, + "diversity_loss_mlp": 0.0, + "epoch": 0.7897268180069258, + "flos": 541988983296.0, + "grad_norm": 0.06591293045265766, + "language_loss": 0.81841874, + "learning_rate": 0.0001115381441162554, + "loss": 0.82893306, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4105, + "time_per_iteration": 2.6688740253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015618, + "balance_loss_mlp": 1.0112071, + "diversity_loss_mlp": 0.0, + "epoch": 0.7899191996921893, + "flos": 1412687817216.0, + "grad_norm": 0.01578072375455914, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74599338, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4106, + "time_per_iteration": 4.878762245178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_mlp": 1.041677, + "diversity_loss_mlp": 0.0, + "epoch": 0.7901115813774529, + "flos": 622841633280.0, + "grad_norm": 0.06419159755656932, + "language_loss": 0.85182965, + "learning_rate": 0.00011114615504234465, + "loss": 0.86233652, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4107, + "time_per_iteration": 2.7453701496124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_mlp": 1.03746724, + "diversity_loss_mlp": 0.0, + "epoch": 0.7903039630627164, + "flos": 645545935872.0, + "grad_norm": 0.07341048206377168, + "language_loss": 0.80923963, + "learning_rate": 0.00011095038691703468, + "loss": 0.81970477, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4108, + "time_per_iteration": 2.857043504714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047055, + "balance_loss_mlp": 1.03800678, + "diversity_loss_mlp": 0.0, + "epoch": 0.79049634474798, + "flos": 594365257728.0, + "grad_norm": 0.06655370110946672, + "language_loss": 0.82816958, + "learning_rate": 0.00011075476983417998, + "loss": 0.83864009, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4109, + "time_per_iteration": 2.8551764488220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049932, + "balance_loss_mlp": 1.04054475, + "diversity_loss_mlp": 0.0, + "epoch": 0.7906887264332435, + "flos": 716093001216.0, + "grad_norm": 0.08565145998771567, + "language_loss": 0.7770009, + "learning_rate": 0.00011055930386972579, + "loss": 0.78750026, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 4110, + "time_per_iteration": 2.9051218032836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_mlp": 1.03950906, + "diversity_loss_mlp": 0.0, + "epoch": 0.7908811081185071, + "flos": 789893918208.0, + "grad_norm": 0.07889594156212229, + "language_loss": 0.78524226, + "learning_rate": 0.00011036398909955863, + "loss": 0.79572868, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 4111, + "time_per_iteration": 2.9591848850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00801967, + "balance_loss_mlp": 1.35861206, + "diversity_loss_mlp": 0.22341654, + "epoch": 0.7910734898037707, + "flos": 641904072192.0, + "grad_norm": 0.031814716701276446, + "language_loss": 0.81445456, + "learning_rate": 0.00011016882559950648, + "loss": 0.82247424, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0109526, + "step": 4112, + "time_per_iteration": 2.8517532348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_mlp": 1.04066622, + "diversity_loss_mlp": 0.0, + "epoch": 0.7912658714890343, + "flos": 669357374976.0, + "grad_norm": 0.06825914372029093, + "language_loss": 0.80628312, + "learning_rate": 0.00010997381344533853, + "loss": 0.81678075, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4113, + "time_per_iteration": 2.76458477973938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054223, + "balance_loss_mlp": 1.04482937, + "diversity_loss_mlp": 0.0, + "epoch": 0.7914582531742979, + "flos": 557779944960.0, + "grad_norm": 0.06296725861693256, + "language_loss": 0.80975449, + "learning_rate": 0.00010977895271276517, + "loss": 0.82029676, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4114, + "time_per_iteration": 2.677236795425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105271, + "balance_loss_mlp": 1.04387641, + "diversity_loss_mlp": 0.0, + "epoch": 0.7916506348595613, + "flos": 570064863744.0, + "grad_norm": 0.07698010071595295, + "language_loss": 0.79882276, + "learning_rate": 0.00010958424347743807, + "loss": 0.80934995, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4115, + "time_per_iteration": 2.7255280017852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056753, + "balance_loss_mlp": 1.04793203, + "diversity_loss_mlp": 0.0, + "epoch": 0.7918430165448249, + "flos": 718301758464.0, + "grad_norm": 0.06323084510093162, + "language_loss": 0.80379033, + "learning_rate": 0.00010938968581494991, + "loss": 0.81435782, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4116, + "time_per_iteration": 2.956744909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056354, + "balance_loss_mlp": 1.0473659, + "diversity_loss_mlp": 0.0, + "epoch": 0.7920353982300885, + "flos": 553648753152.0, + "grad_norm": 0.07593804019744407, + "language_loss": 0.78918922, + "learning_rate": 0.000109195279800835, + "loss": 0.79975271, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4117, + "time_per_iteration": 2.7232017517089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052824, + "balance_loss_mlp": 1.04372239, + "diversity_loss_mlp": 0.0, + "epoch": 0.7922277799153521, + "flos": 810120019968.0, + "grad_norm": 0.07668598230710005, + "language_loss": 0.76558191, + "learning_rate": 0.00010900102551056834, + "loss": 0.77611017, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4118, + "time_per_iteration": 3.0348682403564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_mlp": 1.04203153, + "diversity_loss_mlp": 0.0, + "epoch": 0.7924201616006156, + "flos": 421351123968.0, + "grad_norm": 0.06933579681898581, + "language_loss": 0.8458457, + "learning_rate": 0.00010880692301956601, + "loss": 0.85635561, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4119, + "time_per_iteration": 2.465395212173462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059146, + "balance_loss_mlp": 1.05027056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7926125432858792, + "flos": 617852924928.0, + "grad_norm": 0.06493837690301978, + "language_loss": 0.86651456, + "learning_rate": 0.00010861297240318518, + "loss": 0.87710601, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4120, + "time_per_iteration": 2.8506181240081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_mlp": 1.04826188, + "diversity_loss_mlp": 0.0, + "epoch": 0.7928049249711427, + "flos": 602487051264.0, + "grad_norm": 0.07524766323731863, + "language_loss": 0.87229133, + "learning_rate": 0.00010841917373672444, + "loss": 0.88286078, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 4121, + "time_per_iteration": 2.745227336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055842, + "balance_loss_mlp": 1.04712808, + "diversity_loss_mlp": 0.0, + "epoch": 0.7929973066564063, + "flos": 656024790528.0, + "grad_norm": 0.08118940133699648, + "language_loss": 0.78629029, + "learning_rate": 0.00010822552709542293, + "loss": 0.79684877, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4122, + "time_per_iteration": 2.813340425491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055553, + "balance_loss_mlp": 1.04677343, + "diversity_loss_mlp": 0.0, + "epoch": 0.7931896883416699, + "flos": 536397520896.0, + "grad_norm": 0.058728515527731805, + "language_loss": 0.86142117, + "learning_rate": 0.0001080320325544612, + "loss": 0.87197673, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4123, + "time_per_iteration": 2.6903398036956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053347, + "balance_loss_mlp": 1.04438257, + "diversity_loss_mlp": 0.0, + "epoch": 0.7933820700269334, + "flos": 498082493952.0, + "grad_norm": 0.06377375336372411, + "language_loss": 0.83519953, + "learning_rate": 0.00010783869018895997, + "loss": 0.84573305, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4124, + "time_per_iteration": 2.6091437339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_mlp": 1.04709673, + "diversity_loss_mlp": 0.0, + "epoch": 0.793574451712197, + "flos": 537472350720.0, + "grad_norm": 0.06290112703691109, + "language_loss": 0.84019685, + "learning_rate": 0.00010764550007398189, + "loss": 0.85075527, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4125, + "time_per_iteration": 2.639021396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105337, + "balance_loss_mlp": 1.04447079, + "diversity_loss_mlp": 0.0, + "epoch": 0.7937668333974606, + "flos": 488285687808.0, + "grad_norm": 0.059983052052207615, + "language_loss": 0.81026101, + "learning_rate": 0.00010745246228452982, + "loss": 0.8207947, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4126, + "time_per_iteration": 2.567128896713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_mlp": 1.04658413, + "diversity_loss_mlp": 0.0, + "epoch": 0.7939592150827242, + "flos": 527425924608.0, + "grad_norm": 0.06538981258691282, + "language_loss": 0.81837595, + "learning_rate": 0.00010725957689554771, + "loss": 0.82892644, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.08477783, + "routerloss_mlp": 0.0, + "step": 4127, + "time_per_iteration": 2.7668473720550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105364, + "balance_loss_mlp": 1.04483056, + "diversity_loss_mlp": 0.0, + "epoch": 0.7941515967679876, + "flos": 541702287360.0, + "grad_norm": 0.06455760363891609, + "language_loss": 0.84442085, + "learning_rate": 0.00010706684398192013, + "loss": 0.85495722, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4128, + "time_per_iteration": 2.703094482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_mlp": 1.04694915, + "diversity_loss_mlp": 0.0, + "epoch": 0.7943439784532512, + "flos": 518387516928.0, + "grad_norm": 0.10398066376678644, + "language_loss": 0.81773114, + "learning_rate": 0.00010687426361847313, + "loss": 0.82829189, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4129, + "time_per_iteration": 2.730570077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_mlp": 1.04571033, + "diversity_loss_mlp": 0.0, + "epoch": 0.7945363601385148, + "flos": 509025710592.0, + "grad_norm": 0.06937610081260179, + "language_loss": 0.8574326, + "learning_rate": 0.00010668183587997254, + "loss": 0.86797965, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4130, + "time_per_iteration": 2.644259452819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_mlp": 1.04217792, + "diversity_loss_mlp": 0.0, + "epoch": 0.7947287418237784, + "flos": 651214121472.0, + "grad_norm": 0.05953600763070223, + "language_loss": 0.77579701, + "learning_rate": 0.0001064895608411256, + "loss": 0.78630781, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4131, + "time_per_iteration": 2.841925859451294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105178, + "balance_loss_mlp": 1.04286337, + "diversity_loss_mlp": 0.0, + "epoch": 0.794921123509042, + "flos": 696054477312.0, + "grad_norm": 0.06486183241314894, + "language_loss": 0.80494809, + "learning_rate": 0.00010629743857657998, + "loss": 0.81546587, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4132, + "time_per_iteration": 2.9550116062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007878, + "balance_loss_mlp": 1.00334787, + "diversity_loss_mlp": 0.0, + "epoch": 0.7951135051943055, + "flos": 1402942768128.0, + "grad_norm": 0.014279472424614392, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71606547, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.04541016, + "routerloss_mlp": 0.0, + "step": 4133, + "time_per_iteration": 4.61087965965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_mlp": 1.05091596, + "diversity_loss_mlp": 0.0, + "epoch": 0.795305886879569, + "flos": 810085515264.0, + "grad_norm": 0.08419096338195846, + "language_loss": 0.82037973, + "learning_rate": 0.00010591365266868802, + "loss": 0.83097553, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 4134, + "time_per_iteration": 2.980473518371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006707, + "balance_loss_mlp": 1.00217748, + "diversity_loss_mlp": 0.0, + "epoch": 0.7954982685648326, + "flos": 1426005347328.0, + "grad_norm": 0.013377465040040408, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76518488, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.04541016, + "routerloss_mlp": 0.0, + "step": 4135, + "time_per_iteration": 5.031512975692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051378, + "balance_loss_mlp": 1.04224622, + "diversity_loss_mlp": 0.0, + "epoch": 0.7956906502500962, + "flos": 389885197824.0, + "grad_norm": 0.08143958467983652, + "language_loss": 0.7928952, + "learning_rate": 0.00010553047875229166, + "loss": 0.80340898, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4136, + "time_per_iteration": 2.536219596862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053745, + "balance_loss_mlp": 1.04491794, + "diversity_loss_mlp": 0.0, + "epoch": 0.7958830319353598, + "flos": 515573434368.0, + "grad_norm": 0.05917621440441134, + "language_loss": 0.8352496, + "learning_rate": 0.00010533912147689328, + "loss": 0.84578705, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4137, + "time_per_iteration": 2.62947416305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052392, + "balance_loss_mlp": 1.04364753, + "diversity_loss_mlp": 0.0, + "epoch": 0.7960754136206233, + "flos": 493941390336.0, + "grad_norm": 0.07247645097842569, + "language_loss": 0.82383895, + "learning_rate": 0.00010514791742243656, + "loss": 0.83436286, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4138, + "time_per_iteration": 2.6058223247528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053846, + "balance_loss_mlp": 1.04486322, + "diversity_loss_mlp": 0.0, + "epoch": 0.7962677953058869, + "flos": 655728182784.0, + "grad_norm": 0.07856202151848143, + "language_loss": 0.82678479, + "learning_rate": 0.00010495686666315341, + "loss": 0.83732331, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4139, + "time_per_iteration": 2.8820180892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_mlp": 1.04509258, + "diversity_loss_mlp": 0.0, + "epoch": 0.7964601769911505, + "flos": 542384335872.0, + "grad_norm": 0.09207393340076041, + "language_loss": 0.77504325, + "learning_rate": 0.00010476596927321635, + "loss": 0.78558183, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4140, + "time_per_iteration": 2.5876264572143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054719, + "balance_loss_mlp": 1.04586816, + "diversity_loss_mlp": 0.0, + "epoch": 0.796652558676414, + "flos": 537650016768.0, + "grad_norm": 0.06332389355869186, + "language_loss": 0.80286723, + "learning_rate": 0.00010457522532673835, + "loss": 0.81341445, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4141, + "time_per_iteration": 2.7853429317474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053842, + "balance_loss_mlp": 1.04521155, + "diversity_loss_mlp": 0.0, + "epoch": 0.7968449403616775, + "flos": 475091495424.0, + "grad_norm": 0.07594916891501999, + "language_loss": 0.83322799, + "learning_rate": 0.00010438463489777272, + "loss": 0.84376645, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4142, + "time_per_iteration": 2.574995756149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053543, + "balance_loss_mlp": 1.0441432, + "diversity_loss_mlp": 0.0, + "epoch": 0.7970373220469411, + "flos": 567613827072.0, + "grad_norm": 0.06219380630034642, + "language_loss": 0.77388006, + "learning_rate": 0.00010419419806031316, + "loss": 0.78441548, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.09399414, + "routerloss_mlp": 0.0, + "step": 4143, + "time_per_iteration": 2.681364059448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057205, + "balance_loss_mlp": 1.04838395, + "diversity_loss_mlp": 0.0, + "epoch": 0.7972297037322047, + "flos": 556208446464.0, + "grad_norm": 0.06244291716660837, + "language_loss": 0.83778638, + "learning_rate": 0.00010400391488829403, + "loss": 0.84835839, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4144, + "time_per_iteration": 2.7661397457122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056681, + "balance_loss_mlp": 1.04754949, + "diversity_loss_mlp": 0.0, + "epoch": 0.7974220854174683, + "flos": 576180158976.0, + "grad_norm": 0.056029857219710606, + "language_loss": 0.86605, + "learning_rate": 0.00010381378545558984, + "loss": 0.87661684, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4145, + "time_per_iteration": 2.706909656524658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_mlp": 1.04191816, + "diversity_loss_mlp": 0.0, + "epoch": 0.7976144671027319, + "flos": 483069754368.0, + "grad_norm": 0.06718577287314217, + "language_loss": 0.84665811, + "learning_rate": 0.00010362380983601505, + "loss": 0.85716891, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4146, + "time_per_iteration": 2.529480218887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055069, + "balance_loss_mlp": 1.04609227, + "diversity_loss_mlp": 0.0, + "epoch": 0.7978068487879953, + "flos": 1077865615872.0, + "grad_norm": 0.0571367932207486, + "language_loss": 0.7866556, + "learning_rate": 0.00010343398810332477, + "loss": 0.79720628, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4147, + "time_per_iteration": 3.4586639404296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105507, + "balance_loss_mlp": 1.04595661, + "diversity_loss_mlp": 0.0, + "epoch": 0.7979992304732589, + "flos": 733739586048.0, + "grad_norm": 0.07566676342485233, + "language_loss": 0.84437156, + "learning_rate": 0.00010324432033121467, + "loss": 0.85492229, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4148, + "time_per_iteration": 2.8839025497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053366, + "balance_loss_mlp": 1.04418659, + "diversity_loss_mlp": 0.0, + "epoch": 0.7981916121585225, + "flos": 415774342656.0, + "grad_norm": 0.06830192551222886, + "language_loss": 0.83435208, + "learning_rate": 0.00010305480659332005, + "loss": 0.84488571, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4149, + "time_per_iteration": 2.5951197147369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059283, + "balance_loss_mlp": 1.05012214, + "diversity_loss_mlp": 0.0, + "epoch": 0.7983839938437861, + "flos": 465257613312.0, + "grad_norm": 0.07563453451103978, + "language_loss": 0.83492422, + "learning_rate": 0.00010286544696321682, + "loss": 0.84551704, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4150, + "time_per_iteration": 2.5118510723114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055471, + "balance_loss_mlp": 1.04628563, + "diversity_loss_mlp": 0.0, + "epoch": 0.7985763755290496, + "flos": 510567473664.0, + "grad_norm": 0.07562833621575128, + "language_loss": 0.7924732, + "learning_rate": 0.00010267624151442073, + "loss": 0.80302793, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4151, + "time_per_iteration": 2.612138509750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052309, + "balance_loss_mlp": 1.04312396, + "diversity_loss_mlp": 0.0, + "epoch": 0.7987687572143132, + "flos": 1010649498624.0, + "grad_norm": 0.07020647270289845, + "language_loss": 0.80794007, + "learning_rate": 0.000102487190320388, + "loss": 0.81846315, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4152, + "time_per_iteration": 3.3858306407928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052492, + "balance_loss_mlp": 1.0432297, + "diversity_loss_mlp": 0.0, + "epoch": 0.7989611388995768, + "flos": 1021078794240.0, + "grad_norm": 0.08528953367031804, + "language_loss": 0.79654646, + "learning_rate": 0.00010229829345451475, + "loss": 0.80707145, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4153, + "time_per_iteration": 3.326597213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_mlp": 1.04706669, + "diversity_loss_mlp": 0.0, + "epoch": 0.7991535205848403, + "flos": 1101338601984.0, + "grad_norm": 0.06462141101761633, + "language_loss": 0.79619837, + "learning_rate": 0.00010210955099013724, + "loss": 0.80676001, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4154, + "time_per_iteration": 3.3817038536071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054824, + "balance_loss_mlp": 1.04566312, + "diversity_loss_mlp": 0.0, + "epoch": 0.7993459022701039, + "flos": 834818337792.0, + "grad_norm": 0.07616557599778462, + "language_loss": 0.76846623, + "learning_rate": 0.00010192096300053167, + "loss": 0.77901447, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4155, + "time_per_iteration": 3.081740379333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105106, + "balance_loss_mlp": 1.04188037, + "diversity_loss_mlp": 0.0, + "epoch": 0.7995382839553674, + "flos": 522686836224.0, + "grad_norm": 0.0612954553036602, + "language_loss": 0.85157597, + "learning_rate": 0.00010173252955891477, + "loss": 0.86208659, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4156, + "time_per_iteration": 2.7239129543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055997, + "balance_loss_mlp": 1.04709256, + "diversity_loss_mlp": 0.0, + "epoch": 0.799730665640631, + "flos": 537820715520.0, + "grad_norm": 0.07720224754254114, + "language_loss": 0.73362273, + "learning_rate": 0.00010154425073844253, + "loss": 0.74418271, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4157, + "time_per_iteration": 2.696467638015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052019, + "balance_loss_mlp": 1.04316235, + "diversity_loss_mlp": 0.0, + "epoch": 0.7999230473258946, + "flos": 505060075008.0, + "grad_norm": 0.060505733748086536, + "language_loss": 0.82517296, + "learning_rate": 0.00010135612661221138, + "loss": 0.83569312, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4158, + "time_per_iteration": 2.582913398742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_mlp": 1.03880203, + "diversity_loss_mlp": 0.0, + "epoch": 0.8001154290111582, + "flos": 1027342393344.0, + "grad_norm": 0.08198302238912947, + "language_loss": 0.81945235, + "learning_rate": 0.00010116815725325751, + "loss": 0.82993186, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 4159, + "time_per_iteration": 3.28433895111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00798548, + "balance_loss_mlp": 1.34939909, + "diversity_loss_mlp": 0.22584054, + "epoch": 0.8003078106964217, + "flos": 750906754560.0, + "grad_norm": 0.032371691049230863, + "language_loss": 0.80472159, + "learning_rate": 0.00010098034273455725, + "loss": 0.81270707, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01092844, + "step": 4160, + "time_per_iteration": 3.020301342010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047481, + "balance_loss_mlp": 1.03802133, + "diversity_loss_mlp": 0.0, + "epoch": 0.8005001923816852, + "flos": 488465925120.0, + "grad_norm": 0.06923738075728161, + "language_loss": 0.79914421, + "learning_rate": 0.00010079268312902662, + "loss": 0.80961907, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.09448242, + "routerloss_mlp": 0.0, + "step": 4161, + "time_per_iteration": 2.663827657699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053373, + "balance_loss_mlp": 1.04445577, + "diversity_loss_mlp": 0.0, + "epoch": 0.8006925740669488, + "flos": 513248306688.0, + "grad_norm": 0.07955090405050065, + "language_loss": 0.82002842, + "learning_rate": 0.0001006051785095215, + "loss": 0.83056211, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4162, + "time_per_iteration": 2.669938087463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052136, + "balance_loss_mlp": 1.04306972, + "diversity_loss_mlp": 0.0, + "epoch": 0.8008849557522124, + "flos": 578529879552.0, + "grad_norm": 0.07737392704066832, + "language_loss": 0.79858398, + "learning_rate": 0.0001004178289488376, + "loss": 0.80910534, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4163, + "time_per_iteration": 2.7215919494628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052219, + "balance_loss_mlp": 1.04284358, + "diversity_loss_mlp": 0.0, + "epoch": 0.801077337437476, + "flos": 478708766208.0, + "grad_norm": 0.06994031793136987, + "language_loss": 0.83999282, + "learning_rate": 0.0001002306345197106, + "loss": 0.85051501, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.09368896, + "routerloss_mlp": 0.0, + "step": 4164, + "time_per_iteration": 2.545501708984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049385, + "balance_loss_mlp": 1.04034317, + "diversity_loss_mlp": 0.0, + "epoch": 0.8012697191227395, + "flos": 676700573184.0, + "grad_norm": 0.07265204276246538, + "language_loss": 0.80238962, + "learning_rate": 0.00010004359529481571, + "loss": 0.81288344, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4165, + "time_per_iteration": 2.8751044273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049571, + "balance_loss_mlp": 1.04052877, + "diversity_loss_mlp": 0.0, + "epoch": 0.8014621008080031, + "flos": 1295132405760.0, + "grad_norm": 0.07344708402099766, + "language_loss": 0.82382286, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83431858, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4166, + "time_per_iteration": 3.706587314605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051894, + "balance_loss_mlp": 1.04301274, + "diversity_loss_mlp": 0.0, + "epoch": 0.8016544824932667, + "flos": 511827683328.0, + "grad_norm": 0.0782603427027698, + "language_loss": 0.83461916, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84513807, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4167, + "time_per_iteration": 2.5965118408203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_mlp": 1.04132366, + "diversity_loss_mlp": 0.0, + "epoch": 0.8018468641785302, + "flos": 535690879488.0, + "grad_norm": 0.08470873380508834, + "language_loss": 0.81762064, + "learning_rate": 9.948340957137308e-05, + "loss": 0.82812226, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4168, + "time_per_iteration": 2.6369173526763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_mlp": 1.04494286, + "diversity_loss_mlp": 0.0, + "epoch": 0.8020392458637937, + "flos": 1023431086080.0, + "grad_norm": 0.07955948845391579, + "language_loss": 0.79946613, + "learning_rate": 9.929699188895447e-05, + "loss": 0.81000549, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4169, + "time_per_iteration": 3.257819652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00590619, + "balance_loss_mlp": 1.02878523, + "diversity_loss_mlp": 0.13400336, + "epoch": 0.8022316275490573, + "flos": 1561806821376.0, + "grad_norm": 0.001271365187533197, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.78645021, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00922488, + "step": 4170, + "time_per_iteration": 4.967956066131592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052767, + "balance_loss_mlp": 1.04368353, + "diversity_loss_mlp": 0.0, + "epoch": 0.8024240092343209, + "flos": 420698810880.0, + "grad_norm": 0.06699330376146911, + "language_loss": 0.83303684, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84356451, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4171, + "time_per_iteration": 2.511323928833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053852, + "balance_loss_mlp": 1.04476857, + "diversity_loss_mlp": 0.0, + "epoch": 0.8026163909195845, + "flos": 763836645888.0, + "grad_norm": 0.0707874133261092, + "language_loss": 0.7890135, + "learning_rate": 9.873867253111762e-05, + "loss": 0.79955202, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4172, + "time_per_iteration": 2.938361644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01002455, + "balance_loss_mlp": 0.99778163, + "diversity_loss_mlp": 0.0, + "epoch": 0.8028087726048481, + "flos": 1518861362688.0, + "grad_norm": 0.01094338931973828, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81267017, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.04663086, + "routerloss_mlp": 0.0, + "step": 4173, + "time_per_iteration": 4.908462285995483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793014, + "balance_loss_mlp": 1.33927226, + "diversity_loss_mlp": 0.22488941, + "epoch": 0.8030011542901115, + "flos": 517861486080.0, + "grad_norm": 0.03516130293682118, + "language_loss": 0.88785201, + "learning_rate": 9.836723842278733e-05, + "loss": 0.89578211, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01093344, + "step": 4174, + "time_per_iteration": 2.5922460556030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053625, + "balance_loss_mlp": 1.04467213, + "diversity_loss_mlp": 0.0, + "epoch": 0.8031935359753751, + "flos": 545616165888.0, + "grad_norm": 0.07944554575907646, + "language_loss": 0.78243375, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79296994, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4175, + "time_per_iteration": 2.6601076126098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051864, + "balance_loss_mlp": 1.04280424, + "diversity_loss_mlp": 0.0, + "epoch": 0.8033859176606387, + "flos": 603559309824.0, + "grad_norm": 0.06387478026678979, + "language_loss": 0.84549594, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85601461, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4176, + "time_per_iteration": 2.7655818462371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049571, + "balance_loss_mlp": 1.0406065, + "diversity_loss_mlp": 0.0, + "epoch": 0.8035782993459023, + "flos": 565859520000.0, + "grad_norm": 0.07434715811474918, + "language_loss": 0.81265736, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82315314, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4177, + "time_per_iteration": 2.7365646362304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051174, + "balance_loss_mlp": 1.04198945, + "diversity_loss_mlp": 0.0, + "epoch": 0.8037706810311658, + "flos": 538435952640.0, + "grad_norm": 0.0854183247343152, + "language_loss": 0.84699386, + "learning_rate": 9.762624191379054e-05, + "loss": 0.85750556, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4178, + "time_per_iteration": 2.6607935428619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047249, + "balance_loss_mlp": 1.03811717, + "diversity_loss_mlp": 0.0, + "epoch": 0.8039630627164294, + "flos": 515187993600.0, + "grad_norm": 0.07548014236337308, + "language_loss": 0.79687864, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80735117, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4179, + "time_per_iteration": 2.649068593978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01001844, + "balance_loss_mlp": 0.99719512, + "diversity_loss_mlp": 0.0, + "epoch": 0.804155444401693, + "flos": 1478834247168.0, + "grad_norm": 0.010296775940752873, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75735408, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.04638672, + "routerloss_mlp": 0.0, + "step": 4180, + "time_per_iteration": 4.874431133270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050118, + "balance_loss_mlp": 1.04090953, + "diversity_loss_mlp": 0.0, + "epoch": 0.8043478260869565, + "flos": 521164896768.0, + "grad_norm": 0.07453821883084652, + "language_loss": 0.77098471, + "learning_rate": 9.707213454125396e-05, + "loss": 0.78148586, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4181, + "time_per_iteration": 2.687908887863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045921, + "balance_loss_mlp": 1.03656304, + "diversity_loss_mlp": 0.0, + "epoch": 0.8045402077722201, + "flos": 545448038400.0, + "grad_norm": 0.06056113889476793, + "language_loss": 0.80571556, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81617486, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.09356689, + "routerloss_mlp": 0.0, + "step": 4182, + "time_per_iteration": 2.755779981613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_mlp": 1.04540682, + "diversity_loss_mlp": 0.0, + "epoch": 0.8047325894574836, + "flos": 678388068864.0, + "grad_norm": 0.07500472983981471, + "language_loss": 0.7412895, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75183195, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4183, + "time_per_iteration": 2.959167242050171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046099, + "balance_loss_mlp": 1.03698587, + "diversity_loss_mlp": 0.0, + "epoch": 0.8049249711427472, + "flos": 587227262976.0, + "grad_norm": 0.07263280839339305, + "language_loss": 0.78791356, + "learning_rate": 9.65194350425882e-05, + "loss": 0.79837459, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4184, + "time_per_iteration": 2.7201614379882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049152, + "balance_loss_mlp": 1.0401814, + "diversity_loss_mlp": 0.0, + "epoch": 0.8051173528280108, + "flos": 814194312192.0, + "grad_norm": 0.0782100616306692, + "language_loss": 0.77473164, + "learning_rate": 9.633551507115452e-05, + "loss": 0.78522313, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4185, + "time_per_iteration": 3.134634256362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010494, + "balance_loss_mlp": 1.04034662, + "diversity_loss_mlp": 0.0, + "epoch": 0.8053097345132744, + "flos": 725687175168.0, + "grad_norm": 0.06922447607886563, + "language_loss": 0.77592742, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78642142, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4186, + "time_per_iteration": 2.961618423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051819, + "balance_loss_mlp": 1.04297376, + "diversity_loss_mlp": 0.0, + "epoch": 0.805502116198538, + "flos": 748050453504.0, + "grad_norm": 0.0745309975524961, + "language_loss": 0.81570286, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82622111, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4187, + "time_per_iteration": 2.9941747188568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050277, + "balance_loss_mlp": 1.04128897, + "diversity_loss_mlp": 0.0, + "epoch": 0.8056944978838014, + "flos": 640258421760.0, + "grad_norm": 0.06519286758654869, + "language_loss": 0.87670028, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88720298, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4188, + "time_per_iteration": 2.8933184146881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049222, + "balance_loss_mlp": 1.04018009, + "diversity_loss_mlp": 0.0, + "epoch": 0.805886879569065, + "flos": 644631892992.0, + "grad_norm": 0.07111853308758409, + "language_loss": 0.78227425, + "learning_rate": 9.560140306306436e-05, + "loss": 0.79276645, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4189, + "time_per_iteration": 2.8829870223999023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050789, + "balance_loss_mlp": 1.0420208, + "diversity_loss_mlp": 0.0, + "epoch": 0.8060792612543286, + "flos": 661230812160.0, + "grad_norm": 0.07715619542299273, + "language_loss": 0.81660378, + "learning_rate": 9.541826738671233e-05, + "loss": 0.8271116, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4190, + "time_per_iteration": 2.805797815322876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050431, + "balance_loss_mlp": 1.041592, + "diversity_loss_mlp": 0.0, + "epoch": 0.8062716429395922, + "flos": 455075366400.0, + "grad_norm": 0.07784281121647556, + "language_loss": 0.82554364, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83604801, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4191, + "time_per_iteration": 2.555079460144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055942, + "balance_loss_mlp": 1.0468998, + "diversity_loss_mlp": 0.0, + "epoch": 0.8064640246248557, + "flos": 526407994368.0, + "grad_norm": 0.08129119625333912, + "language_loss": 0.85176903, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86232841, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4192, + "time_per_iteration": 2.616278648376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057227, + "balance_loss_mlp": 1.04823291, + "diversity_loss_mlp": 0.0, + "epoch": 0.8066564063101193, + "flos": 865115458560.0, + "grad_norm": 0.06195550147591559, + "language_loss": 0.8222602, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83283252, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4193, + "time_per_iteration": 3.1774582862854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055176, + "balance_loss_mlp": 1.04616332, + "diversity_loss_mlp": 0.0, + "epoch": 0.8068487879953828, + "flos": 530536614912.0, + "grad_norm": 0.07492247011829438, + "language_loss": 0.82230604, + "learning_rate": 9.468729611697246e-05, + "loss": 0.83285773, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4194, + "time_per_iteration": 2.711758613586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105404, + "balance_loss_mlp": 1.04514122, + "diversity_loss_mlp": 0.0, + "epoch": 0.8070411696806464, + "flos": 566183291904.0, + "grad_norm": 0.05932556750810355, + "language_loss": 0.81710708, + "learning_rate": 9.450494651319003e-05, + "loss": 0.82764751, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4195, + "time_per_iteration": 2.6608495712280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058947, + "balance_loss_mlp": 1.04997635, + "diversity_loss_mlp": 0.0, + "epoch": 0.80723355136591, + "flos": 986591010816.0, + "grad_norm": 0.063085164329588, + "language_loss": 0.79428887, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80487841, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4196, + "time_per_iteration": 3.337599515914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058486, + "balance_loss_mlp": 1.04924726, + "diversity_loss_mlp": 0.0, + "epoch": 0.8074259330511735, + "flos": 566961513984.0, + "grad_norm": 0.06810941123985487, + "language_loss": 0.82549566, + "learning_rate": 9.414071965778221e-05, + "loss": 0.83608055, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4197, + "time_per_iteration": 2.8500421047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00793266, + "balance_loss_mlp": 1.33856153, + "diversity_loss_mlp": 0.22554049, + "epoch": 0.8076183147364371, + "flos": 494662712832.0, + "grad_norm": 0.030004109162440378, + "language_loss": 0.80021191, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80814457, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01121513, + "step": 4198, + "time_per_iteration": 2.7939352989196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061077, + "balance_loss_mlp": 1.05221987, + "diversity_loss_mlp": 0.0, + "epoch": 0.8078106964217007, + "flos": 420011993088.0, + "grad_norm": 0.07237334672543508, + "language_loss": 0.79747534, + "learning_rate": 9.377712307650044e-05, + "loss": 0.80808604, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4199, + "time_per_iteration": 2.616584300994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060422, + "balance_loss_mlp": 1.05148149, + "diversity_loss_mlp": 0.0, + "epoch": 0.8080030781069643, + "flos": 527537152512.0, + "grad_norm": 0.07529347845483464, + "language_loss": 0.83181953, + "learning_rate": 9.359556131514602e-05, + "loss": 0.8424238, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4200, + "time_per_iteration": 2.6320338249206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788939, + "balance_loss_mlp": 1.33364224, + "diversity_loss_mlp": 0.22200125, + "epoch": 0.8081954597922277, + "flos": 544148554752.0, + "grad_norm": 0.03126306975747278, + "language_loss": 0.8159976, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82388693, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01111754, + "step": 4201, + "time_per_iteration": 2.725898265838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060854, + "balance_loss_mlp": 1.05191302, + "diversity_loss_mlp": 0.0, + "epoch": 0.8083878414774913, + "flos": 640900823040.0, + "grad_norm": 0.07028300429625041, + "language_loss": 0.75730419, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76791275, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4202, + "time_per_iteration": 2.858754873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057987, + "balance_loss_mlp": 1.04905808, + "diversity_loss_mlp": 0.0, + "epoch": 0.8085802231627549, + "flos": 705614146560.0, + "grad_norm": 0.07410213802766576, + "language_loss": 0.72826529, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73884517, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4203, + "time_per_iteration": 2.910843849182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053797, + "balance_loss_mlp": 1.04489827, + "diversity_loss_mlp": 0.0, + "epoch": 0.8087726048480185, + "flos": 419762373120.0, + "grad_norm": 0.07872218498382196, + "language_loss": 0.88753879, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89807671, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4204, + "time_per_iteration": 2.531914234161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059733, + "balance_loss_mlp": 1.05073869, + "diversity_loss_mlp": 0.0, + "epoch": 0.8089649865332821, + "flos": 508766178816.0, + "grad_norm": 0.05750820164302825, + "language_loss": 0.87048918, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88108647, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4205, + "time_per_iteration": 2.7968151569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052186, + "balance_loss_mlp": 1.04308999, + "diversity_loss_mlp": 0.0, + "epoch": 0.8091573682185456, + "flos": 457219883520.0, + "grad_norm": 0.06433103951625496, + "language_loss": 0.8483271, + "learning_rate": 9.250950659394386e-05, + "loss": 0.85884893, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4206, + "time_per_iteration": 2.665961742401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_mlp": 1.04172313, + "diversity_loss_mlp": 0.0, + "epoch": 0.8093497499038091, + "flos": 525256441344.0, + "grad_norm": 0.0784365412189913, + "language_loss": 0.77137649, + "learning_rate": 9.232905077078824e-05, + "loss": 0.7818836, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4207, + "time_per_iteration": 2.7918972969055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105439, + "balance_loss_mlp": 1.04530609, + "diversity_loss_mlp": 0.0, + "epoch": 0.8095421315890727, + "flos": 489617478144.0, + "grad_norm": 0.07290792729834863, + "language_loss": 0.76617867, + "learning_rate": 9.214875321953164e-05, + "loss": 0.77672255, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4208, + "time_per_iteration": 2.6330010890960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056212, + "balance_loss_mlp": 1.04722369, + "diversity_loss_mlp": 0.0, + "epoch": 0.8097345132743363, + "flos": 625109861376.0, + "grad_norm": 0.06967828145804263, + "language_loss": 0.81180429, + "learning_rate": 9.196861401017164e-05, + "loss": 0.82236642, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4209, + "time_per_iteration": 2.8048768043518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053814, + "balance_loss_mlp": 1.04471278, + "diversity_loss_mlp": 0.0, + "epoch": 0.8099268949595998, + "flos": 615688584192.0, + "grad_norm": 0.08832200116465504, + "language_loss": 0.79589164, + "learning_rate": 9.178863321264475e-05, + "loss": 0.8064298, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4210, + "time_per_iteration": 2.775315046310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053364, + "balance_loss_mlp": 1.04430985, + "diversity_loss_mlp": 0.0, + "epoch": 0.8101192766448634, + "flos": 479642632704.0, + "grad_norm": 0.05749425026246104, + "language_loss": 0.79754937, + "learning_rate": 9.160881089682566e-05, + "loss": 0.80808306, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4211, + "time_per_iteration": 2.6467440128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051728, + "balance_loss_mlp": 1.04233456, + "diversity_loss_mlp": 0.0, + "epoch": 0.810311658330127, + "flos": 517327741440.0, + "grad_norm": 0.06468521234127066, + "language_loss": 0.8684355, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87895274, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4212, + "time_per_iteration": 2.6296494007110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051493, + "balance_loss_mlp": 1.04236174, + "diversity_loss_mlp": 0.0, + "epoch": 0.8105040400153906, + "flos": 575782235136.0, + "grad_norm": 0.05999607560391635, + "language_loss": 0.84117031, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85168523, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 4213, + "time_per_iteration": 2.834974527359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048457, + "balance_loss_mlp": 1.03935528, + "diversity_loss_mlp": 0.0, + "epoch": 0.8106964217006541, + "flos": 638963707392.0, + "grad_norm": 0.07539161755647025, + "language_loss": 0.85083151, + "learning_rate": 9.107029553743862e-05, + "loss": 0.86131608, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4214, + "time_per_iteration": 2.8861420154571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053208, + "balance_loss_mlp": 1.04424381, + "diversity_loss_mlp": 0.0, + "epoch": 0.8108888033859176, + "flos": 579505964544.0, + "grad_norm": 0.07165268891230793, + "language_loss": 0.81364369, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82417578, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4215, + "time_per_iteration": 2.6690080165863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047412, + "balance_loss_mlp": 1.03829873, + "diversity_loss_mlp": 0.0, + "epoch": 0.8110811850711812, + "flos": 559907209728.0, + "grad_norm": 0.05808229124837682, + "language_loss": 0.83832216, + "learning_rate": 9.071207898465284e-05, + "loss": 0.84879631, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4216, + "time_per_iteration": 2.8289334774017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012526, + "balance_loss_mlp": 1.00782871, + "diversity_loss_mlp": 0.0, + "epoch": 0.8112735667564448, + "flos": 1517939979264.0, + "grad_norm": 0.01559500500099235, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78272945, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.046875, + "routerloss_mlp": 0.0, + "step": 4217, + "time_per_iteration": 4.674102067947388 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051919, + "balance_loss_mlp": 1.04281104, + "diversity_loss_mlp": 0.0, + "epoch": 0.8114659484417084, + "flos": 616340897280.0, + "grad_norm": 0.07154355832559847, + "language_loss": 0.85079706, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86131632, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4218, + "time_per_iteration": 2.8154706954956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043945, + "balance_loss_mlp": 1.03502214, + "diversity_loss_mlp": 0.0, + "epoch": 0.8116583301269719, + "flos": 649951340544.0, + "grad_norm": 0.06078221490906587, + "language_loss": 0.79071403, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80115348, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4219, + "time_per_iteration": 2.9709677696228027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047615, + "balance_loss_mlp": 1.03838241, + "diversity_loss_mlp": 0.0, + "epoch": 0.8118507118122354, + "flos": 553087844352.0, + "grad_norm": 0.07350013125355677, + "language_loss": 0.80881071, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81928694, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 4220, + "time_per_iteration": 2.7022857666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046017, + "balance_loss_mlp": 1.03677237, + "diversity_loss_mlp": 0.0, + "epoch": 0.812043093497499, + "flos": 544118819328.0, + "grad_norm": 0.06142059768116679, + "language_loss": 0.87557077, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88603091, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4221, + "time_per_iteration": 2.637735366821289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052382, + "balance_loss_mlp": 1.04335153, + "diversity_loss_mlp": 0.0, + "epoch": 0.8122354751827626, + "flos": 583404788736.0, + "grad_norm": 0.06689891729172881, + "language_loss": 0.83563554, + "learning_rate": 8.964124513805628e-05, + "loss": 0.84615934, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4222, + "time_per_iteration": 2.792409658432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010868, + "balance_loss_mlp": 1.00612342, + "diversity_loss_mlp": 0.0, + "epoch": 0.8124278568680262, + "flos": 1530568120320.0, + "grad_norm": 0.013920089604171917, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79260939, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.04736328, + "routerloss_mlp": 0.0, + "step": 4223, + "time_per_iteration": 4.96152138710022 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051266, + "balance_loss_mlp": 1.04209328, + "diversity_loss_mlp": 0.0, + "epoch": 0.8126202385532897, + "flos": 432865161216.0, + "grad_norm": 0.07751812943068913, + "language_loss": 0.8010273, + "learning_rate": 8.928557430748668e-05, + "loss": 0.81153995, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4224, + "time_per_iteration": 2.6411619186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010841, + "balance_loss_mlp": 1.00612068, + "diversity_loss_mlp": 0.0, + "epoch": 0.8128126202385533, + "flos": 1547905987584.0, + "grad_norm": 0.013617776499522711, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77506471, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.04711914, + "routerloss_mlp": 0.0, + "step": 4225, + "time_per_iteration": 4.849999904632568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047849, + "balance_loss_mlp": 1.03853297, + "diversity_loss_mlp": 0.0, + "epoch": 0.8130050019238169, + "flos": 528317945856.0, + "grad_norm": 0.06825415899254728, + "language_loss": 0.88826978, + "learning_rate": 8.893054129078077e-05, + "loss": 0.89874828, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 4226, + "time_per_iteration": 2.6051902770996094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104715, + "balance_loss_mlp": 1.03806627, + "diversity_loss_mlp": 0.0, + "epoch": 0.8131973836090804, + "flos": 543125481984.0, + "grad_norm": 0.07913354085389648, + "language_loss": 0.80409497, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81456649, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4227, + "time_per_iteration": 2.709742307662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046017, + "balance_loss_mlp": 1.03684425, + "diversity_loss_mlp": 0.0, + "epoch": 0.8133897652943439, + "flos": 576494019072.0, + "grad_norm": 0.11840379948544452, + "language_loss": 0.82457888, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83503902, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4228, + "time_per_iteration": 2.6976981163024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_mlp": 1.04245067, + "diversity_loss_mlp": 0.0, + "epoch": 0.8135821469796075, + "flos": 579219268608.0, + "grad_norm": 0.077990176521043, + "language_loss": 0.78880024, + "learning_rate": 8.839918887251025e-05, + "loss": 0.79931819, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 4229, + "time_per_iteration": 2.7945659160614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_mlp": 1.04340506, + "diversity_loss_mlp": 0.0, + "epoch": 0.8137745286648711, + "flos": 650346693120.0, + "grad_norm": 0.06092121648139386, + "language_loss": 0.84136802, + "learning_rate": 8.822239090334472e-05, + "loss": 0.8518936, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4230, + "time_per_iteration": 2.946951389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047623, + "balance_loss_mlp": 1.03831291, + "diversity_loss_mlp": 0.0, + "epoch": 0.8139669103501347, + "flos": 701888219136.0, + "grad_norm": 0.06877906362209742, + "language_loss": 0.75546557, + "learning_rate": 8.804575280042493e-05, + "loss": 0.7659418, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4231, + "time_per_iteration": 2.8897807598114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051697, + "balance_loss_mlp": 1.04225588, + "diversity_loss_mlp": 0.0, + "epoch": 0.8141592920353983, + "flos": 650223355392.0, + "grad_norm": 0.07632389877762422, + "language_loss": 0.82944, + "learning_rate": 8.786927463232774e-05, + "loss": 0.839957, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.09442139, + "routerloss_mlp": 0.0, + "step": 4232, + "time_per_iteration": 2.755648374557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052198, + "balance_loss_mlp": 1.04287577, + "diversity_loss_mlp": 0.0, + "epoch": 0.8143516737206618, + "flos": 536829949440.0, + "grad_norm": 0.07245949865511514, + "language_loss": 0.81604928, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82657123, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 4233, + "time_per_iteration": 2.573910713195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048453, + "balance_loss_mlp": 1.03923225, + "diversity_loss_mlp": 0.0, + "epoch": 0.8145440554059253, + "flos": 508366056960.0, + "grad_norm": 0.07474822596726854, + "language_loss": 0.82091659, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83140111, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4234, + "time_per_iteration": 2.595383405685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050121, + "balance_loss_mlp": 1.04080522, + "diversity_loss_mlp": 0.0, + "epoch": 0.8147364370911889, + "flos": 635032576512.0, + "grad_norm": 0.05760879468903708, + "language_loss": 0.86682582, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87732702, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.09307861, + "routerloss_mlp": 0.0, + "step": 4235, + "time_per_iteration": 2.8454620838165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050489, + "balance_loss_mlp": 1.04129791, + "diversity_loss_mlp": 0.0, + "epoch": 0.8149288187764525, + "flos": 422801482752.0, + "grad_norm": 0.07072559835413951, + "language_loss": 0.78216445, + "learning_rate": 8.716496267753343e-05, + "loss": 0.7926693, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 4236, + "time_per_iteration": 2.4742040634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_mlp": 1.03813028, + "diversity_loss_mlp": 0.0, + "epoch": 0.8151212004617161, + "flos": 597444014592.0, + "grad_norm": 0.06449709049791848, + "language_loss": 0.81412882, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82460093, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4237, + "time_per_iteration": 2.7545273303985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006046, + "balance_loss_mlp": 1.00139654, + "diversity_loss_mlp": 0.0, + "epoch": 0.8153135821469796, + "flos": 1479330915840.0, + "grad_norm": 0.010587263465776719, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78858888, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.04638672, + "routerloss_mlp": 0.0, + "step": 4238, + "time_per_iteration": 5.016268730163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_mlp": 1.03776968, + "diversity_loss_mlp": 0.0, + "epoch": 0.8155059638322432, + "flos": 437097669120.0, + "grad_norm": 0.0684339838675198, + "language_loss": 0.82887548, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83934742, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 4239, + "time_per_iteration": 2.5211598873138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052042, + "balance_loss_mlp": 1.04271388, + "diversity_loss_mlp": 0.0, + "epoch": 0.8156983455175068, + "flos": 794390727168.0, + "grad_norm": 0.06874840636234532, + "language_loss": 0.85361314, + "learning_rate": 8.646321514990763e-05, + "loss": 0.8641336, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4240, + "time_per_iteration": 3.083944797515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00785137, + "balance_loss_mlp": 1.32642579, + "diversity_loss_mlp": 0.22223642, + "epoch": 0.8158907272027703, + "flos": 685986029568.0, + "grad_norm": 0.03037997104545499, + "language_loss": 0.81663668, + "learning_rate": 8.628817947092616e-05, + "loss": 0.82448804, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0108057, + "step": 4241, + "time_per_iteration": 2.849032163619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796697, + "balance_loss_mlp": 1.3468852, + "diversity_loss_mlp": 0.22464219, + "epoch": 0.8160831088880338, + "flos": 487055213568.0, + "grad_norm": 0.041459762566519655, + "language_loss": 0.84508646, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85305345, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0109333, + "step": 4242, + "time_per_iteration": 2.6374778747558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010496, + "balance_loss_mlp": 1.0404923, + "diversity_loss_mlp": 0.0, + "epoch": 0.8162754905732974, + "flos": 464872172544.0, + "grad_norm": 0.06813712019116032, + "language_loss": 0.80444574, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81494176, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4243, + "time_per_iteration": 2.5741348266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005855, + "balance_loss_mlp": 1.00120556, + "diversity_loss_mlp": 0.0, + "epoch": 0.816467872258561, + "flos": 1239530522112.0, + "grad_norm": 0.012183850402686274, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76290977, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.04638672, + "routerloss_mlp": 0.0, + "step": 4244, + "time_per_iteration": 4.708779573440552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0079579, + "balance_loss_mlp": 1.34605587, + "diversity_loss_mlp": 0.22397524, + "epoch": 0.8166602539438246, + "flos": 687169516032.0, + "grad_norm": 0.030280251177676618, + "language_loss": 0.86728865, + "learning_rate": 8.558964360534615e-05, + "loss": 0.87524652, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01077495, + "step": 4245, + "time_per_iteration": 2.9368019104003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006174, + "balance_loss_mlp": 1.00154853, + "diversity_loss_mlp": 0.0, + "epoch": 0.8168526356290882, + "flos": 1490520807936.0, + "grad_norm": 0.013862139423476765, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.73980916, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.04614258, + "routerloss_mlp": 0.0, + "step": 4246, + "time_per_iteration": 4.941858291625977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078277, + "balance_loss_mlp": 1.31999934, + "diversity_loss_mlp": 0.22372745, + "epoch": 0.8170450173143516, + "flos": 578201338368.0, + "grad_norm": 0.027810419821976344, + "language_loss": 0.84806323, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85589087, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01090694, + "step": 4247, + "time_per_iteration": 2.7287490367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791953, + "balance_loss_mlp": 1.33846903, + "diversity_loss_mlp": 0.22388186, + "epoch": 0.8172373989996152, + "flos": 571275514368.0, + "grad_norm": 0.03087757735964202, + "language_loss": 0.84696209, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85488164, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01077755, + "step": 4248, + "time_per_iteration": 2.7625157833099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053911, + "balance_loss_mlp": 1.04469025, + "diversity_loss_mlp": 0.0, + "epoch": 0.8174297806848788, + "flos": 528831866880.0, + "grad_norm": 0.06506910983745173, + "language_loss": 0.80918235, + "learning_rate": 8.489368195241948e-05, + "loss": 0.81972146, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 4249, + "time_per_iteration": 2.6258833408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044558, + "balance_loss_mlp": 1.03533733, + "diversity_loss_mlp": 0.0, + "epoch": 0.8176221623701424, + "flos": 569108602368.0, + "grad_norm": 0.06744676767794172, + "language_loss": 0.78911942, + "learning_rate": 8.47200942668846e-05, + "loss": 0.79956502, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4250, + "time_per_iteration": 2.7859880924224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048873, + "balance_loss_mlp": 1.03986096, + "diversity_loss_mlp": 0.0, + "epoch": 0.8178145440554059, + "flos": 656521459200.0, + "grad_norm": 0.09007032647039148, + "language_loss": 0.80543828, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81592703, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4251, + "time_per_iteration": 2.8444883823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050772, + "balance_loss_mlp": 1.04183125, + "diversity_loss_mlp": 0.0, + "epoch": 0.8180069257406695, + "flos": 545924883456.0, + "grad_norm": 0.06143293566062141, + "language_loss": 0.87781107, + "learning_rate": 8.437340264101828e-05, + "loss": 0.88831878, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4252, + "time_per_iteration": 2.710468053817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051813, + "balance_loss_mlp": 1.04260981, + "diversity_loss_mlp": 0.0, + "epoch": 0.818199307425933, + "flos": 619271350272.0, + "grad_norm": 0.06730242930695572, + "language_loss": 0.84812832, + "learning_rate": 8.420029883528474e-05, + "loss": 0.85864639, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4253, + "time_per_iteration": 2.7251899242401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052007, + "balance_loss_mlp": 1.04279804, + "diversity_loss_mlp": 0.0, + "epoch": 0.8183916891111966, + "flos": 647618872320.0, + "grad_norm": 0.07105593379415724, + "language_loss": 0.77203315, + "learning_rate": 8.402735645731157e-05, + "loss": 0.7825532, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 4254, + "time_per_iteration": 2.8979763984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046206, + "balance_loss_mlp": 1.03733134, + "diversity_loss_mlp": 0.0, + "epoch": 0.8185840707964602, + "flos": 499120247808.0, + "grad_norm": 0.07494925573658785, + "language_loss": 0.77925122, + "learning_rate": 8.385457557424098e-05, + "loss": 0.78971332, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4255, + "time_per_iteration": 2.5896246433258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048093, + "balance_loss_mlp": 1.03896809, + "diversity_loss_mlp": 0.0, + "epoch": 0.8187764524817237, + "flos": 786229659648.0, + "grad_norm": 0.05893979232495145, + "language_loss": 0.79938138, + "learning_rate": 8.368195625315251e-05, + "loss": 0.80986238, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4256, + "time_per_iteration": 3.068570852279663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047062, + "balance_loss_mlp": 1.03782368, + "diversity_loss_mlp": 0.0, + "epoch": 0.8189688341669873, + "flos": 550710959616.0, + "grad_norm": 0.07101674717136439, + "language_loss": 0.80977142, + "learning_rate": 8.350949856106283e-05, + "loss": 0.82024205, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4257, + "time_per_iteration": 2.7494471073150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006173, + "balance_loss_mlp": 1.00154781, + "diversity_loss_mlp": 0.0, + "epoch": 0.8191612158522509, + "flos": 1351972435968.0, + "grad_norm": 0.007149039484563577, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72155517, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.04614258, + "routerloss_mlp": 0.0, + "step": 4258, + "time_per_iteration": 4.839837074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043595, + "balance_loss_mlp": 1.03455889, + "diversity_loss_mlp": 0.0, + "epoch": 0.8193535975375145, + "flos": 544257211392.0, + "grad_norm": 0.06534196989657123, + "language_loss": 0.84030735, + "learning_rate": 8.316506833163318e-05, + "loss": 0.85074329, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4259, + "time_per_iteration": 2.6422817707061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050217, + "balance_loss_mlp": 1.04123449, + "diversity_loss_mlp": 0.0, + "epoch": 0.8195459792227779, + "flos": 865733266944.0, + "grad_norm": 0.05670368476253994, + "language_loss": 0.85545492, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86595714, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4260, + "time_per_iteration": 3.125713586807251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050233, + "balance_loss_mlp": 1.04122066, + "diversity_loss_mlp": 0.0, + "epoch": 0.8197383609080415, + "flos": 569293982208.0, + "grad_norm": 0.06904116359736774, + "language_loss": 0.81980395, + "learning_rate": 8.282128542083101e-05, + "loss": 0.83030629, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4261, + "time_per_iteration": 2.76778507232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_mlp": 1.03641081, + "diversity_loss_mlp": 0.0, + "epoch": 0.8199307425933051, + "flos": 530813399040.0, + "grad_norm": 0.058406154368980764, + "language_loss": 0.85347754, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86393321, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4262, + "time_per_iteration": 2.628774404525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052152, + "balance_loss_mlp": 1.04290724, + "diversity_loss_mlp": 0.0, + "epoch": 0.8201231242785687, + "flos": 567070170624.0, + "grad_norm": 0.09112328550849395, + "language_loss": 0.85125005, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86177158, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4263, + "time_per_iteration": 2.7492353916168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048431, + "balance_loss_mlp": 1.03952646, + "diversity_loss_mlp": 0.0, + "epoch": 0.8203155059638323, + "flos": 1230505717248.0, + "grad_norm": 0.06356232342525024, + "language_loss": 0.82992971, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84041393, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4264, + "time_per_iteration": 3.54941725730896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052131, + "balance_loss_mlp": 1.04260004, + "diversity_loss_mlp": 0.0, + "epoch": 0.8205078876490958, + "flos": 574198626816.0, + "grad_norm": 0.061154055751469906, + "language_loss": 0.79944229, + "learning_rate": 8.213566368959558e-05, + "loss": 0.80996358, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.09521484, + "routerloss_mlp": 0.0, + "step": 4265, + "time_per_iteration": 2.677964210510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052685, + "balance_loss_mlp": 1.04367328, + "diversity_loss_mlp": 0.0, + "epoch": 0.8207002693343594, + "flos": 931400280576.0, + "grad_norm": 0.06353811334374408, + "language_loss": 0.78419554, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79472238, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4266, + "time_per_iteration": 3.203380823135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052882, + "balance_loss_mlp": 1.04395366, + "diversity_loss_mlp": 0.0, + "epoch": 0.8208926510196229, + "flos": 549571889664.0, + "grad_norm": 0.06191713334502218, + "language_loss": 0.80525327, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81578207, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4267, + "time_per_iteration": 2.6596202850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056507, + "balance_loss_mlp": 1.04715538, + "diversity_loss_mlp": 0.0, + "epoch": 0.8210850327048865, + "flos": 648182352384.0, + "grad_norm": 0.06008885513704129, + "language_loss": 0.81976879, + "learning_rate": 8.162315056592918e-05, + "loss": 0.83033383, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 4268, + "time_per_iteration": 2.8304736614227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053453, + "balance_loss_mlp": 1.04451835, + "diversity_loss_mlp": 0.0, + "epoch": 0.82127741439015, + "flos": 601520878080.0, + "grad_norm": 0.06523361113761998, + "language_loss": 0.81845587, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82899046, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4269, + "time_per_iteration": 2.7376768589019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105318, + "balance_loss_mlp": 1.04417932, + "diversity_loss_mlp": 0.0, + "epoch": 0.8214697960754136, + "flos": 474831963648.0, + "grad_norm": 0.07673767837283801, + "language_loss": 0.83897698, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84950882, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4270, + "time_per_iteration": 2.6805686950683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051913, + "balance_loss_mlp": 1.04284751, + "diversity_loss_mlp": 0.0, + "epoch": 0.8216621777606772, + "flos": 903648172032.0, + "grad_norm": 0.07279388279593675, + "language_loss": 0.85111851, + "learning_rate": 8.11120992965671e-05, + "loss": 0.86163765, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4271, + "time_per_iteration": 3.080000877380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783822, + "balance_loss_mlp": 1.32480633, + "diversity_loss_mlp": 0.22162104, + "epoch": 0.8218545594459408, + "flos": 514461528576.0, + "grad_norm": 0.033634037430315754, + "language_loss": 0.82290757, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83074582, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01060844, + "step": 4272, + "time_per_iteration": 2.615750789642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_mlp": 1.04102731, + "diversity_loss_mlp": 0.0, + "epoch": 0.8220469411312044, + "flos": 494536803840.0, + "grad_norm": 0.07856247677174821, + "language_loss": 0.86208439, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87258351, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4273, + "time_per_iteration": 2.6263344287872314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051099, + "balance_loss_mlp": 1.04169989, + "diversity_loss_mlp": 0.0, + "epoch": 0.8222393228164678, + "flos": 386433483264.0, + "grad_norm": 0.08144467378809686, + "language_loss": 0.89614367, + "learning_rate": 8.060251166717835e-05, + "loss": 0.90665472, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.09393311, + "routerloss_mlp": 0.0, + "step": 4274, + "time_per_iteration": 2.400228500366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054991, + "balance_loss_mlp": 1.04600263, + "diversity_loss_mlp": 0.0, + "epoch": 0.8224317045017314, + "flos": 536590241280.0, + "grad_norm": 0.06163444359601604, + "language_loss": 0.86974454, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88029444, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4275, + "time_per_iteration": 2.6878175735473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048878, + "balance_loss_mlp": 1.03988957, + "diversity_loss_mlp": 0.0, + "epoch": 0.822624086186995, + "flos": 554899051008.0, + "grad_norm": 0.07177776406534302, + "language_loss": 0.82458985, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83507866, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4276, + "time_per_iteration": 2.666274070739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050704, + "balance_loss_mlp": 1.04178667, + "diversity_loss_mlp": 0.0, + "epoch": 0.8228164678722586, + "flos": 539579791872.0, + "grad_norm": 0.06822688117582502, + "language_loss": 0.79940748, + "learning_rate": 8.009438945831771e-05, + "loss": 0.80991459, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4277, + "time_per_iteration": 2.6920108795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052707, + "balance_loss_mlp": 1.04362309, + "diversity_loss_mlp": 0.0, + "epoch": 0.8230088495575221, + "flos": 473253124608.0, + "grad_norm": 0.06798166655440095, + "language_loss": 0.79305434, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80358148, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4278, + "time_per_iteration": 2.6593875885009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056035, + "balance_loss_mlp": 1.04679036, + "diversity_loss_mlp": 0.0, + "epoch": 0.8232012312427857, + "flos": 591672314880.0, + "grad_norm": 0.07994138400827414, + "language_loss": 0.82999951, + "learning_rate": 7.975645631856127e-05, + "loss": 0.84055984, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4279, + "time_per_iteration": 2.6803600788116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_mlp": 1.04226494, + "diversity_loss_mlp": 0.0, + "epoch": 0.8233936129280492, + "flos": 572644380672.0, + "grad_norm": 0.060738985338191206, + "language_loss": 0.744928, + "learning_rate": 7.958773444541916e-05, + "loss": 0.7554431, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.09234619, + "routerloss_mlp": 0.0, + "step": 4280, + "time_per_iteration": 2.7890987396240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055668, + "balance_loss_mlp": 1.04667926, + "diversity_loss_mlp": 0.0, + "epoch": 0.8235859946133128, + "flos": 731337735168.0, + "grad_norm": 0.06641835359143249, + "language_loss": 0.78285408, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79341078, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4281, + "time_per_iteration": 3.0231053829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052951, + "balance_loss_mlp": 1.04405797, + "diversity_loss_mlp": 0.0, + "epoch": 0.8237783762985764, + "flos": 570314483712.0, + "grad_norm": 0.07232954234982779, + "language_loss": 0.81364781, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82417727, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4282, + "time_per_iteration": 2.702601909637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009495, + "balance_loss_mlp": 1.00503695, + "diversity_loss_mlp": 0.0, + "epoch": 0.8239707579838399, + "flos": 1466232897024.0, + "grad_norm": 0.005580683595342396, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76307166, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4283, + "time_per_iteration": 4.935715675354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057513, + "balance_loss_mlp": 1.04841709, + "diversity_loss_mlp": 0.0, + "epoch": 0.8241631396691035, + "flos": 467313297408.0, + "grad_norm": 0.0758894988729268, + "language_loss": 0.81082892, + "learning_rate": 7.89144797921037e-05, + "loss": 0.82140398, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4284, + "time_per_iteration": 2.6500790119171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010322, + "balance_loss_mlp": 1.00588739, + "diversity_loss_mlp": 0.0, + "epoch": 0.8243555213543671, + "flos": 1539426290688.0, + "grad_norm": 0.005340107036422925, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78944594, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4285, + "time_per_iteration": 4.93043065071106 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055758, + "balance_loss_mlp": 1.04675198, + "diversity_loss_mlp": 0.0, + "epoch": 0.8245479030396307, + "flos": 797429836800.0, + "grad_norm": 0.052404155401405805, + "language_loss": 0.82728308, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83784062, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4286, + "time_per_iteration": 3.1566803455352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054082, + "balance_loss_mlp": 1.04502165, + "diversity_loss_mlp": 0.0, + "epoch": 0.8247402847248941, + "flos": 646114185216.0, + "grad_norm": 0.07426299244547702, + "language_loss": 0.76636487, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77690566, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4287, + "time_per_iteration": 2.894404888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054163, + "balance_loss_mlp": 1.04488242, + "diversity_loss_mlp": 0.0, + "epoch": 0.8249326664101577, + "flos": 604421595648.0, + "grad_norm": 0.05641463912536871, + "language_loss": 0.79555058, + "learning_rate": 7.824384081587637e-05, + "loss": 0.8060922, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 4288, + "time_per_iteration": 2.8229329586029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058243, + "balance_loss_mlp": 1.04930818, + "diversity_loss_mlp": 0.0, + "epoch": 0.8251250480954213, + "flos": 824369218560.0, + "grad_norm": 0.0762203665991507, + "language_loss": 0.86487937, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87546182, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4289, + "time_per_iteration": 3.1116397380828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_mlp": 1.04234433, + "diversity_loss_mlp": 0.0, + "epoch": 0.8253174297806849, + "flos": 757382897664.0, + "grad_norm": 0.0740808728635397, + "language_loss": 0.78204668, + "learning_rate": 7.790950350913112e-05, + "loss": 0.79255825, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4290, + "time_per_iteration": 2.9050347805023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054866, + "balance_loss_mlp": 1.04616976, + "diversity_loss_mlp": 0.0, + "epoch": 0.8255098114659485, + "flos": 794469648384.0, + "grad_norm": 0.058080618005571384, + "language_loss": 0.87400663, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88455528, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 4291, + "time_per_iteration": 3.1467742919921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052707, + "balance_loss_mlp": 1.04383206, + "diversity_loss_mlp": 0.0, + "epoch": 0.825702193151212, + "flos": 710417475072.0, + "grad_norm": 0.06448799909112234, + "language_loss": 0.77267563, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78320277, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4292, + "time_per_iteration": 2.875955581665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105942, + "balance_loss_mlp": 1.05067623, + "diversity_loss_mlp": 0.0, + "epoch": 0.8258945748364755, + "flos": 683394029568.0, + "grad_norm": 0.06489065655526868, + "language_loss": 0.80734456, + "learning_rate": 7.740922673634537e-05, + "loss": 0.8179388, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4293, + "time_per_iteration": 2.906735420227051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105726, + "balance_loss_mlp": 1.04794431, + "diversity_loss_mlp": 0.0, + "epoch": 0.8260869565217391, + "flos": 594563120640.0, + "grad_norm": 0.06785179357058724, + "language_loss": 0.78951818, + "learning_rate": 7.724279585440186e-05, + "loss": 0.80009079, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 4294, + "time_per_iteration": 2.721102237701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051983, + "balance_loss_mlp": 1.04291677, + "diversity_loss_mlp": 0.0, + "epoch": 0.8262793382070027, + "flos": 651480993792.0, + "grad_norm": 0.07073253675532468, + "language_loss": 0.8505556, + "learning_rate": 7.707652910136098e-05, + "loss": 0.8610754, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4295, + "time_per_iteration": 2.7751898765563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055812, + "balance_loss_mlp": 1.04672778, + "diversity_loss_mlp": 0.0, + "epoch": 0.8264717198922663, + "flos": 538922709504.0, + "grad_norm": 0.06741164173780789, + "language_loss": 0.84659898, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85715711, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4296, + "time_per_iteration": 2.6647472381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056393, + "balance_loss_mlp": 1.04746425, + "diversity_loss_mlp": 0.0, + "epoch": 0.8266641015775298, + "flos": 538949873664.0, + "grad_norm": 0.07582259364872852, + "language_loss": 0.75999844, + "learning_rate": 7.674448824012514e-05, + "loss": 0.77056229, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4297, + "time_per_iteration": 2.6833221912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_mlp": 1.04438508, + "diversity_loss_mlp": 0.0, + "epoch": 0.8268564832627934, + "flos": 585361728000.0, + "grad_norm": 0.05929184332183984, + "language_loss": 0.83883959, + "learning_rate": 7.657871426083979e-05, + "loss": 0.84937572, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4298, + "time_per_iteration": 2.8329238891601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053687, + "balance_loss_mlp": 1.04474664, + "diversity_loss_mlp": 0.0, + "epoch": 0.827048864948057, + "flos": 430661173248.0, + "grad_norm": 0.07448007019964706, + "language_loss": 0.84225285, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85278976, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4299, + "time_per_iteration": 2.489332675933838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049289, + "balance_loss_mlp": 1.04037237, + "diversity_loss_mlp": 0.0, + "epoch": 0.8272412466333205, + "flos": 1388430761472.0, + "grad_norm": 0.06599892876771768, + "language_loss": 0.85128617, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86177909, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4300, + "time_per_iteration": 3.732990026473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055631, + "balance_loss_mlp": 1.04661894, + "diversity_loss_mlp": 0.0, + "epoch": 0.827433628318584, + "flos": 538230749184.0, + "grad_norm": 0.05906795179451105, + "language_loss": 0.82889211, + "learning_rate": 7.608237890043335e-05, + "loss": 0.83944845, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4301, + "time_per_iteration": 2.690711259841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048067, + "balance_loss_mlp": 1.03897715, + "diversity_loss_mlp": 0.0, + "epoch": 0.8276260100038476, + "flos": 730734981120.0, + "grad_norm": 0.07258594610710227, + "language_loss": 0.77361107, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78409171, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4302, + "time_per_iteration": 2.9701120853424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788744, + "balance_loss_mlp": 1.3319999, + "diversity_loss_mlp": 0.22346261, + "epoch": 0.8278183916891112, + "flos": 871102273536.0, + "grad_norm": 0.027743371165779296, + "language_loss": 0.82558441, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83347189, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01101248, + "step": 4303, + "time_per_iteration": 3.223346471786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052756, + "balance_loss_mlp": 1.04391634, + "diversity_loss_mlp": 0.0, + "epoch": 0.8280107733743748, + "flos": 594543297024.0, + "grad_norm": 0.05962542188798652, + "language_loss": 0.7781111, + "learning_rate": 7.558752475439134e-05, + "loss": 0.78863871, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4304, + "time_per_iteration": 2.7994863986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051008, + "balance_loss_mlp": 1.04218018, + "diversity_loss_mlp": 0.0, + "epoch": 0.8282031550596384, + "flos": 768607667712.0, + "grad_norm": 0.07052691004217361, + "language_loss": 0.84562683, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85613692, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4305, + "time_per_iteration": 3.0267395973205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_mlp": 1.04208159, + "diversity_loss_mlp": 0.0, + "epoch": 0.8283955367449019, + "flos": 696108805632.0, + "grad_norm": 0.07942922848471844, + "language_loss": 0.78335333, + "learning_rate": 7.525844574130947e-05, + "loss": 0.79386634, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 4306, + "time_per_iteration": 2.914696455001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049867, + "balance_loss_mlp": 1.0407536, + "diversity_loss_mlp": 0.0, + "epoch": 0.8285879184301654, + "flos": 660630256128.0, + "grad_norm": 0.08577922080448468, + "language_loss": 0.82953119, + "learning_rate": 7.509415355178806e-05, + "loss": 0.8400299, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4307, + "time_per_iteration": 2.9498178958892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788913, + "balance_loss_mlp": 1.33115017, + "diversity_loss_mlp": 0.22477263, + "epoch": 0.828780300115429, + "flos": 558709042176.0, + "grad_norm": 0.04309088247538252, + "language_loss": 0.77926069, + "learning_rate": 7.493002632534618e-05, + "loss": 0.78714979, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01095133, + "step": 4308, + "time_per_iteration": 2.7063913345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050662, + "balance_loss_mlp": 1.04154897, + "diversity_loss_mlp": 0.0, + "epoch": 0.8289726818006926, + "flos": 830963930112.0, + "grad_norm": 0.05899046117627297, + "language_loss": 0.81765443, + "learning_rate": 7.476606412570352e-05, + "loss": 0.828161, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4309, + "time_per_iteration": 3.0521981716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053534, + "balance_loss_mlp": 1.04459929, + "diversity_loss_mlp": 0.0, + "epoch": 0.8291650634859561, + "flos": 732289227264.0, + "grad_norm": 0.07518852690871787, + "language_loss": 0.80517173, + "learning_rate": 7.460226701651624e-05, + "loss": 0.81570709, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4310, + "time_per_iteration": 2.904289722442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055947, + "balance_loss_mlp": 1.04662442, + "diversity_loss_mlp": 0.0, + "epoch": 0.8293574451712197, + "flos": 860910114816.0, + "grad_norm": 0.06212685924060065, + "language_loss": 0.81412387, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82468331, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.09313965, + "routerloss_mlp": 0.0, + "step": 4311, + "time_per_iteration": 3.203298807144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052126, + "balance_loss_mlp": 1.04322684, + "diversity_loss_mlp": 0.0, + "epoch": 0.8295498268564833, + "flos": 495156810240.0, + "grad_norm": 0.05391272281173969, + "language_loss": 0.81940407, + "learning_rate": 7.427516832380948e-05, + "loss": 0.8299253, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4312, + "time_per_iteration": 2.8845975399017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055692, + "balance_loss_mlp": 1.04694164, + "diversity_loss_mlp": 0.0, + "epoch": 0.8297422085417469, + "flos": 554471391744.0, + "grad_norm": 0.05500480744199572, + "language_loss": 0.77808565, + "learning_rate": 7.4111866867281e-05, + "loss": 0.78864259, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4313, + "time_per_iteration": 2.7781200408935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048602, + "balance_loss_mlp": 1.03975666, + "diversity_loss_mlp": 0.0, + "epoch": 0.8299345902270104, + "flos": 1247497417728.0, + "grad_norm": 0.06268776190670762, + "language_loss": 0.77513206, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78561807, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4314, + "time_per_iteration": 3.6484732627868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060785, + "balance_loss_mlp": 1.05197561, + "diversity_loss_mlp": 0.0, + "epoch": 0.8301269719122739, + "flos": 585260411904.0, + "grad_norm": 0.07094165320870974, + "language_loss": 0.83007073, + "learning_rate": 7.378576005087034e-05, + "loss": 0.84067863, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4315, + "time_per_iteration": 2.7556705474853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105563, + "balance_loss_mlp": 1.04686821, + "diversity_loss_mlp": 0.0, + "epoch": 0.8303193535975375, + "flos": 509732352000.0, + "grad_norm": 0.06645426228125094, + "language_loss": 0.84888268, + "learning_rate": 7.362295481759412e-05, + "loss": 0.85943896, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4316, + "time_per_iteration": 2.6553759574890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786621, + "balance_loss_mlp": 1.32643843, + "diversity_loss_mlp": 0.22519124, + "epoch": 0.8305117352828011, + "flos": 580652375040.0, + "grad_norm": 0.03189628781024831, + "language_loss": 0.83680773, + "learning_rate": 7.346031511856722e-05, + "loss": 0.84467387, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01080582, + "step": 4317, + "time_per_iteration": 2.742246150970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054275, + "balance_loss_mlp": 1.04532266, + "diversity_loss_mlp": 0.0, + "epoch": 0.8307041169680647, + "flos": 481626736128.0, + "grad_norm": 0.06852217711760565, + "language_loss": 0.7890569, + "learning_rate": 7.329784101693232e-05, + "loss": 0.79959965, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4318, + "time_per_iteration": 2.601116418838501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105841, + "balance_loss_mlp": 1.04927838, + "diversity_loss_mlp": 0.0, + "epoch": 0.8308964986533282, + "flos": 624605852160.0, + "grad_norm": 0.06935977491556748, + "language_loss": 0.83060843, + "learning_rate": 7.313553257576727e-05, + "loss": 0.84119254, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 4319, + "time_per_iteration": 2.7160871028900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_mlp": 1.04382229, + "diversity_loss_mlp": 0.0, + "epoch": 0.8310888803385917, + "flos": 827319495168.0, + "grad_norm": 0.07045309902078044, + "language_loss": 0.78631043, + "learning_rate": 7.297338985808589e-05, + "loss": 0.79683906, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4320, + "time_per_iteration": 3.009129762649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059595, + "balance_loss_mlp": 1.05061913, + "diversity_loss_mlp": 0.0, + "epoch": 0.8312812620238553, + "flos": 583743241728.0, + "grad_norm": 0.06816415290870351, + "language_loss": 0.81865102, + "learning_rate": 7.281141292683746e-05, + "loss": 0.829247, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4321, + "time_per_iteration": 2.814836025238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056811, + "balance_loss_mlp": 1.04793024, + "diversity_loss_mlp": 0.0, + "epoch": 0.8314736437091189, + "flos": 1115605052928.0, + "grad_norm": 0.06950401316575304, + "language_loss": 0.7471621, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75773025, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4322, + "time_per_iteration": 3.438296318054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057458, + "balance_loss_mlp": 1.0484755, + "diversity_loss_mlp": 0.0, + "epoch": 0.8316660253943825, + "flos": 517547625984.0, + "grad_norm": 0.07376809791811713, + "language_loss": 0.82077682, + "learning_rate": 7.248795667511543e-05, + "loss": 0.83135134, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4323, + "time_per_iteration": 2.7750163078308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054403, + "balance_loss_mlp": 1.04560554, + "diversity_loss_mlp": 0.0, + "epoch": 0.831858407079646, + "flos": 795329736192.0, + "grad_norm": 0.07472428991139068, + "language_loss": 0.77946472, + "learning_rate": 7.232647748021864e-05, + "loss": 0.79000878, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4324, + "time_per_iteration": 3.035860776901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058014, + "balance_loss_mlp": 1.04919243, + "diversity_loss_mlp": 0.0, + "epoch": 0.8320507887649096, + "flos": 549967242240.0, + "grad_norm": 0.06856699827771942, + "language_loss": 0.83216256, + "learning_rate": 7.216516432290843e-05, + "loss": 0.84274268, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4325, + "time_per_iteration": 2.705737352371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057993, + "balance_loss_mlp": 1.04915345, + "diversity_loss_mlp": 0.0, + "epoch": 0.8322431704501732, + "flos": 479398155264.0, + "grad_norm": 0.07351613065944015, + "language_loss": 0.82007957, + "learning_rate": 7.20040172658123e-05, + "loss": 0.83065945, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4326, + "time_per_iteration": 2.601170539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060086, + "balance_loss_mlp": 1.0512104, + "diversity_loss_mlp": 0.0, + "epoch": 0.8324355521354367, + "flos": 572434407936.0, + "grad_norm": 0.05702554279595623, + "language_loss": 0.85418373, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86478466, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4327, + "time_per_iteration": 2.6739983558654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057295, + "balance_loss_mlp": 1.04846764, + "diversity_loss_mlp": 0.0, + "epoch": 0.8326279338207002, + "flos": 503454071808.0, + "grad_norm": 0.06350176662838333, + "language_loss": 0.82565081, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83622372, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4328, + "time_per_iteration": 2.608927011489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055632, + "balance_loss_mlp": 1.04681087, + "diversity_loss_mlp": 0.0, + "epoch": 0.8328203155059638, + "flos": 605743474176.0, + "grad_norm": 0.06140661393609168, + "language_loss": 0.81182075, + "learning_rate": 7.152157332111364e-05, + "loss": 0.82237709, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4329, + "time_per_iteration": 2.9149293899536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055768, + "balance_loss_mlp": 1.04682159, + "diversity_loss_mlp": 0.0, + "epoch": 0.8330126971912274, + "flos": 697798872576.0, + "grad_norm": 0.07439273272708623, + "language_loss": 0.8576234, + "learning_rate": 7.136109128985663e-05, + "loss": 0.86818105, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4330, + "time_per_iteration": 2.9639134407043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105965, + "balance_loss_mlp": 1.05070877, + "diversity_loss_mlp": 0.0, + "epoch": 0.833205078876491, + "flos": 494042706432.0, + "grad_norm": 0.08290776170171969, + "language_loss": 0.86890334, + "learning_rate": 7.120077567098249e-05, + "loss": 0.87949985, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4331, + "time_per_iteration": 2.6148195266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054578, + "balance_loss_mlp": 1.04560781, + "diversity_loss_mlp": 0.0, + "epoch": 0.8333974605617546, + "flos": 482812793856.0, + "grad_norm": 0.057322207358884096, + "language_loss": 0.82625836, + "learning_rate": 7.104062652673115e-05, + "loss": 0.83680409, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4332, + "time_per_iteration": 2.621798515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_mlp": 1.0477283, + "diversity_loss_mlp": 0.0, + "epoch": 0.833589842247018, + "flos": 686821151232.0, + "grad_norm": 0.07570063772280167, + "language_loss": 0.82964915, + "learning_rate": 7.088064391927818e-05, + "loss": 0.84021485, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4333, + "time_per_iteration": 2.837819814682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053173, + "balance_loss_mlp": 1.04428554, + "diversity_loss_mlp": 0.0, + "epoch": 0.8337822239322816, + "flos": 881739343872.0, + "grad_norm": 0.06974463300031715, + "language_loss": 0.83023667, + "learning_rate": 7.072082791073419e-05, + "loss": 0.8407684, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4334, + "time_per_iteration": 3.1047897338867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054952, + "balance_loss_mlp": 1.04588628, + "diversity_loss_mlp": 0.0, + "epoch": 0.8339746056175452, + "flos": 497183132160.0, + "grad_norm": 0.07461604540726756, + "language_loss": 0.82598537, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83653492, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4335, + "time_per_iteration": 2.5917162895202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105528, + "balance_loss_mlp": 1.04616058, + "diversity_loss_mlp": 0.0, + "epoch": 0.8341669873028088, + "flos": 510495892992.0, + "grad_norm": 0.07051755558905955, + "language_loss": 0.8628878, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87344062, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4336, + "time_per_iteration": 2.6134135723114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_mlp": 1.04197288, + "diversity_loss_mlp": 0.0, + "epoch": 0.8343593689880723, + "flos": 692321209344.0, + "grad_norm": 0.06598640893887409, + "language_loss": 0.83991468, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85042214, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4337, + "time_per_iteration": 2.7903592586517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052579, + "balance_loss_mlp": 1.04391873, + "diversity_loss_mlp": 0.0, + "epoch": 0.8345517506733359, + "flos": 552408367104.0, + "grad_norm": 0.0663044915688964, + "language_loss": 0.7816447, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79217046, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 4338, + "time_per_iteration": 2.7299916744232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053332, + "balance_loss_mlp": 1.04413533, + "diversity_loss_mlp": 0.0, + "epoch": 0.8347441323585995, + "flos": 592052613120.0, + "grad_norm": 0.06355289445146371, + "language_loss": 0.76546603, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77599931, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4339, + "time_per_iteration": 2.8064498901367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052893, + "balance_loss_mlp": 1.04425037, + "diversity_loss_mlp": 0.0, + "epoch": 0.834936514043863, + "flos": 614917702656.0, + "grad_norm": 0.061799613244502456, + "language_loss": 0.84427285, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85480177, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 4340, + "time_per_iteration": 2.7731611728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105023, + "balance_loss_mlp": 1.04137301, + "diversity_loss_mlp": 0.0, + "epoch": 0.8351288957291266, + "flos": 467844470784.0, + "grad_norm": 0.15350718356465945, + "language_loss": 0.79499578, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80549812, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4341, + "time_per_iteration": 2.6016902923583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052884, + "balance_loss_mlp": 1.04431295, + "diversity_loss_mlp": 0.0, + "epoch": 0.8353212774143901, + "flos": 509319747072.0, + "grad_norm": 0.07564737297123257, + "language_loss": 0.78984159, + "learning_rate": 6.944830483504328e-05, + "loss": 0.80037045, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.08581543, + "routerloss_mlp": 0.0, + "step": 4342, + "time_per_iteration": 2.670459747314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049647, + "balance_loss_mlp": 1.04070663, + "diversity_loss_mlp": 0.0, + "epoch": 0.8355136590996537, + "flos": 687784753152.0, + "grad_norm": 0.06668235677339521, + "language_loss": 0.8060447, + "learning_rate": 6.928999100098483e-05, + "loss": 0.81654119, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4343, + "time_per_iteration": 2.817136287689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783572, + "balance_loss_mlp": 1.31915021, + "diversity_loss_mlp": 0.22572948, + "epoch": 0.8357060407849173, + "flos": 984409417728.0, + "grad_norm": 0.032919488551848924, + "language_loss": 0.84127021, + "learning_rate": 6.913184438338138e-05, + "loss": 0.84910595, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01113241, + "step": 4344, + "time_per_iteration": 3.2518675327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059144, + "balance_loss_mlp": 1.05024457, + "diversity_loss_mlp": 0.0, + "epoch": 0.8358984224701809, + "flos": 843026393088.0, + "grad_norm": 0.06270529003473267, + "language_loss": 0.85050792, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86109936, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4345, + "time_per_iteration": 3.1636109352111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053656, + "balance_loss_mlp": 1.04487062, + "diversity_loss_mlp": 0.0, + "epoch": 0.8360908041554445, + "flos": 626239019520.0, + "grad_norm": 0.07260078506489727, + "language_loss": 0.82210159, + "learning_rate": 6.881605304306748e-05, + "loss": 0.83263814, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4346, + "time_per_iteration": 2.8204703330993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050108, + "balance_loss_mlp": 1.04092288, + "diversity_loss_mlp": 0.0, + "epoch": 0.8362831858407079, + "flos": 576068931072.0, + "grad_norm": 0.061944149403073474, + "language_loss": 0.8502146, + "learning_rate": 6.865840844295796e-05, + "loss": 0.86071575, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4347, + "time_per_iteration": 2.805941343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053763, + "balance_loss_mlp": 1.04459023, + "diversity_loss_mlp": 0.0, + "epoch": 0.8364755675259715, + "flos": 833783155200.0, + "grad_norm": 0.0772733121075158, + "language_loss": 0.8092171, + "learning_rate": 6.850093130450569e-05, + "loss": 0.81975472, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4348, + "time_per_iteration": 3.040851593017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790162, + "balance_loss_mlp": 1.33250082, + "diversity_loss_mlp": 0.22602889, + "epoch": 0.8366679492112351, + "flos": 582480834048.0, + "grad_norm": 0.039903517211963106, + "language_loss": 0.86440182, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87230343, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0108971, + "step": 4349, + "time_per_iteration": 2.699540615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054884, + "balance_loss_mlp": 1.04582453, + "diversity_loss_mlp": 0.0, + "epoch": 0.8368603308964987, + "flos": 611722948608.0, + "grad_norm": 0.07332657660036589, + "language_loss": 0.87533635, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88588518, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4350, + "time_per_iteration": 2.7678165435791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052928, + "balance_loss_mlp": 1.04408848, + "diversity_loss_mlp": 0.0, + "epoch": 0.8370527125817622, + "flos": 507264062976.0, + "grad_norm": 0.06629049094152589, + "language_loss": 0.85621446, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86674374, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4351, + "time_per_iteration": 2.737682819366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045769, + "balance_loss_mlp": 1.03676879, + "diversity_loss_mlp": 0.0, + "epoch": 0.8372450942670258, + "flos": 770952619008.0, + "grad_norm": 0.07766225400345093, + "language_loss": 0.82484055, + "learning_rate": 6.787269858905603e-05, + "loss": 0.8352983, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4352, + "time_per_iteration": 2.9142751693725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048952, + "balance_loss_mlp": 1.04007125, + "diversity_loss_mlp": 0.0, + "epoch": 0.8374374759522893, + "flos": 579276168192.0, + "grad_norm": 0.06438247248872511, + "language_loss": 0.85065448, + "learning_rate": 6.771605967466033e-05, + "loss": 0.86114407, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4353, + "time_per_iteration": 2.6874396800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048044, + "balance_loss_mlp": 1.0389719, + "diversity_loss_mlp": 0.0, + "epoch": 0.8376298576375529, + "flos": 788129699328.0, + "grad_norm": 0.07663124345564373, + "language_loss": 0.82635599, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83683646, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4354, + "time_per_iteration": 2.998286724090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052737, + "balance_loss_mlp": 1.04317021, + "diversity_loss_mlp": 0.0, + "epoch": 0.8378222393228165, + "flos": 577613265408.0, + "grad_norm": 0.07233016182516484, + "language_loss": 0.80633909, + "learning_rate": 6.74032853891452e-05, + "loss": 0.8168664, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.09564209, + "routerloss_mlp": 0.0, + "step": 4355, + "time_per_iteration": 2.75176739692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046082, + "balance_loss_mlp": 1.03711188, + "diversity_loss_mlp": 0.0, + "epoch": 0.83801462100808, + "flos": 480865766400.0, + "grad_norm": 0.06437396666642163, + "language_loss": 0.82113147, + "learning_rate": 6.724715013945548e-05, + "loss": 0.83159232, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4356, + "time_per_iteration": 2.638768196105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_mlp": 1.04145241, + "diversity_loss_mlp": 0.0, + "epoch": 0.8382070026933436, + "flos": 550817044992.0, + "grad_norm": 0.06364273403340714, + "language_loss": 0.8922165, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90272063, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4357, + "time_per_iteration": 2.78487491607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051515, + "balance_loss_mlp": 1.04247308, + "diversity_loss_mlp": 0.0, + "epoch": 0.8383993843786072, + "flos": 624968898048.0, + "grad_norm": 0.08356541609520973, + "language_loss": 0.82212794, + "learning_rate": 6.693538372929725e-05, + "loss": 0.83264303, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4358, + "time_per_iteration": 2.9017884731292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786956, + "balance_loss_mlp": 1.32808125, + "diversity_loss_mlp": 0.22438851, + "epoch": 0.8385917660638708, + "flos": 491169153024.0, + "grad_norm": 0.03328062669176706, + "language_loss": 0.86377019, + "learning_rate": 6.677975268986719e-05, + "loss": 0.87163973, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01072117, + "step": 4359, + "time_per_iteration": 2.57958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047883, + "balance_loss_mlp": 1.0387392, + "diversity_loss_mlp": 0.0, + "epoch": 0.8387841477491342, + "flos": 466900692480.0, + "grad_norm": 0.07170710125962251, + "language_loss": 0.87394094, + "learning_rate": 6.662428984145336e-05, + "loss": 0.8844198, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 4360, + "time_per_iteration": 2.5944197177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016166, + "balance_loss_mlp": 1.01177895, + "diversity_loss_mlp": 0.0, + "epoch": 0.8389765294343978, + "flos": 1564188475392.0, + "grad_norm": 0.01396369957588317, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72796351, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.04394531, + "routerloss_mlp": 0.0, + "step": 4361, + "time_per_iteration": 5.049343109130859 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049104, + "balance_loss_mlp": 1.04028869, + "diversity_loss_mlp": 0.0, + "epoch": 0.8391689111196614, + "flos": 602160708096.0, + "grad_norm": 0.0657328713955244, + "language_loss": 0.82911998, + "learning_rate": 6.631386895903308e-05, + "loss": 0.83961105, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4362, + "time_per_iteration": 2.857707977294922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049668, + "balance_loss_mlp": 1.04045308, + "diversity_loss_mlp": 0.0, + "epoch": 0.839361292804925, + "flos": 443047408128.0, + "grad_norm": 0.07766308356740377, + "language_loss": 0.80444038, + "learning_rate": 6.615891104554261e-05, + "loss": 0.81493711, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4363, + "time_per_iteration": 2.481901168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_mlp": 1.0369525, + "diversity_loss_mlp": 0.0, + "epoch": 0.8395536744901886, + "flos": 594167768064.0, + "grad_norm": 0.061496061316517255, + "language_loss": 0.82737863, + "learning_rate": 6.600412156410057e-05, + "loss": 0.83784378, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.09552002, + "routerloss_mlp": 0.0, + "step": 4364, + "time_per_iteration": 2.7074997425079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048671, + "balance_loss_mlp": 1.03946805, + "diversity_loss_mlp": 0.0, + "epoch": 0.8397460561754521, + "flos": 889836171264.0, + "grad_norm": 0.067014192244174, + "language_loss": 0.84650993, + "learning_rate": 6.58495005748016e-05, + "loss": 0.85699666, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4365, + "time_per_iteration": 3.1557445526123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045295, + "balance_loss_mlp": 1.03640795, + "diversity_loss_mlp": 0.0, + "epoch": 0.8399384378607156, + "flos": 553503020544.0, + "grad_norm": 0.0631575802857794, + "language_loss": 0.89196813, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90242112, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4366, + "time_per_iteration": 2.624469757080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046761, + "balance_loss_mlp": 1.03753984, + "diversity_loss_mlp": 0.0, + "epoch": 0.8401308195459792, + "flos": 518923832832.0, + "grad_norm": 0.06347741472269025, + "language_loss": 0.83584821, + "learning_rate": 6.554076431268341e-05, + "loss": 0.8463158, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4367, + "time_per_iteration": 2.6431565284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_mlp": 1.04021287, + "diversity_loss_mlp": 0.0, + "epoch": 0.8403232012312428, + "flos": 684933221376.0, + "grad_norm": 0.07076442779164972, + "language_loss": 0.80955088, + "learning_rate": 6.538664915972648e-05, + "loss": 0.82004237, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4368, + "time_per_iteration": 3.018554449081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00773368, + "balance_loss_mlp": 1.30118096, + "diversity_loss_mlp": 0.22479768, + "epoch": 0.8405155829165063, + "flos": 577672736256.0, + "grad_norm": 0.03439452063807504, + "language_loss": 0.77776653, + "learning_rate": 6.523270273863652e-05, + "loss": 0.78550017, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01037853, + "step": 4369, + "time_per_iteration": 2.6944448947906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045809, + "balance_loss_mlp": 1.03648067, + "diversity_loss_mlp": 0.0, + "epoch": 0.8407079646017699, + "flos": 456627041280.0, + "grad_norm": 0.1193689802326749, + "language_loss": 0.87956655, + "learning_rate": 6.507892510918079e-05, + "loss": 0.8900246, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.09326172, + "routerloss_mlp": 0.0, + "step": 4370, + "time_per_iteration": 2.529339551925659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047708, + "balance_loss_mlp": 1.03855264, + "diversity_loss_mlp": 0.0, + "epoch": 0.8409003462870335, + "flos": 534917426688.0, + "grad_norm": 0.07411757925982031, + "language_loss": 0.81849647, + "learning_rate": 6.492531633106114e-05, + "loss": 0.82897353, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4371, + "time_per_iteration": 2.776374578475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050613, + "balance_loss_mlp": 1.04111791, + "diversity_loss_mlp": 0.0, + "epoch": 0.8410927279722971, + "flos": 556759443456.0, + "grad_norm": 0.08018635739985482, + "language_loss": 0.77876925, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78927541, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.09484863, + "routerloss_mlp": 0.0, + "step": 4372, + "time_per_iteration": 2.7516069412231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008359, + "balance_loss_mlp": 1.00390017, + "diversity_loss_mlp": 0.0, + "epoch": 0.8412851096575606, + "flos": 1549754270208.0, + "grad_norm": 0.00952058425700796, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78687477, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4373, + "time_per_iteration": 4.912792682647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048789, + "balance_loss_mlp": 1.0395565, + "diversity_loss_mlp": 0.0, + "epoch": 0.8414774913428241, + "flos": 552042749952.0, + "grad_norm": 0.07245552666854996, + "language_loss": 0.78958535, + "learning_rate": 6.446550370075271e-05, + "loss": 0.80007321, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4374, + "time_per_iteration": 2.711447238922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_mlp": 1.03688145, + "diversity_loss_mlp": 0.0, + "epoch": 0.8416698730280877, + "flos": 573015140352.0, + "grad_norm": 0.07770698856431457, + "language_loss": 0.77577722, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78623879, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 4375, + "time_per_iteration": 2.694774627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050017, + "balance_loss_mlp": 1.04059398, + "diversity_loss_mlp": 0.0, + "epoch": 0.8418622547133513, + "flos": 758731940352.0, + "grad_norm": 0.11734230107546348, + "language_loss": 0.80035317, + "learning_rate": 6.415980729547543e-05, + "loss": 0.81085336, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.09411621, + "routerloss_mlp": 0.0, + "step": 4376, + "time_per_iteration": 2.918545961380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049017, + "balance_loss_mlp": 1.03976655, + "diversity_loss_mlp": 0.0, + "epoch": 0.8420546363986149, + "flos": 1074156940800.0, + "grad_norm": 0.07794527811003633, + "language_loss": 0.72769749, + "learning_rate": 6.40072128754366e-05, + "loss": 0.73818767, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4377, + "time_per_iteration": 3.4151737689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050973, + "balance_loss_mlp": 1.04171598, + "diversity_loss_mlp": 0.0, + "epoch": 0.8422470180838784, + "flos": 525908754432.0, + "grad_norm": 0.0675536673804059, + "language_loss": 0.82617545, + "learning_rate": 6.385478772280933e-05, + "loss": 0.83668518, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.0925293, + "routerloss_mlp": 0.0, + "step": 4378, + "time_per_iteration": 2.749711036682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048915, + "balance_loss_mlp": 1.03964031, + "diversity_loss_mlp": 0.0, + "epoch": 0.842439399769142, + "flos": 600834060288.0, + "grad_norm": 0.06567054296588401, + "language_loss": 0.82044506, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83093417, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 4379, + "time_per_iteration": 2.761420488357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049874, + "balance_loss_mlp": 1.04072499, + "diversity_loss_mlp": 0.0, + "epoch": 0.8426317814544055, + "flos": 552222987264.0, + "grad_norm": 0.06119198131713492, + "language_loss": 0.86507058, + "learning_rate": 6.355044545643073e-05, + "loss": 0.87556934, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4380, + "time_per_iteration": 2.816401720046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049119, + "balance_loss_mlp": 1.04015481, + "diversity_loss_mlp": 0.0, + "epoch": 0.8428241631396691, + "flos": 678832980480.0, + "grad_norm": 0.08611471083111012, + "language_loss": 0.77840042, + "learning_rate": 6.33985284608356e-05, + "loss": 0.78889161, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4381, + "time_per_iteration": 2.8088033199310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048589, + "balance_loss_mlp": 1.03958273, + "diversity_loss_mlp": 0.0, + "epoch": 0.8430165448249327, + "flos": 753730748928.0, + "grad_norm": 0.06180211012921075, + "language_loss": 0.79696667, + "learning_rate": 6.324678096896435e-05, + "loss": 0.80745256, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.09014893, + "routerloss_mlp": 0.0, + "step": 4382, + "time_per_iteration": 3.0762522220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049388, + "balance_loss_mlp": 1.04026818, + "diversity_loss_mlp": 0.0, + "epoch": 0.8432089265101962, + "flos": 699140574720.0, + "grad_norm": 0.07097197774761282, + "language_loss": 0.80925977, + "learning_rate": 6.30952030397306e-05, + "loss": 0.81975365, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4383, + "time_per_iteration": 2.8958194255828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_mlp": 1.03793836, + "diversity_loss_mlp": 0.0, + "epoch": 0.8434013081954598, + "flos": 485767839744.0, + "grad_norm": 0.08175099554660337, + "language_loss": 0.84386265, + "learning_rate": 6.294379473198208e-05, + "loss": 0.854334, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4384, + "time_per_iteration": 2.6954331398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049059, + "balance_loss_mlp": 1.03982067, + "diversity_loss_mlp": 0.0, + "epoch": 0.8435936898807234, + "flos": 520623811584.0, + "grad_norm": 0.0940310335311775, + "language_loss": 0.85289472, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86338532, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 4385, + "time_per_iteration": 2.6073288917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052192, + "balance_loss_mlp": 1.0430907, + "diversity_loss_mlp": 0.0, + "epoch": 0.843786071565987, + "flos": 785945534976.0, + "grad_norm": 0.06584361059499325, + "language_loss": 0.80478346, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81530541, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4386, + "time_per_iteration": 2.9602465629577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003223, + "balance_loss_mlp": 0.99876487, + "diversity_loss_mlp": 0.0, + "epoch": 0.8439784532512504, + "flos": 1446278436864.0, + "grad_norm": 0.01332354164942413, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76839739, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4387, + "time_per_iteration": 4.922089099884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051887, + "balance_loss_mlp": 1.0426724, + "diversity_loss_mlp": 0.0, + "epoch": 0.844170834936514, + "flos": 708700243968.0, + "grad_norm": 0.08625525862164317, + "language_loss": 0.82786238, + "learning_rate": 6.23398588904906e-05, + "loss": 0.83838129, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.09216309, + "routerloss_mlp": 0.0, + "step": 4388, + "time_per_iteration": 2.8626224994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049632, + "balance_loss_mlp": 1.04066157, + "diversity_loss_mlp": 0.0, + "epoch": 0.8443632166217776, + "flos": 483428030976.0, + "grad_norm": 0.06592449787759593, + "language_loss": 0.79633564, + "learning_rate": 6.218929957057922e-05, + "loss": 0.80683196, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4389, + "time_per_iteration": 2.681319236755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053609, + "balance_loss_mlp": 1.04455543, + "diversity_loss_mlp": 0.0, + "epoch": 0.8445555983070412, + "flos": 678694588416.0, + "grad_norm": 0.06375633990495472, + "language_loss": 0.80234212, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81287819, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4390, + "time_per_iteration": 2.8914427757263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051123, + "balance_loss_mlp": 1.0421586, + "diversity_loss_mlp": 0.0, + "epoch": 0.8447479799923048, + "flos": 741485477376.0, + "grad_norm": 0.07030854249904422, + "language_loss": 0.74476206, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75527334, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4391, + "time_per_iteration": 2.983142375946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056789, + "balance_loss_mlp": 1.04770541, + "diversity_loss_mlp": 0.0, + "epoch": 0.8449403616775683, + "flos": 953306537472.0, + "grad_norm": 0.06360843007002392, + "language_loss": 0.80354917, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81411707, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4392, + "time_per_iteration": 3.266145706176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105708, + "balance_loss_mlp": 1.04769254, + "diversity_loss_mlp": 0.0, + "epoch": 0.8451327433628318, + "flos": 657363921408.0, + "grad_norm": 0.0822485878003235, + "language_loss": 0.72267312, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73324394, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 4393, + "time_per_iteration": 2.8685081005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104874, + "balance_loss_mlp": 1.03982329, + "diversity_loss_mlp": 0.0, + "epoch": 0.8453251250480954, + "flos": 446113681920.0, + "grad_norm": 0.07697573681675166, + "language_loss": 0.83679235, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84727973, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4394, + "time_per_iteration": 2.533674478530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053449, + "balance_loss_mlp": 1.04453218, + "diversity_loss_mlp": 0.0, + "epoch": 0.845517506733359, + "flos": 542767205376.0, + "grad_norm": 0.07537571823528784, + "language_loss": 0.7097168, + "learning_rate": 6.128951512927305e-05, + "loss": 0.72025126, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4395, + "time_per_iteration": 2.6876683235168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051353, + "balance_loss_mlp": 1.04228103, + "diversity_loss_mlp": 0.0, + "epoch": 0.8457098884186226, + "flos": 502440910848.0, + "grad_norm": 0.08282627197829308, + "language_loss": 0.84426546, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85477906, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4396, + "time_per_iteration": 2.6650242805480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_mlp": 1.0413115, + "diversity_loss_mlp": 0.0, + "epoch": 0.8459022701038861, + "flos": 448893259776.0, + "grad_norm": 0.15468816830135243, + "language_loss": 0.79700321, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80750489, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4397, + "time_per_iteration": 2.7101781368255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044589, + "balance_loss_mlp": 1.03563631, + "diversity_loss_mlp": 0.0, + "epoch": 0.8460946517891497, + "flos": 743178115584.0, + "grad_norm": 0.05893126536703995, + "language_loss": 0.75071192, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.76115775, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4398, + "time_per_iteration": 2.9596059322357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_mlp": 1.03793144, + "diversity_loss_mlp": 0.0, + "epoch": 0.8462870334744133, + "flos": 553216324608.0, + "grad_norm": 0.0659677770319019, + "language_loss": 0.80090201, + "learning_rate": 6.069306450876389e-05, + "loss": 0.81137055, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4399, + "time_per_iteration": 2.750497341156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008801, + "balance_loss_mlp": 1.0044378, + "diversity_loss_mlp": 0.0, + "epoch": 0.8464794151596768, + "flos": 1564877864448.0, + "grad_norm": 0.013995388355349315, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82717371, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.04370117, + "routerloss_mlp": 0.0, + "step": 4400, + "time_per_iteration": 4.847966432571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_mlp": 1.03586566, + "diversity_loss_mlp": 0.0, + "epoch": 0.8466717968449403, + "flos": 550197038592.0, + "grad_norm": 0.060817981350280916, + "language_loss": 0.79790008, + "learning_rate": 6.039586229158084e-05, + "loss": 0.80835003, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4401, + "time_per_iteration": 2.668105125427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045073, + "balance_loss_mlp": 1.03601933, + "diversity_loss_mlp": 0.0, + "epoch": 0.8468641785302039, + "flos": 551919038976.0, + "grad_norm": 0.07199778737497019, + "language_loss": 0.84602404, + "learning_rate": 6.024751715835314e-05, + "loss": 0.85647476, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4402, + "time_per_iteration": 2.8081796169281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044356, + "balance_loss_mlp": 1.03515351, + "diversity_loss_mlp": 0.0, + "epoch": 0.8470565602154675, + "flos": 572671544832.0, + "grad_norm": 0.10925067279097164, + "language_loss": 0.87193465, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88237822, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4403, + "time_per_iteration": 2.7070863246917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047531, + "balance_loss_mlp": 1.03842974, + "diversity_loss_mlp": 0.0, + "epoch": 0.8472489419007311, + "flos": 472833179136.0, + "grad_norm": 0.08568709869316025, + "language_loss": 0.84353817, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85401344, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4404, + "time_per_iteration": 2.5401875972747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044764, + "balance_loss_mlp": 1.03592503, + "diversity_loss_mlp": 0.0, + "epoch": 0.8474413235859947, + "flos": 798020481024.0, + "grad_norm": 0.0709686000036253, + "language_loss": 0.79758859, + "learning_rate": 5.980350635103954e-05, + "loss": 0.80803621, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4405, + "time_per_iteration": 2.9586398601531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047629, + "balance_loss_mlp": 1.03862858, + "diversity_loss_mlp": 0.0, + "epoch": 0.8476337052712581, + "flos": 502379241984.0, + "grad_norm": 0.0758173793957083, + "language_loss": 0.80622578, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81670201, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4406, + "time_per_iteration": 2.5468907356262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104799, + "balance_loss_mlp": 1.03891182, + "diversity_loss_mlp": 0.0, + "epoch": 0.8478260869565217, + "flos": 931971101184.0, + "grad_norm": 0.08716014432574012, + "language_loss": 0.83022702, + "learning_rate": 5.9508353547573e-05, + "loss": 0.84070694, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4407, + "time_per_iteration": 3.180832862854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_mlp": 1.03713799, + "diversity_loss_mlp": 0.0, + "epoch": 0.8480184686417853, + "flos": 708811471872.0, + "grad_norm": 0.06912642288251827, + "language_loss": 0.80724686, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.81770915, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4408, + "time_per_iteration": 2.8790152072906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_mlp": 1.03665996, + "diversity_loss_mlp": 0.0, + "epoch": 0.8482108503270489, + "flos": 614440857600.0, + "grad_norm": 0.06430935054215667, + "language_loss": 0.82201052, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.83246624, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4409, + "time_per_iteration": 2.8187878131866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048254, + "balance_loss_mlp": 1.03908658, + "diversity_loss_mlp": 0.0, + "epoch": 0.8484032320123124, + "flos": 531016031232.0, + "grad_norm": 0.07260617685747814, + "language_loss": 0.82220393, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83268642, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4410, + "time_per_iteration": 2.618715286254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011478, + "balance_loss_mlp": 1.00716281, + "diversity_loss_mlp": 0.0, + "epoch": 0.848595613697576, + "flos": 1542776315904.0, + "grad_norm": 0.010800011769390029, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77308393, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.04321289, + "routerloss_mlp": 0.0, + "step": 4411, + "time_per_iteration": 4.929163455963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779672, + "balance_loss_mlp": 1.31286287, + "diversity_loss_mlp": 0.22471759, + "epoch": 0.8487879953828396, + "flos": 677342974464.0, + "grad_norm": 0.03344280518316992, + "language_loss": 0.74134266, + "learning_rate": 5.877346528406635e-05, + "loss": 0.74913931, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01088216, + "step": 4412, + "time_per_iteration": 2.887648582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_mlp": 1.03763819, + "diversity_loss_mlp": 0.0, + "epoch": 0.8489803770681031, + "flos": 503673956352.0, + "grad_norm": 0.07759361608874747, + "language_loss": 0.79911488, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.80958003, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4413, + "time_per_iteration": 2.634019613265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051803, + "balance_loss_mlp": 1.04298139, + "diversity_loss_mlp": 0.0, + "epoch": 0.8491727587533667, + "flos": 563186027520.0, + "grad_norm": 0.06257116408066361, + "language_loss": 0.77061796, + "learning_rate": 5.84807086750247e-05, + "loss": 0.78113604, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4414, + "time_per_iteration": 2.739079236984253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045842, + "balance_loss_mlp": 1.03654408, + "diversity_loss_mlp": 0.0, + "epoch": 0.8493651404386302, + "flos": 459784719360.0, + "grad_norm": 0.08252582476840821, + "language_loss": 0.779769, + "learning_rate": 5.833458746159243e-05, + "loss": 0.79022747, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 4415, + "time_per_iteration": 2.550938367843628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790044, + "balance_loss_mlp": 1.33385825, + "diversity_loss_mlp": 0.22484043, + "epoch": 0.8495575221238938, + "flos": 461170838016.0, + "grad_norm": 0.03510190626754167, + "language_loss": 0.82241035, + "learning_rate": 5.818863771788013e-05, + "loss": 0.83031082, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01069522, + "step": 4416, + "time_per_iteration": 2.629504442214966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052312, + "balance_loss_mlp": 1.04326987, + "diversity_loss_mlp": 0.0, + "epoch": 0.8497499038091574, + "flos": 870712063488.0, + "grad_norm": 0.06455923563838298, + "language_loss": 0.81343329, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82395649, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4417, + "time_per_iteration": 3.1615569591522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00774549, + "balance_loss_mlp": 1.30053818, + "diversity_loss_mlp": 0.22707056, + "epoch": 0.849942285494421, + "flos": 779600443392.0, + "grad_norm": 0.03325715859037055, + "language_loss": 0.78278667, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79053217, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01074457, + "step": 4418, + "time_per_iteration": 3.063164234161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105333, + "balance_loss_mlp": 1.04439521, + "diversity_loss_mlp": 0.0, + "epoch": 0.8501346671796844, + "flos": 513816556032.0, + "grad_norm": 0.06460470640159872, + "language_loss": 0.84812874, + "learning_rate": 5.775181787135819e-05, + "loss": 0.85866207, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4419, + "time_per_iteration": 2.694917678833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052007, + "balance_loss_mlp": 1.043239, + "diversity_loss_mlp": 0.0, + "epoch": 0.850327048864948, + "flos": 621445602816.0, + "grad_norm": 0.11539940060888441, + "language_loss": 0.83957243, + "learning_rate": 5.76065545724877e-05, + "loss": 0.85009253, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4420, + "time_per_iteration": 2.8541665077209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053783, + "balance_loss_mlp": 1.04484272, + "diversity_loss_mlp": 0.0, + "epoch": 0.8505194305502116, + "flos": 774221524992.0, + "grad_norm": 0.06628978561515504, + "language_loss": 0.79903436, + "learning_rate": 5.746146302598454e-05, + "loss": 0.80957222, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4421, + "time_per_iteration": 3.027402877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057337, + "balance_loss_mlp": 1.04840255, + "diversity_loss_mlp": 0.0, + "epoch": 0.8507118122354752, + "flos": 465257613312.0, + "grad_norm": 0.065145609650453, + "language_loss": 0.86839747, + "learning_rate": 5.731654328817859e-05, + "loss": 0.87897086, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4422, + "time_per_iteration": 2.608247756958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060117, + "balance_loss_mlp": 1.05109882, + "diversity_loss_mlp": 0.0, + "epoch": 0.8509041939207388, + "flos": 534413417472.0, + "grad_norm": 0.06673581896538218, + "language_loss": 0.84873575, + "learning_rate": 5.717179541533257e-05, + "loss": 0.85933691, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4423, + "time_per_iteration": 2.640604019165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055598, + "balance_loss_mlp": 1.04669881, + "diversity_loss_mlp": 0.0, + "epoch": 0.8510965756060023, + "flos": 583738472448.0, + "grad_norm": 0.07136007632395135, + "language_loss": 0.84349924, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85405523, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4424, + "time_per_iteration": 2.681556463241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056087, + "balance_loss_mlp": 1.04699087, + "diversity_loss_mlp": 0.0, + "epoch": 0.8512889572912659, + "flos": 600841400832.0, + "grad_norm": 0.09439640399937352, + "language_loss": 0.77805614, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78861696, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4425, + "time_per_iteration": 2.7769734859466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105464, + "balance_loss_mlp": 1.04534197, + "diversity_loss_mlp": 0.0, + "epoch": 0.8514813389765294, + "flos": 654791745024.0, + "grad_norm": 0.06728138208507028, + "language_loss": 0.78342903, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79397547, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 4426, + "time_per_iteration": 2.878251075744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_mlp": 1.04355907, + "diversity_loss_mlp": 0.0, + "epoch": 0.851673720661793, + "flos": 429761811456.0, + "grad_norm": 0.08229476351335695, + "language_loss": 0.78530198, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.7958256, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4427, + "time_per_iteration": 2.51084041595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105621, + "balance_loss_mlp": 1.04718578, + "diversity_loss_mlp": 0.0, + "epoch": 0.8518661023470565, + "flos": 641572959744.0, + "grad_norm": 0.06960729962592987, + "language_loss": 0.79901236, + "learning_rate": 5.645063599002875e-05, + "loss": 0.80957448, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4428, + "time_per_iteration": 2.7762057781219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055828, + "balance_loss_mlp": 1.04680383, + "diversity_loss_mlp": 0.0, + "epoch": 0.8520584840323201, + "flos": 562143504384.0, + "grad_norm": 0.07302244449525275, + "language_loss": 0.79662502, + "learning_rate": 5.630692048472363e-05, + "loss": 0.80718338, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4429, + "time_per_iteration": 2.660036325454712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056924, + "balance_loss_mlp": 1.04789412, + "diversity_loss_mlp": 0.0, + "epoch": 0.8522508657175837, + "flos": 527050395648.0, + "grad_norm": 0.07546735542766958, + "language_loss": 0.78632665, + "learning_rate": 5.61633772363489e-05, + "loss": 0.79689586, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4430, + "time_per_iteration": 2.6127545833587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105219, + "balance_loss_mlp": 1.04328537, + "diversity_loss_mlp": 0.0, + "epoch": 0.8524432474028473, + "flos": 499120247808.0, + "grad_norm": 0.06572867134879866, + "language_loss": 0.80567098, + "learning_rate": 5.602000630063298e-05, + "loss": 0.81619287, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4431, + "time_per_iteration": 2.5721845626831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053312, + "balance_loss_mlp": 1.04428816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8526356290881109, + "flos": 421314048000.0, + "grad_norm": 0.07674502364366044, + "language_loss": 0.79846716, + "learning_rate": 5.587680773323706e-05, + "loss": 0.80900025, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4432, + "time_per_iteration": 2.510967493057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057356, + "balance_loss_mlp": 1.04839182, + "diversity_loss_mlp": 0.0, + "epoch": 0.8528280107733743, + "flos": 507328303104.0, + "grad_norm": 0.0698638093203012, + "language_loss": 0.80873108, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.8193047, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4433, + "time_per_iteration": 2.6090145111083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054798, + "balance_loss_mlp": 1.04608333, + "diversity_loss_mlp": 0.0, + "epoch": 0.8530203924586379, + "flos": 445893797376.0, + "grad_norm": 0.06627585566585331, + "language_loss": 0.82683206, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.83738005, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4434, + "time_per_iteration": 2.5510103702545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055005, + "balance_loss_mlp": 1.04617763, + "diversity_loss_mlp": 0.0, + "epoch": 0.8532127741439015, + "flos": 657759273984.0, + "grad_norm": 0.06848630308035882, + "language_loss": 0.83932847, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84987855, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4435, + "time_per_iteration": 2.9127962589263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052958, + "balance_loss_mlp": 1.0440768, + "diversity_loss_mlp": 0.0, + "epoch": 0.8534051558291651, + "flos": 536019420672.0, + "grad_norm": 0.07760386997403859, + "language_loss": 0.83284372, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.8433733, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4436, + "time_per_iteration": 2.7183430194854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056242, + "balance_loss_mlp": 1.04731894, + "diversity_loss_mlp": 0.0, + "epoch": 0.8535975375144286, + "flos": 533000134656.0, + "grad_norm": 0.08897067825861743, + "language_loss": 0.79124266, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80180502, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4437, + "time_per_iteration": 2.6436634063720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051335, + "balance_loss_mlp": 1.04229927, + "diversity_loss_mlp": 0.0, + "epoch": 0.8537899191996922, + "flos": 574141727232.0, + "grad_norm": 0.07034775984994458, + "language_loss": 0.82836092, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83887428, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4438, + "time_per_iteration": 2.71964430809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105491, + "balance_loss_mlp": 1.04575455, + "diversity_loss_mlp": 0.0, + "epoch": 0.8539823008849557, + "flos": 465007993344.0, + "grad_norm": 0.0746305826676403, + "language_loss": 0.83321023, + "learning_rate": 5.48792487359433e-05, + "loss": 0.8437593, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 4439, + "time_per_iteration": 2.7270102500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105129, + "balance_loss_mlp": 1.04193783, + "diversity_loss_mlp": 0.0, + "epoch": 0.8541746825702193, + "flos": 554713671168.0, + "grad_norm": 0.11714515413286376, + "language_loss": 0.81816977, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.82868266, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.09350586, + "routerloss_mlp": 0.0, + "step": 4440, + "time_per_iteration": 2.665386915206909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047429, + "balance_loss_mlp": 1.03834486, + "diversity_loss_mlp": 0.0, + "epoch": 0.8543670642554829, + "flos": 546391816704.0, + "grad_norm": 0.06595291509459175, + "language_loss": 0.77334499, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.78381932, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4441, + "time_per_iteration": 2.7599966526031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_mlp": 1.04063272, + "diversity_loss_mlp": 0.0, + "epoch": 0.8545594459407464, + "flos": 512027744256.0, + "grad_norm": 0.07060933653649062, + "language_loss": 0.82500267, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83549809, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4442, + "time_per_iteration": 2.639495372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051049, + "balance_loss_mlp": 1.04200077, + "diversity_loss_mlp": 0.0, + "epoch": 0.85475182762601, + "flos": 421185567744.0, + "grad_norm": 0.07063393477475531, + "language_loss": 0.81464767, + "learning_rate": 5.431301565318786e-05, + "loss": 0.82515812, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4443, + "time_per_iteration": 2.4978034496307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048968, + "balance_loss_mlp": 1.03971708, + "diversity_loss_mlp": 0.0, + "epoch": 0.8549442093112736, + "flos": 389435516928.0, + "grad_norm": 0.08111118700719577, + "language_loss": 0.77217865, + "learning_rate": 5.41718898228542e-05, + "loss": 0.78266835, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4444, + "time_per_iteration": 2.4748144149780273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050876, + "balance_loss_mlp": 1.04197693, + "diversity_loss_mlp": 0.0, + "epoch": 0.8551365909965372, + "flos": 605926282752.0, + "grad_norm": 0.09368313437946132, + "language_loss": 0.79476607, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80527484, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4445, + "time_per_iteration": 2.796154499053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049626, + "balance_loss_mlp": 1.04050708, + "diversity_loss_mlp": 0.0, + "epoch": 0.8553289726818007, + "flos": 504160713216.0, + "grad_norm": 0.06371937907069437, + "language_loss": 0.78714025, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.79763651, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4446, + "time_per_iteration": 2.5761666297912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051208, + "balance_loss_mlp": 1.04208875, + "diversity_loss_mlp": 0.0, + "epoch": 0.8555213543670642, + "flos": 557009063424.0, + "grad_norm": 0.06774235964888489, + "language_loss": 0.76389277, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77440482, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4447, + "time_per_iteration": 2.772761344909668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050103, + "balance_loss_mlp": 1.04116249, + "diversity_loss_mlp": 0.0, + "epoch": 0.8557137360523278, + "flos": 548104278528.0, + "grad_norm": 0.06327552262806617, + "language_loss": 0.75251746, + "learning_rate": 5.360911790663775e-05, + "loss": 0.76301849, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4448, + "time_per_iteration": 2.6334402561187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047773, + "balance_loss_mlp": 1.03859377, + "diversity_loss_mlp": 0.0, + "epoch": 0.8559061177375914, + "flos": 728182628352.0, + "grad_norm": 0.057928896872347986, + "language_loss": 0.78575248, + "learning_rate": 5.346885805197238e-05, + "loss": 0.7962302, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4449, + "time_per_iteration": 2.965585947036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105067, + "balance_loss_mlp": 1.0418725, + "diversity_loss_mlp": 0.0, + "epoch": 0.856098499422855, + "flos": 535881028608.0, + "grad_norm": 0.07751296058129717, + "language_loss": 0.83346003, + "learning_rate": 5.332877155607085e-05, + "loss": 0.84396672, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4450, + "time_per_iteration": 2.6572906970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051985, + "balance_loss_mlp": 1.04291868, + "diversity_loss_mlp": 0.0, + "epoch": 0.8562908811081185, + "flos": 573664882176.0, + "grad_norm": 0.06226038691697754, + "language_loss": 0.83402085, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.84454072, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4451, + "time_per_iteration": 2.715268611907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050652, + "balance_loss_mlp": 1.04167557, + "diversity_loss_mlp": 0.0, + "epoch": 0.856483262793382, + "flos": 781754872320.0, + "grad_norm": 0.07567123638772062, + "language_loss": 0.80818313, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.8186897, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4452, + "time_per_iteration": 3.072892665863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_mlp": 1.03925145, + "diversity_loss_mlp": 0.0, + "epoch": 0.8566756444786456, + "flos": 455819083776.0, + "grad_norm": 0.0664830695636331, + "language_loss": 0.84927678, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85975915, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4453, + "time_per_iteration": 2.538435697555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048709, + "balance_loss_mlp": 1.03954768, + "diversity_loss_mlp": 0.0, + "epoch": 0.8568680261639092, + "flos": 449382587904.0, + "grad_norm": 0.08569801456429596, + "language_loss": 0.84562624, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85611331, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4454, + "time_per_iteration": 2.510293960571289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045918, + "balance_loss_mlp": 1.03693008, + "diversity_loss_mlp": 0.0, + "epoch": 0.8570604078491728, + "flos": 479976316416.0, + "grad_norm": 0.07456272936898871, + "language_loss": 0.82575965, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83621883, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4455, + "time_per_iteration": 2.5304741859436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783782, + "balance_loss_mlp": 1.32045674, + "diversity_loss_mlp": 0.22576013, + "epoch": 0.8572527895344363, + "flos": 505942184448.0, + "grad_norm": 0.031240053389996185, + "language_loss": 0.85362232, + "learning_rate": 5.249189615562627e-05, + "loss": 0.86146021, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01067326, + "step": 4456, + "time_per_iteration": 2.6050779819488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047609, + "balance_loss_mlp": 1.03857875, + "diversity_loss_mlp": 0.0, + "epoch": 0.8574451712196999, + "flos": 787044957696.0, + "grad_norm": 0.05524865057671199, + "language_loss": 0.83069348, + "learning_rate": 5.235302469011905e-05, + "loss": 0.84116954, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4457, + "time_per_iteration": 3.0707337856292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046976, + "balance_loss_mlp": 1.03807688, + "diversity_loss_mlp": 0.0, + "epoch": 0.8576375529049635, + "flos": 509252935680.0, + "grad_norm": 0.061549314191434064, + "language_loss": 0.75128138, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76175112, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4458, + "time_per_iteration": 2.8048369884490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009207, + "balance_loss_mlp": 1.00486779, + "diversity_loss_mlp": 0.0, + "epoch": 0.857829934590227, + "flos": 1460772486144.0, + "grad_norm": 0.009410723197847748, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85776496, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.04345703, + "routerloss_mlp": 0.0, + "step": 4459, + "time_per_iteration": 5.052462339401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049117, + "balance_loss_mlp": 1.04002094, + "diversity_loss_mlp": 0.0, + "epoch": 0.8580223162754905, + "flos": 479296839168.0, + "grad_norm": 0.05814228288805263, + "language_loss": 0.89274621, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90323746, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4460, + "time_per_iteration": 2.707102060317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048266, + "balance_loss_mlp": 1.03917027, + "diversity_loss_mlp": 0.0, + "epoch": 0.8582146979607541, + "flos": 706231954944.0, + "grad_norm": 0.07378533003990426, + "language_loss": 0.7931006, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80358326, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4461, + "time_per_iteration": 2.865081310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104748, + "balance_loss_mlp": 1.03854513, + "diversity_loss_mlp": 0.0, + "epoch": 0.8584070796460177, + "flos": 765158524416.0, + "grad_norm": 0.06549370953575787, + "language_loss": 0.823946, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.8344208, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4462, + "time_per_iteration": 2.960702419281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051966, + "balance_loss_mlp": 1.04283428, + "diversity_loss_mlp": 0.0, + "epoch": 0.8585994613312813, + "flos": 586829339136.0, + "grad_norm": 0.07292053022403922, + "language_loss": 0.85890585, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86942554, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 4463, + "time_per_iteration": 2.795929193496704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047374, + "balance_loss_mlp": 1.03847504, + "diversity_loss_mlp": 0.0, + "epoch": 0.8587918430165449, + "flos": 608295826944.0, + "grad_norm": 0.0593280148984403, + "language_loss": 0.78598225, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79645598, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4464, + "time_per_iteration": 2.81134033203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046787, + "balance_loss_mlp": 1.03755462, + "diversity_loss_mlp": 0.0, + "epoch": 0.8589842247018084, + "flos": 588981570048.0, + "grad_norm": 0.08434589868858423, + "language_loss": 0.80900252, + "learning_rate": 5.124831399159535e-05, + "loss": 0.81947035, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4465, + "time_per_iteration": 2.698519229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055861, + "balance_loss_mlp": 1.04674125, + "diversity_loss_mlp": 0.0, + "epoch": 0.8591766063870719, + "flos": 543879111168.0, + "grad_norm": 0.08280689414498507, + "language_loss": 0.78631306, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.79687166, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4466, + "time_per_iteration": 2.7119884490966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051995, + "balance_loss_mlp": 1.04303014, + "diversity_loss_mlp": 0.0, + "epoch": 0.8593689880723355, + "flos": 493756010496.0, + "grad_norm": 0.0696773734857941, + "language_loss": 0.80894464, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.81946456, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4467, + "time_per_iteration": 2.647484064102173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053716, + "balance_loss_mlp": 1.04451299, + "diversity_loss_mlp": 0.0, + "epoch": 0.8595613697575991, + "flos": 533909408256.0, + "grad_norm": 0.07756425408438049, + "language_loss": 0.83735067, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84788781, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.09197998, + "routerloss_mlp": 0.0, + "step": 4468, + "time_per_iteration": 2.606961488723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050814, + "balance_loss_mlp": 1.04189694, + "diversity_loss_mlp": 0.0, + "epoch": 0.8597537514428626, + "flos": 617628271104.0, + "grad_norm": 0.09275491108708087, + "language_loss": 0.76113212, + "learning_rate": 5.070013822961328e-05, + "loss": 0.77164024, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4469, + "time_per_iteration": 2.7252352237701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044872, + "balance_loss_mlp": 1.03569305, + "diversity_loss_mlp": 0.0, + "epoch": 0.8599461331281262, + "flos": 608730826752.0, + "grad_norm": 0.0715850887288851, + "language_loss": 0.84056306, + "learning_rate": 5.056353024046462e-05, + "loss": 0.85101181, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4470, + "time_per_iteration": 2.705986261367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105045, + "balance_loss_mlp": 1.04136574, + "diversity_loss_mlp": 0.0, + "epoch": 0.8601385148133898, + "flos": 551252044800.0, + "grad_norm": 0.06285887675624062, + "language_loss": 0.83157659, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84208107, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4471, + "time_per_iteration": 2.666837215423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049444, + "balance_loss_mlp": 1.04027641, + "diversity_loss_mlp": 0.0, + "epoch": 0.8603308964986534, + "flos": 581200800768.0, + "grad_norm": 0.05893825733891097, + "language_loss": 0.81146169, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.8219561, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4472, + "time_per_iteration": 2.8742566108703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048905, + "balance_loss_mlp": 1.03975582, + "diversity_loss_mlp": 0.0, + "epoch": 0.8605232781839169, + "flos": 629013828096.0, + "grad_norm": 0.0784559569656679, + "language_loss": 0.75468278, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76517183, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4473, + "time_per_iteration": 2.7587168216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049665, + "balance_loss_mlp": 1.04089117, + "diversity_loss_mlp": 0.0, + "epoch": 0.8607156598691804, + "flos": 468141078528.0, + "grad_norm": 0.06949986804746215, + "language_loss": 0.77037829, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78087491, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4474, + "time_per_iteration": 2.6033754348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046102, + "balance_loss_mlp": 1.0372982, + "diversity_loss_mlp": 0.0, + "epoch": 0.860908041554444, + "flos": 488394344448.0, + "grad_norm": 0.06715849698858382, + "language_loss": 0.82796544, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83842647, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4475, + "time_per_iteration": 2.6462340354919434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_mlp": 1.03617787, + "diversity_loss_mlp": 0.0, + "epoch": 0.8611004232397076, + "flos": 592094831616.0, + "grad_norm": 0.15717168716327404, + "language_loss": 0.80459589, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81504726, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4476, + "time_per_iteration": 2.6762094497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049439, + "balance_loss_mlp": 1.03996754, + "diversity_loss_mlp": 0.0, + "epoch": 0.8612928049249712, + "flos": 774209041920.0, + "grad_norm": 0.06321855833863838, + "language_loss": 0.86383665, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.874331, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.09472656, + "routerloss_mlp": 0.0, + "step": 4477, + "time_per_iteration": 3.0531985759735107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104971, + "balance_loss_mlp": 1.04053116, + "diversity_loss_mlp": 0.0, + "epoch": 0.8614851866102347, + "flos": 537553843200.0, + "grad_norm": 0.06893935293866559, + "language_loss": 0.82464266, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83513981, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4478, + "time_per_iteration": 2.6873598098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104944, + "balance_loss_mlp": 1.04062486, + "diversity_loss_mlp": 0.0, + "epoch": 0.8616775682954982, + "flos": 565916419584.0, + "grad_norm": 0.0676917705812813, + "language_loss": 0.7915647, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80205905, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4479, + "time_per_iteration": 2.6640400886535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049843, + "balance_loss_mlp": 1.04052103, + "diversity_loss_mlp": 0.0, + "epoch": 0.8618699499807618, + "flos": 481592231424.0, + "grad_norm": 0.06998246415259375, + "language_loss": 0.81843102, + "learning_rate": 4.92070558355221e-05, + "loss": 0.82892942, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.09320068, + "routerloss_mlp": 0.0, + "step": 4480, + "time_per_iteration": 2.6465768814086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044397, + "balance_loss_mlp": 1.0348897, + "diversity_loss_mlp": 0.0, + "epoch": 0.8620623316660254, + "flos": 649506802176.0, + "grad_norm": 0.09745126200827099, + "language_loss": 0.74436772, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.7548117, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.09490967, + "routerloss_mlp": 0.0, + "step": 4481, + "time_per_iteration": 2.7863497734069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048433, + "balance_loss_mlp": 1.03935492, + "diversity_loss_mlp": 0.0, + "epoch": 0.862254713351289, + "flos": 751781523456.0, + "grad_norm": 0.06946555375175803, + "language_loss": 0.85534787, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86583215, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4482, + "time_per_iteration": 2.9774255752563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051438, + "balance_loss_mlp": 1.04190743, + "diversity_loss_mlp": 0.0, + "epoch": 0.8624470950365525, + "flos": 841543727616.0, + "grad_norm": 0.07498520167107697, + "language_loss": 0.77633011, + "learning_rate": 4.880352388488024e-05, + "loss": 0.78684449, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.09527588, + "routerloss_mlp": 0.0, + "step": 4483, + "time_per_iteration": 3.2647812366485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783832, + "balance_loss_mlp": 1.32083893, + "diversity_loss_mlp": 0.22531055, + "epoch": 0.8626394767218161, + "flos": 754793468928.0, + "grad_norm": 0.03436935240738205, + "language_loss": 0.83586842, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84370679, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01075701, + "step": 4484, + "time_per_iteration": 2.928110122680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048003, + "balance_loss_mlp": 1.03885961, + "diversity_loss_mlp": 0.0, + "epoch": 0.8628318584070797, + "flos": 703585626624.0, + "grad_norm": 0.0696769189264069, + "language_loss": 0.82539618, + "learning_rate": 4.853537834745203e-05, + "loss": 0.83587623, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4485, + "time_per_iteration": 2.8806722164154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048081, + "balance_loss_mlp": 1.0388062, + "diversity_loss_mlp": 0.0, + "epoch": 0.8630242400923432, + "flos": 471244428288.0, + "grad_norm": 0.07034386086507984, + "language_loss": 0.77557874, + "learning_rate": 4.840156846389487e-05, + "loss": 0.7860595, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.09277344, + "routerloss_mlp": 0.0, + "step": 4486, + "time_per_iteration": 2.5718491077423096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045399, + "balance_loss_mlp": 1.03601718, + "diversity_loss_mlp": 0.0, + "epoch": 0.8632166217776067, + "flos": 964363553280.0, + "grad_norm": 0.08075284630280707, + "language_loss": 0.77191448, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78236842, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.09375, + "routerloss_mlp": 0.0, + "step": 4487, + "time_per_iteration": 3.206104040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048614, + "balance_loss_mlp": 1.03938758, + "diversity_loss_mlp": 0.0, + "epoch": 0.8634090034628703, + "flos": 767913509376.0, + "grad_norm": 0.07054996301110567, + "language_loss": 0.78534716, + "learning_rate": 4.813447472684246e-05, + "loss": 0.79583335, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 4488, + "time_per_iteration": 2.933553695678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049286, + "balance_loss_mlp": 1.03989816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8636013851481339, + "flos": 520591504896.0, + "grad_norm": 0.07600335888626973, + "language_loss": 0.83061361, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84110641, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.09387207, + "routerloss_mlp": 0.0, + "step": 4489, + "time_per_iteration": 2.7383370399475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046793, + "balance_loss_mlp": 1.03779912, + "diversity_loss_mlp": 0.0, + "epoch": 0.8637937668333975, + "flos": 632144342016.0, + "grad_norm": 0.08034973175032056, + "language_loss": 0.80326092, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81372881, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4490, + "time_per_iteration": 2.734177827835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044623, + "balance_loss_mlp": 1.03565812, + "diversity_loss_mlp": 0.0, + "epoch": 0.8639861485186611, + "flos": 856094676480.0, + "grad_norm": 0.057692915875148014, + "language_loss": 0.76451778, + "learning_rate": 4.773514997362e-05, + "loss": 0.77496397, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4491, + "time_per_iteration": 3.0788826942443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049145, + "balance_loss_mlp": 1.04005527, + "diversity_loss_mlp": 0.0, + "epoch": 0.8641785302039245, + "flos": 481261118976.0, + "grad_norm": 0.07466724897853576, + "language_loss": 0.77982771, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.79031909, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4492, + "time_per_iteration": 2.530029058456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048683, + "balance_loss_mlp": 1.039379, + "diversity_loss_mlp": 0.0, + "epoch": 0.8643709118891881, + "flos": 504637558272.0, + "grad_norm": 0.07260420646457022, + "language_loss": 0.80692542, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81741226, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 4493, + "time_per_iteration": 2.577784538269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00778204, + "balance_loss_mlp": 1.31030798, + "diversity_loss_mlp": 0.22490472, + "epoch": 0.8645632935744517, + "flos": 552368719872.0, + "grad_norm": 0.03497904945521898, + "language_loss": 0.82458371, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83236575, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01059737, + "step": 4494, + "time_per_iteration": 2.807935953140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047253, + "balance_loss_mlp": 1.03800845, + "diversity_loss_mlp": 0.0, + "epoch": 0.8647556752597153, + "flos": 524737751040.0, + "grad_norm": 0.07146424710596733, + "language_loss": 0.84123516, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.8517077, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.09246826, + "routerloss_mlp": 0.0, + "step": 4495, + "time_per_iteration": 2.5809860229492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_mlp": 1.03464222, + "diversity_loss_mlp": 0.0, + "epoch": 0.8649480569449788, + "flos": 787768851456.0, + "grad_norm": 0.07059483757370776, + "language_loss": 0.81995988, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83039922, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.09295654, + "routerloss_mlp": 0.0, + "step": 4496, + "time_per_iteration": 3.083287477493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104705, + "balance_loss_mlp": 1.03781724, + "diversity_loss_mlp": 0.0, + "epoch": 0.8651404386302424, + "flos": 763863810048.0, + "grad_norm": 0.06772870422342313, + "language_loss": 0.76696306, + "learning_rate": 4.694124264495225e-05, + "loss": 0.77743357, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4497, + "time_per_iteration": 3.074000835418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045348, + "balance_loss_mlp": 1.03595984, + "diversity_loss_mlp": 0.0, + "epoch": 0.865332820315506, + "flos": 539893651968.0, + "grad_norm": 0.07122639959522058, + "language_loss": 0.82500464, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83545816, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.09381104, + "routerloss_mlp": 0.0, + "step": 4498, + "time_per_iteration": 2.7418711185455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011015, + "balance_loss_mlp": 1.00648534, + "diversity_loss_mlp": 0.0, + "epoch": 0.8655252020007695, + "flos": 1476632830464.0, + "grad_norm": 0.011864937591166903, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80185461, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.04541016, + "routerloss_mlp": 0.0, + "step": 4499, + "time_per_iteration": 4.7632763385772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_mlp": 1.03568506, + "diversity_loss_mlp": 0.0, + "epoch": 0.8657175836860331, + "flos": 517369586688.0, + "grad_norm": 0.060500475018932964, + "language_loss": 0.82638729, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83683342, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4500, + "time_per_iteration": 2.673696756362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043007, + "balance_loss_mlp": 1.0338217, + "diversity_loss_mlp": 0.0, + "epoch": 0.8659099653712966, + "flos": 590523333120.0, + "grad_norm": 0.07115245817272867, + "language_loss": 0.80032218, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81075215, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4501, + "time_per_iteration": 2.7261881828308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104351, + "balance_loss_mlp": 1.03443861, + "diversity_loss_mlp": 0.0, + "epoch": 0.8661023470565602, + "flos": 590449181184.0, + "grad_norm": 0.05583001645863395, + "language_loss": 0.88010484, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89054, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4502, + "time_per_iteration": 2.8443400859832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_mlp": 1.03399086, + "diversity_loss_mlp": 0.0, + "epoch": 0.8662947287418238, + "flos": 567670726656.0, + "grad_norm": 0.06991854339818697, + "language_loss": 0.79483074, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80526078, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4503, + "time_per_iteration": 2.7233920097351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045559, + "balance_loss_mlp": 1.0366962, + "diversity_loss_mlp": 0.0, + "epoch": 0.8664871104270874, + "flos": 515929139712.0, + "grad_norm": 0.06089898281543335, + "language_loss": 0.82218802, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83264363, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4504, + "time_per_iteration": 2.7425873279571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050274, + "balance_loss_mlp": 1.04102361, + "diversity_loss_mlp": 0.0, + "epoch": 0.866679492112351, + "flos": 557263452672.0, + "grad_norm": 0.06301593457003249, + "language_loss": 0.78539002, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79589272, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 4505, + "time_per_iteration": 2.779776096343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_mlp": 1.03568339, + "diversity_loss_mlp": 0.0, + "epoch": 0.8668718737976144, + "flos": 722448004608.0, + "grad_norm": 0.08053258741139525, + "language_loss": 0.81561208, + "learning_rate": 4.57622578599054e-05, + "loss": 0.82605934, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4506, + "time_per_iteration": 2.9221668243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_mlp": 1.03598642, + "diversity_loss_mlp": 0.0, + "epoch": 0.867064255482878, + "flos": 600705580032.0, + "grad_norm": 0.0716656508067539, + "language_loss": 0.84894359, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.85939521, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4507, + "time_per_iteration": 2.72947359085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045976, + "balance_loss_mlp": 1.03671956, + "diversity_loss_mlp": 0.0, + "epoch": 0.8672566371681416, + "flos": 803527879680.0, + "grad_norm": 0.06708434542706315, + "language_loss": 0.76185739, + "learning_rate": 4.550219979745529e-05, + "loss": 0.77231717, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.09265137, + "routerloss_mlp": 0.0, + "step": 4508, + "time_per_iteration": 3.0237209796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044629, + "balance_loss_mlp": 1.03565264, + "diversity_loss_mlp": 0.0, + "epoch": 0.8674490188534052, + "flos": 627368177664.0, + "grad_norm": 0.06518598780385719, + "language_loss": 0.83932543, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.84977174, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4509, + "time_per_iteration": 2.755521059036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047969, + "balance_loss_mlp": 1.03887904, + "diversity_loss_mlp": 0.0, + "epoch": 0.8676414005386687, + "flos": 727831692288.0, + "grad_norm": 0.0684158926680597, + "language_loss": 0.86113983, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87161952, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4510, + "time_per_iteration": 3.0163121223449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046767, + "balance_loss_mlp": 1.03768277, + "diversity_loss_mlp": 0.0, + "epoch": 0.8678337822239323, + "flos": 539972573184.0, + "grad_norm": 0.06806250868382878, + "language_loss": 0.80556583, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.81603348, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4511, + "time_per_iteration": 2.7898309230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045577, + "balance_loss_mlp": 1.03637373, + "diversity_loss_mlp": 0.0, + "epoch": 0.8680261639091958, + "flos": 507521023488.0, + "grad_norm": 0.09053329692660277, + "language_loss": 0.79419863, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80465442, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4512, + "time_per_iteration": 2.579146385192871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104619, + "balance_loss_mlp": 1.03741062, + "diversity_loss_mlp": 0.0, + "epoch": 0.8682185455944594, + "flos": 487126794240.0, + "grad_norm": 0.06296584652642616, + "language_loss": 0.80771571, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.81817764, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4513, + "time_per_iteration": 2.6543962955474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045095, + "balance_loss_mlp": 1.03607059, + "diversity_loss_mlp": 0.0, + "epoch": 0.868410927279723, + "flos": 603690361344.0, + "grad_norm": 0.07075999679510799, + "language_loss": 0.81035638, + "learning_rate": 4.472626206030528e-05, + "loss": 0.82080734, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4514, + "time_per_iteration": 2.7115249633789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104638, + "balance_loss_mlp": 1.03727281, + "diversity_loss_mlp": 0.0, + "epoch": 0.8686033089649865, + "flos": 1118985186816.0, + "grad_norm": 0.08852072985797838, + "language_loss": 0.84644556, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.85690933, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4515, + "time_per_iteration": 3.3953351974487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048525, + "balance_loss_mlp": 1.03951859, + "diversity_loss_mlp": 0.0, + "epoch": 0.8687956906502501, + "flos": 568019091456.0, + "grad_norm": 0.09550241245969901, + "language_loss": 0.83630067, + "learning_rate": 4.446902963685862e-05, + "loss": 0.8467859, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4516, + "time_per_iteration": 2.7019460201263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046576, + "balance_loss_mlp": 1.03759933, + "diversity_loss_mlp": 0.0, + "epoch": 0.8689880723355137, + "flos": 544338703872.0, + "grad_norm": 0.061078878472804264, + "language_loss": 0.84983051, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.86029625, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4517, + "time_per_iteration": 2.6748125553131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050094, + "balance_loss_mlp": 1.04121304, + "diversity_loss_mlp": 0.0, + "epoch": 0.8691804540207773, + "flos": 457425086976.0, + "grad_norm": 0.06941157706477712, + "language_loss": 0.86215872, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.87265968, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4518, + "time_per_iteration": 2.580519914627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_mlp": 1.0403676, + "diversity_loss_mlp": 0.0, + "epoch": 0.8693728357060407, + "flos": 591872375808.0, + "grad_norm": 0.060481411793616664, + "language_loss": 0.79905188, + "learning_rate": 4.40845075221456e-05, + "loss": 0.80954409, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4519, + "time_per_iteration": 2.739733934402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049186, + "balance_loss_mlp": 1.04021573, + "diversity_loss_mlp": 0.0, + "epoch": 0.8695652173913043, + "flos": 680263515648.0, + "grad_norm": 0.08287606201497805, + "language_loss": 0.79479718, + "learning_rate": 4.395668742181164e-05, + "loss": 0.80528903, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4520, + "time_per_iteration": 2.8706867694854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050204, + "balance_loss_mlp": 1.04147816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8697575990765679, + "flos": 492362551296.0, + "grad_norm": 0.06861911538387308, + "language_loss": 0.7854861, + "learning_rate": 4.38290443731934e-05, + "loss": 0.7959882, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4521, + "time_per_iteration": 2.5845677852630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051004, + "balance_loss_mlp": 1.0421524, + "diversity_loss_mlp": 0.0, + "epoch": 0.8699499807618315, + "flos": 526949079552.0, + "grad_norm": 0.0587255823279189, + "language_loss": 0.82027864, + "learning_rate": 4.370157842584671e-05, + "loss": 0.83078861, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4522, + "time_per_iteration": 2.7062559127807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047666, + "balance_loss_mlp": 1.03883255, + "diversity_loss_mlp": 0.0, + "epoch": 0.8701423624470951, + "flos": 814342616064.0, + "grad_norm": 0.07380194299564537, + "language_loss": 0.80566227, + "learning_rate": 4.357428962925808e-05, + "loss": 0.81613898, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4523, + "time_per_iteration": 3.1326324939727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050998, + "balance_loss_mlp": 1.04187274, + "diversity_loss_mlp": 0.0, + "epoch": 0.8703347441323586, + "flos": 556789178880.0, + "grad_norm": 0.06623832108710956, + "language_loss": 0.88391662, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89442658, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4524, + "time_per_iteration": 2.684760808944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048562, + "balance_loss_mlp": 1.03950179, + "diversity_loss_mlp": 0.0, + "epoch": 0.8705271258176221, + "flos": 585443220480.0, + "grad_norm": 0.06258298642895538, + "language_loss": 0.84498411, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.8554697, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4525, + "time_per_iteration": 2.8076937198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050394, + "balance_loss_mlp": 1.04153669, + "diversity_loss_mlp": 0.0, + "epoch": 0.8707195075028857, + "flos": 669216411648.0, + "grad_norm": 0.058503085061922935, + "language_loss": 0.85245442, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86295837, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4526, + "time_per_iteration": 2.938445806503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_mlp": 1.03660226, + "diversity_loss_mlp": 0.0, + "epoch": 0.8709118891881493, + "flos": 520391443968.0, + "grad_norm": 0.06425490678836035, + "language_loss": 0.83926785, + "learning_rate": 4.306690693781007e-05, + "loss": 0.84972262, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4527, + "time_per_iteration": 2.759881019592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_mlp": 1.03936505, + "diversity_loss_mlp": 0.0, + "epoch": 0.8711042708734128, + "flos": 553208984064.0, + "grad_norm": 0.07304239619490156, + "language_loss": 0.81745154, + "learning_rate": 4.294050463490401e-05, + "loss": 0.8279348, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4528, + "time_per_iteration": 2.6849725246429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048731, + "balance_loss_mlp": 1.04004014, + "diversity_loss_mlp": 0.0, + "epoch": 0.8712966525586764, + "flos": 502193862144.0, + "grad_norm": 0.08116186300687973, + "language_loss": 0.82389712, + "learning_rate": 4.281427977823094e-05, + "loss": 0.83438438, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4529, + "time_per_iteration": 2.721444606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_mlp": 1.03866649, + "diversity_loss_mlp": 0.0, + "epoch": 0.87148903424394, + "flos": 804096129024.0, + "grad_norm": 0.0788947608454547, + "language_loss": 0.73803437, + "learning_rate": 4.268823241679593e-05, + "loss": 0.74850899, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4530, + "time_per_iteration": 3.0360207557678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047532, + "balance_loss_mlp": 1.03866839, + "diversity_loss_mlp": 0.0, + "epoch": 0.8716814159292036, + "flos": 773438160384.0, + "grad_norm": 0.061803367683131466, + "language_loss": 0.86130869, + "learning_rate": 4.256236259953489e-05, + "loss": 0.87178397, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4531, + "time_per_iteration": 3.0060312747955322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051694, + "balance_loss_mlp": 1.04256225, + "diversity_loss_mlp": 0.0, + "epoch": 0.8718737976144671, + "flos": 486835329024.0, + "grad_norm": 0.08097144635360554, + "language_loss": 0.85292768, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86344463, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 4532, + "time_per_iteration": 2.5708203315734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042692, + "balance_loss_mlp": 1.03403783, + "diversity_loss_mlp": 0.0, + "epoch": 0.8720661792997306, + "flos": 584123913216.0, + "grad_norm": 0.07173781512264084, + "language_loss": 0.7855528, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.79597974, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.08660889, + "routerloss_mlp": 0.0, + "step": 4533, + "time_per_iteration": 2.714898109436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100683, + "balance_loss_mlp": 1.00234771, + "diversity_loss_mlp": 0.0, + "epoch": 0.8722585609849942, + "flos": 1495942318080.0, + "grad_norm": 0.011018751042369157, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81973636, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4534, + "time_per_iteration": 4.830231189727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046788, + "balance_loss_mlp": 1.03760934, + "diversity_loss_mlp": 0.0, + "epoch": 0.8724509426702578, + "flos": 596169123840.0, + "grad_norm": 0.0639859938433398, + "language_loss": 0.87151349, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88198137, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4535, + "time_per_iteration": 2.7394185066223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044463, + "balance_loss_mlp": 1.03511095, + "diversity_loss_mlp": 0.0, + "epoch": 0.8726433243555214, + "flos": 443635481088.0, + "grad_norm": 0.07410951797613952, + "language_loss": 0.80976605, + "learning_rate": 4.193567838376888e-05, + "loss": 0.8202107, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.09344482, + "routerloss_mlp": 0.0, + "step": 4536, + "time_per_iteration": 2.5781943798065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048364, + "balance_loss_mlp": 1.03959656, + "diversity_loss_mlp": 0.0, + "epoch": 0.8728357060407849, + "flos": 553181819904.0, + "grad_norm": 0.07408162868136768, + "language_loss": 0.82072723, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83121085, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4537, + "time_per_iteration": 2.6797525882720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046083, + "balance_loss_mlp": 1.03713083, + "diversity_loss_mlp": 0.0, + "epoch": 0.8730280877260485, + "flos": 627807946752.0, + "grad_norm": 0.07156355175880628, + "language_loss": 0.78797638, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79843724, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4538, + "time_per_iteration": 2.8440496921539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047687, + "balance_loss_mlp": 1.03858507, + "diversity_loss_mlp": 0.0, + "epoch": 0.873220469411312, + "flos": 535384359936.0, + "grad_norm": 0.0722387407949978, + "language_loss": 0.79965913, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81013602, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4539, + "time_per_iteration": 2.721238136291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050694, + "balance_loss_mlp": 1.04186094, + "diversity_loss_mlp": 0.0, + "epoch": 0.8734128510965756, + "flos": 561883972608.0, + "grad_norm": 0.12124336335781533, + "language_loss": 0.84041327, + "learning_rate": 4.143753177230242e-05, + "loss": 0.8509202, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4540, + "time_per_iteration": 2.6914098262786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045135, + "balance_loss_mlp": 1.03622985, + "diversity_loss_mlp": 0.0, + "epoch": 0.8736052327818392, + "flos": 686467643904.0, + "grad_norm": 0.07799885017860995, + "language_loss": 0.79752243, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80797374, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4541, + "time_per_iteration": 2.93182110786438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048792, + "balance_loss_mlp": 1.03960705, + "diversity_loss_mlp": 0.0, + "epoch": 0.8737976144671027, + "flos": 531673113600.0, + "grad_norm": 0.06451256022818536, + "language_loss": 0.81514108, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82562894, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.09185791, + "routerloss_mlp": 0.0, + "step": 4542, + "time_per_iteration": 2.8326876163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00778397, + "balance_loss_mlp": 1.31047845, + "diversity_loss_mlp": 0.22450379, + "epoch": 0.8739899961523663, + "flos": 575592086016.0, + "grad_norm": 0.03126791623306444, + "language_loss": 0.81873107, + "learning_rate": 4.106579095649649e-05, + "loss": 0.82651508, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01090602, + "step": 4543, + "time_per_iteration": 2.9323105812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048028, + "balance_loss_mlp": 1.03904009, + "diversity_loss_mlp": 0.0, + "epoch": 0.8741823778376299, + "flos": 731332965888.0, + "grad_norm": 0.09261999312040192, + "language_loss": 0.76578218, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77626246, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4544, + "time_per_iteration": 2.8980069160461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104863, + "balance_loss_mlp": 1.03955245, + "diversity_loss_mlp": 0.0, + "epoch": 0.8743747595228935, + "flos": 567080082432.0, + "grad_norm": 0.06860768160110936, + "language_loss": 0.83654785, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84703422, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4545, + "time_per_iteration": 2.7457897663116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049582, + "balance_loss_mlp": 1.04058218, + "diversity_loss_mlp": 0.0, + "epoch": 0.8745671412081569, + "flos": 493370569728.0, + "grad_norm": 0.06696244649326027, + "language_loss": 0.82145166, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83194745, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4546, + "time_per_iteration": 2.5956528186798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050136, + "balance_loss_mlp": 1.04104638, + "diversity_loss_mlp": 0.0, + "epoch": 0.8747595228934205, + "flos": 524139766272.0, + "grad_norm": 0.06814063729509118, + "language_loss": 0.83736241, + "learning_rate": 4.057263119533233e-05, + "loss": 0.84786379, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4547, + "time_per_iteration": 2.6598734855651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104913, + "balance_loss_mlp": 1.04005837, + "diversity_loss_mlp": 0.0, + "epoch": 0.8749519045786841, + "flos": 744349118976.0, + "grad_norm": 0.07262523755606552, + "language_loss": 0.80276871, + "learning_rate": 4.044978704935853e-05, + "loss": 0.81325996, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4548, + "time_per_iteration": 3.042619466781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054339, + "balance_loss_mlp": 1.04545808, + "diversity_loss_mlp": 0.0, + "epoch": 0.8751442862639477, + "flos": 594278995968.0, + "grad_norm": 0.0643557055974673, + "language_loss": 0.79893917, + "learning_rate": 4.032712131660027e-05, + "loss": 0.80948257, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4549, + "time_per_iteration": 2.8232662677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045807, + "balance_loss_mlp": 1.03677678, + "diversity_loss_mlp": 0.0, + "epoch": 0.8753366679492113, + "flos": 496530819072.0, + "grad_norm": 0.06974853076229501, + "language_loss": 0.78530467, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79576278, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4550, + "time_per_iteration": 2.7248096466064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046831, + "balance_loss_mlp": 1.03792024, + "diversity_loss_mlp": 0.0, + "epoch": 0.8755290496344748, + "flos": 489864526848.0, + "grad_norm": 0.08026438876668639, + "language_loss": 0.81858146, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.82904983, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4551, + "time_per_iteration": 2.563875436782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046474, + "balance_loss_mlp": 1.03774762, + "diversity_loss_mlp": 0.0, + "epoch": 0.8757214313197383, + "flos": 591859892736.0, + "grad_norm": 0.27955745224323525, + "language_loss": 0.81637728, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82684195, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 4552, + "time_per_iteration": 2.810784339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048687, + "balance_loss_mlp": 1.03973484, + "diversity_loss_mlp": 0.0, + "epoch": 0.8759138130050019, + "flos": 976843763712.0, + "grad_norm": 0.0711083365968444, + "language_loss": 0.78033483, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.79082167, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4553, + "time_per_iteration": 3.2460765838623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_mlp": 1.03957188, + "diversity_loss_mlp": 0.0, + "epoch": 0.8761061946902655, + "flos": 802764338688.0, + "grad_norm": 0.05712124953956382, + "language_loss": 0.77816379, + "learning_rate": 3.971647051542243e-05, + "loss": 0.78864872, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4554, + "time_per_iteration": 3.0767805576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_mlp": 1.03772795, + "diversity_loss_mlp": 0.0, + "epoch": 0.8762985763755291, + "flos": 698495602176.0, + "grad_norm": 0.0721600968568646, + "language_loss": 0.74639142, + "learning_rate": 3.95948762596155e-05, + "loss": 0.75685859, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4555, + "time_per_iteration": 2.9832050800323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052245, + "balance_loss_mlp": 1.04343569, + "diversity_loss_mlp": 0.0, + "epoch": 0.8764909580607926, + "flos": 629717898240.0, + "grad_norm": 0.06902673277726463, + "language_loss": 0.80373311, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.8142556, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4556, + "time_per_iteration": 2.8642075061798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_mlp": 1.03882241, + "diversity_loss_mlp": 0.0, + "epoch": 0.8766833397460562, + "flos": 481545243648.0, + "grad_norm": 0.06429651244751071, + "language_loss": 0.80069965, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81117713, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4557, + "time_per_iteration": 2.6734185218811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_mlp": 1.03912759, + "diversity_loss_mlp": 0.0, + "epoch": 0.8768757214313198, + "flos": 407734414848.0, + "grad_norm": 0.06573901979402896, + "language_loss": 0.78168076, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79216218, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4558, + "time_per_iteration": 2.5166428089141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049801, + "balance_loss_mlp": 1.04095614, + "diversity_loss_mlp": 0.0, + "epoch": 0.8770681031165833, + "flos": 582582150144.0, + "grad_norm": 0.0842466180792191, + "language_loss": 0.8216058, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.83210379, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4559, + "time_per_iteration": 2.666722536087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050514, + "balance_loss_mlp": 1.04134107, + "diversity_loss_mlp": 0.0, + "epoch": 0.8772604848018468, + "flos": 508687257600.0, + "grad_norm": 0.07334962326293068, + "language_loss": 0.80860007, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.81910527, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4560, + "time_per_iteration": 2.627713441848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050224, + "balance_loss_mlp": 1.04125929, + "diversity_loss_mlp": 0.0, + "epoch": 0.8774528664871104, + "flos": 408836408832.0, + "grad_norm": 0.07694067808462435, + "language_loss": 0.8510192, + "learning_rate": 3.886906601970913e-05, + "loss": 0.86152148, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4561, + "time_per_iteration": 2.5129141807556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049748, + "balance_loss_mlp": 1.04076576, + "diversity_loss_mlp": 0.0, + "epoch": 0.877645248172374, + "flos": 500844819456.0, + "grad_norm": 0.05712308761867227, + "language_loss": 0.83274788, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84324539, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4562, + "time_per_iteration": 2.6301164627075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00775546, + "balance_loss_mlp": 1.3038888, + "diversity_loss_mlp": 0.22576925, + "epoch": 0.8778376298576376, + "flos": 633145019904.0, + "grad_norm": 0.034853936620068894, + "language_loss": 0.7813766, + "learning_rate": 3.862856098834189e-05, + "loss": 0.78913212, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01071687, + "step": 4563, + "time_per_iteration": 2.876042604446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055367, + "balance_loss_mlp": 1.04642081, + "diversity_loss_mlp": 0.0, + "epoch": 0.8780300115429012, + "flos": 533988329472.0, + "grad_norm": 0.06747212929306415, + "language_loss": 0.80067873, + "learning_rate": 3.850857712974976e-05, + "loss": 0.81123239, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4564, + "time_per_iteration": 2.8073532581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052003, + "balance_loss_mlp": 1.04328895, + "diversity_loss_mlp": 0.0, + "epoch": 0.8782223932281646, + "flos": 511662127104.0, + "grad_norm": 0.06003904599639906, + "language_loss": 0.77326131, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78378129, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4565, + "time_per_iteration": 2.6049962043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050973, + "balance_loss_mlp": 1.04202604, + "diversity_loss_mlp": 0.0, + "epoch": 0.8784147749134282, + "flos": 780714547200.0, + "grad_norm": 0.064833498730125, + "language_loss": 0.70079195, + "learning_rate": 3.826914695965766e-05, + "loss": 0.71130168, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4566, + "time_per_iteration": 3.1731789112091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00786853, + "balance_loss_mlp": 1.32932496, + "diversity_loss_mlp": 0.22292963, + "epoch": 0.8786071565986918, + "flos": 561004434432.0, + "grad_norm": 0.0397840730750478, + "language_loss": 0.76011282, + "learning_rate": 3.814970074111279e-05, + "loss": 0.76798129, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01072608, + "step": 4567, + "time_per_iteration": 2.697258472442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050311, + "balance_loss_mlp": 1.04135227, + "diversity_loss_mlp": 0.0, + "epoch": 0.8787995382839554, + "flos": 603448081920.0, + "grad_norm": 0.06722529563230402, + "language_loss": 0.77491319, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78541636, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4568, + "time_per_iteration": 2.840650796890259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050171, + "balance_loss_mlp": 1.04145098, + "diversity_loss_mlp": 0.0, + "epoch": 0.8789919199692189, + "flos": 560233552896.0, + "grad_norm": 0.05883368445240149, + "language_loss": 0.8492918, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.85979354, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4569, + "time_per_iteration": 2.6557326316833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051532, + "balance_loss_mlp": 1.04278803, + "diversity_loss_mlp": 0.0, + "epoch": 0.8791843016544825, + "flos": 539115429888.0, + "grad_norm": 0.07943052402500107, + "language_loss": 0.8255586, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.83607388, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4570, + "time_per_iteration": 2.627609968185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053016, + "balance_loss_mlp": 1.04396188, + "diversity_loss_mlp": 0.0, + "epoch": 0.8793766833397461, + "flos": 1008699899904.0, + "grad_norm": 0.06059091910308417, + "language_loss": 0.79351205, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80404216, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4571, + "time_per_iteration": 3.35367751121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_mlp": 1.0433991, + "diversity_loss_mlp": 0.0, + "epoch": 0.8795690650250096, + "flos": 678637688832.0, + "grad_norm": 0.06539899330048332, + "language_loss": 0.80981296, + "learning_rate": 3.755516016623628e-05, + "loss": 0.82033718, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4572, + "time_per_iteration": 2.880627155303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_mlp": 1.0410192, + "diversity_loss_mlp": 0.0, + "epoch": 0.8797614467102732, + "flos": 453432287232.0, + "grad_norm": 0.07570874184627417, + "language_loss": 0.88668913, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.89718843, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4573, + "time_per_iteration": 2.563573122024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051581, + "balance_loss_mlp": 1.04257524, + "diversity_loss_mlp": 0.0, + "epoch": 0.8799538283955367, + "flos": 550913591808.0, + "grad_norm": 0.06673280620392491, + "language_loss": 0.84119153, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.8517074, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4574, + "time_per_iteration": 2.6805808544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052755, + "balance_loss_mlp": 1.04388571, + "diversity_loss_mlp": 0.0, + "epoch": 0.8801462100808003, + "flos": 807429275136.0, + "grad_norm": 0.07043061387858378, + "language_loss": 0.84413314, + "learning_rate": 3.720058989624681e-05, + "loss": 0.85466063, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4575, + "time_per_iteration": 3.049510955810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051216, + "balance_loss_mlp": 1.04210222, + "diversity_loss_mlp": 0.0, + "epoch": 0.8803385917660639, + "flos": 768694302720.0, + "grad_norm": 0.06156041987406192, + "language_loss": 0.84676832, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85728043, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4576, + "time_per_iteration": 2.931907892227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050983, + "balance_loss_mlp": 1.04205978, + "diversity_loss_mlp": 0.0, + "epoch": 0.8805309734513275, + "flos": 567339614208.0, + "grad_norm": 0.05826624297126263, + "language_loss": 0.81173784, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82224762, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4577, + "time_per_iteration": 2.7370834350585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051952, + "balance_loss_mlp": 1.04316616, + "diversity_loss_mlp": 0.0, + "epoch": 0.880723355136591, + "flos": 679779330048.0, + "grad_norm": 0.06645498049207266, + "language_loss": 0.81695998, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82747948, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4578, + "time_per_iteration": 2.7928130626678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105216, + "balance_loss_mlp": 1.04327333, + "diversity_loss_mlp": 0.0, + "epoch": 0.8809157368218545, + "flos": 565629723648.0, + "grad_norm": 0.06357300740797822, + "language_loss": 0.79227793, + "learning_rate": 3.673034519424734e-05, + "loss": 0.80279958, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4579, + "time_per_iteration": 2.7231593132019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050745, + "balance_loss_mlp": 1.04194164, + "diversity_loss_mlp": 0.0, + "epoch": 0.8811081185071181, + "flos": 515407878144.0, + "grad_norm": 0.059350650415536, + "language_loss": 0.76098466, + "learning_rate": 3.661323354789586e-05, + "loss": 0.77149218, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4580, + "time_per_iteration": 2.683220624923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048772, + "balance_loss_mlp": 1.03990269, + "diversity_loss_mlp": 0.0, + "epoch": 0.8813005001923817, + "flos": 594343236096.0, + "grad_norm": 0.06771926957891432, + "language_loss": 0.81324798, + "learning_rate": 3.649630180424191e-05, + "loss": 0.82373571, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4581, + "time_per_iteration": 2.6779592037200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_mlp": 1.04133832, + "diversity_loss_mlp": 0.0, + "epoch": 0.8814928818776453, + "flos": 666940843008.0, + "grad_norm": 0.07585053291634766, + "language_loss": 0.79299724, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80349755, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4582, + "time_per_iteration": 2.831101894378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_mlp": 1.04368544, + "diversity_loss_mlp": 0.0, + "epoch": 0.8816852635629088, + "flos": 609153343488.0, + "grad_norm": 0.06530916783888785, + "language_loss": 0.85757875, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86810547, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4583, + "time_per_iteration": 2.7231874465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050068, + "balance_loss_mlp": 1.04128242, + "diversity_loss_mlp": 0.0, + "epoch": 0.8818776452481724, + "flos": 480379009536.0, + "grad_norm": 0.07680446741638405, + "language_loss": 0.82252479, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83302546, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4584, + "time_per_iteration": 2.6065118312835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00796186, + "balance_loss_mlp": 1.34451175, + "diversity_loss_mlp": 0.22621799, + "epoch": 0.882070026933436, + "flos": 1045394242560.0, + "grad_norm": 0.03516245413492739, + "language_loss": 0.73908472, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74704659, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0108207, + "step": 4585, + "time_per_iteration": 3.3458354473114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048695, + "balance_loss_mlp": 1.039891, + "diversity_loss_mlp": 0.0, + "epoch": 0.8822624086186995, + "flos": 474409446912.0, + "grad_norm": 0.06564674034294884, + "language_loss": 0.80001426, + "learning_rate": 3.591434321288345e-05, + "loss": 0.81050122, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4586, + "time_per_iteration": 2.72759747505188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_mlp": 1.04060817, + "diversity_loss_mlp": 0.0, + "epoch": 0.882454790303963, + "flos": 654023434752.0, + "grad_norm": 0.07346558638928435, + "language_loss": 0.81996882, + "learning_rate": 3.579849183630485e-05, + "loss": 0.83046365, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4587, + "time_per_iteration": 2.808663845062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051315, + "balance_loss_mlp": 1.0421896, + "diversity_loss_mlp": 0.0, + "epoch": 0.8826471719892266, + "flos": 470325242880.0, + "grad_norm": 0.06304354104337369, + "language_loss": 0.78938949, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79990268, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 4588, + "time_per_iteration": 2.573918581008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047855, + "balance_loss_mlp": 1.03888416, + "diversity_loss_mlp": 0.0, + "epoch": 0.8828395536744902, + "flos": 468753744384.0, + "grad_norm": 0.061374871286848334, + "language_loss": 0.83903325, + "learning_rate": 3.556732978508048e-05, + "loss": 0.8495118, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4589, + "time_per_iteration": 2.6800525188446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049011, + "balance_loss_mlp": 1.04007053, + "diversity_loss_mlp": 0.0, + "epoch": 0.8830319353597538, + "flos": 721377944064.0, + "grad_norm": 0.06744146282588834, + "language_loss": 0.81342435, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82391441, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4590, + "time_per_iteration": 2.953735589981079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052142, + "balance_loss_mlp": 1.04338574, + "diversity_loss_mlp": 0.0, + "epoch": 0.8832243170450174, + "flos": 443277204480.0, + "grad_norm": 0.07827681611400703, + "language_loss": 0.81570184, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.82622325, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4591, + "time_per_iteration": 2.611823081970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_mlp": 1.04045248, + "diversity_loss_mlp": 0.0, + "epoch": 0.8834166987302808, + "flos": 566583413760.0, + "grad_norm": 0.07488922713809969, + "language_loss": 0.82166886, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83216357, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4592, + "time_per_iteration": 2.820740222930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049441, + "balance_loss_mlp": 1.04061973, + "diversity_loss_mlp": 0.0, + "epoch": 0.8836090804155444, + "flos": 609316328448.0, + "grad_norm": 0.06826234415728213, + "language_loss": 0.82207388, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83256829, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4593, + "time_per_iteration": 2.7582898139953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048958, + "balance_loss_mlp": 1.04009509, + "diversity_loss_mlp": 0.0, + "epoch": 0.883801462100808, + "flos": 557065963008.0, + "grad_norm": 0.07322628079560306, + "language_loss": 0.80310255, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81359208, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4594, + "time_per_iteration": 2.7062149047851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051053, + "balance_loss_mlp": 1.04161763, + "diversity_loss_mlp": 0.0, + "epoch": 0.8839938437860716, + "flos": 516188671488.0, + "grad_norm": 0.08697939284189399, + "language_loss": 0.77308345, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78359401, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.09423828, + "routerloss_mlp": 0.0, + "step": 4595, + "time_per_iteration": 2.6008739471435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047041, + "balance_loss_mlp": 1.03805816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8841862254713351, + "flos": 713696292864.0, + "grad_norm": 0.07630739769725799, + "language_loss": 0.79033625, + "learning_rate": 3.47639446766777e-05, + "loss": 0.80080664, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4596, + "time_per_iteration": 2.8426897525787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048948, + "balance_loss_mlp": 1.040079, + "diversity_loss_mlp": 0.0, + "epoch": 0.8843786071565987, + "flos": 833975875584.0, + "grad_norm": 0.06236969459816259, + "language_loss": 0.82549202, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.83598149, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4597, + "time_per_iteration": 3.0126264095306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050555, + "balance_loss_mlp": 1.0417217, + "diversity_loss_mlp": 0.0, + "epoch": 0.8845709888418622, + "flos": 656884505088.0, + "grad_norm": 0.057498871629657215, + "language_loss": 0.82855976, + "learning_rate": 3.453603099349462e-05, + "loss": 0.83906525, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4598, + "time_per_iteration": 2.9096622467041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779413, + "balance_loss_mlp": 1.31441939, + "diversity_loss_mlp": 0.22293654, + "epoch": 0.8847633705271258, + "flos": 523326666240.0, + "grad_norm": 0.031937649468038294, + "language_loss": 0.80943024, + "learning_rate": 3.442234519350823e-05, + "loss": 0.81722438, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01073514, + "step": 4599, + "time_per_iteration": 2.752638339996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049498, + "balance_loss_mlp": 1.04064703, + "diversity_loss_mlp": 0.0, + "epoch": 0.8849557522123894, + "flos": 548591035392.0, + "grad_norm": 0.06795094778934727, + "language_loss": 0.84458822, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85508323, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4600, + "time_per_iteration": 2.663498878479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779393, + "balance_loss_mlp": 1.31195164, + "diversity_loss_mlp": 0.22577716, + "epoch": 0.8851481338976529, + "flos": 622372128768.0, + "grad_norm": 0.03181593301262544, + "language_loss": 0.83776021, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84555423, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01052869, + "step": 4601, + "time_per_iteration": 2.7995564937591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046945, + "balance_loss_mlp": 1.0379926, + "diversity_loss_mlp": 0.0, + "epoch": 0.8853405155829165, + "flos": 444359374848.0, + "grad_norm": 0.06356049403382279, + "language_loss": 0.80725026, + "learning_rate": 3.408237248940088e-05, + "loss": 0.8177197, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4602, + "time_per_iteration": 2.6017932891845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_mlp": 1.03828001, + "diversity_loss_mlp": 0.0, + "epoch": 0.8855328972681801, + "flos": 730470680064.0, + "grad_norm": 0.07035000464547823, + "language_loss": 0.77883828, + "learning_rate": 3.396940996663683e-05, + "loss": 0.78931201, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4603, + "time_per_iteration": 2.9521942138671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046951, + "balance_loss_mlp": 1.03792644, + "diversity_loss_mlp": 0.0, + "epoch": 0.8857252789534437, + "flos": 487376414208.0, + "grad_norm": 0.06898692389267871, + "language_loss": 0.78990823, + "learning_rate": 3.385662837299375e-05, + "loss": 0.80037773, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4604, + "time_per_iteration": 2.5854694843292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_mlp": 1.03895068, + "diversity_loss_mlp": 0.0, + "epoch": 0.8859176606387072, + "flos": 508556206080.0, + "grad_norm": 0.06638743776056398, + "language_loss": 0.81713545, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82761252, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4605, + "time_per_iteration": 2.692868232727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_mlp": 1.03658962, + "diversity_loss_mlp": 0.0, + "epoch": 0.8861100423239707, + "flos": 516628440576.0, + "grad_norm": 0.06624513803881459, + "language_loss": 0.85526776, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.86572611, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.09240723, + "routerloss_mlp": 0.0, + "step": 4606, + "time_per_iteration": 2.6592142581939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790369, + "balance_loss_mlp": 1.33229494, + "diversity_loss_mlp": 0.22699621, + "epoch": 0.8863024240092343, + "flos": 626975396352.0, + "grad_norm": 0.03136786172758775, + "language_loss": 0.79641789, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80432159, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01072356, + "step": 4607, + "time_per_iteration": 2.7557034492492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048991, + "balance_loss_mlp": 1.03997266, + "diversity_loss_mlp": 0.0, + "epoch": 0.8864948056944979, + "flos": 766910260224.0, + "grad_norm": 0.053068589539523224, + "language_loss": 0.83634484, + "learning_rate": 3.340731216429083e-05, + "loss": 0.84683472, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4608, + "time_per_iteration": 2.970646381378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013538, + "balance_loss_mlp": 1.00912714, + "diversity_loss_mlp": 0.0, + "epoch": 0.8866871873797615, + "flos": 1502331452928.0, + "grad_norm": 0.013952158084226052, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79844493, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4609, + "time_per_iteration": 4.800167798995972 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046127, + "balance_loss_mlp": 1.03707361, + "diversity_loss_mlp": 0.0, + "epoch": 0.886879569065025, + "flos": 811516050432.0, + "grad_norm": 0.06983974762090492, + "language_loss": 0.82014269, + "learning_rate": 3.3183740769755e-05, + "loss": 0.83060396, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4610, + "time_per_iteration": 3.0428099632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013271, + "balance_loss_mlp": 1.00885999, + "diversity_loss_mlp": 0.0, + "epoch": 0.8870719507502886, + "flos": 1582838309376.0, + "grad_norm": 0.013954976330346456, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.77924109, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4611, + "time_per_iteration": 4.960276126861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048945, + "balance_loss_mlp": 1.04021323, + "diversity_loss_mlp": 0.0, + "epoch": 0.8872643324355521, + "flos": 634027129344.0, + "grad_norm": 0.06747784662244205, + "language_loss": 0.75143421, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76192367, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4612, + "time_per_iteration": 2.8096370697021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046218, + "balance_loss_mlp": 1.03731275, + "diversity_loss_mlp": 0.0, + "epoch": 0.8874567141208157, + "flos": 535755119616.0, + "grad_norm": 0.081523690910391, + "language_loss": 0.83038783, + "learning_rate": 3.284974304209532e-05, + "loss": 0.84084994, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4613, + "time_per_iteration": 2.6296303272247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047961, + "balance_loss_mlp": 1.0389961, + "diversity_loss_mlp": 0.0, + "epoch": 0.8876490958060793, + "flos": 1566302552064.0, + "grad_norm": 0.07384350898299535, + "language_loss": 0.79394948, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80442905, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4614, + "time_per_iteration": 3.9052226543426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045945, + "balance_loss_mlp": 1.0370816, + "diversity_loss_mlp": 0.0, + "epoch": 0.8878414774913428, + "flos": 636633810432.0, + "grad_norm": 0.06075632435028376, + "language_loss": 0.84765017, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.85810959, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4615, + "time_per_iteration": 2.784306764602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049902, + "balance_loss_mlp": 1.04100347, + "diversity_loss_mlp": 0.0, + "epoch": 0.8880338591766064, + "flos": 496429502976.0, + "grad_norm": 0.07661340087165963, + "language_loss": 0.81347793, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82397699, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4616, + "time_per_iteration": 2.585916042327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779874, + "balance_loss_mlp": 1.31519485, + "diversity_loss_mlp": 0.22310758, + "epoch": 0.88822624086187, + "flos": 542861180928.0, + "grad_norm": 0.03294259540614503, + "language_loss": 0.79988885, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.80768752, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01072259, + "step": 4617, + "time_per_iteration": 2.658268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_mlp": 1.03512335, + "diversity_loss_mlp": 0.0, + "epoch": 0.8884186225471336, + "flos": 551822865408.0, + "grad_norm": 0.08219678758811591, + "language_loss": 0.83779407, + "learning_rate": 3.229670801173418e-05, + "loss": 0.84823501, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4618, + "time_per_iteration": 2.6499626636505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013119, + "balance_loss_mlp": 1.00873196, + "diversity_loss_mlp": 0.0, + "epoch": 0.888611004232397, + "flos": 1565263305216.0, + "grad_norm": 0.01269771212796008, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79525316, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.04394531, + "routerloss_mlp": 0.0, + "step": 4619, + "time_per_iteration": 5.039214134216309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048007, + "balance_loss_mlp": 1.03929269, + "diversity_loss_mlp": 0.0, + "epoch": 0.8888033859176606, + "flos": 767028828672.0, + "grad_norm": 0.06229683334708209, + "language_loss": 0.82604653, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83652663, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4620, + "time_per_iteration": 2.987938404083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044784, + "balance_loss_mlp": 1.03616548, + "diversity_loss_mlp": 0.0, + "epoch": 0.8889957676029242, + "flos": 934110849024.0, + "grad_norm": 0.0772642935579886, + "language_loss": 0.8405602, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.851008, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.08630371, + "routerloss_mlp": 0.0, + "step": 4621, + "time_per_iteration": 3.1390573978424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050592, + "balance_loss_mlp": 1.04172254, + "diversity_loss_mlp": 0.0, + "epoch": 0.8891881492881878, + "flos": 589611488256.0, + "grad_norm": 0.06838136238403997, + "language_loss": 0.81778359, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82828951, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4622, + "time_per_iteration": 2.799467086791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047387, + "balance_loss_mlp": 1.03847051, + "diversity_loss_mlp": 0.0, + "epoch": 0.8893805309734514, + "flos": 540718861824.0, + "grad_norm": 0.0659043400927782, + "language_loss": 0.82619703, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83667088, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4623, + "time_per_iteration": 2.7340970039367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046712, + "balance_loss_mlp": 1.0377115, + "diversity_loss_mlp": 0.0, + "epoch": 0.8895729126587149, + "flos": 560095160832.0, + "grad_norm": 0.06558378954602251, + "language_loss": 0.81849378, + "learning_rate": 3.163905853111054e-05, + "loss": 0.8289609, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4624, + "time_per_iteration": 2.6568024158477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047965, + "balance_loss_mlp": 1.03908443, + "diversity_loss_mlp": 0.0, + "epoch": 0.8897652943439784, + "flos": 610154021376.0, + "grad_norm": 0.060975907763050036, + "language_loss": 0.81057096, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82105064, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4625, + "time_per_iteration": 2.7340495586395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044582, + "balance_loss_mlp": 1.03537273, + "diversity_loss_mlp": 0.0, + "epoch": 0.889957676029242, + "flos": 917847811584.0, + "grad_norm": 0.07485889575749058, + "language_loss": 0.770868, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78131384, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.09210205, + "routerloss_mlp": 0.0, + "step": 4626, + "time_per_iteration": 3.187793016433716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051033, + "balance_loss_mlp": 1.04202616, + "diversity_loss_mlp": 0.0, + "epoch": 0.8901500577145056, + "flos": 488698292736.0, + "grad_norm": 0.08455877289506715, + "language_loss": 0.8016057, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81211603, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4627, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104463, + "balance_loss_mlp": 1.03559375, + "diversity_loss_mlp": 0.0, + "epoch": 0.8903424393997691, + "flos": 733648181760.0, + "grad_norm": 0.06293120132110656, + "language_loss": 0.80719471, + "learning_rate": 3.120426165316398e-05, + "loss": 0.81764102, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4628, + "time_per_iteration": 2.9961817264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044841, + "balance_loss_mlp": 1.03616869, + "diversity_loss_mlp": 0.0, + "epoch": 0.8905348210850327, + "flos": 519813282816.0, + "grad_norm": 0.08203467156217556, + "language_loss": 0.81727576, + "learning_rate": 3.109601733496881e-05, + "loss": 0.82772422, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 4629, + "time_per_iteration": 2.679408073425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042396, + "balance_loss_mlp": 1.03355646, + "diversity_loss_mlp": 0.0, + "epoch": 0.8907272027702963, + "flos": 578976989184.0, + "grad_norm": 0.06898009343071365, + "language_loss": 0.79810011, + "learning_rate": 3.098795506144458e-05, + "loss": 0.80852401, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4630, + "time_per_iteration": 2.83233380317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041898, + "balance_loss_mlp": 1.03328514, + "diversity_loss_mlp": 0.0, + "epoch": 0.8909195844555599, + "flos": 893628910080.0, + "grad_norm": 0.0715777029832187, + "language_loss": 0.7953496, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80576855, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4631, + "time_per_iteration": 3.12410569190979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0077771, + "balance_loss_mlp": 1.31088805, + "diversity_loss_mlp": 0.22250512, + "epoch": 0.8911119661408234, + "flos": 549865926144.0, + "grad_norm": 0.032192261312759214, + "language_loss": 0.84286821, + "learning_rate": 3.077237681615208e-05, + "loss": 0.8506453, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01101306, + "step": 4632, + "time_per_iteration": 2.703425884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_mlp": 1.04004884, + "diversity_loss_mlp": 0.0, + "epoch": 0.8913043478260869, + "flos": 481139979264.0, + "grad_norm": 0.08188608007058847, + "language_loss": 0.84165525, + "learning_rate": 3.066486092807874e-05, + "loss": 0.85214841, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4633, + "time_per_iteration": 2.712557554244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047634, + "balance_loss_mlp": 1.03861618, + "diversity_loss_mlp": 0.0, + "epoch": 0.8914967295113505, + "flos": 484581782016.0, + "grad_norm": 0.06060123366569166, + "language_loss": 0.85206622, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86254251, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4634, + "time_per_iteration": 2.630039691925049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042062, + "balance_loss_mlp": 1.03316331, + "diversity_loss_mlp": 0.0, + "epoch": 0.8916891111966141, + "flos": 445664001024.0, + "grad_norm": 0.06527746139553993, + "language_loss": 0.8135035, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82392418, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4635, + "time_per_iteration": 2.5558903217315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047554, + "balance_loss_mlp": 1.03875005, + "diversity_loss_mlp": 0.0, + "epoch": 0.8918814928818777, + "flos": 564016379904.0, + "grad_norm": 0.06346729793174329, + "language_loss": 0.78307879, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79355425, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4636, + "time_per_iteration": 2.7006030082702637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045904, + "balance_loss_mlp": 1.03714168, + "diversity_loss_mlp": 0.0, + "epoch": 0.8920738745671412, + "flos": 575943022080.0, + "grad_norm": 0.06783278448064689, + "language_loss": 0.8109082, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82136714, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4637, + "time_per_iteration": 2.6627137660980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043668, + "balance_loss_mlp": 1.03518057, + "diversity_loss_mlp": 0.0, + "epoch": 0.8922662562524047, + "flos": 620180623872.0, + "grad_norm": 0.06701291241567459, + "language_loss": 0.84168345, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.85212016, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.08496094, + "routerloss_mlp": 0.0, + "step": 4638, + "time_per_iteration": 2.7190563678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048979, + "balance_loss_mlp": 1.04025865, + "diversity_loss_mlp": 0.0, + "epoch": 0.8924586379376683, + "flos": 583624673280.0, + "grad_norm": 0.06480897369874776, + "language_loss": 0.79137188, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80186164, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4639, + "time_per_iteration": 2.7548539638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046259, + "balance_loss_mlp": 1.03746128, + "diversity_loss_mlp": 0.0, + "epoch": 0.8926510196229319, + "flos": 525177520128.0, + "grad_norm": 0.06545758779491198, + "language_loss": 0.81798422, + "learning_rate": 2.991735397786538e-05, + "loss": 0.82844687, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4640, + "time_per_iteration": 2.7450599670410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046018, + "balance_loss_mlp": 1.03710771, + "diversity_loss_mlp": 0.0, + "epoch": 0.8928434013081955, + "flos": 486669772800.0, + "grad_norm": 0.07321859189533414, + "language_loss": 0.80895549, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81941569, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4641, + "time_per_iteration": 2.5623698234558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003551, + "balance_loss_mlp": 0.99911606, + "diversity_loss_mlp": 0.0, + "epoch": 0.893035782993459, + "flos": 1448302560768.0, + "grad_norm": 0.005611533508350328, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81334412, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4642, + "time_per_iteration": 4.691712379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_mlp": 1.03812027, + "diversity_loss_mlp": 0.0, + "epoch": 0.8932281646787226, + "flos": 611320255488.0, + "grad_norm": 0.0756626581840296, + "language_loss": 0.8056438, + "learning_rate": 2.95997305629786e-05, + "loss": 0.8161152, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4643, + "time_per_iteration": 2.774066925048828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048848, + "balance_loss_mlp": 1.03975809, + "diversity_loss_mlp": 0.0, + "epoch": 0.8934205463639862, + "flos": 565760775168.0, + "grad_norm": 0.07062905944842346, + "language_loss": 0.84894288, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85943139, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4644, + "time_per_iteration": 2.6488940715789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048016, + "balance_loss_mlp": 1.03935552, + "diversity_loss_mlp": 0.0, + "epoch": 0.8936129280492497, + "flos": 488431420416.0, + "grad_norm": 0.0836667751857819, + "language_loss": 0.78037202, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79085219, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 4645, + "time_per_iteration": 2.583796977996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049053, + "balance_loss_mlp": 1.04036856, + "diversity_loss_mlp": 0.0, + "epoch": 0.8938053097345132, + "flos": 886490542080.0, + "grad_norm": 0.05897365940384636, + "language_loss": 0.807109, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81759953, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 4646, + "time_per_iteration": 3.2107162475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_mlp": 1.03690004, + "diversity_loss_mlp": 0.0, + "epoch": 0.8939976914197768, + "flos": 593285658624.0, + "grad_norm": 0.06566650575637094, + "language_loss": 0.8383972, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.8488546, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4647, + "time_per_iteration": 2.742075204849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_mlp": 1.04111314, + "diversity_loss_mlp": 0.0, + "epoch": 0.8941900731050404, + "flos": 523247745024.0, + "grad_norm": 0.07362813813067959, + "language_loss": 0.81445944, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.82496303, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.09259033, + "routerloss_mlp": 0.0, + "step": 4648, + "time_per_iteration": 2.664386510848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_mlp": 1.03510404, + "diversity_loss_mlp": 0.0, + "epoch": 0.894382454790304, + "flos": 800582745600.0, + "grad_norm": 0.06107370863093702, + "language_loss": 0.80719924, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.81763935, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4649, + "time_per_iteration": 2.9920804500579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047105, + "balance_loss_mlp": 1.03825331, + "diversity_loss_mlp": 0.0, + "epoch": 0.8945748364755676, + "flos": 479037307392.0, + "grad_norm": 0.06165388839592064, + "language_loss": 0.84649253, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.85696357, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4650, + "time_per_iteration": 2.6212713718414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046466, + "balance_loss_mlp": 1.03758526, + "diversity_loss_mlp": 0.0, + "epoch": 0.894767218160831, + "flos": 508776090624.0, + "grad_norm": 0.06579934808698863, + "language_loss": 0.83054405, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84100872, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4651, + "time_per_iteration": 2.671393394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045401, + "balance_loss_mlp": 1.03632951, + "diversity_loss_mlp": 0.0, + "epoch": 0.8949595998460946, + "flos": 685857549312.0, + "grad_norm": 0.06478595695479929, + "language_loss": 0.81956565, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.83001965, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4652, + "time_per_iteration": 2.849560499191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045732, + "balance_loss_mlp": 1.03662467, + "diversity_loss_mlp": 0.0, + "epoch": 0.8951519815313582, + "flos": 799920520704.0, + "grad_norm": 0.06805126112229812, + "language_loss": 0.7762472, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78670454, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4653, + "time_per_iteration": 2.9823384284973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010449, + "balance_loss_mlp": 1.03572643, + "diversity_loss_mlp": 0.0, + "epoch": 0.8953443632166218, + "flos": 666740782080.0, + "grad_norm": 0.06521391394645211, + "language_loss": 0.86080307, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87125206, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.09179688, + "routerloss_mlp": 0.0, + "step": 4654, + "time_per_iteration": 2.7805397510528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048835, + "balance_loss_mlp": 1.03972173, + "diversity_loss_mlp": 0.0, + "epoch": 0.8955367449018854, + "flos": 644977686528.0, + "grad_norm": 0.0849204409565989, + "language_loss": 0.83320463, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84369302, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4655, + "time_per_iteration": 2.876401662826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00780551, + "balance_loss_mlp": 1.31460428, + "diversity_loss_mlp": 0.22509943, + "epoch": 0.8957291265871489, + "flos": 808714077696.0, + "grad_norm": 0.034355787829583595, + "language_loss": 0.77789617, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78570163, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0106987, + "step": 4656, + "time_per_iteration": 3.0823395252227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_mlp": 1.03641081, + "diversity_loss_mlp": 0.0, + "epoch": 0.8959215082724125, + "flos": 518923832832.0, + "grad_norm": 0.0676440423058397, + "language_loss": 0.77287573, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78333104, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4657, + "time_per_iteration": 2.64528751373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048024, + "balance_loss_mlp": 1.03879762, + "diversity_loss_mlp": 0.0, + "epoch": 0.896113889957676, + "flos": 476917383168.0, + "grad_norm": 0.0693704945431175, + "language_loss": 0.77242142, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78290164, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4658, + "time_per_iteration": 2.6108851432800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047944, + "balance_loss_mlp": 1.03874731, + "diversity_loss_mlp": 0.0, + "epoch": 0.8963062716429396, + "flos": 518162863104.0, + "grad_norm": 0.0647769416450041, + "language_loss": 0.83169466, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84217411, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.09191895, + "routerloss_mlp": 0.0, + "step": 4659, + "time_per_iteration": 2.605060338973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791335, + "balance_loss_mlp": 1.33468997, + "diversity_loss_mlp": 0.22667646, + "epoch": 0.8964986533282031, + "flos": 508484625408.0, + "grad_norm": 0.035487365759697125, + "language_loss": 0.82103157, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82894492, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01065169, + "step": 4660, + "time_per_iteration": 2.7054848670959473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045343, + "balance_loss_mlp": 1.0363133, + "diversity_loss_mlp": 0.0, + "epoch": 0.8966910350134667, + "flos": 536076320256.0, + "grad_norm": 0.08335321412533339, + "language_loss": 0.81629348, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82674694, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4661, + "time_per_iteration": 2.6878600120544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051819, + "balance_loss_mlp": 1.04299188, + "diversity_loss_mlp": 0.0, + "epoch": 0.8968834166987303, + "flos": 723226226688.0, + "grad_norm": 0.08023362288618259, + "language_loss": 0.84117174, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85168993, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4662, + "time_per_iteration": 2.9065072536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783028, + "balance_loss_mlp": 1.31757593, + "diversity_loss_mlp": 0.22707665, + "epoch": 0.8970757983839939, + "flos": 681686710272.0, + "grad_norm": 0.028939334122514253, + "language_loss": 0.84291148, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.85074168, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01070135, + "step": 4663, + "time_per_iteration": 2.978598117828369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044806, + "balance_loss_mlp": 1.03588283, + "diversity_loss_mlp": 0.0, + "epoch": 0.8972681800692575, + "flos": 613037486592.0, + "grad_norm": 0.09868574536780622, + "language_loss": 0.75424099, + "learning_rate": 2.742244971856006e-05, + "loss": 0.76468909, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4664, + "time_per_iteration": 2.7175958156585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104879, + "balance_loss_mlp": 1.03972983, + "diversity_loss_mlp": 0.0, + "epoch": 0.8974605617545209, + "flos": 572350344192.0, + "grad_norm": 0.07019842465420709, + "language_loss": 0.83128035, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84176832, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4665, + "time_per_iteration": 2.7153587341308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104645, + "balance_loss_mlp": 1.03744328, + "diversity_loss_mlp": 0.0, + "epoch": 0.8976529434397845, + "flos": 520418608128.0, + "grad_norm": 0.06031238876791543, + "language_loss": 0.87254226, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88300675, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4666, + "time_per_iteration": 2.6804378032684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047772, + "balance_loss_mlp": 1.03897464, + "diversity_loss_mlp": 0.0, + "epoch": 0.8978453251250481, + "flos": 471355656192.0, + "grad_norm": 0.05793843844833838, + "language_loss": 0.82573009, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83620781, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4667, + "time_per_iteration": 2.6166820526123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_mlp": 1.03860664, + "diversity_loss_mlp": 0.0, + "epoch": 0.8980377068103117, + "flos": 591659831808.0, + "grad_norm": 0.057031250426829085, + "language_loss": 0.82203746, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.8325128, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4668, + "time_per_iteration": 2.7726669311523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_mlp": 1.04209065, + "diversity_loss_mlp": 0.0, + "epoch": 0.8982300884955752, + "flos": 767619472896.0, + "grad_norm": 0.07157029094935193, + "language_loss": 0.82771599, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83822584, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4669, + "time_per_iteration": 2.934701681137085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050412, + "balance_loss_mlp": 1.04158425, + "diversity_loss_mlp": 0.0, + "epoch": 0.8984224701808388, + "flos": 844575496704.0, + "grad_norm": 0.07594625881413491, + "language_loss": 0.77720773, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78771186, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4670, + "time_per_iteration": 3.232701539993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050559, + "balance_loss_mlp": 1.04171383, + "diversity_loss_mlp": 0.0, + "epoch": 0.8986148518661023, + "flos": 757661879808.0, + "grad_norm": 0.07298208517048191, + "language_loss": 0.75987267, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.77037835, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4671, + "time_per_iteration": 3.183443784713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_mlp": 1.03399336, + "diversity_loss_mlp": 0.0, + "epoch": 0.8988072335513659, + "flos": 563070030336.0, + "grad_norm": 0.0671693421720064, + "language_loss": 0.76635265, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.77678287, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4672, + "time_per_iteration": 2.657771587371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047693, + "balance_loss_mlp": 1.03888941, + "diversity_loss_mlp": 0.0, + "epoch": 0.8989996152366295, + "flos": 492683751936.0, + "grad_norm": 0.07004510948289375, + "language_loss": 0.86707628, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87755322, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4673, + "time_per_iteration": 2.535236120223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047542, + "balance_loss_mlp": 1.03844619, + "diversity_loss_mlp": 0.0, + "epoch": 0.899191996921893, + "flos": 542567144448.0, + "grad_norm": 0.07892824616979383, + "language_loss": 0.75965667, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.77013206, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4674, + "time_per_iteration": 2.6591787338256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048875, + "balance_loss_mlp": 1.03979182, + "diversity_loss_mlp": 0.0, + "epoch": 0.8993843786071566, + "flos": 471325920768.0, + "grad_norm": 0.06983733159730086, + "language_loss": 0.80178928, + "learning_rate": 2.631423662948984e-05, + "loss": 0.81227803, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 4675, + "time_per_iteration": 2.5485310554504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048014, + "balance_loss_mlp": 1.03897238, + "diversity_loss_mlp": 0.0, + "epoch": 0.8995767602924202, + "flos": 526726623744.0, + "grad_norm": 0.07663293464144452, + "language_loss": 0.82886845, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83934855, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4676, + "time_per_iteration": 2.712852954864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047902, + "balance_loss_mlp": 1.03895569, + "diversity_loss_mlp": 0.0, + "epoch": 0.8997691419776838, + "flos": 557634212352.0, + "grad_norm": 0.063501986784752, + "language_loss": 0.8503803, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.86085933, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4677, + "time_per_iteration": 2.700191020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049601, + "balance_loss_mlp": 1.04097605, + "diversity_loss_mlp": 0.0, + "epoch": 0.8999615236629472, + "flos": 639027947520.0, + "grad_norm": 0.06651601339856017, + "language_loss": 0.80581087, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81630689, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 4678, + "time_per_iteration": 2.815133571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004703, + "balance_loss_mlp": 1.00024414, + "diversity_loss_mlp": 0.0, + "epoch": 0.9001539053482108, + "flos": 1431510547968.0, + "grad_norm": 0.0032341066943480366, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86788726, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4679, + "time_per_iteration": 4.805148124694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051809, + "balance_loss_mlp": 1.0428679, + "diversity_loss_mlp": 0.0, + "epoch": 0.9003462870334744, + "flos": 566877450240.0, + "grad_norm": 0.07566932247626351, + "language_loss": 0.79916567, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.8096838, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4680, + "time_per_iteration": 2.844775915145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046896, + "balance_loss_mlp": 1.03798509, + "diversity_loss_mlp": 0.0, + "epoch": 0.900538668718738, + "flos": 538655837184.0, + "grad_norm": 0.06791957432772232, + "language_loss": 0.78502154, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.7954905, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4681, + "time_per_iteration": 2.6303482055664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_mlp": 1.03633404, + "diversity_loss_mlp": 0.0, + "epoch": 0.9007310504040016, + "flos": 488387003904.0, + "grad_norm": 0.08260546999078933, + "language_loss": 0.86167276, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.872123, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 4682, + "time_per_iteration": 2.5562498569488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104895, + "balance_loss_mlp": 1.04019439, + "diversity_loss_mlp": 0.0, + "epoch": 0.9009234320892651, + "flos": 652901617152.0, + "grad_norm": 0.07052497776440367, + "language_loss": 0.78726637, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79775584, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4683, + "time_per_iteration": 2.8474693298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045566, + "balance_loss_mlp": 1.03673279, + "diversity_loss_mlp": 0.0, + "epoch": 0.9011158137745287, + "flos": 545569178112.0, + "grad_norm": 0.058284794620577896, + "language_loss": 0.84781289, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.85826856, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4684, + "time_per_iteration": 2.6325201988220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045889, + "balance_loss_mlp": 1.03712106, + "diversity_loss_mlp": 0.0, + "epoch": 0.9013081954597922, + "flos": 559699808256.0, + "grad_norm": 0.07314098955075891, + "language_loss": 0.82745099, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83790988, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4685, + "time_per_iteration": 2.7466516494750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104569, + "balance_loss_mlp": 1.03673732, + "diversity_loss_mlp": 0.0, + "epoch": 0.9015005771450558, + "flos": 728652132864.0, + "grad_norm": 0.06299423790772288, + "language_loss": 0.81375784, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82421476, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4686, + "time_per_iteration": 2.8934953212738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048153, + "balance_loss_mlp": 1.03924799, + "diversity_loss_mlp": 0.0, + "epoch": 0.9016929588303193, + "flos": 517416574464.0, + "grad_norm": 0.07273312761869775, + "language_loss": 0.81357133, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.82405281, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4687, + "time_per_iteration": 2.7839083671569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046931, + "balance_loss_mlp": 1.03816867, + "diversity_loss_mlp": 0.0, + "epoch": 0.9018853405155829, + "flos": 622335052800.0, + "grad_norm": 0.05747241213566195, + "language_loss": 0.86223972, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87270904, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4688, + "time_per_iteration": 2.8053431510925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_mlp": 1.03985882, + "diversity_loss_mlp": 0.0, + "epoch": 0.9020777222008465, + "flos": 523284820992.0, + "grad_norm": 0.06831532416346216, + "language_loss": 0.77670169, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.78718954, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4689, + "time_per_iteration": 2.6122989654541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_mlp": 1.03666687, + "diversity_loss_mlp": 0.0, + "epoch": 0.9022701038861101, + "flos": 633713269248.0, + "grad_norm": 0.05580417916624313, + "language_loss": 0.81750822, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.82796389, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4690, + "time_per_iteration": 2.8226675987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049641, + "balance_loss_mlp": 1.04086757, + "diversity_loss_mlp": 0.0, + "epoch": 0.9024624855713737, + "flos": 513295294464.0, + "grad_norm": 0.07066245461166361, + "language_loss": 0.84397352, + "learning_rate": 2.474202664305253e-05, + "loss": 0.8544699, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4691, + "time_per_iteration": 2.608060359954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046874, + "balance_loss_mlp": 1.03758168, + "diversity_loss_mlp": 0.0, + "epoch": 0.9026548672566371, + "flos": 477411480576.0, + "grad_norm": 0.06466025971704324, + "language_loss": 0.86426198, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87473077, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 4692, + "time_per_iteration": 2.63151216506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_mlp": 1.0386498, + "diversity_loss_mlp": 0.0, + "epoch": 0.9028472489419007, + "flos": 661994353152.0, + "grad_norm": 0.06521986088761798, + "language_loss": 0.73844278, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74891818, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4693, + "time_per_iteration": 2.833467483520508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048441, + "balance_loss_mlp": 1.0395714, + "diversity_loss_mlp": 0.0, + "epoch": 0.9030396306271643, + "flos": 534588885504.0, + "grad_norm": 0.07181614420601379, + "language_loss": 0.82029641, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.8307808, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4694, + "time_per_iteration": 2.6215834617614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050504, + "balance_loss_mlp": 1.04152727, + "diversity_loss_mlp": 0.0, + "epoch": 0.9032320123124279, + "flos": 801032426496.0, + "grad_norm": 0.07933043955400586, + "language_loss": 0.8251496, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83565462, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4695, + "time_per_iteration": 2.9662675857543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045657, + "balance_loss_mlp": 1.03683591, + "diversity_loss_mlp": 0.0, + "epoch": 0.9034243939976914, + "flos": 553942789632.0, + "grad_norm": 0.08647194091584645, + "language_loss": 0.76889336, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77934992, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4696, + "time_per_iteration": 2.6831114292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046387, + "balance_loss_mlp": 1.03765512, + "diversity_loss_mlp": 0.0, + "epoch": 0.903616775682955, + "flos": 503903752704.0, + "grad_norm": 0.06589427726191109, + "language_loss": 0.82852316, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83898699, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 4697, + "time_per_iteration": 2.606084108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046118, + "balance_loss_mlp": 1.03745151, + "diversity_loss_mlp": 0.0, + "epoch": 0.9038091573682185, + "flos": 436297052160.0, + "grad_norm": 0.07072654359751072, + "language_loss": 0.79079431, + "learning_rate": 2.406902878347017e-05, + "loss": 0.80125546, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 4698, + "time_per_iteration": 2.6087543964385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049163, + "balance_loss_mlp": 1.03998375, + "diversity_loss_mlp": 0.0, + "epoch": 0.9040015390534821, + "flos": 532916070912.0, + "grad_norm": 0.08844604656187115, + "language_loss": 0.81696689, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.8274585, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.09173584, + "routerloss_mlp": 0.0, + "step": 4699, + "time_per_iteration": 2.6180419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044344, + "balance_loss_mlp": 1.03545141, + "diversity_loss_mlp": 0.0, + "epoch": 0.9041939207387457, + "flos": 564307845120.0, + "grad_norm": 0.06789594949929362, + "language_loss": 0.80433279, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.81477618, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4700, + "time_per_iteration": 2.8047759532928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_mlp": 1.03756499, + "diversity_loss_mlp": 0.0, + "epoch": 0.9043863024240092, + "flos": 515509194240.0, + "grad_norm": 0.07594330446268198, + "language_loss": 0.77877766, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78924191, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4701, + "time_per_iteration": 2.5752878189086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003251, + "balance_loss_mlp": 0.99879241, + "diversity_loss_mlp": 0.0, + "epoch": 0.9045786841092728, + "flos": 1277949063168.0, + "grad_norm": 0.003648556595750329, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73933041, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4702, + "time_per_iteration": 4.9735963344573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_mlp": 1.04117787, + "diversity_loss_mlp": 0.0, + "epoch": 0.9047710657945364, + "flos": 585841144320.0, + "grad_norm": 0.08131986828145982, + "language_loss": 0.8338269, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.84432721, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4703, + "time_per_iteration": 2.736764430999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045539, + "balance_loss_mlp": 1.0364728, + "diversity_loss_mlp": 0.0, + "epoch": 0.9049634474798, + "flos": 571937739264.0, + "grad_norm": 0.085064980666539, + "language_loss": 0.79620826, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80666363, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 4704, + "time_per_iteration": 2.7620725631713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010488, + "balance_loss_mlp": 1.04021692, + "diversity_loss_mlp": 0.0, + "epoch": 0.9051558291650635, + "flos": 572619787776.0, + "grad_norm": 0.08171845507100765, + "language_loss": 0.74530506, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75579304, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.0859375, + "routerloss_mlp": 0.0, + "step": 4705, + "time_per_iteration": 2.6691336631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048986, + "balance_loss_mlp": 1.0402658, + "diversity_loss_mlp": 0.0, + "epoch": 0.905348210850327, + "flos": 540538624512.0, + "grad_norm": 0.08031830917867225, + "language_loss": 0.79134667, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80183655, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4706, + "time_per_iteration": 2.657421350479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049352, + "balance_loss_mlp": 1.04074478, + "diversity_loss_mlp": 0.0, + "epoch": 0.9055405925355906, + "flos": 516381391872.0, + "grad_norm": 0.07852771434357471, + "language_loss": 0.81530303, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82579654, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4707, + "time_per_iteration": 2.6042165756225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_mlp": 1.03612292, + "diversity_loss_mlp": 0.0, + "epoch": 0.9057329742208542, + "flos": 914643145728.0, + "grad_norm": 0.052073742250211955, + "language_loss": 0.85184813, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.86229986, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4708, + "time_per_iteration": 3.205712080001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048052, + "balance_loss_mlp": 1.03921902, + "diversity_loss_mlp": 0.0, + "epoch": 0.9059253559061178, + "flos": 905261515776.0, + "grad_norm": 0.07208392805658173, + "language_loss": 0.83473063, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.84521115, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4709, + "time_per_iteration": 3.15082049369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046721, + "balance_loss_mlp": 1.03755391, + "diversity_loss_mlp": 0.0, + "epoch": 0.9061177375913813, + "flos": 664534222848.0, + "grad_norm": 0.09897458123618827, + "language_loss": 0.77498788, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78545511, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4710, + "time_per_iteration": 2.856567144393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047626, + "balance_loss_mlp": 1.03875113, + "diversity_loss_mlp": 0.0, + "epoch": 0.9063101192766448, + "flos": 565609900032.0, + "grad_norm": 0.06579655928741501, + "language_loss": 0.82653207, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.83700836, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4711, + "time_per_iteration": 2.8177871704101562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_mlp": 1.03596139, + "diversity_loss_mlp": 0.0, + "epoch": 0.9065025009619084, + "flos": 727377242112.0, + "grad_norm": 0.06583523405134029, + "language_loss": 0.78812146, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.79856777, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 4712, + "time_per_iteration": 2.880993604660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010452, + "balance_loss_mlp": 1.03643262, + "diversity_loss_mlp": 0.0, + "epoch": 0.906694882647172, + "flos": 531512699904.0, + "grad_norm": 0.07415444506941751, + "language_loss": 0.80136561, + "learning_rate": 2.265739417041418e-05, + "loss": 0.81181759, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4713, + "time_per_iteration": 2.627692937850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_mlp": 1.03697634, + "diversity_loss_mlp": 0.0, + "epoch": 0.9068872643324356, + "flos": 429788975616.0, + "grad_norm": 0.06943776230353088, + "language_loss": 0.84932685, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.85978746, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4714, + "time_per_iteration": 2.5953822135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049292, + "balance_loss_mlp": 1.04006529, + "diversity_loss_mlp": 0.0, + "epoch": 0.9070796460176991, + "flos": 588366332928.0, + "grad_norm": 0.07092231807138824, + "language_loss": 0.79715693, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80764985, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.09228516, + "routerloss_mlp": 0.0, + "step": 4715, + "time_per_iteration": 2.7853944301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_mlp": 1.03861296, + "diversity_loss_mlp": 0.0, + "epoch": 0.9072720277029627, + "flos": 571582033920.0, + "grad_norm": 0.08464437568581946, + "language_loss": 0.7548542, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.765329, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4716, + "time_per_iteration": 2.7233853340148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_mlp": 1.03780317, + "diversity_loss_mlp": 0.0, + "epoch": 0.9074644093882263, + "flos": 555798412800.0, + "grad_norm": 0.07842659824105606, + "language_loss": 0.88551593, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89598, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4717, + "time_per_iteration": 2.66381573677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_mlp": 1.03856564, + "diversity_loss_mlp": 0.0, + "epoch": 0.9076567910734898, + "flos": 640994798592.0, + "grad_norm": 0.06367124229028898, + "language_loss": 0.82281721, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83329189, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 4718, + "time_per_iteration": 2.7830944061279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047901, + "balance_loss_mlp": 1.03901446, + "diversity_loss_mlp": 0.0, + "epoch": 0.9078491727587533, + "flos": 733998744576.0, + "grad_norm": 0.07734106151470267, + "language_loss": 0.81955713, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.83003616, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4719, + "time_per_iteration": 3.1287927627563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046133, + "balance_loss_mlp": 1.03729379, + "diversity_loss_mlp": 0.0, + "epoch": 0.9080415544440169, + "flos": 654774492672.0, + "grad_norm": 0.060901042499375765, + "language_loss": 0.86802292, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.87848425, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4720, + "time_per_iteration": 2.8568358421325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041752, + "balance_loss_mlp": 1.03286505, + "diversity_loss_mlp": 0.0, + "epoch": 0.9082339361292805, + "flos": 597463838208.0, + "grad_norm": 0.06480953268672687, + "language_loss": 0.79562217, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80603969, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 4721, + "time_per_iteration": 2.7394514083862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104847, + "balance_loss_mlp": 1.03955877, + "diversity_loss_mlp": 0.0, + "epoch": 0.9084263178145441, + "flos": 504407761920.0, + "grad_norm": 0.09226866260525313, + "language_loss": 0.84760112, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85808581, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4722, + "time_per_iteration": 2.616605281829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046083, + "balance_loss_mlp": 1.03716016, + "diversity_loss_mlp": 0.0, + "epoch": 0.9086186994998077, + "flos": 550031482368.0, + "grad_norm": 0.07637156979590433, + "language_loss": 0.80386579, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.81432664, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4723, + "time_per_iteration": 2.740421772003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_mlp": 1.03707719, + "diversity_loss_mlp": 0.0, + "epoch": 0.9088110811850711, + "flos": 1134076847616.0, + "grad_norm": 0.06514717136506207, + "language_loss": 0.75284863, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76330721, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4724, + "time_per_iteration": 3.563429117202759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052369, + "balance_loss_mlp": 1.0434463, + "diversity_loss_mlp": 0.0, + "epoch": 0.9090034628703347, + "flos": 556991811072.0, + "grad_norm": 0.06971007170583818, + "language_loss": 0.76744211, + "learning_rate": 2.155810244111628e-05, + "loss": 0.77796578, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4725, + "time_per_iteration": 2.658780336380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052144, + "balance_loss_mlp": 1.0433048, + "diversity_loss_mlp": 0.0, + "epoch": 0.9091958445555983, + "flos": 543970515456.0, + "grad_norm": 0.06413099042531242, + "language_loss": 0.84407449, + "learning_rate": 2.146770131403658e-05, + "loss": 0.8545959, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4726, + "time_per_iteration": 2.6778671741485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_mlp": 1.04029298, + "diversity_loss_mlp": 0.0, + "epoch": 0.9093882262408619, + "flos": 526113957888.0, + "grad_norm": 0.07280363304099743, + "language_loss": 0.81181479, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82230693, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4727, + "time_per_iteration": 2.6568636894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_mlp": 1.04238701, + "diversity_loss_mlp": 0.0, + "epoch": 0.9095806079261254, + "flos": 548526795264.0, + "grad_norm": 0.0725280737417026, + "language_loss": 0.81922674, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.82973742, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 4728, + "time_per_iteration": 2.643022060394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044931, + "balance_loss_mlp": 1.03610396, + "diversity_loss_mlp": 0.0, + "epoch": 0.909772989611389, + "flos": 572535724032.0, + "grad_norm": 0.0673800156354799, + "language_loss": 0.84635472, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85680401, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4729, + "time_per_iteration": 2.724855661392212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048461, + "balance_loss_mlp": 1.03952658, + "diversity_loss_mlp": 0.0, + "epoch": 0.9099653712966526, + "flos": 561812391936.0, + "grad_norm": 0.07330494114530435, + "language_loss": 0.79589331, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.80637789, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4730, + "time_per_iteration": 2.665303945541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047478, + "balance_loss_mlp": 1.03856754, + "diversity_loss_mlp": 0.0, + "epoch": 0.9101577529819161, + "flos": 1093800112128.0, + "grad_norm": 0.078385767023693, + "language_loss": 0.80267072, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81314552, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4731, + "time_per_iteration": 3.366713762283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046528, + "balance_loss_mlp": 1.03736663, + "diversity_loss_mlp": 0.0, + "epoch": 0.9103501346671797, + "flos": 445444116480.0, + "grad_norm": 0.08027492001685438, + "language_loss": 0.81851661, + "learning_rate": 2.092919721190678e-05, + "loss": 0.82898188, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 4732, + "time_per_iteration": 2.511289119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_mlp": 1.04403806, + "diversity_loss_mlp": 0.0, + "epoch": 0.9105425163524432, + "flos": 500770667520.0, + "grad_norm": 0.07912673976757961, + "language_loss": 0.77801937, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.7885493, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4733, + "time_per_iteration": 2.6270110607147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048563, + "balance_loss_mlp": 1.03949749, + "diversity_loss_mlp": 0.0, + "epoch": 0.9107348980377068, + "flos": 657519565824.0, + "grad_norm": 0.055649375090756015, + "language_loss": 0.84341621, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.85390186, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 4734, + "time_per_iteration": 2.8428561687469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048531, + "balance_loss_mlp": 1.0395968, + "diversity_loss_mlp": 0.0, + "epoch": 0.9109272797229704, + "flos": 553668576768.0, + "grad_norm": 0.07562354165732797, + "language_loss": 0.84999311, + "learning_rate": 2.066245558029256e-05, + "loss": 0.8604784, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4735, + "time_per_iteration": 2.617300033569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047339, + "balance_loss_mlp": 1.03857076, + "diversity_loss_mlp": 0.0, + "epoch": 0.911119661408234, + "flos": 519007896576.0, + "grad_norm": 0.06845754764753385, + "language_loss": 0.84216273, + "learning_rate": 2.057391384781182e-05, + "loss": 0.8526361, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4736, + "time_per_iteration": 2.621656894683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053341, + "balance_loss_mlp": 1.04450214, + "diversity_loss_mlp": 0.0, + "epoch": 0.9113120430934974, + "flos": 554375218176.0, + "grad_norm": 0.07185753448877732, + "language_loss": 0.83150327, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.8420366, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4737, + "time_per_iteration": 2.6248881816864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052271, + "balance_loss_mlp": 1.04334199, + "diversity_loss_mlp": 0.0, + "epoch": 0.911504424778761, + "flos": 501889913856.0, + "grad_norm": 0.06362345813560902, + "language_loss": 0.81097478, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.8214975, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4738, + "time_per_iteration": 2.6537606716156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050862, + "balance_loss_mlp": 1.0419693, + "diversity_loss_mlp": 0.0, + "epoch": 0.9116968064640246, + "flos": 611100370944.0, + "grad_norm": 0.06023003948048014, + "language_loss": 0.81882358, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.82933223, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4739, + "time_per_iteration": 2.7091641426086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047604, + "balance_loss_mlp": 1.03856826, + "diversity_loss_mlp": 0.0, + "epoch": 0.9118891881492882, + "flos": 572918593536.0, + "grad_norm": 0.06392422998543029, + "language_loss": 0.82626665, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.8367427, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.0904541, + "routerloss_mlp": 0.0, + "step": 4740, + "time_per_iteration": 2.762544631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049849, + "balance_loss_mlp": 1.04099774, + "diversity_loss_mlp": 0.0, + "epoch": 0.9120815698345518, + "flos": 635961673728.0, + "grad_norm": 0.0822598036225358, + "language_loss": 0.78046763, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.79096615, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4741, + "time_per_iteration": 2.84562087059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_mlp": 1.04134798, + "diversity_loss_mlp": 0.0, + "epoch": 0.9122739515198153, + "flos": 702300824064.0, + "grad_norm": 0.06551662933562434, + "language_loss": 0.857319, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.86782068, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4742, + "time_per_iteration": 2.8531861305236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050211, + "balance_loss_mlp": 1.04143143, + "diversity_loss_mlp": 0.0, + "epoch": 0.9124663332050789, + "flos": 524690763264.0, + "grad_norm": 0.08699441594773756, + "language_loss": 0.87479031, + "learning_rate": 1.995933526832239e-05, + "loss": 0.88529241, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4743, + "time_per_iteration": 2.650739908218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049409, + "balance_loss_mlp": 1.04080176, + "diversity_loss_mlp": 0.0, + "epoch": 0.9126587148903424, + "flos": 563299826688.0, + "grad_norm": 0.06693150560912724, + "language_loss": 0.826424, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83691812, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4744, + "time_per_iteration": 2.679450035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045731, + "balance_loss_mlp": 1.03671229, + "diversity_loss_mlp": 0.0, + "epoch": 0.912851096575606, + "flos": 505942184448.0, + "grad_norm": 0.08010451753321661, + "language_loss": 0.79965168, + "learning_rate": 1.978541819374574e-05, + "loss": 0.81010902, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4745, + "time_per_iteration": 2.5925939083099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_mlp": 1.03974199, + "diversity_loss_mlp": 0.0, + "epoch": 0.9130434782608695, + "flos": 550730783232.0, + "grad_norm": 0.06455396152064795, + "language_loss": 0.82245004, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83293486, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4746, + "time_per_iteration": 2.6314661502838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_mlp": 1.04123759, + "diversity_loss_mlp": 0.0, + "epoch": 0.9132358599461331, + "flos": 468976200192.0, + "grad_norm": 0.06909556408267023, + "language_loss": 0.83497131, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84546977, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4747, + "time_per_iteration": 2.5474631786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046479, + "balance_loss_mlp": 1.03760934, + "diversity_loss_mlp": 0.0, + "epoch": 0.9134282416313967, + "flos": 506097828864.0, + "grad_norm": 0.07312632583700283, + "language_loss": 0.79836029, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.80882508, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4748, + "time_per_iteration": 2.680522918701172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050241, + "balance_loss_mlp": 1.04132986, + "diversity_loss_mlp": 0.0, + "epoch": 0.9136206233166603, + "flos": 604819519488.0, + "grad_norm": 0.06502832751654097, + "language_loss": 0.83780789, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.84831029, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4749, + "time_per_iteration": 2.7452023029327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050292, + "balance_loss_mlp": 1.04147661, + "diversity_loss_mlp": 0.0, + "epoch": 0.9138130050019239, + "flos": 561738240000.0, + "grad_norm": 0.07375447300189412, + "language_loss": 0.82539463, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.83589756, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4750, + "time_per_iteration": 2.6701533794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_mlp": 1.04132831, + "diversity_loss_mlp": 0.0, + "epoch": 0.9140053866871873, + "flos": 690117221376.0, + "grad_norm": 0.06117546898764861, + "language_loss": 0.90313232, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91363287, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4751, + "time_per_iteration": 2.8322813510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_mlp": 1.03372943, + "diversity_loss_mlp": 0.0, + "epoch": 0.9141977683724509, + "flos": 551012336640.0, + "grad_norm": 0.05974577392766342, + "language_loss": 0.84016383, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85059029, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 4752, + "time_per_iteration": 2.688077449798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050001, + "balance_loss_mlp": 1.04098237, + "diversity_loss_mlp": 0.0, + "epoch": 0.9143901500577145, + "flos": 540088943616.0, + "grad_norm": 0.06413328541809935, + "language_loss": 0.75752521, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76802522, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4753, + "time_per_iteration": 2.650331974029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047101, + "balance_loss_mlp": 1.03808916, + "diversity_loss_mlp": 0.0, + "epoch": 0.9145825317429781, + "flos": 528767626752.0, + "grad_norm": 0.08121838802327101, + "language_loss": 0.80860132, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.81907237, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4754, + "time_per_iteration": 2.6111409664154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051972, + "balance_loss_mlp": 1.04308462, + "diversity_loss_mlp": 0.0, + "epoch": 0.9147749134282416, + "flos": 514792641024.0, + "grad_norm": 0.06557647778558516, + "language_loss": 0.79137278, + "learning_rate": 1.892702433097776e-05, + "loss": 0.80189246, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4755, + "time_per_iteration": 2.6349050998687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047014, + "balance_loss_mlp": 1.0382818, + "diversity_loss_mlp": 0.0, + "epoch": 0.9149672951135052, + "flos": 514441704960.0, + "grad_norm": 0.06908775382754948, + "language_loss": 0.85741401, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.8678841, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4756, + "time_per_iteration": 2.681579113006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_mlp": 1.03613043, + "diversity_loss_mlp": 0.0, + "epoch": 0.9151596767987688, + "flos": 577069608960.0, + "grad_norm": 0.06619379563809555, + "language_loss": 0.81299222, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.82344431, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.09069824, + "routerloss_mlp": 0.0, + "step": 4757, + "time_per_iteration": 2.8199880123138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104812, + "balance_loss_mlp": 1.03948975, + "diversity_loss_mlp": 0.0, + "epoch": 0.9153520584840323, + "flos": 619335590400.0, + "grad_norm": 0.07903863840267403, + "language_loss": 0.82496881, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83544993, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4758, + "time_per_iteration": 2.7341158390045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00789047, + "balance_loss_mlp": 1.333637, + "diversity_loss_mlp": 0.22318089, + "epoch": 0.9155444401692959, + "flos": 468921871872.0, + "grad_norm": 0.0321241392563351, + "language_loss": 0.83172476, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.83961523, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01063856, + "step": 4759, + "time_per_iteration": 2.597241163253784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008181, + "balance_loss_mlp": 1.00372291, + "diversity_loss_mlp": 0.0, + "epoch": 0.9157368218545594, + "flos": 1410711054336.0, + "grad_norm": 0.006260194037571693, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75827253, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4760, + "time_per_iteration": 4.852627754211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007794, + "balance_loss_mlp": 1.00331163, + "diversity_loss_mlp": 0.0, + "epoch": 0.915929203539823, + "flos": 1522019040768.0, + "grad_norm": 0.006798931475656377, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80583847, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4761, + "time_per_iteration": 4.994880437850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047238, + "balance_loss_mlp": 1.03847599, + "diversity_loss_mlp": 0.0, + "epoch": 0.9161215852250866, + "flos": 535752548352.0, + "grad_norm": 0.05790619573319675, + "language_loss": 0.80362964, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81410205, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4762, + "time_per_iteration": 2.752257823944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046406, + "balance_loss_mlp": 1.03779316, + "diversity_loss_mlp": 0.0, + "epoch": 0.9163139669103502, + "flos": 590624649216.0, + "grad_norm": 0.07895774001894396, + "language_loss": 0.8113842, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.82184827, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.08624268, + "routerloss_mlp": 0.0, + "step": 4763, + "time_per_iteration": 2.7287051677703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_mlp": 1.03780174, + "diversity_loss_mlp": 0.0, + "epoch": 0.9165063485956138, + "flos": 821975081472.0, + "grad_norm": 0.06309721497849985, + "language_loss": 0.8474853, + "learning_rate": 1.817043762598397e-05, + "loss": 0.85795045, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4764, + "time_per_iteration": 3.033647060394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047904, + "balance_loss_mlp": 1.03908885, + "diversity_loss_mlp": 0.0, + "epoch": 0.9166987302808772, + "flos": 525194772480.0, + "grad_norm": 0.06604892374800723, + "language_loss": 0.8237828, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.83426178, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4765, + "time_per_iteration": 2.6534650325775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_mlp": 1.03842866, + "diversity_loss_mlp": 0.0, + "epoch": 0.9168911119661408, + "flos": 655095693312.0, + "grad_norm": 0.05990107828974712, + "language_loss": 0.84426653, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85473955, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4766, + "time_per_iteration": 2.907374620437622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049797, + "balance_loss_mlp": 1.04099941, + "diversity_loss_mlp": 0.0, + "epoch": 0.9170834936514044, + "flos": 491747314176.0, + "grad_norm": 0.06352266446456978, + "language_loss": 0.8504523, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.86095023, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4767, + "time_per_iteration": 2.526810884475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052899, + "balance_loss_mlp": 1.04373765, + "diversity_loss_mlp": 0.0, + "epoch": 0.917275875336668, + "flos": 628040314368.0, + "grad_norm": 0.07650045088890157, + "language_loss": 0.80317563, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81370461, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4768, + "time_per_iteration": 2.8045382499694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006939, + "balance_loss_mlp": 1.00245714, + "diversity_loss_mlp": 0.0, + "epoch": 0.9174682570219315, + "flos": 1517981824512.0, + "grad_norm": 0.004694640504473852, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79187173, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4769, + "time_per_iteration": 5.044682264328003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045239, + "balance_loss_mlp": 1.03626275, + "diversity_loss_mlp": 0.0, + "epoch": 0.917660638707195, + "flos": 560021008896.0, + "grad_norm": 0.06781997849214876, + "language_loss": 0.85250586, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.86295819, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 4770, + "time_per_iteration": 2.691663980484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049626, + "balance_loss_mlp": 1.04060245, + "diversity_loss_mlp": 0.0, + "epoch": 0.9178530203924586, + "flos": 447252751872.0, + "grad_norm": 0.06638212987757935, + "language_loss": 0.84090322, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.85139954, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4771, + "time_per_iteration": 2.4912991523742676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048451, + "balance_loss_mlp": 1.03953981, + "diversity_loss_mlp": 0.0, + "epoch": 0.9180454020777222, + "flos": 465981507072.0, + "grad_norm": 0.06646365406462024, + "language_loss": 0.80387986, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81436437, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4772, + "time_per_iteration": 2.5629544258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051354, + "balance_loss_mlp": 1.04250824, + "diversity_loss_mlp": 0.0, + "epoch": 0.9182377837629858, + "flos": 596314856448.0, + "grad_norm": 0.06012915212224945, + "language_loss": 0.87101483, + "learning_rate": 1.74290029706784e-05, + "loss": 0.88152838, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4773, + "time_per_iteration": 2.7718729972839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_mlp": 1.04024041, + "diversity_loss_mlp": 0.0, + "epoch": 0.9184301654482493, + "flos": 996671941632.0, + "grad_norm": 0.05995829646518676, + "language_loss": 0.8283515, + "learning_rate": 1.734755767142876e-05, + "loss": 0.83884239, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4774, + "time_per_iteration": 3.344503164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051218, + "balance_loss_mlp": 1.04242659, + "diversity_loss_mlp": 0.0, + "epoch": 0.9186225471335129, + "flos": 508860154368.0, + "grad_norm": 0.06073994859782487, + "language_loss": 0.84713805, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.85765028, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4775, + "time_per_iteration": 2.641633987426758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048807, + "balance_loss_mlp": 1.04000342, + "diversity_loss_mlp": 0.0, + "epoch": 0.9188149288187765, + "flos": 940423633920.0, + "grad_norm": 0.07386829063235183, + "language_loss": 0.79117858, + "learning_rate": 1.718522925136551e-05, + "loss": 0.80166662, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4776, + "time_per_iteration": 3.311635971069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_mlp": 1.03558719, + "diversity_loss_mlp": 0.0, + "epoch": 0.91900731050404, + "flos": 583674232320.0, + "grad_norm": 0.065220381744787, + "language_loss": 0.84085238, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.85129607, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4777, + "time_per_iteration": 2.6673994064331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049607, + "balance_loss_mlp": 1.04089904, + "diversity_loss_mlp": 0.0, + "epoch": 0.9191996921893035, + "flos": 581213283840.0, + "grad_norm": 0.07320352446310975, + "language_loss": 0.79461032, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.8051064, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4778, + "time_per_iteration": 2.7164108753204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_mlp": 1.03928864, + "diversity_loss_mlp": 0.0, + "epoch": 0.9193920738745671, + "flos": 908935686144.0, + "grad_norm": 0.06805017648291643, + "language_loss": 0.79739892, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.80787992, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4779, + "time_per_iteration": 3.1064183712005615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006109, + "balance_loss_mlp": 1.00162721, + "diversity_loss_mlp": 0.0, + "epoch": 0.9195844555598307, + "flos": 1558372359168.0, + "grad_norm": 0.003729713968603667, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80801499, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4780, + "time_per_iteration": 4.670097351074219 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_mlp": 1.03773558, + "diversity_loss_mlp": 0.0, + "epoch": 0.9197768372450943, + "flos": 474053741568.0, + "grad_norm": 0.07167718666233086, + "language_loss": 0.78371525, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79418308, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4781, + "time_per_iteration": 2.550713300704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047938, + "balance_loss_mlp": 1.03888965, + "diversity_loss_mlp": 0.0, + "epoch": 0.9199692189303579, + "flos": 857016059904.0, + "grad_norm": 0.06622093872641387, + "language_loss": 0.84516716, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85564649, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4782, + "time_per_iteration": 3.2526657581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049444, + "balance_loss_mlp": 1.04045606, + "diversity_loss_mlp": 0.0, + "epoch": 0.9201616006156214, + "flos": 504390509568.0, + "grad_norm": 0.06845257893605372, + "language_loss": 0.77780342, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78829783, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4783, + "time_per_iteration": 2.6809587478637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_mlp": 1.03927171, + "diversity_loss_mlp": 0.0, + "epoch": 0.9203539823008849, + "flos": 548781184512.0, + "grad_norm": 0.06867364706040735, + "language_loss": 0.85155487, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.86203671, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4784, + "time_per_iteration": 2.7345173358917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787021, + "balance_loss_mlp": 1.32680988, + "diversity_loss_mlp": 0.22533412, + "epoch": 0.9205463639861485, + "flos": 540004879872.0, + "grad_norm": 0.03407668721721812, + "language_loss": 0.82609832, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83396852, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01094901, + "step": 4785, + "time_per_iteration": 2.685168504714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044838, + "balance_loss_mlp": 1.03591502, + "diversity_loss_mlp": 0.0, + "epoch": 0.9207387456714121, + "flos": 799725229056.0, + "grad_norm": 0.0666841111034061, + "language_loss": 0.77980995, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79025835, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 4786, + "time_per_iteration": 3.038740873336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051934, + "balance_loss_mlp": 1.04323745, + "diversity_loss_mlp": 0.0, + "epoch": 0.9209311273566756, + "flos": 502848746496.0, + "grad_norm": 0.06529131061254304, + "language_loss": 0.78631401, + "learning_rate": 1.630583198044333e-05, + "loss": 0.79683334, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4787, + "time_per_iteration": 2.65899658203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_mlp": 1.03834498, + "diversity_loss_mlp": 0.0, + "epoch": 0.9211235090419392, + "flos": 569323717632.0, + "grad_norm": 0.0788130161570292, + "language_loss": 0.8252883, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83576053, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4788, + "time_per_iteration": 2.6822633743286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_mlp": 1.04112673, + "diversity_loss_mlp": 0.0, + "epoch": 0.9213158907272028, + "flos": 806549736960.0, + "grad_norm": 0.07410580856976316, + "language_loss": 0.82474685, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.83524632, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4789, + "time_per_iteration": 2.9761576652526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045718, + "balance_loss_mlp": 1.03685474, + "diversity_loss_mlp": 0.0, + "epoch": 0.9215082724124664, + "flos": 490682396160.0, + "grad_norm": 0.0657414722313636, + "language_loss": 0.76699907, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77745622, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4790, + "time_per_iteration": 2.523589849472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004074, + "balance_loss_mlp": 0.99959189, + "diversity_loss_mlp": 0.0, + "epoch": 0.9217006540977299, + "flos": 1514495232000.0, + "grad_norm": 0.003599452207974624, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78074342, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.04492188, + "routerloss_mlp": 0.0, + "step": 4791, + "time_per_iteration": 4.9881064891815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052288, + "balance_loss_mlp": 1.04335308, + "diversity_loss_mlp": 0.0, + "epoch": 0.9218930357829934, + "flos": 743793352704.0, + "grad_norm": 0.06705071724600334, + "language_loss": 0.76482338, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77534628, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4792, + "time_per_iteration": 2.9655344486236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044613, + "balance_loss_mlp": 1.03580952, + "diversity_loss_mlp": 0.0, + "epoch": 0.922085417468257, + "flos": 453036934656.0, + "grad_norm": 0.20959696332428077, + "language_loss": 0.80366439, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81411052, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4793, + "time_per_iteration": 2.554954767227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044371, + "balance_loss_mlp": 1.0355078, + "diversity_loss_mlp": 0.0, + "epoch": 0.9222777991535206, + "flos": 500249405952.0, + "grad_norm": 0.07075391253683742, + "language_loss": 0.84841311, + "learning_rate": 1.575804349061616e-05, + "loss": 0.8588568, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 4794, + "time_per_iteration": 2.5949018001556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_mlp": 1.0387888, + "diversity_loss_mlp": 0.0, + "epoch": 0.9224701808387842, + "flos": 527959669248.0, + "grad_norm": 0.0784160138888604, + "language_loss": 0.79135698, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.80183321, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4795, + "time_per_iteration": 2.598656415939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048282, + "balance_loss_mlp": 1.03969288, + "diversity_loss_mlp": 0.0, + "epoch": 0.9226625625240477, + "flos": 874640623104.0, + "grad_norm": 0.06249472558878416, + "language_loss": 0.75247115, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76295394, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.08587646, + "routerloss_mlp": 0.0, + "step": 4796, + "time_per_iteration": 3.1448936462402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050009, + "balance_loss_mlp": 1.04117608, + "diversity_loss_mlp": 0.0, + "epoch": 0.9228549442093112, + "flos": 502774594560.0, + "grad_norm": 0.07031980659654383, + "language_loss": 0.88239074, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.89289081, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4797, + "time_per_iteration": 2.543046474456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00783825, + "balance_loss_mlp": 1.32076359, + "diversity_loss_mlp": 0.2258461, + "epoch": 0.9230473258945748, + "flos": 599989026816.0, + "grad_norm": 0.030753006157988122, + "language_loss": 0.84967744, + "learning_rate": 1.544915681564829e-05, + "loss": 0.85751569, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01051996, + "step": 4798, + "time_per_iteration": 2.819098949432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049115, + "balance_loss_mlp": 1.04029381, + "diversity_loss_mlp": 0.0, + "epoch": 0.9232397075798384, + "flos": 822508826112.0, + "grad_norm": 0.06926441515905145, + "language_loss": 0.79267633, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80316746, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4799, + "time_per_iteration": 3.0866541862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048573, + "balance_loss_mlp": 1.03970361, + "diversity_loss_mlp": 0.0, + "epoch": 0.923432089265102, + "flos": 707030000640.0, + "grad_norm": 0.06842232748476472, + "language_loss": 0.84939086, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85987657, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4800, + "time_per_iteration": 2.840742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048436, + "balance_loss_mlp": 1.03941798, + "diversity_loss_mlp": 0.0, + "epoch": 0.9236244709503655, + "flos": 701861054976.0, + "grad_norm": 0.07816499010690336, + "language_loss": 0.76574665, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77623105, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4801, + "time_per_iteration": 2.8335320949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_mlp": 1.04159379, + "diversity_loss_mlp": 0.0, + "epoch": 0.9238168526356291, + "flos": 515039689728.0, + "grad_norm": 0.06210245880406286, + "language_loss": 0.843297, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85380167, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4802, + "time_per_iteration": 2.6566197872161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047457, + "balance_loss_mlp": 1.03858757, + "diversity_loss_mlp": 0.0, + "epoch": 0.9240092343208927, + "flos": 492024098304.0, + "grad_norm": 0.09058835826894181, + "language_loss": 0.81587046, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82634509, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4803, + "time_per_iteration": 2.6244726181030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045538, + "balance_loss_mlp": 1.0367403, + "diversity_loss_mlp": 0.0, + "epoch": 0.9242016160061562, + "flos": 647218750464.0, + "grad_norm": 0.06939366274556823, + "language_loss": 0.73765552, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.74811089, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4804, + "time_per_iteration": 2.8761777877807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050005, + "balance_loss_mlp": 1.04128492, + "diversity_loss_mlp": 0.0, + "epoch": 0.9243939976914197, + "flos": 729430354944.0, + "grad_norm": 0.07337139477875909, + "language_loss": 0.79396987, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80446994, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4805, + "time_per_iteration": 2.9650769233703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046276, + "balance_loss_mlp": 1.03742468, + "diversity_loss_mlp": 0.0, + "epoch": 0.9245863793766833, + "flos": 452246229504.0, + "grad_norm": 0.07187105546875673, + "language_loss": 0.90605378, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.91651654, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4806, + "time_per_iteration": 2.6060450077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00788148, + "balance_loss_mlp": 1.33026791, + "diversity_loss_mlp": 0.22471815, + "epoch": 0.9247787610619469, + "flos": 755030605824.0, + "grad_norm": 0.03522090358058462, + "language_loss": 0.77311206, + "learning_rate": 1.476516966469732e-05, + "loss": 0.78099358, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01065494, + "step": 4807, + "time_per_iteration": 2.9656925201416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047033, + "balance_loss_mlp": 1.03775859, + "diversity_loss_mlp": 0.0, + "epoch": 0.9249711427472105, + "flos": 561928389120.0, + "grad_norm": 0.05970940147953983, + "language_loss": 0.85029161, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.860762, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.0927124, + "routerloss_mlp": 0.0, + "step": 4808, + "time_per_iteration": 2.730725049972534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045322, + "balance_loss_mlp": 1.0360359, + "diversity_loss_mlp": 0.0, + "epoch": 0.9251635244324741, + "flos": 526699459584.0, + "grad_norm": 0.07434097229920794, + "language_loss": 0.85175872, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86221194, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.09283447, + "routerloss_mlp": 0.0, + "step": 4809, + "time_per_iteration": 2.677678346633911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047523, + "balance_loss_mlp": 1.03848672, + "diversity_loss_mlp": 0.0, + "epoch": 0.9253559061177375, + "flos": 611280608256.0, + "grad_norm": 0.06773039177733224, + "language_loss": 0.79278344, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80325866, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4810, + "time_per_iteration": 2.7994203567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003108, + "balance_loss_mlp": 0.99864995, + "diversity_loss_mlp": 0.0, + "epoch": 0.9255482878030011, + "flos": 1551258957312.0, + "grad_norm": 0.003310724835280569, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77928501, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4811, + "time_per_iteration": 4.716477394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053397, + "balance_loss_mlp": 1.04443264, + "diversity_loss_mlp": 0.0, + "epoch": 0.9257406694882647, + "flos": 766366603776.0, + "grad_norm": 0.07798685492020957, + "language_loss": 0.80983555, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.82036948, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4812, + "time_per_iteration": 3.111435651779175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046457, + "balance_loss_mlp": 1.03743255, + "diversity_loss_mlp": 0.0, + "epoch": 0.9259330511735283, + "flos": 497991089664.0, + "grad_norm": 0.07891038810151499, + "language_loss": 0.83191156, + "learning_rate": 1.431765421986686e-05, + "loss": 0.84237611, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 4813, + "time_per_iteration": 2.5696511268615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_mlp": 1.04083896, + "diversity_loss_mlp": 0.0, + "epoch": 0.9261254328587919, + "flos": 626874080256.0, + "grad_norm": 0.06938826271777476, + "language_loss": 0.79197675, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80247152, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4814, + "time_per_iteration": 2.716487407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047968, + "balance_loss_mlp": 1.03926587, + "diversity_loss_mlp": 0.0, + "epoch": 0.9263178145440554, + "flos": 597382345728.0, + "grad_norm": 0.06659923130000121, + "language_loss": 0.8535648, + "learning_rate": 1.416999056594831e-05, + "loss": 0.86404449, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4815, + "time_per_iteration": 2.7244887351989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050391, + "balance_loss_mlp": 1.0416646, + "diversity_loss_mlp": 0.0, + "epoch": 0.926510196229319, + "flos": 388563319296.0, + "grad_norm": 0.06890226138960381, + "language_loss": 0.83825701, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84876096, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4816, + "time_per_iteration": 2.464979887008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048928, + "balance_loss_mlp": 1.04029167, + "diversity_loss_mlp": 0.0, + "epoch": 0.9267025779145825, + "flos": 545798974464.0, + "grad_norm": 0.07919281923401009, + "language_loss": 0.84257257, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85306185, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 4817, + "time_per_iteration": 2.640580415725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_mlp": 1.03899682, + "diversity_loss_mlp": 0.0, + "epoch": 0.9268949595998461, + "flos": 499789813248.0, + "grad_norm": 0.06905431252215245, + "language_loss": 0.82030249, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.83077914, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.08679199, + "routerloss_mlp": 0.0, + "step": 4818, + "time_per_iteration": 2.6683123111724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_mlp": 1.04013073, + "diversity_loss_mlp": 0.0, + "epoch": 0.9270873412851096, + "flos": 432828085248.0, + "grad_norm": 0.06364347314694363, + "language_loss": 0.82941604, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.8399058, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4819, + "time_per_iteration": 2.622507333755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047498, + "balance_loss_mlp": 1.03880203, + "diversity_loss_mlp": 0.0, + "epoch": 0.9272797229703732, + "flos": 466769640960.0, + "grad_norm": 0.07369631813155064, + "language_loss": 0.8604511, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87092614, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4820, + "time_per_iteration": 2.5886447429656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042961, + "balance_loss_mlp": 1.03391302, + "diversity_loss_mlp": 0.0, + "epoch": 0.9274721046556368, + "flos": 704838122496.0, + "grad_norm": 0.06986061953541225, + "language_loss": 0.78981894, + "learning_rate": 1.373152729763938e-05, + "loss": 0.80024862, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.09051514, + "routerloss_mlp": 0.0, + "step": 4821, + "time_per_iteration": 3.002431869506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100315, + "balance_loss_mlp": 0.99869162, + "diversity_loss_mlp": 0.0, + "epoch": 0.9276644863409004, + "flos": 1402255950336.0, + "grad_norm": 0.0033138689547235365, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83383614, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4822, + "time_per_iteration": 4.872236728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048216, + "balance_loss_mlp": 1.03961504, + "diversity_loss_mlp": 0.0, + "epoch": 0.927856868026164, + "flos": 741722614272.0, + "grad_norm": 0.10753003885480804, + "language_loss": 0.80162168, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81210387, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4823, + "time_per_iteration": 3.065385103225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010481, + "balance_loss_mlp": 1.03920078, + "diversity_loss_mlp": 0.0, + "epoch": 0.9280492497114274, + "flos": 412223883264.0, + "grad_norm": 0.07544984559040653, + "language_loss": 0.74334532, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.75382626, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4824, + "time_per_iteration": 2.459182024002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045084, + "balance_loss_mlp": 1.03613138, + "diversity_loss_mlp": 0.0, + "epoch": 0.928241631396691, + "flos": 646504768512.0, + "grad_norm": 0.1022591189326798, + "language_loss": 0.84062541, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85107625, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4825, + "time_per_iteration": 2.7902333736419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104756, + "balance_loss_mlp": 1.03844011, + "diversity_loss_mlp": 0.0, + "epoch": 0.9284340130819546, + "flos": 696855094272.0, + "grad_norm": 0.06332347540086566, + "language_loss": 0.80870605, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81918162, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.09118652, + "routerloss_mlp": 0.0, + "step": 4826, + "time_per_iteration": 3.014462947845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078881, + "balance_loss_mlp": 1.33157349, + "diversity_loss_mlp": 0.22439189, + "epoch": 0.9286263947672182, + "flos": 759132062208.0, + "grad_norm": 0.028742947039502215, + "language_loss": 0.83905512, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.84694326, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01082761, + "step": 4827, + "time_per_iteration": 3.0634989738464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_mlp": 1.03804338, + "diversity_loss_mlp": 0.0, + "epoch": 0.9288187764524817, + "flos": 672823770624.0, + "grad_norm": 0.07468304915568001, + "language_loss": 0.80064201, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81110942, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4828, + "time_per_iteration": 2.9195716381073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104862, + "balance_loss_mlp": 1.03953636, + "diversity_loss_mlp": 0.0, + "epoch": 0.9290111581377453, + "flos": 500469290496.0, + "grad_norm": 0.06920378526179259, + "language_loss": 0.83656001, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.84704626, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.09082031, + "routerloss_mlp": 0.0, + "step": 4829, + "time_per_iteration": 2.5818231105804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100325, + "balance_loss_mlp": 0.99879169, + "diversity_loss_mlp": 0.0, + "epoch": 0.9292035398230089, + "flos": 1563627566592.0, + "grad_norm": 0.0032198614954978416, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73125315, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4830, + "time_per_iteration": 4.951828718185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003246, + "balance_loss_mlp": 0.99878782, + "diversity_loss_mlp": 0.0, + "epoch": 0.9293959215082724, + "flos": 1518673411584.0, + "grad_norm": 0.003220380799395436, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80515087, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.04467773, + "routerloss_mlp": 0.0, + "step": 4831, + "time_per_iteration": 4.905702590942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_mlp": 1.04304385, + "diversity_loss_mlp": 0.0, + "epoch": 0.929588303193536, + "flos": 557836844544.0, + "grad_norm": 0.08579455116544467, + "language_loss": 0.84383392, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.85435468, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4832, + "time_per_iteration": 2.6667189598083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051534, + "balance_loss_mlp": 1.04263496, + "diversity_loss_mlp": 0.0, + "epoch": 0.9297806848787995, + "flos": 478580285952.0, + "grad_norm": 0.07653793753230506, + "language_loss": 0.80192435, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.81243968, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4833, + "time_per_iteration": 2.530576705932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784425, + "balance_loss_mlp": 1.32099247, + "diversity_loss_mlp": 0.22666103, + "epoch": 0.9299730665640631, + "flos": 564537641472.0, + "grad_norm": 0.02823635345590092, + "language_loss": 0.80189478, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.80973905, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01059832, + "step": 4834, + "time_per_iteration": 2.8291900157928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046644, + "balance_loss_mlp": 1.03810263, + "diversity_loss_mlp": 0.0, + "epoch": 0.9301654482493267, + "flos": 560174082048.0, + "grad_norm": 0.06755490191164544, + "language_loss": 0.82792151, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83838797, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.08551025, + "routerloss_mlp": 0.0, + "step": 4835, + "time_per_iteration": 2.823146104812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_mlp": 0.99874461, + "diversity_loss_mlp": 0.0, + "epoch": 0.9303578299345903, + "flos": 1520096606208.0, + "grad_norm": 0.0032138775564420387, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77855623, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4836, + "time_per_iteration": 4.9668896198272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_mlp": 1.04090357, + "diversity_loss_mlp": 0.0, + "epoch": 0.9305502116198537, + "flos": 530843134464.0, + "grad_norm": 0.07503406646188047, + "language_loss": 0.82993883, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.84043521, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4837, + "time_per_iteration": 2.637373924255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045053, + "balance_loss_mlp": 1.03608274, + "diversity_loss_mlp": 0.0, + "epoch": 0.9307425933051173, + "flos": 474898775040.0, + "grad_norm": 0.08374095917705242, + "language_loss": 0.81554383, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82599437, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4838, + "time_per_iteration": 2.515183448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784124, + "balance_loss_mlp": 1.32047153, + "diversity_loss_mlp": 0.22594652, + "epoch": 0.9309349749903809, + "flos": 584892223488.0, + "grad_norm": 0.03184031575728359, + "language_loss": 0.86872089, + "learning_rate": 1.245693929549213e-05, + "loss": 0.87656212, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01091547, + "step": 4839, + "time_per_iteration": 2.7616403102874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_mlp": 1.03896284, + "diversity_loss_mlp": 0.0, + "epoch": 0.9311273566756445, + "flos": 861666315264.0, + "grad_norm": 0.061490618450412385, + "language_loss": 0.76999998, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.78047729, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4840, + "time_per_iteration": 3.0911495685577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049189, + "balance_loss_mlp": 1.04037976, + "diversity_loss_mlp": 0.0, + "epoch": 0.9313197383609081, + "flos": 548094366720.0, + "grad_norm": 0.07195558921256455, + "language_loss": 0.82423127, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83472311, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4841, + "time_per_iteration": 2.6239800453186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_mlp": 1.0373354, + "diversity_loss_mlp": 0.0, + "epoch": 0.9315121200461716, + "flos": 468756315648.0, + "grad_norm": 0.07717139537202818, + "language_loss": 0.81388533, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82434833, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 4842, + "time_per_iteration": 2.5533297061920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104779, + "balance_loss_mlp": 1.03906965, + "diversity_loss_mlp": 0.0, + "epoch": 0.9317045017314352, + "flos": 417659701248.0, + "grad_norm": 0.07073553761883396, + "language_loss": 0.77673644, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.78721428, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4843, + "time_per_iteration": 2.528705358505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046518, + "balance_loss_mlp": 1.03777993, + "diversity_loss_mlp": 0.0, + "epoch": 0.9318968834166987, + "flos": 540489065472.0, + "grad_norm": 0.06887316839423005, + "language_loss": 0.7711761, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78164124, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 4844, + "time_per_iteration": 2.7841336727142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_mlp": 1.0387938, + "diversity_loss_mlp": 0.0, + "epoch": 0.9320892651019623, + "flos": 521330452992.0, + "grad_norm": 0.07471339735643584, + "language_loss": 0.80957037, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.82004702, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 4845, + "time_per_iteration": 2.5967259407043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047686, + "balance_loss_mlp": 1.03901899, + "diversity_loss_mlp": 0.0, + "epoch": 0.9322816467872258, + "flos": 582072998400.0, + "grad_norm": 0.0577436249864269, + "language_loss": 0.80821908, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.8186959, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 4846, + "time_per_iteration": 2.735156774520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00780467, + "balance_loss_mlp": 1.31358051, + "diversity_loss_mlp": 0.22594975, + "epoch": 0.9324740284724894, + "flos": 484747338240.0, + "grad_norm": 0.03394753668394222, + "language_loss": 0.82436854, + "learning_rate": 1.191013150742537e-05, + "loss": 0.83217323, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01070218, + "step": 4847, + "time_per_iteration": 2.730957269668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047642, + "balance_loss_mlp": 1.03871894, + "diversity_loss_mlp": 0.0, + "epoch": 0.932666410157753, + "flos": 732585461760.0, + "grad_norm": 0.06722310118133415, + "language_loss": 0.82897216, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.83944857, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4848, + "time_per_iteration": 3.0189881324768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044207, + "balance_loss_mlp": 1.03535616, + "diversity_loss_mlp": 0.0, + "epoch": 0.9328587918430166, + "flos": 965537127936.0, + "grad_norm": 0.08276324861402574, + "language_loss": 0.78624225, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.79668438, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4849, + "time_per_iteration": 3.2938950061798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043788, + "balance_loss_mlp": 1.03470397, + "diversity_loss_mlp": 0.0, + "epoch": 0.9330511735282802, + "flos": 614552085504.0, + "grad_norm": 0.07019081687121781, + "language_loss": 0.80391824, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81435609, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4850, + "time_per_iteration": 2.7515499591827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047473, + "balance_loss_mlp": 1.03849709, + "diversity_loss_mlp": 0.0, + "epoch": 0.9332435552135436, + "flos": 559101823488.0, + "grad_norm": 0.06820253841014733, + "language_loss": 0.85668182, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.86715662, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4851, + "time_per_iteration": 2.680340528488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047775, + "balance_loss_mlp": 1.03895366, + "diversity_loss_mlp": 0.0, + "epoch": 0.9334359368988072, + "flos": 515536358400.0, + "grad_norm": 0.1196628498062152, + "language_loss": 0.8199991, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.83047688, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4852, + "time_per_iteration": 2.5966830253601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045996, + "balance_loss_mlp": 1.03706086, + "diversity_loss_mlp": 0.0, + "epoch": 0.9336283185840708, + "flos": 539809588224.0, + "grad_norm": 0.07419739239105261, + "language_loss": 0.82826304, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.838723, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4853, + "time_per_iteration": 2.7714791297912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100356, + "balance_loss_mlp": 0.999125, + "diversity_loss_mlp": 0.0, + "epoch": 0.9338207002693344, + "flos": 1562824751616.0, + "grad_norm": 0.004307105036144614, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.7945857, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4854, + "time_per_iteration": 4.893805265426636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046802, + "balance_loss_mlp": 1.03777242, + "diversity_loss_mlp": 0.0, + "epoch": 0.9340130819545979, + "flos": 645261811200.0, + "grad_norm": 0.06988266936343929, + "language_loss": 0.81466687, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82513487, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 4855, + "time_per_iteration": 2.9019949436187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046813, + "balance_loss_mlp": 1.03802776, + "diversity_loss_mlp": 0.0, + "epoch": 0.9342054636398615, + "flos": 503441588736.0, + "grad_norm": 0.06582390304127933, + "language_loss": 0.76894152, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.77940965, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4856, + "time_per_iteration": 2.650545597076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049411, + "balance_loss_mlp": 1.04064322, + "diversity_loss_mlp": 0.0, + "epoch": 0.934397845325125, + "flos": 593026126848.0, + "grad_norm": 0.0537499767930613, + "language_loss": 0.84482789, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.855322, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4857, + "time_per_iteration": 2.82725191116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044882, + "balance_loss_mlp": 1.03607237, + "diversity_loss_mlp": 0.0, + "epoch": 0.9345902270103886, + "flos": 499891129344.0, + "grad_norm": 0.0729144221953202, + "language_loss": 0.80315518, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.813604, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4858, + "time_per_iteration": 2.575934410095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00589881, + "balance_loss_mlp": 1.02917051, + "diversity_loss_mlp": 0.13201948, + "epoch": 0.9347826086956522, + "flos": 1520329347072.0, + "grad_norm": 0.001270733186004784, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.76577604, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00928602, + "step": 4859, + "time_per_iteration": 4.699007987976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043954, + "balance_loss_mlp": 1.03486431, + "diversity_loss_mlp": 0.0, + "epoch": 0.9349749903809157, + "flos": 504550923264.0, + "grad_norm": 0.05691745976231031, + "language_loss": 0.81198478, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.82242435, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4860, + "time_per_iteration": 2.777012825012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043681, + "balance_loss_mlp": 1.0347048, + "diversity_loss_mlp": 0.0, + "epoch": 0.9351673720661793, + "flos": 568901200896.0, + "grad_norm": 0.10010618557822787, + "language_loss": 0.79151934, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.80195618, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.08984375, + "routerloss_mlp": 0.0, + "step": 4861, + "time_per_iteration": 2.6320016384124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044691, + "balance_loss_mlp": 1.03600073, + "diversity_loss_mlp": 0.0, + "epoch": 0.9353597537514429, + "flos": 544605576192.0, + "grad_norm": 0.08362946312424821, + "language_loss": 0.86286509, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87331206, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4862, + "time_per_iteration": 2.6105833053588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044576, + "balance_loss_mlp": 1.03586817, + "diversity_loss_mlp": 0.0, + "epoch": 0.9355521354367065, + "flos": 518997984768.0, + "grad_norm": 0.05900589694062164, + "language_loss": 0.84758484, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85803056, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4863, + "time_per_iteration": 2.7426371574401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045058, + "balance_loss_mlp": 1.03628969, + "diversity_loss_mlp": 0.0, + "epoch": 0.93574451712197, + "flos": 446316314112.0, + "grad_norm": 0.0932071553441471, + "language_loss": 0.78725177, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79770231, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4864, + "time_per_iteration": 2.5507235527038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045394, + "balance_loss_mlp": 1.0366559, + "diversity_loss_mlp": 0.0, + "epoch": 0.9359368988072335, + "flos": 480517401600.0, + "grad_norm": 0.08522352532070332, + "language_loss": 0.76506388, + "learning_rate": 1.072417553472832e-05, + "loss": 0.77551782, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4865, + "time_per_iteration": 4.053428649902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045737, + "balance_loss_mlp": 1.03688622, + "diversity_loss_mlp": 0.0, + "epoch": 0.9361292804924971, + "flos": 497118892032.0, + "grad_norm": 0.06592512300053538, + "language_loss": 0.85022455, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86068201, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4866, + "time_per_iteration": 2.608532667160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_mlp": 1.03633118, + "diversity_loss_mlp": 0.0, + "epoch": 0.9363216621777607, + "flos": 618122368512.0, + "grad_norm": 0.08990017203823457, + "language_loss": 0.84334439, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85379487, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4867, + "time_per_iteration": 2.7455151081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003805, + "balance_loss_mlp": 0.99937075, + "diversity_loss_mlp": 0.0, + "epoch": 0.9365140438630243, + "flos": 1415929559040.0, + "grad_norm": 0.005040674101907188, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80207145, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4868, + "time_per_iteration": 4.876135587692261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043847, + "balance_loss_mlp": 1.03491819, + "diversity_loss_mlp": 0.0, + "epoch": 0.9367064255482878, + "flos": 590503509504.0, + "grad_norm": 0.07053266752313711, + "language_loss": 0.81646079, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82689929, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 4869, + "time_per_iteration": 2.708939790725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046683, + "balance_loss_mlp": 1.03781986, + "diversity_loss_mlp": 0.0, + "epoch": 0.9368988072335513, + "flos": 526637790720.0, + "grad_norm": 0.060976282943095796, + "language_loss": 0.82172537, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.83219218, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4870, + "time_per_iteration": 2.65925669670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010449, + "balance_loss_mlp": 1.03607297, + "diversity_loss_mlp": 0.0, + "epoch": 0.9370911889188149, + "flos": 743205279744.0, + "grad_norm": 0.062164083958686674, + "language_loss": 0.78947985, + "learning_rate": 1.034252625822113e-05, + "loss": 0.79992884, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4871, + "time_per_iteration": 2.9242799282073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040518, + "balance_loss_mlp": 1.03191113, + "diversity_loss_mlp": 0.0, + "epoch": 0.9372835706040785, + "flos": 546038682624.0, + "grad_norm": 0.06036408822352837, + "language_loss": 0.78672194, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.79712713, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4872, + "time_per_iteration": 2.7019548416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044427, + "balance_loss_mlp": 1.03556955, + "diversity_loss_mlp": 0.0, + "epoch": 0.9374759522893421, + "flos": 491633515008.0, + "grad_norm": 0.0656254889693481, + "language_loss": 0.81680477, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82724905, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 4873, + "time_per_iteration": 2.6661787033081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_mlp": 1.04079902, + "diversity_loss_mlp": 0.0, + "epoch": 0.9376683339746056, + "flos": 578421222912.0, + "grad_norm": 0.07062356836033176, + "language_loss": 0.82414687, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83464432, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4874, + "time_per_iteration": 2.711991310119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047638, + "balance_loss_mlp": 1.03853059, + "diversity_loss_mlp": 0.0, + "epoch": 0.9378607156598692, + "flos": 506290549248.0, + "grad_norm": 0.07310284560827243, + "language_loss": 0.80373824, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81421459, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 4875, + "time_per_iteration": 2.650681972503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_mlp": 1.03720248, + "diversity_loss_mlp": 0.0, + "epoch": 0.9380530973451328, + "flos": 520015915008.0, + "grad_norm": 0.062293314386374414, + "language_loss": 0.77575111, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.78621429, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.09112549, + "routerloss_mlp": 0.0, + "step": 4876, + "time_per_iteration": 2.6609630584716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_mlp": 1.03741789, + "diversity_loss_mlp": 0.0, + "epoch": 0.9382454790303963, + "flos": 557799768576.0, + "grad_norm": 0.06315414550541629, + "language_loss": 0.84719789, + "learning_rate": 9.967720642029999e-06, + "loss": 0.85766232, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4877, + "time_per_iteration": 2.651707172393799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045511, + "balance_loss_mlp": 1.03690422, + "diversity_loss_mlp": 0.0, + "epoch": 0.9384378607156598, + "flos": 695476316160.0, + "grad_norm": 0.0631685338403412, + "language_loss": 0.81854308, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82899821, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4878, + "time_per_iteration": 2.949418783187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049635, + "balance_loss_mlp": 1.04068828, + "diversity_loss_mlp": 0.0, + "epoch": 0.9386302424009234, + "flos": 554750747136.0, + "grad_norm": 0.08565110846317762, + "language_loss": 0.80980134, + "learning_rate": 9.844307158203058e-06, + "loss": 0.82029772, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4879, + "time_per_iteration": 2.6912460327148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048881, + "balance_loss_mlp": 1.03982735, + "diversity_loss_mlp": 0.0, + "epoch": 0.938822624086187, + "flos": 566981337600.0, + "grad_norm": 0.0804374374941349, + "language_loss": 0.79621142, + "learning_rate": 9.782885847304469e-06, + "loss": 0.80670023, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4880, + "time_per_iteration": 2.6459033489227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045381, + "balance_loss_mlp": 1.03668451, + "diversity_loss_mlp": 0.0, + "epoch": 0.9390150057714506, + "flos": 417602801664.0, + "grad_norm": 0.07482420746454603, + "language_loss": 0.80257022, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81302404, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 4881, + "time_per_iteration": 2.5740063190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_mlp": 1.03760171, + "diversity_loss_mlp": 0.0, + "epoch": 0.9392073874567142, + "flos": 1553839967232.0, + "grad_norm": 0.0852712224295467, + "language_loss": 0.76510745, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77557057, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4882, + "time_per_iteration": 3.689307689666748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050073, + "balance_loss_mlp": 1.04114449, + "diversity_loss_mlp": 0.0, + "epoch": 0.9393997691419776, + "flos": 652536000000.0, + "grad_norm": 0.09232552056587429, + "language_loss": 0.7808578, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79135859, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4883, + "time_per_iteration": 2.7796614170074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004059, + "balance_loss_mlp": 0.99962485, + "diversity_loss_mlp": 0.0, + "epoch": 0.9395921508272412, + "flos": 1553294817792.0, + "grad_norm": 0.004454986396057403, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79174733, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4884, + "time_per_iteration": 4.815665245056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049205, + "balance_loss_mlp": 1.04029393, + "diversity_loss_mlp": 0.0, + "epoch": 0.9397845325125048, + "flos": 498144162816.0, + "grad_norm": 0.06863865940742271, + "language_loss": 0.78660077, + "learning_rate": 9.478634554578314e-06, + "loss": 0.79709285, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4885, + "time_per_iteration": 2.6168384552001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104537, + "balance_loss_mlp": 1.03678083, + "diversity_loss_mlp": 0.0, + "epoch": 0.9399769141977684, + "flos": 498596414976.0, + "grad_norm": 0.07504646640886149, + "language_loss": 0.83853602, + "learning_rate": 9.418355513755638e-06, + "loss": 0.84898973, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.08599854, + "routerloss_mlp": 0.0, + "step": 4886, + "time_per_iteration": 2.5939505100250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00589544, + "balance_loss_mlp": 1.02856016, + "diversity_loss_mlp": 0.13189431, + "epoch": 0.9401692958830319, + "flos": 1402500427776.0, + "grad_norm": 0.0012775322428382279, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.79921734, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00931658, + "step": 4887, + "time_per_iteration": 4.869856357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047496, + "balance_loss_mlp": 1.03856742, + "diversity_loss_mlp": 0.0, + "epoch": 0.9403616775682955, + "flos": 540123448320.0, + "grad_norm": 0.06148309655419226, + "language_loss": 0.85074973, + "learning_rate": 9.298368837495575e-06, + "loss": 0.86122465, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 4888, + "time_per_iteration": 2.723494052886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004242, + "balance_loss_mlp": 0.99983096, + "diversity_loss_mlp": 0.0, + "epoch": 0.9405540592535591, + "flos": 1322058184704.0, + "grad_norm": 0.0026510918871896585, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76173675, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4889, + "time_per_iteration": 4.887513637542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047457, + "balance_loss_mlp": 1.03848016, + "diversity_loss_mlp": 0.0, + "epoch": 0.9407464409388226, + "flos": 572362827264.0, + "grad_norm": 0.07795508435687046, + "language_loss": 0.83106863, + "learning_rate": 9.179144190235799e-06, + "loss": 0.8415432, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4890, + "time_per_iteration": 2.6607882976531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_mlp": 1.03781509, + "diversity_loss_mlp": 0.0, + "epoch": 0.9409388226240862, + "flos": 511264203264.0, + "grad_norm": 0.06087500740988416, + "language_loss": 0.76773834, + "learning_rate": 9.119817685386112e-06, + "loss": 0.77820671, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 4891, + "time_per_iteration": 2.704505205154419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004256, + "balance_loss_mlp": 0.99982125, + "diversity_loss_mlp": 0.0, + "epoch": 0.9411312043093497, + "flos": 1569901077504.0, + "grad_norm": 0.0026524442975608157, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81246138, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4892, + "time_per_iteration": 4.861233949661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_mlp": 1.04099298, + "diversity_loss_mlp": 0.0, + "epoch": 0.9413235859946133, + "flos": 569469450240.0, + "grad_norm": 0.0781928260181619, + "language_loss": 0.78609961, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79659593, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 4893, + "time_per_iteration": 2.7279999256134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048837, + "balance_loss_mlp": 1.03969967, + "diversity_loss_mlp": 0.0, + "epoch": 0.9415159676798769, + "flos": 781905747456.0, + "grad_norm": 0.06974865955281616, + "language_loss": 0.80413878, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81462717, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.0914917, + "routerloss_mlp": 0.0, + "step": 4894, + "time_per_iteration": 3.0058786869049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050294, + "balance_loss_mlp": 1.04135358, + "diversity_loss_mlp": 0.0, + "epoch": 0.9417083493651405, + "flos": 849341749248.0, + "grad_norm": 0.08932063460271895, + "language_loss": 0.79991817, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81042111, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4895, + "time_per_iteration": 3.1561882495880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046423, + "balance_loss_mlp": 1.03778601, + "diversity_loss_mlp": 0.0, + "epoch": 0.941900731050404, + "flos": 529333304832.0, + "grad_norm": 0.0641512346414091, + "language_loss": 0.85852486, + "learning_rate": 8.826044268024025e-06, + "loss": 0.86898911, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4896, + "time_per_iteration": 2.6913957595825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045265, + "balance_loss_mlp": 1.03639615, + "diversity_loss_mlp": 0.0, + "epoch": 0.9420931127356675, + "flos": 557073303552.0, + "grad_norm": 0.0665448744143015, + "language_loss": 0.80267036, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81312299, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 4897, + "time_per_iteration": 2.7335498332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104815, + "balance_loss_mlp": 1.0395788, + "diversity_loss_mlp": 0.0, + "epoch": 0.9422854944209311, + "flos": 652543340544.0, + "grad_norm": 0.07266036540005272, + "language_loss": 0.86784083, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87832236, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.08581543, + "routerloss_mlp": 0.0, + "step": 4898, + "time_per_iteration": 2.820343255996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_mlp": 1.04090655, + "diversity_loss_mlp": 0.0, + "epoch": 0.9424778761061947, + "flos": 553685829120.0, + "grad_norm": 0.07366201746067845, + "language_loss": 0.84326768, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85376465, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 4899, + "time_per_iteration": 2.708554744720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_mlp": 1.03626442, + "diversity_loss_mlp": 0.0, + "epoch": 0.9426702577914583, + "flos": 588559053312.0, + "grad_norm": 0.07321817964783915, + "language_loss": 0.79721165, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80766267, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4900, + "time_per_iteration": 2.674393892288208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054375, + "balance_loss_mlp": 1.04557145, + "diversity_loss_mlp": 0.0, + "epoch": 0.9428626394767218, + "flos": 616625021952.0, + "grad_norm": 0.0749978632070715, + "language_loss": 0.78455758, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79510128, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4901, + "time_per_iteration": 2.805161952972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047513, + "balance_loss_mlp": 1.03873909, + "diversity_loss_mlp": 0.0, + "epoch": 0.9430550211619854, + "flos": 610410981888.0, + "grad_norm": 0.07047076389805079, + "language_loss": 0.82071722, + "learning_rate": 8.479809201123178e-06, + "loss": 0.83119237, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4902, + "time_per_iteration": 2.732999324798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_mlp": 1.03907359, + "diversity_loss_mlp": 0.0, + "epoch": 0.943247402847249, + "flos": 565990571520.0, + "grad_norm": 0.06786486493908951, + "language_loss": 0.78043211, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79091066, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4903, + "time_per_iteration": 2.7100279331207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048262, + "balance_loss_mlp": 1.03943491, + "diversity_loss_mlp": 0.0, + "epoch": 0.9434397845325125, + "flos": 527040483840.0, + "grad_norm": 0.07474785644916408, + "language_loss": 0.81409293, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82457554, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 4904, + "time_per_iteration": 2.598313093185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046884, + "balance_loss_mlp": 1.0381397, + "diversity_loss_mlp": 0.0, + "epoch": 0.943632166217776, + "flos": 593451214848.0, + "grad_norm": 0.06861839019347821, + "language_loss": 0.82857311, + "learning_rate": 8.309267504391593e-06, + "loss": 0.83904195, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4905, + "time_per_iteration": 2.7130138874053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010495, + "balance_loss_mlp": 1.04049969, + "diversity_loss_mlp": 0.0, + "epoch": 0.9438245479030396, + "flos": 572770289664.0, + "grad_norm": 0.05740754157545699, + "language_loss": 0.85487771, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86537278, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 4906, + "time_per_iteration": 2.819689989089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046047, + "balance_loss_mlp": 1.03723109, + "diversity_loss_mlp": 0.0, + "epoch": 0.9440169295883032, + "flos": 488258523648.0, + "grad_norm": 0.0749683755111213, + "language_loss": 0.81567025, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82613063, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4907, + "time_per_iteration": 2.554344415664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049809, + "balance_loss_mlp": 1.04098761, + "diversity_loss_mlp": 0.0, + "epoch": 0.9442093112735668, + "flos": 731742999552.0, + "grad_norm": 0.06901073906266146, + "language_loss": 0.73883832, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74933642, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4908, + "time_per_iteration": 3.0110507011413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047925, + "balance_loss_mlp": 1.03897214, + "diversity_loss_mlp": 0.0, + "epoch": 0.9444016929588304, + "flos": 571031036928.0, + "grad_norm": 0.07411833720689497, + "language_loss": 0.8239246, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83440387, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4909, + "time_per_iteration": 2.6770436763763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047051, + "balance_loss_mlp": 1.03801453, + "diversity_loss_mlp": 0.0, + "epoch": 0.9445940746440938, + "flos": 509292582912.0, + "grad_norm": 0.06788128134122538, + "language_loss": 0.86283165, + "learning_rate": 8.028849459169318e-06, + "loss": 0.8733021, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.09039307, + "routerloss_mlp": 0.0, + "step": 4910, + "time_per_iteration": 2.582549810409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049067, + "balance_loss_mlp": 1.04030466, + "diversity_loss_mlp": 0.0, + "epoch": 0.9447864563293574, + "flos": 624556293120.0, + "grad_norm": 0.0678450295570026, + "language_loss": 0.80976182, + "learning_rate": 7.97333876382028e-06, + "loss": 0.82025248, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4911, + "time_per_iteration": 2.8425984382629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049899, + "balance_loss_mlp": 1.04112482, + "diversity_loss_mlp": 0.0, + "epoch": 0.944978838014621, + "flos": 505270047744.0, + "grad_norm": 0.08525541673585063, + "language_loss": 0.81182563, + "learning_rate": 7.918019090162098e-06, + "loss": 0.82232463, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.08789062, + "routerloss_mlp": 0.0, + "step": 4912, + "time_per_iteration": 2.7192227840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004436, + "balance_loss_mlp": 1.00002539, + "diversity_loss_mlp": 0.0, + "epoch": 0.9451712196998846, + "flos": 1484205451776.0, + "grad_norm": 0.00558203174928547, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79291773, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4913, + "time_per_iteration": 4.945667505264282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050028, + "balance_loss_mlp": 1.0412302, + "diversity_loss_mlp": 0.0, + "epoch": 0.9453636013851482, + "flos": 521137732608.0, + "grad_norm": 0.07323836789774518, + "language_loss": 0.90345061, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91395086, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4914, + "time_per_iteration": 2.628188371658325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004442, + "balance_loss_mlp": 1.00000703, + "diversity_loss_mlp": 0.0, + "epoch": 0.9455559830704117, + "flos": 1496902975488.0, + "grad_norm": 0.00558152160329536, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.8456679, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.04443359, + "routerloss_mlp": 0.0, + "step": 4915, + "time_per_iteration": 4.940939426422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049716, + "balance_loss_mlp": 1.04091215, + "diversity_loss_mlp": 0.0, + "epoch": 0.9457483647556753, + "flos": 498126910464.0, + "grad_norm": 0.05816068289189103, + "language_loss": 0.81779099, + "learning_rate": 7.698651040865534e-06, + "loss": 0.8282882, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4916, + "time_per_iteration": 2.622225522994995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045766, + "balance_loss_mlp": 1.03712368, + "diversity_loss_mlp": 0.0, + "epoch": 0.9459407464409388, + "flos": 1019405979648.0, + "grad_norm": 0.06122686842867312, + "language_loss": 0.82315564, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83361328, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.08654785, + "routerloss_mlp": 0.0, + "step": 4917, + "time_per_iteration": 3.3565821647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050444, + "balance_loss_mlp": 1.04189634, + "diversity_loss_mlp": 0.0, + "epoch": 0.9461331281262024, + "flos": 513589330944.0, + "grad_norm": 0.07064430272408662, + "language_loss": 0.81672692, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82723141, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.08557129, + "routerloss_mlp": 0.0, + "step": 4918, + "time_per_iteration": 2.609248399734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049398, + "balance_loss_mlp": 1.04064822, + "diversity_loss_mlp": 0.0, + "epoch": 0.9463255098114659, + "flos": 528023909376.0, + "grad_norm": 0.07970710282703287, + "language_loss": 0.7821058, + "learning_rate": 7.536131776620936e-06, + "loss": 0.7925998, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4919, + "time_per_iteration": 2.6066248416900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049566, + "balance_loss_mlp": 1.0406847, + "diversity_loss_mlp": 0.0, + "epoch": 0.9465178914967295, + "flos": 506043500544.0, + "grad_norm": 0.08687319482199532, + "language_loss": 0.83590424, + "learning_rate": 7.482341043430485e-06, + "loss": 0.8463999, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4920, + "time_per_iteration": 2.579651117324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_mlp": 1.03711653, + "diversity_loss_mlp": 0.0, + "epoch": 0.9467102731819931, + "flos": 660254727168.0, + "grad_norm": 0.06849366756552606, + "language_loss": 0.85644251, + "learning_rate": 7.428741522553184e-06, + "loss": 0.86690247, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4921, + "time_per_iteration": 2.9116263389587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045153, + "balance_loss_mlp": 1.03621817, + "diversity_loss_mlp": 0.0, + "epoch": 0.9469026548672567, + "flos": 675183403008.0, + "grad_norm": 0.06484399276768851, + "language_loss": 0.89472318, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90517473, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4922, + "time_per_iteration": 2.9387049674987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047394, + "balance_loss_mlp": 1.03844738, + "diversity_loss_mlp": 0.0, + "epoch": 0.9470950365525203, + "flos": 513964859904.0, + "grad_norm": 0.08622456288461161, + "language_loss": 0.80096912, + "learning_rate": 7.32211620090012e-06, + "loss": 0.81144309, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4923, + "time_per_iteration": 2.6302578449249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050313, + "balance_loss_mlp": 1.04158056, + "diversity_loss_mlp": 0.0, + "epoch": 0.9472874182377837, + "flos": 550103063040.0, + "grad_norm": 0.0601694962527871, + "language_loss": 0.81003237, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82053542, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4924, + "time_per_iteration": 2.808788299560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051085, + "balance_loss_mlp": 1.04240632, + "diversity_loss_mlp": 0.0, + "epoch": 0.9474797999230473, + "flos": 542769776640.0, + "grad_norm": 0.06384621728093878, + "language_loss": 0.80346602, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81397688, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 4925, + "time_per_iteration": 2.6172335147857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049268, + "balance_loss_mlp": 1.04039288, + "diversity_loss_mlp": 0.0, + "epoch": 0.9476721816083109, + "flos": 844644879360.0, + "grad_norm": 0.06326857300487894, + "language_loss": 0.85833907, + "learning_rate": 7.163612828585242e-06, + "loss": 0.86883175, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 4926, + "time_per_iteration": 3.1013805866241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046961, + "balance_loss_mlp": 1.03822935, + "diversity_loss_mlp": 0.0, + "epoch": 0.9478645632935745, + "flos": 638002676736.0, + "grad_norm": 0.0714765450100148, + "language_loss": 0.7945109, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80498052, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 4927, + "time_per_iteration": 2.7759459018707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044976, + "balance_loss_mlp": 1.03620195, + "diversity_loss_mlp": 0.0, + "epoch": 0.948056944978838, + "flos": 656832748032.0, + "grad_norm": 0.08515861260909238, + "language_loss": 0.75973248, + "learning_rate": 7.058900559793469e-06, + "loss": 0.77018219, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4928, + "time_per_iteration": 2.8861470222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052944, + "balance_loss_mlp": 1.04416978, + "diversity_loss_mlp": 0.0, + "epoch": 0.9482493266641016, + "flos": 440907660288.0, + "grad_norm": 0.06735199813953592, + "language_loss": 0.83267879, + "learning_rate": 7.00683148031378e-06, + "loss": 0.84320819, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.08776855, + "routerloss_mlp": 0.0, + "step": 4929, + "time_per_iteration": 2.510803699493408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045383, + "balance_loss_mlp": 1.03666258, + "diversity_loss_mlp": 0.0, + "epoch": 0.9484417083493651, + "flos": 545989123584.0, + "grad_norm": 0.06926665939050473, + "language_loss": 0.78147107, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.79192489, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 4930, + "time_per_iteration": 2.7705516815185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784775, + "balance_loss_mlp": 1.32251549, + "diversity_loss_mlp": 0.22577199, + "epoch": 0.9486340900346287, + "flos": 538598937600.0, + "grad_norm": 0.030705907107943475, + "language_loss": 0.80018926, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80803692, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01063121, + "step": 4931, + "time_per_iteration": 2.700617551803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052654, + "balance_loss_mlp": 1.04359388, + "diversity_loss_mlp": 0.0, + "epoch": 0.9488264717198923, + "flos": 681669457920.0, + "grad_norm": 0.07163166168335688, + "language_loss": 0.85786635, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86839288, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 4932, + "time_per_iteration": 2.8230526447296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_mlp": 1.03682232, + "diversity_loss_mlp": 0.0, + "epoch": 0.9490188534051558, + "flos": 462603944448.0, + "grad_norm": 0.07113425512473334, + "language_loss": 0.88082981, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.89128458, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.08660889, + "routerloss_mlp": 0.0, + "step": 4933, + "time_per_iteration": 2.5242044925689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052399, + "balance_loss_mlp": 1.04369068, + "diversity_loss_mlp": 0.0, + "epoch": 0.9492112350904194, + "flos": 543135393792.0, + "grad_norm": 0.06957529053478449, + "language_loss": 0.82772219, + "learning_rate": 6.7493574384489e-06, + "loss": 0.83824623, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 4934, + "time_per_iteration": 2.682114362716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046039, + "balance_loss_mlp": 1.03765225, + "diversity_loss_mlp": 0.0, + "epoch": 0.949403616775683, + "flos": 550322947584.0, + "grad_norm": 0.06306988880080433, + "language_loss": 0.8386761, + "learning_rate": 6.698437041126992e-06, + "loss": 0.84913647, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.0838623, + "routerloss_mlp": 0.0, + "step": 4935, + "time_per_iteration": 2.726893424987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046934, + "balance_loss_mlp": 1.03838086, + "diversity_loss_mlp": 0.0, + "epoch": 0.9495959984609466, + "flos": 598383023616.0, + "grad_norm": 0.05973475098726946, + "language_loss": 0.82893109, + "learning_rate": 6.647708160456678e-06, + "loss": 0.83940041, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.08563232, + "routerloss_mlp": 0.0, + "step": 4936, + "time_per_iteration": 2.729111671447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_mlp": 1.03814435, + "diversity_loss_mlp": 0.0, + "epoch": 0.94978838014621, + "flos": 608409626112.0, + "grad_norm": 0.07659756248200288, + "language_loss": 0.82697654, + "learning_rate": 6.597170816132702e-06, + "loss": 0.83744407, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 4937, + "time_per_iteration": 2.8081254959106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784639, + "balance_loss_mlp": 1.32296765, + "diversity_loss_mlp": 0.22491853, + "epoch": 0.9499807618314736, + "flos": 540832660992.0, + "grad_norm": 0.031155014429691368, + "language_loss": 0.86999297, + "learning_rate": 6.546825027775427e-06, + "loss": 0.87783933, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01069584, + "step": 4938, + "time_per_iteration": 2.647392749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049194, + "balance_loss_mlp": 1.04043269, + "diversity_loss_mlp": 0.0, + "epoch": 0.9501731435167372, + "flos": 594600196608.0, + "grad_norm": 0.06549207812906088, + "language_loss": 0.82709306, + "learning_rate": 6.496670814930717e-06, + "loss": 0.83758503, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4939, + "time_per_iteration": 2.6947948932647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049253, + "balance_loss_mlp": 1.04041934, + "diversity_loss_mlp": 0.0, + "epoch": 0.9503655252020008, + "flos": 454138928640.0, + "grad_norm": 0.0674263053300071, + "language_loss": 0.80045903, + "learning_rate": 6.446708197070161e-06, + "loss": 0.81095159, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 4940, + "time_per_iteration": 2.537261486053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_mlp": 1.03906798, + "diversity_loss_mlp": 0.0, + "epoch": 0.9505579068872644, + "flos": 667944092160.0, + "grad_norm": 0.06671960471522939, + "language_loss": 0.84743893, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85791707, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.08764648, + "routerloss_mlp": 0.0, + "step": 4941, + "time_per_iteration": 2.7824418544769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051988, + "balance_loss_mlp": 1.04320264, + "diversity_loss_mlp": 0.0, + "epoch": 0.9507502885725279, + "flos": 402207192576.0, + "grad_norm": 0.07518292778028754, + "language_loss": 0.81734824, + "learning_rate": 6.347357823816235e-06, + "loss": 0.8278681, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 4942, + "time_per_iteration": 2.5175111293792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045578, + "balance_loss_mlp": 1.03662586, + "diversity_loss_mlp": 0.0, + "epoch": 0.9509426702577914, + "flos": 700358565888.0, + "grad_norm": 0.06073583327995898, + "language_loss": 0.79565704, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80611289, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 4943, + "time_per_iteration": 2.98564076423645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_mlp": 1.03589809, + "diversity_loss_mlp": 0.0, + "epoch": 0.951135051943055, + "flos": 501415640064.0, + "grad_norm": 0.07464458367850044, + "language_loss": 0.82931554, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83976078, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.08630371, + "routerloss_mlp": 0.0, + "step": 4944, + "time_per_iteration": 2.586824417114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048064, + "balance_loss_mlp": 1.03944492, + "diversity_loss_mlp": 0.0, + "epoch": 0.9513274336283186, + "flos": 614621094912.0, + "grad_norm": 0.0706686343064775, + "language_loss": 0.81845355, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82893419, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 4945, + "time_per_iteration": 2.921309232711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046293, + "balance_loss_mlp": 1.03738809, + "diversity_loss_mlp": 0.0, + "epoch": 0.9515198153135821, + "flos": 519586057728.0, + "grad_norm": 0.07524726970917751, + "language_loss": 0.82137179, + "learning_rate": 6.150957065611363e-06, + "loss": 0.83183479, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4946, + "time_per_iteration": 2.5640242099761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049384, + "balance_loss_mlp": 1.04034781, + "diversity_loss_mlp": 0.0, + "epoch": 0.9517121969988457, + "flos": 664954168320.0, + "grad_norm": 0.07065066286266242, + "language_loss": 0.76635486, + "learning_rate": 6.102336151595667e-06, + "loss": 0.77684867, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 4947, + "time_per_iteration": 2.965193033218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049032, + "balance_loss_mlp": 1.04028833, + "diversity_loss_mlp": 0.0, + "epoch": 0.9519045786841093, + "flos": 676409107968.0, + "grad_norm": 0.06944081610529035, + "language_loss": 0.75779366, + "learning_rate": 6.053906985658553e-06, + "loss": 0.76828402, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 4948, + "time_per_iteration": 2.8114254474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047439, + "balance_loss_mlp": 1.03859949, + "diversity_loss_mlp": 0.0, + "epoch": 0.9520969603693729, + "flos": 652901617152.0, + "grad_norm": 0.06267886834412634, + "language_loss": 0.80306596, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81354034, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.08843994, + "routerloss_mlp": 0.0, + "step": 4949, + "time_per_iteration": 2.829516887664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047548, + "balance_loss_mlp": 1.03901839, + "diversity_loss_mlp": 0.0, + "epoch": 0.9522893420546364, + "flos": 743284200960.0, + "grad_norm": 0.06460536676220141, + "language_loss": 0.83404064, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84451616, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.08538818, + "routerloss_mlp": 0.0, + "step": 4950, + "time_per_iteration": 3.064345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047703, + "balance_loss_mlp": 1.03875649, + "diversity_loss_mlp": 0.0, + "epoch": 0.9524817237398999, + "flos": 761696898048.0, + "grad_norm": 0.07065514061093704, + "language_loss": 0.80931592, + "learning_rate": 5.909770163964545e-06, + "loss": 0.81979299, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 4951, + "time_per_iteration": 2.9210174083709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045969, + "balance_loss_mlp": 1.03724885, + "diversity_loss_mlp": 0.0, + "epoch": 0.9526741054251635, + "flos": 529125903360.0, + "grad_norm": 0.0779800356462361, + "language_loss": 0.82006431, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83052403, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4952, + "time_per_iteration": 2.570007801055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048957, + "balance_loss_mlp": 1.0397898, + "diversity_loss_mlp": 0.0, + "epoch": 0.9528664871104271, + "flos": 488441332224.0, + "grad_norm": 0.07317068745782636, + "language_loss": 0.81126779, + "learning_rate": 5.814638032609787e-06, + "loss": 0.82175738, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.0916748, + "routerloss_mlp": 0.0, + "step": 4953, + "time_per_iteration": 2.593344211578369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047896, + "balance_loss_mlp": 1.03926563, + "diversity_loss_mlp": 0.0, + "epoch": 0.9530588687956907, + "flos": 517745115648.0, + "grad_norm": 0.06495580169291973, + "language_loss": 0.85402286, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86450183, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 4954, + "time_per_iteration": 2.757946491241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00780626, + "balance_loss_mlp": 1.31376719, + "diversity_loss_mlp": 0.22618601, + "epoch": 0.9532512504809542, + "flos": 675148898304.0, + "grad_norm": 0.03586731087797504, + "language_loss": 0.81228065, + "learning_rate": 5.720273340271864e-06, + "loss": 0.82008696, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0106497, + "step": 4955, + "time_per_iteration": 2.883862018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049414, + "balance_loss_mlp": 1.04027104, + "diversity_loss_mlp": 0.0, + "epoch": 0.9534436321662177, + "flos": 489523502592.0, + "grad_norm": 0.07193968737801358, + "language_loss": 0.84132719, + "learning_rate": 5.673378829575249e-06, + "loss": 0.85182136, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.09143066, + "routerloss_mlp": 0.0, + "step": 4956, + "time_per_iteration": 2.5883569717407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046949, + "balance_loss_mlp": 1.03826427, + "diversity_loss_mlp": 0.0, + "epoch": 0.9536360138514813, + "flos": 496585147392.0, + "grad_norm": 0.06822952225428794, + "language_loss": 0.81915605, + "learning_rate": 5.626676233493167e-06, + "loss": 0.82962549, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 4957, + "time_per_iteration": 2.630600690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_mlp": 1.040012, + "diversity_loss_mlp": 0.0, + "epoch": 0.9538283955367449, + "flos": 801462283776.0, + "grad_norm": 0.05995693166435021, + "language_loss": 0.83973289, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85022032, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 4958, + "time_per_iteration": 3.0566930770874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045737, + "balance_loss_mlp": 1.0366534, + "diversity_loss_mlp": 0.0, + "epoch": 0.9540207772220085, + "flos": 556668039168.0, + "grad_norm": 0.06699001332746012, + "language_loss": 0.80331284, + "learning_rate": 5.533846857624203e-06, + "loss": 0.81377017, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 4959, + "time_per_iteration": 2.761378049850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047149, + "balance_loss_mlp": 1.03821445, + "diversity_loss_mlp": 0.0, + "epoch": 0.954213158907272, + "flos": 684505935360.0, + "grad_norm": 0.0761611393687458, + "language_loss": 0.82048774, + "learning_rate": 5.487720113876882e-06, + "loss": 0.83095926, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4960, + "time_per_iteration": 2.932245969772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_mlp": 1.04009259, + "diversity_loss_mlp": 0.0, + "epoch": 0.9544055405925356, + "flos": 535752548352.0, + "grad_norm": 0.06840338993330367, + "language_loss": 0.8257823, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83627176, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4961, + "time_per_iteration": 2.7189135551452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049536, + "balance_loss_mlp": 1.04058886, + "diversity_loss_mlp": 0.0, + "epoch": 0.9545979222777992, + "flos": 825404401152.0, + "grad_norm": 0.06804248679935226, + "language_loss": 0.80613565, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81663102, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.08947754, + "routerloss_mlp": 0.0, + "step": 4962, + "time_per_iteration": 3.102736711502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078831, + "balance_loss_mlp": 1.33004642, + "diversity_loss_mlp": 0.2248106, + "epoch": 0.9547903039630627, + "flos": 761691755520.0, + "grad_norm": 0.03404897095721445, + "language_loss": 0.77822566, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78610873, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01088165, + "step": 4963, + "time_per_iteration": 3.1009397506713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051718, + "balance_loss_mlp": 1.04287314, + "diversity_loss_mlp": 0.0, + "epoch": 0.9549826856483262, + "flos": 515306562048.0, + "grad_norm": 0.0785854138679803, + "language_loss": 0.82759595, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.83811319, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4964, + "time_per_iteration": 2.5947694778442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052088, + "balance_loss_mlp": 1.04327834, + "diversity_loss_mlp": 0.0, + "epoch": 0.9551750673335898, + "flos": 643107382272.0, + "grad_norm": 0.06792534083569658, + "language_loss": 0.82819939, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83872032, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4965, + "time_per_iteration": 2.803609609603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050399, + "balance_loss_mlp": 1.04159546, + "diversity_loss_mlp": 0.0, + "epoch": 0.9553674490188534, + "flos": 472208030208.0, + "grad_norm": 0.06616240585597659, + "language_loss": 0.8283782, + "learning_rate": 5.214991993520546e-06, + "loss": 0.83888221, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4966, + "time_per_iteration": 2.584310531616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048058, + "balance_loss_mlp": 1.03910518, + "diversity_loss_mlp": 0.0, + "epoch": 0.955559830704117, + "flos": 528317945856.0, + "grad_norm": 0.07793598675668457, + "language_loss": 0.8188796, + "learning_rate": 5.170209528521763e-06, + "loss": 0.82936013, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 4967, + "time_per_iteration": 2.592332601547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104625, + "balance_loss_mlp": 1.03739893, + "diversity_loss_mlp": 0.0, + "epoch": 0.9557522123893806, + "flos": 548168518656.0, + "grad_norm": 0.06516874865343447, + "language_loss": 0.84235787, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85282034, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 4968, + "time_per_iteration": 2.6265814304351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044806, + "balance_loss_mlp": 1.03580022, + "diversity_loss_mlp": 0.0, + "epoch": 0.955944594074644, + "flos": 509465479680.0, + "grad_norm": 0.05920920196225761, + "language_loss": 0.81924808, + "learning_rate": 5.08122094572222e-06, + "loss": 0.82969612, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 4969, + "time_per_iteration": 2.668456554412842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_mlp": 1.03809857, + "diversity_loss_mlp": 0.0, + "epoch": 0.9561369757599076, + "flos": 527578997760.0, + "grad_norm": 0.07042790663947672, + "language_loss": 0.79412282, + "learning_rate": 5.037014862469824e-06, + "loss": 0.80459142, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.08770752, + "routerloss_mlp": 0.0, + "step": 4970, + "time_per_iteration": 2.7282607555389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050652, + "balance_loss_mlp": 1.0418489, + "diversity_loss_mlp": 0.0, + "epoch": 0.9563293574451712, + "flos": 498201062400.0, + "grad_norm": 0.06399713345893698, + "language_loss": 0.80029887, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81080544, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4971, + "time_per_iteration": 2.6104438304901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00588737, + "balance_loss_mlp": 1.02730632, + "diversity_loss_mlp": 0.13157621, + "epoch": 0.9565217391304348, + "flos": 1408875628032.0, + "grad_norm": 0.0012650050689020306, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.823623, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0092962, + "step": 4972, + "time_per_iteration": 4.941720008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044467, + "balance_loss_mlp": 1.03565741, + "diversity_loss_mlp": 0.0, + "epoch": 0.9567141208156984, + "flos": 503846853120.0, + "grad_norm": 0.059256065258913096, + "language_loss": 0.78335071, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79379541, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4973, + "time_per_iteration": 2.788773775100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049048, + "balance_loss_mlp": 1.04036331, + "diversity_loss_mlp": 0.0, + "epoch": 0.9569065025009619, + "flos": 433213526016.0, + "grad_norm": 0.08268664024117288, + "language_loss": 0.79965454, + "learning_rate": 4.86211231669359e-06, + "loss": 0.81014502, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 4974, + "time_per_iteration": 2.4901206493377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047752, + "balance_loss_mlp": 1.03915691, + "diversity_loss_mlp": 0.0, + "epoch": 0.9570988841862255, + "flos": 589959853056.0, + "grad_norm": 0.0658884479140285, + "language_loss": 0.78595436, + "learning_rate": 4.818867211936806e-06, + "loss": 0.7964319, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 4975, + "time_per_iteration": 4.219155550003052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043882, + "balance_loss_mlp": 1.03510857, + "diversity_loss_mlp": 0.0, + "epoch": 0.957291265871489, + "flos": 767278448640.0, + "grad_norm": 0.07813154083214305, + "language_loss": 0.78541613, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79585493, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 4976, + "time_per_iteration": 2.9422388076782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045842, + "balance_loss_mlp": 1.03703845, + "diversity_loss_mlp": 0.0, + "epoch": 0.9574836475567526, + "flos": 639104670720.0, + "grad_norm": 0.07237747383924455, + "language_loss": 0.84659564, + "learning_rate": 4.732953758233849e-06, + "loss": 0.85705405, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 4977, + "time_per_iteration": 2.826688528060913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004691, + "balance_loss_mlp": 1.0002805, + "diversity_loss_mlp": 0.0, + "epoch": 0.9576760292420161, + "flos": 1575939649536.0, + "grad_norm": 0.006664188824760945, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79611945, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 4978, + "time_per_iteration": 4.937689781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078841, + "balance_loss_mlp": 1.33186579, + "diversity_loss_mlp": 0.22349364, + "epoch": 0.9578684109272797, + "flos": 496345439232.0, + "grad_norm": 0.030270093123026424, + "language_loss": 0.87261242, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.8804965, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01073015, + "step": 4979, + "time_per_iteration": 2.6448476314544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00787724, + "balance_loss_mlp": 1.330446, + "diversity_loss_mlp": 0.2238563, + "epoch": 0.9580607926125433, + "flos": 429954531840.0, + "grad_norm": 0.03851656500602482, + "language_loss": 0.85486841, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86274564, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0105729, + "step": 4980, + "time_per_iteration": 2.513583183288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_mlp": 1.03938699, + "diversity_loss_mlp": 0.0, + "epoch": 0.9582531742978069, + "flos": 1127262251520.0, + "grad_norm": 0.0738676496011813, + "language_loss": 0.80298102, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81346583, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.09106445, + "routerloss_mlp": 0.0, + "step": 4981, + "time_per_iteration": 3.532383441925049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_mlp": 1.03933644, + "diversity_loss_mlp": 0.0, + "epoch": 0.9584455559830705, + "flos": 524458395648.0, + "grad_norm": 0.05859325637714088, + "language_loss": 0.79110616, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80158764, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4982, + "time_per_iteration": 2.6554603576660156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048551, + "balance_loss_mlp": 1.03964579, + "diversity_loss_mlp": 0.0, + "epoch": 0.9586379376683339, + "flos": 634187543040.0, + "grad_norm": 0.05822993259734132, + "language_loss": 0.81000149, + "learning_rate": 4.479828637655392e-06, + "loss": 0.82048702, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4983, + "time_per_iteration": 2.836662530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045539, + "balance_loss_mlp": 1.03656244, + "diversity_loss_mlp": 0.0, + "epoch": 0.9588303193535975, + "flos": 416061038592.0, + "grad_norm": 0.06921858371067632, + "language_loss": 0.83688623, + "learning_rate": 4.438314345641459e-06, + "loss": 0.84734166, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 4984, + "time_per_iteration": 2.4890353679656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047663, + "balance_loss_mlp": 1.03846598, + "diversity_loss_mlp": 0.0, + "epoch": 0.9590227010388611, + "flos": 481683635712.0, + "grad_norm": 0.0655069361339347, + "language_loss": 0.78102469, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79150128, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.09204102, + "routerloss_mlp": 0.0, + "step": 4985, + "time_per_iteration": 2.5810418128967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046293, + "balance_loss_mlp": 1.03757238, + "diversity_loss_mlp": 0.0, + "epoch": 0.9592150827241247, + "flos": 684540440064.0, + "grad_norm": 0.0696645623460603, + "language_loss": 0.80404431, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81450725, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4986, + "time_per_iteration": 3.0027694702148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044808, + "balance_loss_mlp": 1.03609419, + "diversity_loss_mlp": 0.0, + "epoch": 0.9594074644093882, + "flos": 574490092032.0, + "grad_norm": 0.06168953583598696, + "language_loss": 0.70886958, + "learning_rate": 4.314925898349642e-06, + "loss": 0.71931762, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 4987, + "time_per_iteration": 2.7255663871765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046824, + "balance_loss_mlp": 1.03819966, + "diversity_loss_mlp": 0.0, + "epoch": 0.9595998460946518, + "flos": 546871233024.0, + "grad_norm": 0.0653725751798929, + "language_loss": 0.78369594, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79416412, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 4988, + "time_per_iteration": 2.7598073482513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042023, + "balance_loss_mlp": 1.03311229, + "diversity_loss_mlp": 0.0, + "epoch": 0.9597922277799154, + "flos": 474043829760.0, + "grad_norm": 0.07692135244194774, + "language_loss": 0.78684759, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79726779, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 4989, + "time_per_iteration": 2.5303213596343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047607, + "balance_loss_mlp": 1.03871953, + "diversity_loss_mlp": 0.0, + "epoch": 0.9599846094651789, + "flos": 514691324928.0, + "grad_norm": 0.08379738751426644, + "language_loss": 0.85613489, + "learning_rate": 4.193269428723889e-06, + "loss": 0.866611, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 4990, + "time_per_iteration": 2.614570379257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046822, + "balance_loss_mlp": 1.03815556, + "diversity_loss_mlp": 0.0, + "epoch": 0.9601769911504425, + "flos": 594983066112.0, + "grad_norm": 0.08435652614677631, + "language_loss": 0.78316408, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79363227, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.08679199, + "routerloss_mlp": 0.0, + "step": 4991, + "time_per_iteration": 2.748410224914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104591, + "balance_loss_mlp": 1.03710628, + "diversity_loss_mlp": 0.0, + "epoch": 0.960369372835706, + "flos": 493012293120.0, + "grad_norm": 0.06666949415129908, + "language_loss": 0.79405791, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80451697, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 4992, + "time_per_iteration": 2.6129846572875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049094, + "balance_loss_mlp": 1.04027796, + "diversity_loss_mlp": 0.0, + "epoch": 0.9605617545209696, + "flos": 579293420544.0, + "grad_norm": 0.06505303405528073, + "language_loss": 0.82855588, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83904684, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 4993, + "time_per_iteration": 2.697122097015381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048804, + "balance_loss_mlp": 1.03996491, + "diversity_loss_mlp": 0.0, + "epoch": 0.9607541362062332, + "flos": 927708857856.0, + "grad_norm": 0.05557800406655289, + "language_loss": 0.86002243, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87051046, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 4994, + "time_per_iteration": 3.2234411239624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049247, + "balance_loss_mlp": 1.04041374, + "diversity_loss_mlp": 0.0, + "epoch": 0.9609465178914968, + "flos": 573121225728.0, + "grad_norm": 0.05698113601966363, + "language_loss": 0.75638676, + "learning_rate": 3.994358637073036e-06, + "loss": 0.7668792, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 4995, + "time_per_iteration": 2.811509847640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047757, + "balance_loss_mlp": 1.03900671, + "diversity_loss_mlp": 0.0, + "epoch": 0.9611388995767602, + "flos": 530850475008.0, + "grad_norm": 0.06182635414067332, + "language_loss": 0.85539091, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86586845, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 4996, + "time_per_iteration": 2.6234097480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00776504, + "balance_loss_mlp": 1.30815172, + "diversity_loss_mlp": 0.22351003, + "epoch": 0.9613312812620238, + "flos": 646247808000.0, + "grad_norm": 0.03585301103792293, + "language_loss": 0.82592523, + "learning_rate": 3.916142178097881e-06, + "loss": 0.83369029, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01067326, + "step": 4997, + "time_per_iteration": 2.7915287017822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0077909, + "balance_loss_mlp": 1.31180668, + "diversity_loss_mlp": 0.22519468, + "epoch": 0.9615236629472874, + "flos": 496152718848.0, + "grad_norm": 0.032099715647482555, + "language_loss": 0.77762806, + "learning_rate": 3.877322836288888e-06, + "loss": 0.78541887, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0105895, + "step": 4998, + "time_per_iteration": 2.8831381797790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045673, + "balance_loss_mlp": 1.03671455, + "diversity_loss_mlp": 0.0, + "epoch": 0.961716044632551, + "flos": 512974093824.0, + "grad_norm": 0.0659062812504805, + "language_loss": 0.75562751, + "learning_rate": 3.838696106385153e-06, + "loss": 0.76608419, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 4999, + "time_per_iteration": 2.5965874195098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049929, + "balance_loss_mlp": 1.0409348, + "diversity_loss_mlp": 0.0, + "epoch": 0.9619084263178146, + "flos": 501084527616.0, + "grad_norm": 0.06697543006955084, + "language_loss": 0.80806673, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81856602, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 5000, + "time_per_iteration": 2.5651276111602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_mlp": 1.0366478, + "diversity_loss_mlp": 0.0, + "epoch": 0.9621008080030781, + "flos": 595635379200.0, + "grad_norm": 0.0765647536824451, + "language_loss": 0.75030309, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.76075912, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 5001, + "time_per_iteration": 2.750175952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048226, + "balance_loss_mlp": 1.03932738, + "diversity_loss_mlp": 0.0, + "epoch": 0.9622931896883417, + "flos": 502250761728.0, + "grad_norm": 0.07727900973651224, + "language_loss": 0.81910348, + "learning_rate": 3.723971737693899e-06, + "loss": 0.82958579, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5002, + "time_per_iteration": 2.665245294570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048413, + "balance_loss_mlp": 1.03946078, + "diversity_loss_mlp": 0.0, + "epoch": 0.9624855713736052, + "flos": 607287808512.0, + "grad_norm": 0.0718035222006464, + "language_loss": 0.80944788, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81993198, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.08959961, + "routerloss_mlp": 0.0, + "step": 5003, + "time_per_iteration": 2.7820627689361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047459, + "balance_loss_mlp": 1.03892946, + "diversity_loss_mlp": 0.0, + "epoch": 0.9626779530588688, + "flos": 510715777536.0, + "grad_norm": 0.09658490174394786, + "language_loss": 0.85061997, + "learning_rate": 3.648452157695936e-06, + "loss": 0.86109459, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.08538818, + "routerloss_mlp": 0.0, + "step": 5004, + "time_per_iteration": 2.5650572776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051022, + "balance_loss_mlp": 1.04228425, + "diversity_loss_mlp": 0.0, + "epoch": 0.9628703347441323, + "flos": 627294025728.0, + "grad_norm": 0.07079516660765435, + "language_loss": 0.82573175, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83624196, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 5005, + "time_per_iteration": 2.808318853378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054525, + "balance_loss_mlp": 1.04536355, + "diversity_loss_mlp": 0.0, + "epoch": 0.9630627164293959, + "flos": 630758223360.0, + "grad_norm": 0.06358415598016834, + "language_loss": 0.77436566, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78491098, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.09161377, + "routerloss_mlp": 0.0, + "step": 5006, + "time_per_iteration": 2.7581474781036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_mlp": 1.0372808, + "diversity_loss_mlp": 0.0, + "epoch": 0.9632550981146595, + "flos": 570558961152.0, + "grad_norm": 0.06259715736563402, + "language_loss": 0.78214157, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79260308, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 5007, + "time_per_iteration": 2.8067400455474854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047305, + "balance_loss_mlp": 1.03849518, + "diversity_loss_mlp": 0.0, + "epoch": 0.9634474797999231, + "flos": 466117327872.0, + "grad_norm": 0.0652004870167461, + "language_loss": 0.8097052, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.82017827, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5008, + "time_per_iteration": 2.6624722480773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043775, + "balance_loss_mlp": 1.03475678, + "diversity_loss_mlp": 0.0, + "epoch": 0.9636398614851867, + "flos": 526600714752.0, + "grad_norm": 0.07542594197578673, + "language_loss": 0.85320652, + "learning_rate": 3.463025724284974e-06, + "loss": 0.8636443, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.09020996, + "routerloss_mlp": 0.0, + "step": 5009, + "time_per_iteration": 2.649427890777588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_mlp": 1.03576136, + "diversity_loss_mlp": 0.0, + "epoch": 0.9638322431704501, + "flos": 564831677952.0, + "grad_norm": 0.06511821335900564, + "language_loss": 0.75133872, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76178598, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 5010, + "time_per_iteration": 2.780074119567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046262, + "balance_loss_mlp": 1.03736854, + "diversity_loss_mlp": 0.0, + "epoch": 0.9640246248557137, + "flos": 477772328448.0, + "grad_norm": 0.07329288404167861, + "language_loss": 0.84246582, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.8529284, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 5011, + "time_per_iteration": 2.651488780975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047189, + "balance_loss_mlp": 1.03848636, + "diversity_loss_mlp": 0.0, + "epoch": 0.9642170065409773, + "flos": 539318062080.0, + "grad_norm": 0.06680869041289342, + "language_loss": 0.88673419, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89720607, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 5012, + "time_per_iteration": 2.6489691734313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_mlp": 1.03752685, + "diversity_loss_mlp": 0.0, + "epoch": 0.9644093882262409, + "flos": 523754325504.0, + "grad_norm": 0.06514803880345414, + "language_loss": 0.83791411, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.848378, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 5013, + "time_per_iteration": 2.57792067527771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046587, + "balance_loss_mlp": 1.03800964, + "diversity_loss_mlp": 0.0, + "epoch": 0.9646017699115044, + "flos": 574290031104.0, + "grad_norm": 0.06277044595718272, + "language_loss": 0.78603232, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79649818, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.08587646, + "routerloss_mlp": 0.0, + "step": 5014, + "time_per_iteration": 2.75705885887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049, + "balance_loss_mlp": 1.04026842, + "diversity_loss_mlp": 0.0, + "epoch": 0.964794151596768, + "flos": 636799366656.0, + "grad_norm": 0.10341285482454692, + "language_loss": 0.84443051, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85492051, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 5015, + "time_per_iteration": 2.7894856929779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104746, + "balance_loss_mlp": 1.03886533, + "diversity_loss_mlp": 0.0, + "epoch": 0.9649865332820315, + "flos": 617435550720.0, + "grad_norm": 0.07303173278488923, + "language_loss": 0.86459041, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87506503, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 5016, + "time_per_iteration": 2.774505376815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045328, + "balance_loss_mlp": 1.036268, + "diversity_loss_mlp": 0.0, + "epoch": 0.9651789149672951, + "flos": 516183528960.0, + "grad_norm": 0.06203977251445547, + "language_loss": 0.81297398, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82342726, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.09063721, + "routerloss_mlp": 0.0, + "step": 5017, + "time_per_iteration": 2.7457613945007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045833, + "balance_loss_mlp": 1.0369395, + "diversity_loss_mlp": 0.0, + "epoch": 0.9653712966525587, + "flos": 492940712448.0, + "grad_norm": 0.07389028070446926, + "language_loss": 0.80003834, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.81049669, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5018, + "time_per_iteration": 2.5559167861938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_mlp": 1.03835368, + "diversity_loss_mlp": 0.0, + "epoch": 0.9655636783378222, + "flos": 536560505856.0, + "grad_norm": 0.05876051048061586, + "language_loss": 0.82367206, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83414584, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 5019, + "time_per_iteration": 2.7295024394989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048475, + "balance_loss_mlp": 1.03974926, + "diversity_loss_mlp": 0.0, + "epoch": 0.9657560600230858, + "flos": 459023749632.0, + "grad_norm": 0.0742577236438263, + "language_loss": 0.82501537, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83550012, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 5020, + "time_per_iteration": 2.732282876968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048667, + "balance_loss_mlp": 1.0402087, + "diversity_loss_mlp": 0.0, + "epoch": 0.9659484417083494, + "flos": 686178749952.0, + "grad_norm": 0.07257927833574024, + "language_loss": 0.83663607, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84712267, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.08465576, + "routerloss_mlp": 0.0, + "step": 5021, + "time_per_iteration": 2.8645994663238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003265, + "balance_loss_mlp": 0.99885476, + "diversity_loss_mlp": 0.0, + "epoch": 0.966140823393613, + "flos": 1502292178944.0, + "grad_norm": 0.004502170891661989, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81697512, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5022, + "time_per_iteration": 4.703518390655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049998, + "balance_loss_mlp": 1.04122436, + "diversity_loss_mlp": 0.0, + "epoch": 0.9663332050788765, + "flos": 464899336704.0, + "grad_norm": 0.08988904326994861, + "language_loss": 0.81278229, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.82328224, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 5023, + "time_per_iteration": 2.581846237182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010486, + "balance_loss_mlp": 1.03996289, + "diversity_loss_mlp": 0.0, + "epoch": 0.96652558676414, + "flos": 500834907648.0, + "grad_norm": 0.07024301133900458, + "language_loss": 0.85463035, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86511636, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5024, + "time_per_iteration": 2.6678829193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047106, + "balance_loss_mlp": 1.03803396, + "diversity_loss_mlp": 0.0, + "epoch": 0.9667179684494036, + "flos": 424839914496.0, + "grad_norm": 0.0827162063613028, + "language_loss": 0.82914466, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.8396157, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.09075928, + "routerloss_mlp": 0.0, + "step": 5025, + "time_per_iteration": 2.4615111351013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047117, + "balance_loss_mlp": 1.03826559, + "diversity_loss_mlp": 0.0, + "epoch": 0.9669103501346672, + "flos": 516996628992.0, + "grad_norm": 0.061914921870518225, + "language_loss": 0.85848838, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.86895955, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 5026, + "time_per_iteration": 2.6631765365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045557, + "balance_loss_mlp": 1.03661585, + "diversity_loss_mlp": 0.0, + "epoch": 0.9671027318199308, + "flos": 456241600512.0, + "grad_norm": 0.10389844527854888, + "language_loss": 0.75783134, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.76828694, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 5027, + "time_per_iteration": 2.6192245483398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_mlp": 1.03396487, + "diversity_loss_mlp": 0.0, + "epoch": 0.9672951135051943, + "flos": 525058951680.0, + "grad_norm": 0.06651584976337663, + "language_loss": 0.80529153, + "learning_rate": 2.802372171957057e-06, + "loss": 0.8157168, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.08575439, + "routerloss_mlp": 0.0, + "step": 5028, + "time_per_iteration": 2.6251182556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047556, + "balance_loss_mlp": 1.03856707, + "diversity_loss_mlp": 0.0, + "epoch": 0.9674874951904578, + "flos": 573986082816.0, + "grad_norm": 0.06722764033814799, + "language_loss": 0.79839933, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.80887485, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 5029, + "time_per_iteration": 2.7434749603271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_mlp": 1.03893399, + "diversity_loss_mlp": 0.0, + "epoch": 0.9676798768757214, + "flos": 629184153600.0, + "grad_norm": 0.06316563947076154, + "language_loss": 0.80004889, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.81052518, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 5030, + "time_per_iteration": 2.9535553455352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003251, + "balance_loss_mlp": 0.99884009, + "diversity_loss_mlp": 0.0, + "epoch": 0.967872258560985, + "flos": 1463880605184.0, + "grad_norm": 0.004505813137803552, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76566613, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5031, + "time_per_iteration": 4.665933609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049308, + "balance_loss_mlp": 1.04061723, + "diversity_loss_mlp": 0.0, + "epoch": 0.9680646402462486, + "flos": 565503814656.0, + "grad_norm": 0.07437893126618236, + "language_loss": 0.79223692, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80272996, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 5032, + "time_per_iteration": 2.6745200157165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003248, + "balance_loss_mlp": 0.99883741, + "diversity_loss_mlp": 0.0, + "epoch": 0.9682570219315121, + "flos": 1434463022592.0, + "grad_norm": 0.004505868190554417, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79078054, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5033, + "time_per_iteration": 4.830533027648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043814, + "balance_loss_mlp": 1.03500438, + "diversity_loss_mlp": 0.0, + "epoch": 0.9684494036167757, + "flos": 584610670080.0, + "grad_norm": 0.07679444902591688, + "language_loss": 0.81878042, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82921857, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 5034, + "time_per_iteration": 2.7140636444091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048554, + "balance_loss_mlp": 1.03991711, + "diversity_loss_mlp": 0.0, + "epoch": 0.9686417853020393, + "flos": 559064747520.0, + "grad_norm": 0.06455129167487729, + "language_loss": 0.84188414, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85236967, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 5035, + "time_per_iteration": 2.7100539207458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048563, + "balance_loss_mlp": 1.03969407, + "diversity_loss_mlp": 0.0, + "epoch": 0.9688341669873028, + "flos": 784927604736.0, + "grad_norm": 0.07457469088112735, + "language_loss": 0.8308925, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84137809, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 5036, + "time_per_iteration": 3.0273303985595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00775046, + "balance_loss_mlp": 1.30442953, + "diversity_loss_mlp": 0.22392677, + "epoch": 0.9690265486725663, + "flos": 395899176960.0, + "grad_norm": 0.03634578837356394, + "language_loss": 0.79774749, + "learning_rate": 2.513747116326126e-06, + "loss": 0.805498, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01086747, + "step": 5037, + "time_per_iteration": 2.496250629425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046622, + "balance_loss_mlp": 1.03794384, + "diversity_loss_mlp": 0.0, + "epoch": 0.9692189303578299, + "flos": 476373726720.0, + "grad_norm": 0.07461894486851982, + "language_loss": 0.77795297, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78841919, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 5038, + "time_per_iteration": 2.735316753387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046103, + "balance_loss_mlp": 1.03756189, + "diversity_loss_mlp": 0.0, + "epoch": 0.9694113120430935, + "flos": 597575066112.0, + "grad_norm": 0.07661744515255002, + "language_loss": 0.79197067, + "learning_rate": 2.451732453851385e-06, + "loss": 0.8024317, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.08551025, + "routerloss_mlp": 0.0, + "step": 5039, + "time_per_iteration": 2.7147159576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043927, + "balance_loss_mlp": 1.03520727, + "diversity_loss_mlp": 0.0, + "epoch": 0.9696036937283571, + "flos": 500881895424.0, + "grad_norm": 0.06459150402718168, + "language_loss": 0.82762325, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.83806252, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 5040, + "time_per_iteration": 2.5953493118286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_mlp": 1.03482664, + "diversity_loss_mlp": 0.0, + "epoch": 0.9697960754136207, + "flos": 432277088256.0, + "grad_norm": 0.08520160899358113, + "language_loss": 0.87077874, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88121581, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 5041, + "time_per_iteration": 2.470695972442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047399, + "balance_loss_mlp": 1.03847671, + "diversity_loss_mlp": 0.0, + "epoch": 0.9699884570988841, + "flos": 568540353024.0, + "grad_norm": 0.0661289335538221, + "language_loss": 0.85483861, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86531258, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 5042, + "time_per_iteration": 2.7053682804107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104593, + "balance_loss_mlp": 1.03708434, + "diversity_loss_mlp": 0.0, + "epoch": 0.9701808387841477, + "flos": 516215835648.0, + "grad_norm": 0.06476327659734085, + "language_loss": 0.81779778, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82825708, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 5043, + "time_per_iteration": 2.6728196144104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_mlp": 1.03794777, + "diversity_loss_mlp": 0.0, + "epoch": 0.9703732204694113, + "flos": 491517517824.0, + "grad_norm": 0.08267177511200062, + "language_loss": 0.76453322, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77500081, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 5044, + "time_per_iteration": 2.5768589973449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047615, + "balance_loss_mlp": 1.03866804, + "diversity_loss_mlp": 0.0, + "epoch": 0.9705656021546749, + "flos": 626120451072.0, + "grad_norm": 0.06897516762466789, + "language_loss": 0.80167985, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.81215596, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 5045, + "time_per_iteration": 2.795342206954956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_mlp": 1.03677726, + "diversity_loss_mlp": 0.0, + "epoch": 0.9707579838399384, + "flos": 471437148672.0, + "grad_norm": 0.0755169004935037, + "language_loss": 0.83042562, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.84088135, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 5046, + "time_per_iteration": 2.5994091033935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_mlp": 1.03778839, + "diversity_loss_mlp": 0.0, + "epoch": 0.970950365525202, + "flos": 492103019520.0, + "grad_norm": 0.07013648257820884, + "language_loss": 0.80700469, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81747067, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.08807373, + "routerloss_mlp": 0.0, + "step": 5047, + "time_per_iteration": 2.695164680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_mlp": 1.03531933, + "diversity_loss_mlp": 0.0, + "epoch": 0.9711427472104656, + "flos": 557322923520.0, + "grad_norm": 0.06514840583334829, + "language_loss": 0.80631614, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81675637, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 5048, + "time_per_iteration": 2.692713975906372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_mlp": 1.04089093, + "diversity_loss_mlp": 0.0, + "epoch": 0.9713351288957291, + "flos": 625841095680.0, + "grad_norm": 0.06192564808689567, + "language_loss": 0.83786458, + "learning_rate": 2.153250946564489e-06, + "loss": 0.84835804, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.08459473, + "routerloss_mlp": 0.0, + "step": 5049, + "time_per_iteration": 2.934725761413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049582, + "balance_loss_mlp": 1.04098153, + "diversity_loss_mlp": 0.0, + "epoch": 0.9715275105809927, + "flos": 499073260032.0, + "grad_norm": 0.0692175783084948, + "language_loss": 0.81435341, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.82484925, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 5050, + "time_per_iteration": 2.732560873031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047834, + "balance_loss_mlp": 1.03919172, + "diversity_loss_mlp": 0.0, + "epoch": 0.9717198922662562, + "flos": 477515367936.0, + "grad_norm": 0.07244382675246107, + "language_loss": 0.77611834, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78659672, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5051, + "time_per_iteration": 2.553946018218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048661, + "balance_loss_mlp": 1.03976798, + "diversity_loss_mlp": 0.0, + "epoch": 0.9719122739515198, + "flos": 553446120960.0, + "grad_norm": 0.058871704281843434, + "language_loss": 0.78665662, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79714322, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 5052, + "time_per_iteration": 2.700554847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104788, + "balance_loss_mlp": 1.03924966, + "diversity_loss_mlp": 0.0, + "epoch": 0.9721046556367834, + "flos": 565852179456.0, + "grad_norm": 0.06621518812082018, + "language_loss": 0.79820377, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.80868256, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.08630371, + "routerloss_mlp": 0.0, + "step": 5053, + "time_per_iteration": 2.6846559047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048468, + "balance_loss_mlp": 1.03977799, + "diversity_loss_mlp": 0.0, + "epoch": 0.972297037322047, + "flos": 560315045376.0, + "grad_norm": 0.07341823776686772, + "language_loss": 0.78280944, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.79329413, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 5054, + "time_per_iteration": 2.773064136505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047195, + "balance_loss_mlp": 1.03857064, + "diversity_loss_mlp": 0.0, + "epoch": 0.9724894190073105, + "flos": 512440349184.0, + "grad_norm": 0.07604483960314544, + "language_loss": 0.79561597, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.80608791, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 5055, + "time_per_iteration": 2.6578407287597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046653, + "balance_loss_mlp": 1.03799832, + "diversity_loss_mlp": 0.0, + "epoch": 0.972681800692574, + "flos": 613832961024.0, + "grad_norm": 0.0731380618710485, + "language_loss": 0.80641949, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81688601, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5056, + "time_per_iteration": 2.778132438659668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049506, + "balance_loss_mlp": 1.04049361, + "diversity_loss_mlp": 0.0, + "epoch": 0.9728741823778376, + "flos": 833911635456.0, + "grad_norm": 0.06341434190577709, + "language_loss": 0.83532751, + "learning_rate": 1.92838141509849e-06, + "loss": 0.84582257, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.09008789, + "routerloss_mlp": 0.0, + "step": 5057, + "time_per_iteration": 3.070535898208618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104959, + "balance_loss_mlp": 1.04053009, + "diversity_loss_mlp": 0.0, + "epoch": 0.9730665640631012, + "flos": 571450982400.0, + "grad_norm": 0.06728126412432961, + "language_loss": 0.84373492, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85423088, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.09057617, + "routerloss_mlp": 0.0, + "step": 5058, + "time_per_iteration": 2.7407948970794678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041956, + "balance_loss_mlp": 1.03302085, + "diversity_loss_mlp": 0.0, + "epoch": 0.9732589457483648, + "flos": 506520345600.0, + "grad_norm": 0.06896959434834592, + "language_loss": 0.77172613, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78214562, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.0894165, + "routerloss_mlp": 0.0, + "step": 5059, + "time_per_iteration": 2.593101978302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_mlp": 1.03695965, + "diversity_loss_mlp": 0.0, + "epoch": 0.9734513274336283, + "flos": 926977623552.0, + "grad_norm": 0.06467450172514855, + "language_loss": 0.80509335, + "learning_rate": 1.84724562509897e-06, + "loss": 0.8155489, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.08599854, + "routerloss_mlp": 0.0, + "step": 5060, + "time_per_iteration": 3.130805015563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048144, + "balance_loss_mlp": 1.03940582, + "diversity_loss_mlp": 0.0, + "epoch": 0.9736437091188919, + "flos": 491930122752.0, + "grad_norm": 0.07143647662877724, + "language_loss": 0.7819376, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79241908, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 5061, + "time_per_iteration": 2.7030551433563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105027, + "balance_loss_mlp": 1.04135358, + "diversity_loss_mlp": 0.0, + "epoch": 0.9738360908041555, + "flos": 613321611264.0, + "grad_norm": 0.07722158587827427, + "language_loss": 0.8399719, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.8504746, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 5062, + "time_per_iteration": 2.7250983715057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00588666, + "balance_loss_mlp": 1.0272553, + "diversity_loss_mlp": 0.13149816, + "epoch": 0.974028472489419, + "flos": 1549561549824.0, + "grad_norm": 0.001262541739400147, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.76580763, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00928975, + "step": 5063, + "time_per_iteration": 4.984234094619751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0058866, + "balance_loss_mlp": 1.02724576, + "diversity_loss_mlp": 0.13149402, + "epoch": 0.9742208541746825, + "flos": 1411155965952.0, + "grad_norm": 0.0012626586872862898, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.8026638, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00929021, + "step": 5064, + "time_per_iteration": 4.959820032119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043111, + "balance_loss_mlp": 1.03426552, + "diversity_loss_mlp": 0.0, + "epoch": 0.9744132358599461, + "flos": 674884597248.0, + "grad_norm": 0.061567595116442546, + "language_loss": 0.76945543, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77988654, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 5065, + "time_per_iteration": 2.8605847358703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046536, + "balance_loss_mlp": 1.03767872, + "diversity_loss_mlp": 0.0, + "epoch": 0.9746056175452097, + "flos": 598407616512.0, + "grad_norm": 0.06408228412896971, + "language_loss": 0.77837121, + "learning_rate": 1.690196122544896e-06, + "loss": 0.78883654, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 5066, + "time_per_iteration": 2.8428735733032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051292, + "balance_loss_mlp": 1.04271507, + "diversity_loss_mlp": 0.0, + "epoch": 0.9747979992304733, + "flos": 732175428096.0, + "grad_norm": 0.06431524577835049, + "language_loss": 0.82438833, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83490127, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.08581543, + "routerloss_mlp": 0.0, + "step": 5067, + "time_per_iteration": 2.9748458862304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046087, + "balance_loss_mlp": 1.03706264, + "diversity_loss_mlp": 0.0, + "epoch": 0.9749903809157369, + "flos": 616499112960.0, + "grad_norm": 0.07892101071391965, + "language_loss": 0.76234651, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.7728073, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.090271, + "routerloss_mlp": 0.0, + "step": 5068, + "time_per_iteration": 2.720424175262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049533, + "balance_loss_mlp": 1.04073572, + "diversity_loss_mlp": 0.0, + "epoch": 0.9751827626010003, + "flos": 468398039040.0, + "grad_norm": 0.06592156995071553, + "language_loss": 0.84109974, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.85159504, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 5069, + "time_per_iteration": 2.5736031532287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048842, + "balance_loss_mlp": 1.03985965, + "diversity_loss_mlp": 0.0, + "epoch": 0.9753751442862639, + "flos": 599215574016.0, + "grad_norm": 0.08190997494854581, + "language_loss": 0.85377657, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86426497, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.08990479, + "routerloss_mlp": 0.0, + "step": 5070, + "time_per_iteration": 2.8101613521575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049289, + "balance_loss_mlp": 1.04077792, + "diversity_loss_mlp": 0.0, + "epoch": 0.9755675259715275, + "flos": 650806285824.0, + "grad_norm": 0.0795575480548678, + "language_loss": 0.82202387, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83251673, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.08514404, + "routerloss_mlp": 0.0, + "step": 5071, + "time_per_iteration": 2.890133857727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047822, + "balance_loss_mlp": 1.03918517, + "diversity_loss_mlp": 0.0, + "epoch": 0.9757599076567911, + "flos": 563658103296.0, + "grad_norm": 0.08438970561016089, + "language_loss": 0.79632509, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80680329, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.08642578, + "routerloss_mlp": 0.0, + "step": 5072, + "time_per_iteration": 2.678088426589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_mlp": 1.03586388, + "diversity_loss_mlp": 0.0, + "epoch": 0.9759522893420547, + "flos": 504637558272.0, + "grad_norm": 0.07402137285679701, + "language_loss": 0.80289578, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81334102, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 5073, + "time_per_iteration": 2.5970799922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048341, + "balance_loss_mlp": 1.03980589, + "diversity_loss_mlp": 0.0, + "epoch": 0.9761446710273182, + "flos": 583728560640.0, + "grad_norm": 0.07471313714776352, + "language_loss": 0.82160485, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83208829, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.08544922, + "routerloss_mlp": 0.0, + "step": 5074, + "time_per_iteration": 2.724550724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047011, + "balance_loss_mlp": 1.03809404, + "diversity_loss_mlp": 0.0, + "epoch": 0.9763370527125818, + "flos": 482207468544.0, + "grad_norm": 0.07314052432610715, + "language_loss": 0.81910318, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.82957333, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.08917236, + "routerloss_mlp": 0.0, + "step": 5075, + "time_per_iteration": 2.6065866947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047284, + "balance_loss_mlp": 1.03871298, + "diversity_loss_mlp": 0.0, + "epoch": 0.9765294343978453, + "flos": 618987225600.0, + "grad_norm": 0.06220869349054033, + "language_loss": 0.78624564, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79671854, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.08575439, + "routerloss_mlp": 0.0, + "step": 5076, + "time_per_iteration": 2.7079405784606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00791822, + "balance_loss_mlp": 1.33702493, + "diversity_loss_mlp": 0.22525913, + "epoch": 0.9767218160831089, + "flos": 526573550592.0, + "grad_norm": 0.034551396825965836, + "language_loss": 0.85462183, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86254001, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01067994, + "step": 5077, + "time_per_iteration": 2.641120195388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_mlp": 1.03591776, + "diversity_loss_mlp": 0.0, + "epoch": 0.9769141977683724, + "flos": 525194772480.0, + "grad_norm": 0.06272749449600955, + "language_loss": 0.84269607, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.85314226, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 5078, + "time_per_iteration": 2.6361942291259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049536, + "balance_loss_mlp": 1.04075623, + "diversity_loss_mlp": 0.0, + "epoch": 0.977106579453636, + "flos": 457615236096.0, + "grad_norm": 0.06781093629055318, + "language_loss": 0.80375177, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81424713, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 5079, + "time_per_iteration": 2.799654245376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_mlp": 1.03826737, + "diversity_loss_mlp": 0.0, + "epoch": 0.9772989611388996, + "flos": 532090861056.0, + "grad_norm": 0.07054076117423486, + "language_loss": 0.8182112, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82868278, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.08892822, + "routerloss_mlp": 0.0, + "step": 5080, + "time_per_iteration": 2.6261301040649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_mlp": 1.03806853, + "diversity_loss_mlp": 0.0, + "epoch": 0.9774913428241632, + "flos": 755349235200.0, + "grad_norm": 0.08607720599847436, + "language_loss": 0.85967886, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.87014663, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.0871582, + "routerloss_mlp": 0.0, + "step": 5081, + "time_per_iteration": 3.0126025676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100325, + "balance_loss_mlp": 0.99883974, + "diversity_loss_mlp": 0.0, + "epoch": 0.9776837245094268, + "flos": 1554320088576.0, + "grad_norm": 0.004504556807133143, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79898739, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5082, + "time_per_iteration": 4.989394903182983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048062, + "balance_loss_mlp": 1.03916299, + "diversity_loss_mlp": 0.0, + "epoch": 0.9778761061946902, + "flos": 592534600704.0, + "grad_norm": 0.08681180158775233, + "language_loss": 0.84184444, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.85232502, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5083, + "time_per_iteration": 2.694652557373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049697, + "balance_loss_mlp": 1.04097056, + "diversity_loss_mlp": 0.0, + "epoch": 0.9780684878799538, + "flos": 414951704064.0, + "grad_norm": 0.06774609280174271, + "language_loss": 0.81603408, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.82653111, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.08734131, + "routerloss_mlp": 0.0, + "step": 5084, + "time_per_iteration": 2.469529151916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049134, + "balance_loss_mlp": 1.04026437, + "diversity_loss_mlp": 0.0, + "epoch": 0.9782608695652174, + "flos": 568411872768.0, + "grad_norm": 0.06648884426689973, + "language_loss": 0.84724671, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.85773802, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 5085, + "time_per_iteration": 2.7240426540374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046379, + "balance_loss_mlp": 1.03747988, + "diversity_loss_mlp": 0.0, + "epoch": 0.978453251250481, + "flos": 690472926720.0, + "grad_norm": 0.07204518126062733, + "language_loss": 0.82956612, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84002984, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5086, + "time_per_iteration": 2.872143507003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_mlp": 1.03782678, + "diversity_loss_mlp": 0.0, + "epoch": 0.9786456329357445, + "flos": 502505150976.0, + "grad_norm": 0.06206480321158436, + "language_loss": 0.77373308, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.7841996, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 5087, + "time_per_iteration": 2.6224989891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046355, + "balance_loss_mlp": 1.0377537, + "diversity_loss_mlp": 0.0, + "epoch": 0.9788380146210081, + "flos": 863183485440.0, + "grad_norm": 0.07890344203678189, + "language_loss": 0.80865037, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.81911391, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 5088, + "time_per_iteration": 3.021143913269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046697, + "balance_loss_mlp": 1.03788161, + "diversity_loss_mlp": 0.0, + "epoch": 0.9790303963062716, + "flos": 512717133312.0, + "grad_norm": 0.0707462132351249, + "language_loss": 0.83914399, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.84961092, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 5089, + "time_per_iteration": 2.6006627082824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048629, + "balance_loss_mlp": 1.03978372, + "diversity_loss_mlp": 0.0, + "epoch": 0.9792227779915352, + "flos": 494428147200.0, + "grad_norm": 0.07258951215182398, + "language_loss": 0.86249393, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87298024, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 5090, + "time_per_iteration": 2.57961106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104929, + "balance_loss_mlp": 1.04054606, + "diversity_loss_mlp": 0.0, + "epoch": 0.9794151596767988, + "flos": 608325562368.0, + "grad_norm": 0.0687229233050849, + "language_loss": 0.81661642, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82710934, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 5091, + "time_per_iteration": 2.774179458618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043168, + "balance_loss_mlp": 1.03394735, + "diversity_loss_mlp": 0.0, + "epoch": 0.9796075413620623, + "flos": 478222009344.0, + "grad_norm": 0.06774886047931106, + "language_loss": 0.86759937, + "learning_rate": 1.09015417612357e-06, + "loss": 0.87803102, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.09222412, + "routerloss_mlp": 0.0, + "step": 5092, + "time_per_iteration": 2.5726425647735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044727, + "balance_loss_mlp": 1.03592968, + "diversity_loss_mlp": 0.0, + "epoch": 0.9797999230473259, + "flos": 592220740608.0, + "grad_norm": 0.06986809662631227, + "language_loss": 0.84486377, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85531104, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 5093, + "time_per_iteration": 2.734572649002075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044433, + "balance_loss_mlp": 1.03564167, + "diversity_loss_mlp": 0.0, + "epoch": 0.9799923047325895, + "flos": 556381343232.0, + "grad_norm": 0.06627525100654652, + "language_loss": 0.8142283, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82467258, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.08795166, + "routerloss_mlp": 0.0, + "step": 5094, + "time_per_iteration": 2.901499032974243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104922, + "balance_loss_mlp": 1.04027307, + "diversity_loss_mlp": 0.0, + "epoch": 0.9801846864178531, + "flos": 579456405504.0, + "grad_norm": 0.05858269256579561, + "language_loss": 0.84523547, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85572767, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.08953857, + "routerloss_mlp": 0.0, + "step": 5095, + "time_per_iteration": 2.749011754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048661, + "balance_loss_mlp": 1.03991067, + "diversity_loss_mlp": 0.0, + "epoch": 0.9803770681031165, + "flos": 515101358592.0, + "grad_norm": 0.08054047976821545, + "language_loss": 0.8013413, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.81182784, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.08752441, + "routerloss_mlp": 0.0, + "step": 5096, + "time_per_iteration": 2.6734437942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048068, + "balance_loss_mlp": 1.03946686, + "diversity_loss_mlp": 0.0, + "epoch": 0.9805694497883801, + "flos": 566988678144.0, + "grad_norm": 0.06350240490258963, + "language_loss": 0.7813378, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79181844, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 5097, + "time_per_iteration": 2.726039409637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047979, + "balance_loss_mlp": 1.03928292, + "diversity_loss_mlp": 0.0, + "epoch": 0.9807618314736437, + "flos": 479351167488.0, + "grad_norm": 0.06123275422091068, + "language_loss": 0.73776084, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74824059, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.08709717, + "routerloss_mlp": 0.0, + "step": 5098, + "time_per_iteration": 2.6765458583831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00779933, + "balance_loss_mlp": 1.31478071, + "diversity_loss_mlp": 0.22396225, + "epoch": 0.9809542131589073, + "flos": 545285053440.0, + "grad_norm": 0.03778989641153832, + "language_loss": 0.80182397, + "learning_rate": 9.509698444908344e-07, + "loss": 0.8096233, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.0105617, + "step": 5099, + "time_per_iteration": 2.6399407386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047076, + "balance_loss_mlp": 1.03835607, + "diversity_loss_mlp": 0.0, + "epoch": 0.9811465948441709, + "flos": 520843696128.0, + "grad_norm": 0.0712325944726878, + "language_loss": 0.79504228, + "learning_rate": 9.318612999057452e-07, + "loss": 0.80551302, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.08728027, + "routerloss_mlp": 0.0, + "step": 5100, + "time_per_iteration": 2.605034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047574, + "balance_loss_mlp": 1.03872824, + "diversity_loss_mlp": 0.0, + "epoch": 0.9813389765294344, + "flos": 541282341888.0, + "grad_norm": 0.07915756516451043, + "language_loss": 0.80425239, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81472808, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 5101, + "time_per_iteration": 2.653615713119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_mlp": 1.03676605, + "diversity_loss_mlp": 0.0, + "epoch": 0.981531358214698, + "flos": 567356866560.0, + "grad_norm": 0.07121268040890673, + "language_loss": 0.84309268, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85354877, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.08837891, + "routerloss_mlp": 0.0, + "step": 5102, + "time_per_iteration": 2.7331223487854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048136, + "balance_loss_mlp": 1.03933203, + "diversity_loss_mlp": 0.0, + "epoch": 0.9817237398999615, + "flos": 577272241152.0, + "grad_norm": 0.06082212845964829, + "language_loss": 0.80932826, + "learning_rate": 8.756982280578307e-07, + "loss": 0.81980968, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5103, + "time_per_iteration": 2.731088876724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047414, + "balance_loss_mlp": 1.03868246, + "diversity_loss_mlp": 0.0, + "epoch": 0.9819161215852251, + "flos": 701507547648.0, + "grad_norm": 0.06577153639103081, + "language_loss": 0.82189977, + "learning_rate": 8.573647489714676e-07, + "loss": 0.83237398, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.08740234, + "routerloss_mlp": 0.0, + "step": 5104, + "time_per_iteration": 2.952533721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047188, + "balance_loss_mlp": 1.03831923, + "diversity_loss_mlp": 0.0, + "epoch": 0.9821085032704886, + "flos": 624188104704.0, + "grad_norm": 0.06798431241240387, + "language_loss": 0.84167528, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85214722, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.08880615, + "routerloss_mlp": 0.0, + "step": 5105, + "time_per_iteration": 2.86313533782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_mlp": 1.03541374, + "diversity_loss_mlp": 0.0, + "epoch": 0.9823008849557522, + "flos": 499505688576.0, + "grad_norm": 0.06686184516115971, + "language_loss": 0.81452221, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82496238, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.08612061, + "routerloss_mlp": 0.0, + "step": 5106, + "time_per_iteration": 2.708230495452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045424, + "balance_loss_mlp": 1.03651953, + "diversity_loss_mlp": 0.0, + "epoch": 0.9824932666410158, + "flos": 523815994368.0, + "grad_norm": 0.07713140113072105, + "language_loss": 0.72798324, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73843747, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5107, + "time_per_iteration": 2.6602892875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047946, + "balance_loss_mlp": 1.0389818, + "diversity_loss_mlp": 0.0, + "epoch": 0.9826856483262794, + "flos": 502663366656.0, + "grad_norm": 0.06073968757615098, + "language_loss": 0.82624412, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83672357, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.08966064, + "routerloss_mlp": 0.0, + "step": 5108, + "time_per_iteration": 2.637178421020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_mlp": 1.03743768, + "diversity_loss_mlp": 0.0, + "epoch": 0.982878030011543, + "flos": 562056869376.0, + "grad_norm": 0.05986915063822493, + "language_loss": 0.84416521, + "learning_rate": 7.686042586151354e-07, + "loss": 0.85462821, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 5109, + "time_per_iteration": 2.827469825744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046109, + "balance_loss_mlp": 1.03744864, + "diversity_loss_mlp": 0.0, + "epoch": 0.9830704116968064, + "flos": 537101591040.0, + "grad_norm": 0.05962385879994031, + "language_loss": 0.82830834, + "learning_rate": 7.514335898027857e-07, + "loss": 0.83876944, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5110, + "time_per_iteration": 2.7789480686187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052062, + "balance_loss_mlp": 1.0431633, + "diversity_loss_mlp": 0.0, + "epoch": 0.98326279338207, + "flos": 458949597696.0, + "grad_norm": 0.08038091049338392, + "language_loss": 0.84353125, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85405189, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5111, + "time_per_iteration": 2.504210948944092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046838, + "balance_loss_mlp": 1.03787303, + "diversity_loss_mlp": 0.0, + "epoch": 0.9834551750673336, + "flos": 640974974976.0, + "grad_norm": 0.06156712151194387, + "language_loss": 0.79174638, + "learning_rate": 7.17673735218416e-07, + "loss": 0.80221474, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.08972168, + "routerloss_mlp": 0.0, + "step": 5112, + "time_per_iteration": 2.8035426139831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045207, + "balance_loss_mlp": 1.03661203, + "diversity_loss_mlp": 0.0, + "epoch": 0.9836475567525972, + "flos": 1071807220224.0, + "grad_norm": 0.062084580460965294, + "language_loss": 0.7939449, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80439693, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.08605957, + "routerloss_mlp": 0.0, + "step": 5113, + "time_per_iteration": 3.4046199321746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051509, + "balance_loss_mlp": 1.04283631, + "diversity_loss_mlp": 0.0, + "epoch": 0.9838399384378607, + "flos": 565209778176.0, + "grad_norm": 0.08317258429297145, + "language_loss": 0.76198953, + "learning_rate": 6.846892349181566e-07, + "loss": 0.77250463, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.08685303, + "routerloss_mlp": 0.0, + "step": 5114, + "time_per_iteration": 2.668950319290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050737, + "balance_loss_mlp": 1.04192185, + "diversity_loss_mlp": 0.0, + "epoch": 0.9840323201231242, + "flos": 772805670912.0, + "grad_norm": 0.07567501347544295, + "language_loss": 0.79288757, + "learning_rate": 6.684877586787819e-07, + "loss": 0.80339497, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5115, + "time_per_iteration": 2.9638354778289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_mlp": 1.03803074, + "diversity_loss_mlp": 0.0, + "epoch": 0.9842247018083878, + "flos": 472262358528.0, + "grad_norm": 0.07643720957533141, + "language_loss": 0.85790366, + "learning_rate": 6.524801401249225e-07, + "loss": 0.86837137, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 5116, + "time_per_iteration": 2.5682291984558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_mlp": 1.03958189, + "diversity_loss_mlp": 0.0, + "epoch": 0.9844170834936514, + "flos": 525259012608.0, + "grad_norm": 0.07092299014904967, + "language_loss": 0.84942091, + "learning_rate": 6.366663854713295e-07, + "loss": 0.85990334, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 5117, + "time_per_iteration": 2.637977123260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003203, + "balance_loss_mlp": 0.99879217, + "diversity_loss_mlp": 0.0, + "epoch": 0.984609465178915, + "flos": 1567247408640.0, + "grad_norm": 0.004507137876237267, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78165722, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5118, + "time_per_iteration": 4.920542001724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052333, + "balance_loss_mlp": 1.04354155, + "diversity_loss_mlp": 0.0, + "epoch": 0.9848018468641785, + "flos": 519548981760.0, + "grad_norm": 0.07669150259725825, + "language_loss": 0.82077813, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83130145, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 5119, + "time_per_iteration": 2.606952428817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_mlp": 1.03820455, + "diversity_loss_mlp": 0.0, + "epoch": 0.9849942285494421, + "flos": 493004952576.0, + "grad_norm": 0.061362579804974775, + "language_loss": 0.83024836, + "learning_rate": 5.903883659301167e-07, + "loss": 0.84071916, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 5120, + "time_per_iteration": 2.588484525680542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051257, + "balance_loss_mlp": 1.04235184, + "diversity_loss_mlp": 0.0, + "epoch": 0.9851866102347057, + "flos": 546001606656.0, + "grad_norm": 0.0845871079135169, + "language_loss": 0.81128502, + "learning_rate": 5.753501275193029e-07, + "loss": 0.82179761, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5121, + "time_per_iteration": 2.6300275325775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044388, + "balance_loss_mlp": 1.03557277, + "diversity_loss_mlp": 0.0, + "epoch": 0.9853789919199692, + "flos": 476257729536.0, + "grad_norm": 0.07512722548004026, + "language_loss": 0.80214739, + "learning_rate": 5.605057829531912e-07, + "loss": 0.81259131, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5122, + "time_per_iteration": 2.528691053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051018, + "balance_loss_mlp": 1.04198194, + "diversity_loss_mlp": 0.0, + "epoch": 0.9855713736052328, + "flos": 1032619995648.0, + "grad_norm": 0.1156037342387967, + "language_loss": 0.76233137, + "learning_rate": 5.458553379950049e-07, + "loss": 0.77284151, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.09033203, + "routerloss_mlp": 0.0, + "step": 5123, + "time_per_iteration": 3.356245517730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_mlp": 1.04011154, + "diversity_loss_mlp": 0.0, + "epoch": 0.9857637552904963, + "flos": 495050724864.0, + "grad_norm": 0.0641282180922578, + "language_loss": 0.82703745, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83752573, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 5124, + "time_per_iteration": 2.625892400741577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051607, + "balance_loss_mlp": 1.04285097, + "diversity_loss_mlp": 0.0, + "epoch": 0.9859561369757599, + "flos": 592267728384.0, + "grad_norm": 0.06640628679407225, + "language_loss": 0.8357659, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84628195, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 5125, + "time_per_iteration": 2.6943421363830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045914, + "balance_loss_mlp": 1.03727758, + "diversity_loss_mlp": 0.0, + "epoch": 0.9861485186610235, + "flos": 486971149824.0, + "grad_norm": 0.07733437230097125, + "language_loss": 0.78536099, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79582012, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5126, + "time_per_iteration": 2.663972854614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047998, + "balance_loss_mlp": 1.03925443, + "diversity_loss_mlp": 0.0, + "epoch": 0.9863409003462871, + "flos": 518795352576.0, + "grad_norm": 0.06032739387712679, + "language_loss": 0.82490909, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83538908, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.08758545, + "routerloss_mlp": 0.0, + "step": 5127, + "time_per_iteration": 2.6729202270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003205, + "balance_loss_mlp": 0.99879479, + "diversity_loss_mlp": 0.0, + "epoch": 0.9865332820315506, + "flos": 1486026570240.0, + "grad_norm": 0.004506363295624896, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80185938, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5128, + "time_per_iteration": 4.911416530609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0078477, + "balance_loss_mlp": 1.32232308, + "diversity_loss_mlp": 0.22574797, + "epoch": 0.9867256637168141, + "flos": 582112645632.0, + "grad_norm": 0.03417894522546616, + "language_loss": 0.79182434, + "learning_rate": 4.620248732582488e-07, + "loss": 0.79967207, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01073482, + "step": 5129, + "time_per_iteration": 2.7484471797943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0077241, + "balance_loss_mlp": 1.299196, + "diversity_loss_mlp": 0.22459432, + "epoch": 0.9869180454020777, + "flos": 959303264256.0, + "grad_norm": 0.0327459890880189, + "language_loss": 0.86703897, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87476307, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01051447, + "step": 5130, + "time_per_iteration": 3.2471301555633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048775, + "balance_loss_mlp": 1.03996539, + "diversity_loss_mlp": 0.0, + "epoch": 0.9871104270873413, + "flos": 770730163200.0, + "grad_norm": 0.07462217297713208, + "language_loss": 0.82822615, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.83871394, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.0881958, + "routerloss_mlp": 0.0, + "step": 5131, + "time_per_iteration": 3.0264713764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_mlp": 1.03579676, + "diversity_loss_mlp": 0.0, + "epoch": 0.9873028087726049, + "flos": 446444794368.0, + "grad_norm": 0.09684750541354396, + "language_loss": 0.78034192, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.7907899, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 5132, + "time_per_iteration": 2.501401662826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047696, + "balance_loss_mlp": 1.03900599, + "diversity_loss_mlp": 0.0, + "epoch": 0.9874951904578684, + "flos": 507612427776.0, + "grad_norm": 0.06608816794625222, + "language_loss": 0.86122322, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87170017, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 5133, + "time_per_iteration": 2.595851421356201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046161, + "balance_loss_mlp": 1.03731585, + "diversity_loss_mlp": 0.0, + "epoch": 0.987687572143132, + "flos": 716742743040.0, + "grad_norm": 0.07376071696211185, + "language_loss": 0.81970578, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83016741, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.08856201, + "routerloss_mlp": 0.0, + "step": 5134, + "time_per_iteration": 2.899350881576538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01003204, + "balance_loss_mlp": 0.99879336, + "diversity_loss_mlp": 0.0, + "epoch": 0.9878799538283956, + "flos": 1538647695360.0, + "grad_norm": 0.004506854986446618, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80821157, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.04418945, + "routerloss_mlp": 0.0, + "step": 5135, + "time_per_iteration": 4.867925405502319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_mlp": 1.04155612, + "diversity_loss_mlp": 0.0, + "epoch": 0.9880723355136591, + "flos": 721424931840.0, + "grad_norm": 0.06071682459398163, + "language_loss": 0.81917751, + "learning_rate": 3.730469030412964e-07, + "loss": 0.82968003, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 5136, + "time_per_iteration": 2.9082465171813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00784556, + "balance_loss_mlp": 1.3212409, + "diversity_loss_mlp": 0.22676432, + "epoch": 0.9882647171989226, + "flos": 557350087680.0, + "grad_norm": 0.028741736801368708, + "language_loss": 0.84462202, + "learning_rate": 3.611116155572969e-07, + "loss": 0.8524676, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01055351, + "step": 5137, + "time_per_iteration": 2.687598705291748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048005, + "balance_loss_mlp": 1.03901052, + "diversity_loss_mlp": 0.0, + "epoch": 0.9884570988841862, + "flos": 562820410368.0, + "grad_norm": 0.07713102005937741, + "language_loss": 0.80440414, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81488419, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.09002686, + "routerloss_mlp": 0.0, + "step": 5138, + "time_per_iteration": 2.7116920948028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_mlp": 1.03775895, + "diversity_loss_mlp": 0.0, + "epoch": 0.9886494805694498, + "flos": 431763167232.0, + "grad_norm": 0.07051878557324726, + "language_loss": 0.86536169, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87582827, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5139, + "time_per_iteration": 2.477654218673706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045545, + "balance_loss_mlp": 1.03696823, + "diversity_loss_mlp": 0.0, + "epoch": 0.9888418622547134, + "flos": 592082348544.0, + "grad_norm": 0.05631423705134008, + "language_loss": 0.90553308, + "learning_rate": 3.264696333806771e-07, + "loss": 0.9159885, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.08587646, + "routerloss_mlp": 0.0, + "step": 5140, + "time_per_iteration": 2.789351224899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049588, + "balance_loss_mlp": 1.04073703, + "diversity_loss_mlp": 0.0, + "epoch": 0.989034243939977, + "flos": 1134993461760.0, + "grad_norm": 0.06262136237267299, + "language_loss": 0.80186951, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81236541, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 5141, + "time_per_iteration": 3.521420478820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104667, + "balance_loss_mlp": 1.03778934, + "diversity_loss_mlp": 0.0, + "epoch": 0.9892266256252404, + "flos": 566670048768.0, + "grad_norm": 0.0653214866342138, + "language_loss": 0.81865728, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.82912397, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 5142, + "time_per_iteration": 2.6905152797698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_mlp": 1.03794312, + "diversity_loss_mlp": 0.0, + "epoch": 0.989419007310504, + "flos": 640577051136.0, + "grad_norm": 0.06437869536727725, + "language_loss": 0.83950132, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.84996706, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 5143, + "time_per_iteration": 2.9280619621276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00775776, + "balance_loss_mlp": 1.30826199, + "diversity_loss_mlp": 0.22223487, + "epoch": 0.9896113889957676, + "flos": 455478059520.0, + "grad_norm": 0.03094231827555858, + "language_loss": 0.81775147, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82550919, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01052798, + "step": 5144, + "time_per_iteration": 2.6317298412323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046559, + "balance_loss_mlp": 1.03809488, + "diversity_loss_mlp": 0.0, + "epoch": 0.9898037706810312, + "flos": 567339614208.0, + "grad_norm": 0.06731066884585553, + "language_loss": 0.80676913, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81723469, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.08465576, + "routerloss_mlp": 0.0, + "step": 5145, + "time_per_iteration": 2.6584229469299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00588607, + "balance_loss_mlp": 1.02718186, + "diversity_loss_mlp": 0.13146883, + "epoch": 0.9899961523662947, + "flos": 1550268191232.0, + "grad_norm": 0.0012619225721446723, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.7873503, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.00928183, + "step": 5146, + "time_per_iteration": 4.944198369979858 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046886, + "balance_loss_mlp": 1.03796947, + "diversity_loss_mlp": 0.0, + "epoch": 0.9901885340515583, + "flos": 610709787648.0, + "grad_norm": 0.06397137457157225, + "language_loss": 0.85200578, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86247468, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.0892334, + "routerloss_mlp": 0.0, + "step": 5147, + "time_per_iteration": 2.920581579208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045844, + "balance_loss_mlp": 1.03715396, + "diversity_loss_mlp": 0.0, + "epoch": 0.9903809157368219, + "flos": 517483385856.0, + "grad_norm": 0.06276990657159663, + "language_loss": 0.82674694, + "learning_rate": 2.426269020866512e-07, + "loss": 0.83720535, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 5148, + "time_per_iteration": 2.5547163486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047255, + "balance_loss_mlp": 1.0385884, + "diversity_loss_mlp": 0.0, + "epoch": 0.9905732974220854, + "flos": 1100426757120.0, + "grad_norm": 0.06810375608375513, + "language_loss": 0.80711174, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81758434, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5149, + "time_per_iteration": 3.4215774536132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045945, + "balance_loss_mlp": 1.03725505, + "diversity_loss_mlp": 0.0, + "epoch": 0.990765679107349, + "flos": 858002056704.0, + "grad_norm": 0.08140595339599294, + "language_loss": 0.84472948, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85518897, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 5150, + "time_per_iteration": 3.104238271713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_mlp": 1.03784227, + "diversity_loss_mlp": 0.0, + "epoch": 0.9909580607926125, + "flos": 491287721472.0, + "grad_norm": 0.07994567324384995, + "language_loss": 0.80567187, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81613684, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5151, + "time_per_iteration": 2.597073554992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046205, + "balance_loss_mlp": 1.03738976, + "diversity_loss_mlp": 0.0, + "epoch": 0.9911504424778761, + "flos": 585060350976.0, + "grad_norm": 0.0788095686937427, + "language_loss": 0.79632246, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80678451, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 5152, + "time_per_iteration": 2.672553062438965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104469, + "balance_loss_mlp": 1.03561211, + "diversity_loss_mlp": 0.0, + "epoch": 0.9913428241631397, + "flos": 570030359040.0, + "grad_norm": 0.06752430275446872, + "language_loss": 0.81667304, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82711995, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.09088135, + "routerloss_mlp": 0.0, + "step": 5153, + "time_per_iteration": 2.6830427646636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047286, + "balance_loss_mlp": 1.03867936, + "diversity_loss_mlp": 0.0, + "epoch": 0.9915352058484033, + "flos": 489745958400.0, + "grad_norm": 0.06636262173491685, + "language_loss": 0.86006486, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.8705377, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.08618164, + "routerloss_mlp": 0.0, + "step": 5154, + "time_per_iteration": 2.6368730068206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104807, + "balance_loss_mlp": 1.03920066, + "diversity_loss_mlp": 0.0, + "epoch": 0.9917275875336667, + "flos": 744047741952.0, + "grad_norm": 0.060555053830850476, + "language_loss": 0.82984126, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.84032202, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.08874512, + "routerloss_mlp": 0.0, + "step": 5155, + "time_per_iteration": 2.989109754562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043535, + "balance_loss_mlp": 1.03463578, + "diversity_loss_mlp": 0.0, + "epoch": 0.9919199692189303, + "flos": 508272081408.0, + "grad_norm": 0.06288570543658592, + "language_loss": 0.80066729, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81110263, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 5156, + "time_per_iteration": 2.6498100757598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045724, + "balance_loss_mlp": 1.03691423, + "diversity_loss_mlp": 0.0, + "epoch": 0.9921123509041939, + "flos": 543963174912.0, + "grad_norm": 0.06594459780967553, + "language_loss": 0.84395134, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85440862, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.08825684, + "routerloss_mlp": 0.0, + "step": 5157, + "time_per_iteration": 2.6490535736083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045394, + "balance_loss_mlp": 1.03646517, + "diversity_loss_mlp": 0.0, + "epoch": 0.9923047325894575, + "flos": 671561362944.0, + "grad_norm": 0.06545947039571581, + "language_loss": 0.77654356, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78699744, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.08929443, + "routerloss_mlp": 0.0, + "step": 5158, + "time_per_iteration": 2.7639706134796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047617, + "balance_loss_mlp": 1.03868222, + "diversity_loss_mlp": 0.0, + "epoch": 0.9924971142747211, + "flos": 466557096960.0, + "grad_norm": 0.06168897901648668, + "language_loss": 0.8080498, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81852591, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.08935547, + "routerloss_mlp": 0.0, + "step": 5159, + "time_per_iteration": 2.7385036945343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049861, + "balance_loss_mlp": 1.04073584, + "diversity_loss_mlp": 0.0, + "epoch": 0.9926894959599846, + "flos": 491581757952.0, + "grad_norm": 0.06899221386615825, + "language_loss": 0.82835615, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83885473, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.09124756, + "routerloss_mlp": 0.0, + "step": 5160, + "time_per_iteration": 2.559859037399292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050661, + "balance_loss_mlp": 1.04187524, + "diversity_loss_mlp": 0.0, + "epoch": 0.9928818776452482, + "flos": 492389715456.0, + "grad_norm": 0.08668343737324606, + "language_loss": 0.81916565, + "learning_rate": 1.328673533166902e-07, + "loss": 0.82967234, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.0880127, + "routerloss_mlp": 0.0, + "step": 5161, + "time_per_iteration": 2.5678670406341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048471, + "balance_loss_mlp": 1.03970289, + "diversity_loss_mlp": 0.0, + "epoch": 0.9930742593305117, + "flos": 546357312000.0, + "grad_norm": 0.06843444651252836, + "language_loss": 0.84165454, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85213923, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 5162, + "time_per_iteration": 2.7581584453582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_mlp": 1.03851247, + "diversity_loss_mlp": 0.0, + "epoch": 0.9932666410157753, + "flos": 585510031872.0, + "grad_norm": 0.06263196001846472, + "language_loss": 0.85711837, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.86758995, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5163, + "time_per_iteration": 2.7846977710723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046625, + "balance_loss_mlp": 1.0378511, + "diversity_loss_mlp": 0.0, + "epoch": 0.9934590227010388, + "flos": 537086909952.0, + "grad_norm": 0.06164233206359557, + "language_loss": 0.83855546, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.84902167, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.08782959, + "routerloss_mlp": 0.0, + "step": 5164, + "time_per_iteration": 2.716646671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_mlp": 1.03856587, + "diversity_loss_mlp": 0.0, + "epoch": 0.9936514043863024, + "flos": 518014559232.0, + "grad_norm": 0.06133860998625567, + "language_loss": 0.86944854, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.8799212, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.08703613, + "routerloss_mlp": 0.0, + "step": 5165, + "time_per_iteration": 2.614095687866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104736, + "balance_loss_mlp": 1.03866947, + "diversity_loss_mlp": 0.0, + "epoch": 0.993843786071566, + "flos": 744625903104.0, + "grad_norm": 0.06754893239543082, + "language_loss": 0.80281818, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81329167, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.0869751, + "routerloss_mlp": 0.0, + "step": 5166, + "time_per_iteration": 3.028465986251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050745, + "balance_loss_mlp": 1.04209042, + "diversity_loss_mlp": 0.0, + "epoch": 0.9940361677568296, + "flos": 525918666240.0, + "grad_norm": 0.06956871932384841, + "language_loss": 0.82008004, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83058745, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.08666992, + "routerloss_mlp": 0.0, + "step": 5167, + "time_per_iteration": 2.698882818222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104561, + "balance_loss_mlp": 1.03672278, + "diversity_loss_mlp": 0.0, + "epoch": 0.9942285494420932, + "flos": 555650108928.0, + "grad_norm": 0.06410012888366921, + "language_loss": 0.80157578, + "learning_rate": 8.735020633177104e-08, + "loss": 0.81203187, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.08898926, + "routerloss_mlp": 0.0, + "step": 5168, + "time_per_iteration": 2.7812376022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046457, + "balance_loss_mlp": 1.0377903, + "diversity_loss_mlp": 0.0, + "epoch": 0.9944209311273566, + "flos": 585996788736.0, + "grad_norm": 0.06620347908149736, + "language_loss": 0.82235384, + "learning_rate": 8.162407083411872e-08, + "loss": 0.83281839, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.08673096, + "routerloss_mlp": 0.0, + "step": 5169, + "time_per_iteration": 2.7237818241119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047633, + "balance_loss_mlp": 1.0389545, + "diversity_loss_mlp": 0.0, + "epoch": 0.9946133128126202, + "flos": 735518486016.0, + "grad_norm": 0.06912708749251066, + "language_loss": 0.82253057, + "learning_rate": 7.609202086272804e-08, + "loss": 0.83300692, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.08691406, + "routerloss_mlp": 0.0, + "step": 5170, + "time_per_iteration": 2.9818952083587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047087, + "balance_loss_mlp": 1.03824186, + "diversity_loss_mlp": 0.0, + "epoch": 0.9948056944978838, + "flos": 646018011648.0, + "grad_norm": 0.08243647739411311, + "language_loss": 0.82281691, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83328784, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.08850098, + "routerloss_mlp": 0.0, + "step": 5171, + "time_per_iteration": 2.7422502040863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104891, + "balance_loss_mlp": 1.04017246, + "diversity_loss_mlp": 0.0, + "epoch": 0.9949980761831474, + "flos": 445846809600.0, + "grad_norm": 0.06824796371814347, + "language_loss": 0.86093032, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87141943, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.08746338, + "routerloss_mlp": 0.0, + "step": 5172, + "time_per_iteration": 2.51432728767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046756, + "balance_loss_mlp": 1.03810704, + "diversity_loss_mlp": 0.0, + "epoch": 0.995190457868411, + "flos": 435637398528.0, + "grad_norm": 0.06509423598404523, + "language_loss": 0.85527599, + "learning_rate": 6.066040520641414e-08, + "loss": 0.86574364, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.08654785, + "routerloss_mlp": 0.0, + "step": 5173, + "time_per_iteration": 2.6191818714141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047522, + "balance_loss_mlp": 1.0386107, + "diversity_loss_mlp": 0.0, + "epoch": 0.9953828395536745, + "flos": 514187315712.0, + "grad_norm": 0.06870476422803651, + "language_loss": 0.81628877, + "learning_rate": 5.590471806377062e-08, + "loss": 0.82676393, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.08911133, + "routerloss_mlp": 0.0, + "step": 5174, + "time_per_iteration": 2.569406270980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046917, + "balance_loss_mlp": 1.03805971, + "diversity_loss_mlp": 0.0, + "epoch": 0.995575221238938, + "flos": 479847836160.0, + "grad_norm": 0.06879136838428648, + "language_loss": 0.81909287, + "learning_rate": 5.134312643245709e-08, + "loss": 0.82956201, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.08862305, + "routerloss_mlp": 0.0, + "step": 5175, + "time_per_iteration": 2.5882654190063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049317, + "balance_loss_mlp": 1.04018593, + "diversity_loss_mlp": 0.0, + "epoch": 0.9957676029242016, + "flos": 587785600512.0, + "grad_norm": 0.08802784581931292, + "language_loss": 0.76484299, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77533621, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.09130859, + "routerloss_mlp": 0.0, + "step": 5176, + "time_per_iteration": 2.7355172634124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00781269, + "balance_loss_mlp": 1.31630397, + "diversity_loss_mlp": 0.2250234, + "epoch": 0.9959599846094652, + "flos": 426465741312.0, + "grad_norm": 0.03484461119289524, + "language_loss": 0.80370349, + "learning_rate": 4.280223671243588e-08, + "loss": 0.81151617, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01060532, + "step": 5177, + "time_per_iteration": 2.488933563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_mlp": 1.03673339, + "diversity_loss_mlp": 0.0, + "epoch": 0.9961523662947287, + "flos": 611619061248.0, + "grad_norm": 0.060646192988618466, + "language_loss": 0.80473614, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.81519341, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 5178, + "time_per_iteration": 2.860849380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045766, + "balance_loss_mlp": 1.03690243, + "diversity_loss_mlp": 0.0, + "epoch": 0.9963447479799923, + "flos": 550785111552.0, + "grad_norm": 0.06956117500096984, + "language_loss": 0.73755258, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.74801028, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.08868408, + "routerloss_mlp": 0.0, + "step": 5179, + "time_per_iteration": 2.652787446975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051578, + "balance_loss_mlp": 1.04275656, + "diversity_loss_mlp": 0.0, + "epoch": 0.9965371296652559, + "flos": 625873402368.0, + "grad_norm": 0.081637230316847, + "language_loss": 0.89049286, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.90100861, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.08831787, + "routerloss_mlp": 0.0, + "step": 5180, + "time_per_iteration": 2.7644760608673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048065, + "balance_loss_mlp": 1.03896928, + "diversity_loss_mlp": 0.0, + "epoch": 0.9967295113505195, + "flos": 639522044928.0, + "grad_norm": 0.0759879935902396, + "language_loss": 0.81941384, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.82989448, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.09100342, + "routerloss_mlp": 0.0, + "step": 5181, + "time_per_iteration": 2.9104771614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046151, + "balance_loss_mlp": 1.03727567, + "diversity_loss_mlp": 0.0, + "epoch": 0.996921893035783, + "flos": 607389124608.0, + "grad_norm": 0.0884261396290618, + "language_loss": 0.76887906, + "learning_rate": 2.484679859793282e-08, + "loss": 0.77934057, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.08886719, + "routerloss_mlp": 0.0, + "step": 5182, + "time_per_iteration": 2.721599578857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_mlp": 1.03908062, + "diversity_loss_mlp": 0.0, + "epoch": 0.9971142747210465, + "flos": 644162388480.0, + "grad_norm": 0.0648988132762576, + "language_loss": 0.81727201, + "learning_rate": 2.183802848243488e-08, + "loss": 0.82775426, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.09155273, + "routerloss_mlp": 0.0, + "step": 5183, + "time_per_iteration": 2.7815635204315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048087, + "balance_loss_mlp": 1.03952742, + "diversity_loss_mlp": 0.0, + "epoch": 0.9973066564063101, + "flos": 1040773722624.0, + "grad_norm": 0.05502432672300637, + "language_loss": 0.81058741, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82106829, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.08569336, + "routerloss_mlp": 0.0, + "step": 5184, + "time_per_iteration": 3.372502326965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105351, + "balance_loss_mlp": 1.04470634, + "diversity_loss_mlp": 0.0, + "epoch": 0.9974990380915737, + "flos": 665095131648.0, + "grad_norm": 0.08025246784684749, + "language_loss": 0.83187962, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84241462, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.08813477, + "routerloss_mlp": 0.0, + "step": 5185, + "time_per_iteration": 2.835519313812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047458, + "balance_loss_mlp": 1.03881598, + "diversity_loss_mlp": 0.0, + "epoch": 0.9976914197768373, + "flos": 718121521152.0, + "grad_norm": 0.06904687845719167, + "language_loss": 0.77359349, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78406811, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.08648682, + "routerloss_mlp": 0.0, + "step": 5186, + "time_per_iteration": 2.8937785625457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048144, + "balance_loss_mlp": 1.03904831, + "diversity_loss_mlp": 0.0, + "epoch": 0.9978838014621008, + "flos": 518328419328.0, + "grad_norm": 0.07280590001962838, + "language_loss": 0.79471743, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.80519885, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.09094238, + "routerloss_mlp": 0.0, + "step": 5187, + "time_per_iteration": 2.635932207107544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044774, + "balance_loss_mlp": 1.03606606, + "diversity_loss_mlp": 0.0, + "epoch": 0.9980761831473643, + "flos": 603430829568.0, + "grad_norm": 0.05359795749809877, + "language_loss": 0.84325933, + "learning_rate": 9.70582968801148e-09, + "loss": 0.85370713, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.08721924, + "routerloss_mlp": 0.0, + "step": 5188, + "time_per_iteration": 2.7615973949432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045401, + "balance_loss_mlp": 1.03626382, + "diversity_loss_mlp": 0.0, + "epoch": 0.9982685648326279, + "flos": 453523691520.0, + "grad_norm": 0.0657633073490906, + "language_loss": 0.8937813, + "learning_rate": 7.861726879943021e-09, + "loss": 0.9042353, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.09136963, + "routerloss_mlp": 0.0, + "step": 5189, + "time_per_iteration": 2.543257236480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045558, + "balance_loss_mlp": 1.03698051, + "diversity_loss_mlp": 0.0, + "epoch": 0.9984609465178915, + "flos": 481424103936.0, + "grad_norm": 0.0777283177143095, + "language_loss": 0.78666133, + "learning_rate": 6.211738235173403e-09, + "loss": 0.79711688, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.08581543, + "routerloss_mlp": 0.0, + "step": 5190, + "time_per_iteration": 2.6314117908477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010468, + "balance_loss_mlp": 1.03816903, + "diversity_loss_mlp": 0.0, + "epoch": 0.9986533282031551, + "flos": 476941976064.0, + "grad_norm": 0.05898093011437241, + "language_loss": 0.84184742, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85231537, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.08636475, + "routerloss_mlp": 0.0, + "step": 5191, + "time_per_iteration": 2.6695079803466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104994, + "balance_loss_mlp": 1.04094553, + "diversity_loss_mlp": 0.0, + "epoch": 0.9988457098884186, + "flos": 641948488704.0, + "grad_norm": 0.06405577435904004, + "language_loss": 0.86847579, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87897515, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.08996582, + "routerloss_mlp": 0.0, + "step": 5192, + "time_per_iteration": 2.8024892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046763, + "balance_loss_mlp": 1.03778648, + "diversity_loss_mlp": 0.0, + "epoch": 0.9990380915736822, + "flos": 396321693696.0, + "grad_norm": 0.0686453524231272, + "language_loss": 0.88108921, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.89155686, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.08978271, + "routerloss_mlp": 0.0, + "step": 5193, + "time_per_iteration": 2.4370131492614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045344, + "balance_loss_mlp": 1.0364393, + "diversity_loss_mlp": 0.0, + "epoch": 0.9992304732589458, + "flos": 576123259392.0, + "grad_norm": 0.06828670759326802, + "language_loss": 0.85050082, + "learning_rate": 1.552936970405927e-09, + "loss": 0.86095428, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.08905029, + "routerloss_mlp": 0.0, + "step": 5194, + "time_per_iteration": 2.765718698501587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048829, + "balance_loss_mlp": 1.04024625, + "diversity_loss_mlp": 0.0, + "epoch": 0.9994228549442093, + "flos": 544291716096.0, + "grad_norm": 0.07220046609149769, + "language_loss": 0.75592577, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76641411, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.08587646, + "routerloss_mlp": 0.0, + "step": 5195, + "time_per_iteration": 2.713330030441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.00790766, + "balance_loss_mlp": 1.33585405, + "diversity_loss_mlp": 0.22418211, + "epoch": 0.9996152366294728, + "flos": 1471314502656.0, + "grad_norm": 0.03504416823087641, + "language_loss": 0.81017089, + "learning_rate": 3.882343933003796e-10, + "loss": 0.81807852, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.0, + "routerloss_mlp": 0.01074793, + "step": 5196, + "time_per_iteration": 3.730872631072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_mlp": 1.028754, + "diversity_loss_mlp": 0.0, + "epoch": 0.9998076183147364, + "flos": 618950149632.0, + "grad_norm": 0.09543829836144671, + "language_loss": 0.69830346, + "learning_rate": 9.70586077619906e-11, + "loss": 0.70866984, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.0788269, + "routerloss_mlp": 0.0, + "step": 5197, + "time_per_iteration": 4.026475429534912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018596, + "balance_loss_mlp": 1.01257348, + "diversity_loss_mlp": 0.0, + "epoch": 1.0, + "flos": 1290737617920.0, + "grad_norm": 0.032396730253084045, + "language_loss": 0.84149116, + "learning_rate": 0.0, + "loss": 0.85167712, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.06033325, + "routerloss_mlp": 0.0, + "step": 5198, + "time_per_iteration": 5.587369918823242 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 430340944, + "step": 5198, + "total_flos": 1.171926856433664e+16, + "train_loss": 0.8587041911183526, + "train_runtime": 15568.2077, + "train_samples_per_second": 42.734, + "train_steps_per_second": 0.334 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.171926856433664e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_competesmoev30/training_args.bin b/sft_pretrain/Full_competesmoev30/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b6a9277adbc97dc93da839d7637a55f6cb09192 --- /dev/null +++ b/sft_pretrain/Full_competesmoev30/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe769c1cc19035ec98b831c3889d46da4eb91c0444d770f41a815de3d19398a +size 7992